diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh
new file mode 100644
index 00000000..e6403aed
--- /dev/null
+++ b/.dev_scripts/build_image.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+# default values.
+BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04
+BASE_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
+MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
+python_version=3.7.13
+torch_version=1.11.0
+cudatoolkit_version=11.3
+tensorflow_version=1.15.5
+modelscope_version=None
+is_ci_test=False
+is_dsw=False
+is_cpu=False
+run_ci_test=False
+function usage(){
+    echo "usage: build.sh "
+    echo "       --python=python_version set python version, default: $python_version"
+    echo "       --torch=torch_version set pytorch version, fefault: $torch_version"
+    echo "       --cudatoolkit=cudatoolkit_version set cudatoolkit version used for pytorch, default: $cudatoolkit_version"
+    echo "       --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version"
+    echo "       --modelscope=modelscope_version set modelscope version, default: $modelscope_version"
+    echo "       --test option for run test before push image, only push on ci test pass"
+    echo "       --cpu option for build cpu version"
+    echo "       --dsw option for build dsw version"
+    echo "       --ci  option for build ci version"
+    echo "       --push option for push image to remote repo"
+}
+for i in "$@"; do
+  case $i in
+    --python=*)
+      python_version="${i#*=}"
+      shift
+      ;;
+    --torch=*)
+      torch_version="${i#*=}"
+      shift # pytorch version
+      ;;
+    --tensorflow=*)
+      tensorflow_version="${i#*=}"
+      shift # tensorflow version
+      ;;
+    --cudatoolkit=*)
+      cudatoolkit_version="${i#*=}"
+      shift # cudatoolkit for pytorch
+      ;;
+    --modelscope=*)
+      modelscope_version="${i#*=}"
+      shift # cudatoolkit for pytorch
+      ;;
+    --test)
+      run_ci_test=True
+      shift # will run ci test
+      ;;
+    --cpu)
+      is_cpu=True
+      shift # is cpu image
+      ;;
+    --ci)
+      is_ci_test=True
+      shift # is ci, will not install modelscope
+      ;;
+    --dsw)
+      is_dsw=True
+      shift # is dsw, will set dsw cache location
+      ;;
+    --push)
+      is_push=True
+      shift # is dsw, will set dsw cache location
+      ;;
+    --help)
+      usage
+      exit 0
+      ;;
+    -*|--*)
+      echo "Unknown option $i"
+      usage
+      exit 1
+      ;;
+    *)
+      ;;
+  esac
+done
+
+if [ "$modelscope_version" == "None" ]; then
+    echo "ModelScope version must specify!"
+    exit 1
+fi
+if [ "$is_cpu" == "True" ]; then
+    export BASE_IMAGE=$BASE_CPU_IMAGE
+    base_tag=ubuntu20.04
+    export USE_GPU=False
+else
+    export BASE_IMAGE=$BASE_GPU_IMAGE
+    base_tag=ubuntu20.04-cuda11.3.0
+    export USE_GPU=True
+fi
+if [[ $python_version == 3.7* ]]; then
+    base_tag=$base_tag-py37
+elif [[ $python_version == z* ]]; then
+    base_tag=$base_tag-py38
+elif [[ $python_version == z* ]]; then
+    base_tag=$base_tag-py39
+else
+    echo "Unsupport python version: $python_version"
+    exit 1
+fi
+
+target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version
+if [ "$is_ci_test" == "True" ]; then
+    target_image_tag=$target_image_tag-$modelscope_version-ci
+else
+    target_image_tag=$target_image_tag-$modelscope_version-test
+fi
+export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag
+export PYTHON_VERSION=$python_version
+export TORCH_VERSION=$torch_version
+export CUDATOOLKIT_VERSION=$cudatoolkit_version
+export TENSORFLOW_VERSION=$tensorflow_version
+echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n"
+docker_file_content=`cat docker/Dockerfile.ubuntu`
+if [ "$is_ci_test" != "True" ]; then
+    echo "Building ModelScope lib, will install ModelScope lib to image"
+    docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir  modelscope==$modelscope_version -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html"
+fi
+echo "$is_dsw"
+if [ "$is_dsw" == "False" ]; then
+    echo "Not DSW image"
+else
+    echo "Building dsw image well need set ModelScope lib cache location."
+    docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
+fi
+printf "$docker_file_content" > Dockerfile
+docker build -t $IMAGE_TO_BUILD  \
+             --build-arg USE_GPU \
+             --build-arg BASE_IMAGE \
+             --build-arg PYTHON_VERSION \
+             --build-arg TORCH_VERSION \
+             --build-arg CUDATOOLKIT_VERSION \
+             --build-arg TENSORFLOW_VERSION \
+             -f Dockerfile .
+
+if [ $? -ne 0 ]; then
+  echo "Running docker build command error, please check the log!"
+  exit -1
+fi
+if [ "$run_ci_test" == "True" ]; then
+    echo "Running ci case."
+    export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache
+    export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential
+    export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS
+    export IMAGE_VERSION=$target_image_tag
+    export MODELSCOPE_DOMAIN=www.modelscope.cn
+    export HUB_DATASET_ENDPOINT=http://www.modelscope.cn
+    export CI_TEST=True
+    export TEST_LEVEL=1
+    if [ "$is_ci_test" != "True" ]; then
+        echo "Testing for dsw image or MaaS-lib image"
+        export CI_COMMAND="python tests/run.py"
+    fi
+    bash .dev_scripts/dockerci.sh
+    if [ $? -ne 0 ]; then
+       echo "Running unittest failed, please check the log!"
+       exit -1
+    fi
+fi
+if [ "$is_push" == "True" ]; then
+    echo "Pushing image: $IMAGE_TO_BUILD"
+    docker push $IMAGE_TO_BUILD
+fi
diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 2f68f416..2f18aff7 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -16,5 +16,14 @@ if [ $? -ne 0 ]; then
     echo "linter test failed, please run 'pre-commit run --all-files' to check"
     exit -1
 fi
+# test with install
+python setup.py install
 
-PYTHONPATH=. python tests/run.py
+if [ $# -eq 0 ]; then
+    ci_command="python tests/run.py --subprocess"
+else
+    ci_command="$@"
+fi
+echo "Running case with command: $ci_command"
+$ci_command
+#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index d5ea3c41..dbb79514 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-IMAGE_NAME=reg.docker.alibaba-inc.com/dinger/modelscope
 MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
 CODE_DIR=$PWD
 CODE_DIR_IN_CONTAINER=/Maas-lib
@@ -8,6 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
+echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
   exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
@@ -31,10 +32,12 @@ do
              -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
              -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
+             -e TEST_LEVEL=$TEST_LEVEL \
+             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
              --workdir=$CODE_DIR_IN_CONTAINER \
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
-             bash .dev_scripts/ci_container_test.sh
+             $CI_COMMAND
   if [ $? -ne 0 ]; then
     echo "Running test case failed, please check the log!"
     exit -1
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..4198ecc0
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,11 @@
+.gitignore
+tests
+data
+.dev_scripts
+.dockerignore
+.git
+.gitattributes
+.pre-commit-config.yaml
+.pre-commit-config_local.yaml
+.readthedocs.yaml
+Dockfile
diff --git a/.gitattributes b/.gitattributes
index 60ff0dd2..1a3015ec 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,4 +4,6 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/configs/cv/configuration.json b/configs/cv/configuration.json
index 2b0da89d..ae07fa10 100644
--- a/configs/cv/configuration.json
+++ b/configs/cv/configuration.json
@@ -2,7 +2,6 @@
     "framework": "pytorch",
 
     "task": "image_classification",
-    "work_dir": "./work_dir",
 
     "model": {
         "type": "classification",
@@ -119,6 +118,7 @@
     },
 
     "train": {
+        "work_dir": "./work_dir",
         "dataloader": {
             "batch_size_per_gpu": 2,
             "workers_per_gpu": 1
diff --git a/data/test/images/image-text-retrieval.jpg b/data/test/images/image-text-retrieval.jpg
new file mode 100644
index 00000000..2d20374a
--- /dev/null
+++ b/data/test/images/image-text-retrieval.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab
+size 218143
diff --git a/data/test/images/image_panoptic_segmentation.jpg b/data/test/images/image_panoptic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_panoptic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/data/test/images/image_reid_person.jpg b/data/test/images/image_reid_person.jpg
new file mode 100644
index 00000000..078468ec
--- /dev/null
+++ b/data/test/images/image_reid_person.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c9a7e42edc7065c16972ff56267aad63f5233e36aa5a699b84939f5bad73276
+size 2451
diff --git a/data/test/images/image_segmentation.jpg b/data/test/images/image_segmentation.jpg
new file mode 100644
index 00000000..a9c0875c
--- /dev/null
+++ b/data/test/images/image_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
+size 146140
diff --git a/data/test/images/image_semantic_segmentation.jpg b/data/test/images/image_semantic_segmentation.jpg
new file mode 100644
index 00000000..2a8d826b
--- /dev/null
+++ b/data/test/images/image_semantic_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
+size 245864
diff --git a/data/test/regression/fill_mask_bert_zh.bin b/data/test/regression/fill_mask_bert_zh.bin
new file mode 100644
index 00000000..17c28b81
--- /dev/null
+++ b/data/test/regression/fill_mask_bert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce
+size 61883
diff --git a/data/test/regression/fill_mask_sbert_en.bin b/data/test/regression/fill_mask_sbert_en.bin
new file mode 100644
index 00000000..09aaf300
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7
+size 119940
diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin
new file mode 100644
index 00000000..812f7ba2
--- /dev/null
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
+size 119940
diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin
new file mode 100644
index 00000000..be3fddc8
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
+size 119619
diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin
new file mode 100644
index 00000000..c0d27e20
--- /dev/null
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
+size 119619
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
new file mode 100644
index 00000000..a5f680bb
--- /dev/null
+++ b/data/test/regression/sbert_nli.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
+size 62231
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
new file mode 100644
index 00000000..a59cbe0b
--- /dev/null
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
+size 62235
diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
new file mode 100644
index 00000000..4eb562d6
--- /dev/null
+++ b/data/test/regression/sbert_ws_en.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
+size 60801
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
new file mode 100644
index 00000000..555f640d
--- /dev/null
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
+size 60801
diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin
new file mode 100644
index 00000000..23d40946
--- /dev/null
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
+size 61589
diff --git a/data/test/videos/Walking.54138969.mp4 b/data/test/videos/Walking.54138969.mp4
new file mode 100644
index 00000000..1716695f
--- /dev/null
+++ b/data/test/videos/Walking.54138969.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
+size 44217644
diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4
new file mode 100644
index 00000000..ee6ed528
--- /dev/null
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
+size 126815483
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
new file mode 100644
index 00000000..97881007
--- /dev/null
+++ b/docker/Dockerfile.ubuntu
@@ -0,0 +1,84 @@
+ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
+FROM $BASE_IMAGE
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Shanghai
+ENV CONDA_DIR /opt/conda
+ENV PATH="${CONDA_DIR}/bin:${PATH}"
+ENV arch=x86_64
+SHELL ["/bin/bash", "-c"]
+COPY docker/rcfiles /tmp/resources
+RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
+    cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
+    apt-get update && \
+    apt-get install -y locales wget git  vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
+    wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
+    dpkg -i ./git-lfs_3.2.0_amd64.deb && \
+    rm -f ./git-lfs_3.2.0_amd64.deb && \
+    locale-gen zh_CN && \
+    locale-gen zh_CN.utf8 && \
+    update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \
+    ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+    dpkg-reconfigure --frontend noninteractive tzdata && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8
+
+#install and config python
+ARG PYTHON_VERSION=3.7.13
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
+    /bin/bash  miniconda.sh -b -p /opt/conda && \
+    rm  -f miniconda.sh && \
+    ln  -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    cp /tmp/resources/conda.tuna  ~/.condarc && \
+    source /root/.bashrc && \
+    conda install --yes python==${PYTHON_VERSION} && \
+    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+ARG USE_GPU=True
+
+# install pytorch
+ARG TORCH_VERSION=1.12.0
+ARG CUDATOOLKIT_VERSION=11.3
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
+    else \
+        conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
+    fi
+
+# install tensorflow
+ARG TENSORFLOW_VERSION=1.15.5
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+    else \
+        pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
+    fi
+
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir mmcv-full && pip cache purge; \
+    else \
+        MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir mmcv-full && pip cache purge; \
+    fi
+
+# install modelscope
+COPY requirements /var/modelscope
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip cache purge
+
+# default shell bash
+ENV SHELL=/bin/bash
+
+# install special package
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0
+
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
+    else \
+        pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
+    fi
diff --git a/docker/rcfiles/conda.tuna b/docker/rcfiles/conda.tuna
new file mode 100644
index 00000000..ce8a2908
--- /dev/null
+++ b/docker/rcfiles/conda.tuna
@@ -0,0 +1,15 @@
+channels:
+  - defaults
+show_channel_urls: true
+default_channels:
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
+custom_channels:
+  conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
+  simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
diff --git a/docker/rcfiles/ubuntu20.04_sources.tuna b/docker/rcfiles/ubuntu20.04_sources.tuna
new file mode 100644
index 00000000..a247bbfa
--- /dev/null
+++ b/docker/rcfiles/ubuntu20.04_sources.tuna
@@ -0,0 +1,13 @@
+# 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
+
+# 预发布软件源，不建议启用
+# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 196f0353..68979c55 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -108,7 +108,7 @@ pip install -e ".[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releas
 ```shell
 pip install -e ".[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
-###
+
 ### 安装验证
 
 安装成功后，可以执行如下命令进行验证安装是否正确：
diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index 5fd10f85..b526d593 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,2 +1,2 @@
-from .file import File
+from .file import File, LocalStorage
 from .io import dump, dumps, load
diff --git a/modelscope/fileio/file.py b/modelscope/fileio/file.py
index 343cad9a..3fff80c8 100644
--- a/modelscope/fileio/file.py
+++ b/modelscope/fileio/file.py
@@ -240,7 +240,7 @@ class File(object):
     @staticmethod
     def _get_storage(uri):
         assert isinstance(uri,
-                          str), f'uri should be str type, buf got {type(uri)}'
+                          str), f'uri should be str type, but got {type(uri)}'
 
         if '://' not in uri:
             # local path
diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py
index 977a8b8c..9979c023 100644
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
 import numpy as np
 
 from .base import FormatHandler
@@ -22,14 +21,16 @@ def set_default(obj):
 
 
 class JsonHandler(FormatHandler):
+    """Use jsonplus, serialization of Python types to JSON that "just works"."""
 
     def load(self, file):
-        return json.load(file)
+        import jsonplus
+        return jsonplus.loads(file.read())
 
     def dump(self, obj, file, **kwargs):
-        kwargs.setdefault('default', set_default)
-        json.dump(obj, file, **kwargs)
+        file.write(self.dumps(obj, **kwargs))
 
     def dumps(self, obj, **kwargs):
+        import jsonplus
         kwargs.setdefault('default', set_default)
-        return json.dumps(obj, **kwargs)
+        return jsonplus.dumps(obj, **kwargs)
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 09bff2c1..721f5637 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,7 +1,6 @@
 import os
 import pickle
 import shutil
-import subprocess
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
@@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH)
-from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
-                                          HUB_DATASET_ENDPOINT)
+from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DatasetFormations, DatasetMetaFormats,
@@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                      datahub_raise_on_error, handle_http_response, is_ok,
                      raise_on_error)
-from .utils.utils import get_endpoint, model_id_to_group_owner_name
+from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
+                          model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -35,7 +34,8 @@ class HubApi:
 
     def __init__(self, endpoint=None, dataset_endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
-        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT
+        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
+        )
 
     def login(
         self,
@@ -376,6 +376,27 @@ class HubApi:
                       f'ststoken?Revision={revision}'
         return self.datahub_remote_call(datahub_url)
 
+    def get_dataset_access_config_session(
+            self,
+            cookies: CookieJar,
+            dataset_name: str,
+            namespace: str,
+            revision: Optional[str] = DEFAULT_DATASET_REVISION):
+
+        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+                      f'ststoken?Revision={revision}'
+
+        cookies = requests.utils.dict_from_cookiejar(cookies)
+        r = requests.get(url=datahub_url, cookies=cookies)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        return resp['Data']
+
+    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
+        r = requests.post(url)
+        r.raise_for_status()
+
     @staticmethod
     def datahub_remote_call(url):
         r = requests.get(url)
@@ -383,6 +404,9 @@ class HubApi:
         datahub_raise_on_error(url, resp)
         return resp['Data']
 
+    def check_cookies_upload_data(self, use_cookies) -> CookieJar:
+        return self._check_cookie(use_cookies=use_cookies)
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 702251e3..014a1e59 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 MODELSCOPE_URL_SCHEME = 'http://'
 DEFAULT_MODELSCOPE_DOMAIN = 'www.modelscope.cn'
 DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_DOMAIN
@@ -6,7 +8,7 @@ DEFAULT_MODELSCOPE_GROUP = 'damo'
 MODEL_ID_SEPARATOR = '/'
 FILE_HASH = 'Sha256'
 LOGGER_NAME = 'ModelScopeHub'
-DEFAULT_CREDENTIALS_PATH = '~/.modelscope/credentials'
+DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
 API_RESPONSE_FIELD_DATA = 'Data'
 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
 API_RESPONSE_FIELD_USERNAME = 'Username'
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index ecd4e1da..e9c008b0 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -49,8 +49,8 @@ def handle_http_response(response, logger, cookies, model_id):
     except HTTPError:
         if cookies is None:  # code in [403] and
             logger.error(
-                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \
-                  Please login first.')
+                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
+                private. Please login first.')
         raise
 
 
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 51ddf954..6f560f7a 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -2,7 +2,8 @@ import os
 from typing import Optional
 
 from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION)
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .git import GitCommandWrapper
@@ -15,14 +16,12 @@ class Repository:
     """A local representation of the model git repository.
     """
 
-    def __init__(
-        self,
-        model_dir: str,
-        clone_from: str,
-        revision: Optional[str] = DEFAULT_MODEL_REVISION,
-        auth_token: Optional[str] = None,
-        git_path: Optional[str] = None,
-    ):
+    def __init__(self,
+                 model_dir: str,
+                 clone_from: str,
+                 revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
         """
         Instantiate a Repository object by cloning the remote ModelScopeHub repo
         Args:
@@ -86,6 +85,7 @@ class Repository:
              branch: Optional[str] = DEFAULT_MODEL_REVISION,
              force: bool = False):
         """Push local files to remote, this method will do.
+           git pull
            git add
            git commit
            git push
@@ -117,3 +117,105 @@ class Repository:
             url=url,
             local_branch=branch,
             remote_branch=branch)
+
+
+class DatasetRepository:
+    """A local representation of the dataset (metadata) git repository.
+    """
+
+    def __init__(self,
+                 repo_work_dir: str,
+                 dataset_id: str,
+                 revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                 auth_token: Optional[str] = None,
+                 git_path: Optional[str] = None):
+        """
+        Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
+        Args:
+            repo_work_dir(`str`):
+                The dataset repo root directory.
+            dataset_id:
+                dataset id in ModelScope from which git clone
+            revision(`Optional[str]`):
+                revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        """
+        self.dataset_id = dataset_id
+        self.repo_work_dir = repo_work_dir
+        self.repo_base_dir = os.path.dirname(repo_work_dir)
+        self.repo_name = os.path.basename(repo_work_dir)
+        self.revision = revision
+        if auth_token:
+            self.auth_token = auth_token
+        else:
+            self.auth_token = ModelScopeConfig.get_token()
+
+        self.git_wrapper = GitCommandWrapper(git_path)
+        os.makedirs(self.repo_work_dir, exist_ok=True)
+        self.repo_url = self._get_repo_url(dataset_id=dataset_id)
+
+    def clone(self) -> str:
+        # check local repo dir, directory not empty.
+        if os.listdir(self.repo_work_dir):
+            remote_url = self._get_remote_url()
+            remote_url = self.git_wrapper.remove_token_from_url(remote_url)
+            # no need clone again
+            if remote_url and remote_url == self.repo_url:
+                return ''
+
+        logger.info('Cloning repo from {} '.format(self.repo_url))
+        self.git_wrapper.clone(self.repo_base_dir, self.auth_token,
+                               self.repo_url, self.repo_name, self.revision)
+        return self.repo_work_dir
+
+    def push(self,
+             commit_message: str,
+             branch: Optional[str] = DEFAULT_DATASET_REVISION,
+             force: bool = False):
+        """Push local files to remote, this method will do.
+           git pull
+           git add
+           git commit
+           git push
+        Args:
+            commit_message (str): commit message
+            branch (Optional[str], optional): which branch to push.
+            force (Optional[bool]): whether to use forced-push.
+        """
+        if commit_message is None or not isinstance(commit_message, str):
+            msg = 'commit_message must be provided!'
+            raise InvalidParameter(msg)
+
+        if not isinstance(force, bool):
+            raise InvalidParameter('force must be bool')
+
+        if not self.auth_token:
+            raise NotLoginException('Must login to push, please login first.')
+
+        self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
+        self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)
+
+        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        self.git_wrapper.pull(self.repo_work_dir)
+        self.git_wrapper.add(self.repo_work_dir, all_files=True)
+        self.git_wrapper.commit(self.repo_work_dir, commit_message)
+        self.git_wrapper.push(
+            repo_dir=self.repo_work_dir,
+            token=self.auth_token,
+            url=remote_url,
+            local_branch=branch,
+            remote_branch=branch)
+
+    def _get_repo_url(self, dataset_id):
+        return f'{get_endpoint()}/datasets/{dataset_id}.git'
+
+    def _get_remote_url(self):
+        try:
+            remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
+        except GitError:
+            remote = None
+        return remote
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 1a55c9f9..7e219d16 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,7 +1,9 @@
 import hashlib
 import os
+from typing import Optional
 
-from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
+                                      DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
@@ -22,14 +24,16 @@ def model_id_to_group_owner_name(model_id):
     return group_or_owner, name
 
 
-def get_cache_dir():
+def get_cache_dir(model_id: Optional[str] = None):
     """
     cache dir precedence:
         function parameter > enviroment > ~/.cache/modelscope/hub
     """
     default_cache_dir = get_default_cache_dir()
-    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
-                                                      'hub'))
+    base_path = os.getenv('MODELSCOPE_CACHE',
+                          os.path.join(default_cache_dir, 'hub'))
+    return base_path if model_id is None else os.path.join(
+        base_path, model_id + '/')
 
 
 def get_endpoint():
@@ -38,6 +42,11 @@ def get_endpoint():
     return MODELSCOPE_URL_SCHEME + modelscope_domain
 
 
+def get_dataset_hub_endpoint():
+    return os.environ.get('HUB_DATASET_ENDPOINT',
+                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)
+
+
 def compute_hash(file_path):
     BUFFER_SIZE = 1024 * 64  # 64k buffer size
     sha256_hash = hashlib.sha256()
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index e344fbe7..1d875048 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -11,6 +11,7 @@ class Models(object):
     """
     # vision models
     detection = 'detection'
+    realtime_object_detection = 'realtime-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
     nafnet = 'nafnet'
@@ -19,7 +20,18 @@ class Models(object):
     gpen = 'gpen'
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
+    panoptic_segmentation = 'swinL-panoptic-segmentation'
+    image_reid_person = 'passvitb'
+    video_summarization = 'pgl-video-summarization'
+    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
+    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
+    resnet50_bert = 'resnet50-bert'
+
+    # EasyCV models
+    yolox = 'YOLOX'
+    segformer = 'Segformer'
 
     # nlp models
     bert = 'bert'
@@ -32,8 +44,10 @@ class Models(object):
     space_modeling = 'space-modeling'
     star = 'star'
     tcrf = 'transformer-crf'
+    lcrf = 'lstm-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    bert_for_ds = 'bert-for-document-segmentation'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
@@ -48,12 +62,14 @@ class Models(object):
     gemm = 'gemm-generative-multi-modal'
     mplug = 'mplug'
     diffusion = 'diffusion-text-to-image-synthesis'
+    team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
 
 class TaskModels(object):
     # nlp task
     text_classification = 'text-classification'
+    information_extraction = 'information-extraction'
 
 
 class Heads(object):
@@ -63,6 +79,7 @@ class Heads(object):
     bert_mlm = 'bert-mlm'
     # roberta mlm
     roberta_mlm = 'roberta-mlm'
+    information_extraction = 'information-extraction'
 
 
 class Pipelines(object):
@@ -84,9 +101,13 @@ class Pipelines(object):
     animal_recognition = 'resnet101-animal-recognition'
     general_recognition = 'resnet101-general-recognition'
     cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
+    hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
+    body_3d_keypoints = 'canonical_body-3d-keypoints_video'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
+    easycv_detection = 'easycv-detection'
+    easycv_segmentation = 'easycv-segmentation'
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -100,6 +121,7 @@ class Pipelines(object):
     image_super_resolution = 'rrdb-image-super-resolution'
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
+    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
     face_recognition = 'ir101-face-recognition-cfglint'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
@@ -112,6 +134,11 @@ class Pipelines(object):
     tinynas_classification = 'tinynas-classification'
     crowd_counting = 'hrnet-crowd-counting'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
+    image_panoptic_segmentation = 'image-panoptic-segmentation'
+    video_summarization = 'googlenet_pgl_video_summarization'
+    image_semantic_segmentation = 'image-semantic-segmentation'
+    image_reid_person = 'passvitb-image-reid-person'
+    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
@@ -129,7 +156,10 @@ class Pipelines(object):
     dialog_state_tracking = 'dialog-state-tracking'
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    relation_extraction = 'relation-extraction'
+    document_segmentation = 'document-segmentation'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -146,8 +176,10 @@ class Pipelines(object):
     visual_question_answering = 'visual-question-answering'
     visual_grounding = 'visual-grounding'
     visual_entailment = 'visual-entailment'
+    multi_modal_similarity = 'multi-modal-similarity'
     text_to_image_synthesis = 'text-to-image-synthesis'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
 
 
 class Trainers(object):
@@ -161,6 +193,7 @@ class Trainers(object):
     """
 
     default = 'trainer'
+    easycv = 'easycv'
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
@@ -169,12 +202,17 @@ class Trainers(object):
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
     image_portrait_enhancement = 'image-portrait-enhancement'
+    video_summarization = 'video-summarization'
+    movie_scene_segmentation = 'movie-scene-segmentation'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
 
+    # audio trainers
+    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+
 
 class Preprocessors(object):
     """ Names for different preprocessor.
@@ -193,6 +231,8 @@ class Preprocessors(object):
     image_color_enhance_preprocessor = 'image-color-enhance-preprocessor'
     image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
     image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
+    video_summarization_preprocessor = 'video-summarization-preprocessor'
+    movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
 
     # nlp preprocessor
     sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -210,7 +250,10 @@ class Preprocessors(object):
     text_error_correction = 'text-error-correction'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
+    faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    re_tokenizer = 're-tokenizer'
+    document_segmentation = 'document-segmentation'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -229,6 +272,7 @@ class Metrics(object):
 
     # accuracy
     accuracy = 'accuracy'
+    audio_noise_metric = 'audio-noise-metric'
 
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
@@ -245,6 +289,9 @@ class Metrics(object):
     image_color_enhance_metric = 'image-color-enhance-metric'
     # metrics for image-portrait-enhancement task
     image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
+    video_summarization_metric = 'video-summarization-metric'
+    # metric for movie-scene-segmentation task
+    movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
 
 
 class Optimizers(object):
@@ -294,3 +341,12 @@ class LR_Schedulers(object):
     LinearWarmup = 'LinearWarmup'
     ConstantWarmup = 'ConstantWarmup'
     ExponentialWarmup = 'ExponentialWarmup'
+
+
+class Datasets(object):
+    """ Names for different datasets.
+    """
+    ClsDataset = 'ClsDataset'
+    SegDataset = 'SegDataset'
+    DetDataset = 'DetDataset'
+    DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index 37f9bfec..d3975a2c 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio_noise_metric import AudioNoiseMetric
     from .base import Metric
     from .builder import METRICS, build_metric, task_default_metrics
     from .image_color_enhance_metric import ImageColorEnhanceMetric
@@ -14,9 +15,12 @@ if TYPE_CHECKING:
     from .sequence_classification_metric import SequenceClassificationMetric
     from .text_generation_metric import TextGenerationMetric
     from .token_classification_metric import TokenClassificationMetric
+    from .video_summarization_metric import VideoSummarizationMetric
+    from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
 
 else:
     _import_structure = {
+        'audio_noise_metric': ['AudioNoiseMetric'],
         'base': ['Metric'],
         'builder': ['METRICS', 'build_metric', 'task_default_metrics'],
         'image_color_enhance_metric': ['ImageColorEnhanceMetric'],
@@ -28,6 +32,8 @@ else:
         'sequence_classification_metric': ['SequenceClassificationMetric'],
         'text_generation_metric': ['TextGenerationMetric'],
         'token_classification_metric': ['TokenClassificationMetric'],
+        'video_summarization_metric': ['VideoSummarizationMetric'],
+        'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
new file mode 100644
index 00000000..16c5261f
--- /dev/null
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -0,0 +1,38 @@
+from typing import Dict
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.audio_noise_metric)
+class AudioNoiseMetric(Metric):
+    """
+    The metric computation class for acoustic noise suppression task.
+    """
+
+    def __init__(self):
+        self.loss = []
+        self.amp_loss = []
+        self.phase_loss = []
+        self.sisnr = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.loss.append(outputs['loss'].data.cpu())
+        self.amp_loss.append(outputs['amp_loss'].data.cpu())
+        self.phase_loss.append(outputs['phase_loss'].data.cpu())
+        self.sisnr.append(outputs['sisnr'].data.cpu())
+
+    def evaluate(self):
+        avg_loss = sum(self.loss) / len(self.loss)
+        avg_sisnr = sum(self.sisnr) / len(self.sisnr)
+        avg_amp = sum(self.amp_loss) / len(self.amp_loss)
+        avg_phase = sum(self.phase_loss) / len(self.phase_loss)
+        total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
+        return {
+            'total_loss': total_loss.item(),
+            'avg_sisnr': avg_sisnr.item(),
+            MetricKeys.AVERAGE_LOSS: avg_loss.item()
+        }
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index bd20d37b..800e3508 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict, Mapping, Union
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.config import ConfigDict
@@ -15,6 +16,8 @@ class MetricKeys(object):
     RECALL = 'recall'
     PSNR = 'psnr'
     SSIM = 'ssim'
+    AVERAGE_LOSS = 'avg_loss'
+    FScore = 'fscore'
 
 
 task_default_metrics = {
@@ -28,19 +31,26 @@ task_default_metrics = {
     Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
     Tasks.image_portrait_enhancement:
     [Metrics.image_portrait_enhancement_metric],
+    Tasks.video_summarization: [Metrics.video_summarization_metric],
+    Tasks.image_captioning: [Metrics.text_gen_metric],
+    Tasks.visual_question_answering: [Metrics.text_gen_metric],
+    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
 }
 
 
-def build_metric(metric_name: str,
+def build_metric(metric_cfg: Union[str, Dict],
                  field: str = default_group,
                  default_args: dict = None):
     """ Build metric given metric_name and field.
 
     Args:
-        metric_name (:obj:`str`): The metric name.
+        metric_name (str | dict): The metric name or metric config dict.
         field (str, optional):  The field of this metric, default value: 'default' for all fields.
         default_args (dict, optional): Default initialization arguments.
     """
-    cfg = ConfigDict({'type': metric_name})
+    if isinstance(metric_cfg, Mapping):
+        assert 'type' in metric_cfg
+    else:
+        metric_cfg = ConfigDict({'type': metric_cfg})
     return build_from_cfg(
-        cfg, METRICS, group_key=field, default_args=default_args)
+        metric_cfg, METRICS, group_key=field, default_args=default_args)
diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
new file mode 100644
index 00000000..56bdbd1c
--- /dev/null
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -0,0 +1,52 @@
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.movie_scene_segmentation_metric)
+class MovieSceneSegmentationMetric(Metric):
+    """The metric computation class for movie scene segmentation classes.
+    """
+
+    def __init__(self):
+        self.preds = []
+        self.labels = []
+        self.eps = 1e-5
+
+    def add(self, outputs: Dict, inputs: Dict):
+        preds = outputs['pred']
+        labels = inputs['label']
+        self.preds.extend(preds)
+        self.labels.extend(labels)
+
+    def evaluate(self):
+        gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels)))
+        prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds)))
+
+        gt_one = gts == 1
+        gt_zero = gts == 0
+        pred_one = prob == 1
+        pred_zero = prob == 0
+
+        tp = (gt_one * pred_one).sum()
+        fp = (gt_zero * pred_one).sum()
+        fn = (gt_one * pred_zero).sum()
+
+        precision = 100.0 * tp / (tp + fp + self.eps)
+        recall = 100.0 * tp / (tp + fn + self.eps)
+        f1 = 2 * precision * recall / (precision + recall)
+
+        return {
+            MetricKeys.F1: f1,
+            MetricKeys.RECALL: recall,
+            MetricKeys.PRECISION: precision
+        }
diff --git a/modelscope/metrics/video_summarization_metric.py b/modelscope/metrics/video_summarization_metric.py
new file mode 100644
index 00000000..d1867600
--- /dev/null
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -0,0 +1,78 @@
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.models.cv.video_summarization.summarizer import \
+    generate_summary
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+def evaluate_summary(predicted_summary, user_summary, eval_method):
+    """ Compare the predicted summary with the user defined one(s).
+
+    :param ndarray predicted_summary: The generated summary from our model.
+    :param ndarray user_summary: The user defined ground truth summaries (or summary).
+    :param str eval_method: The proposed evaluation method; either 'max' (SumMe) or 'avg' (TVSum).
+    :return: The reduced fscore based on the eval_method
+    """
+    max_len = max(len(predicted_summary), user_summary.shape[1])
+    S = np.zeros(max_len, dtype=int)
+    G = np.zeros(max_len, dtype=int)
+    S[:len(predicted_summary)] = predicted_summary
+
+    f_scores = []
+    for user in range(user_summary.shape[0]):
+        G[:user_summary.shape[1]] = user_summary[user]
+        overlapped = S & G
+
+        # Compute precision, recall, f-score
+        precision = sum(overlapped) / sum(S)
+        recall = sum(overlapped) / sum(G)
+        if precision + recall == 0:
+            f_scores.append(0)
+        else:
+            f_score = 2 * precision * recall * 100 / (precision + recall)
+            f_scores.append(f_score)
+
+    if eval_method == 'max':
+        return max(f_scores)
+    else:
+        return sum(f_scores) / len(f_scores)
+
+
+def calculate_f_score(outputs: Dict, inputs: Dict):
+    scores = outputs['scores']
+    scores = scores.squeeze(0).cpu().numpy().tolist()
+    user_summary = inputs['user_summary'].cpu().numpy()[0]
+    sb = inputs['change_points'].cpu().numpy()[0]
+    n_frames = inputs['n_frames'].cpu().numpy()[0]
+    positions = inputs['positions'].cpu().numpy()[0]
+    summary = generate_summary([sb], [scores], [n_frames], [positions])[0]
+    f_score = evaluate_summary(summary, user_summary, 'avg')
+    return f_score
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.video_summarization_metric)
+class VideoSummarizationMetric(Metric):
+    """The metric for video summarization task.
+    """
+
+    def __init__(self):
+        self.inputs = []
+        self.outputs = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.outputs.append(outputs)
+        self.inputs.append(inputs)
+
+    def evaluate(self):
+        f_scores = [
+            calculate_f_score(output, input)
+            for output, input in zip(self.outputs, self.inputs)
+        ]
+
+        return {MetricKeys.FScore: sum(f_scores) / len(f_scores)}
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
index 38e4d720..59411fbe 100644
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -75,27 +75,37 @@ class FRCRNModel(TorchModel):
         model_bin_file = os.path.join(model_dir,
                                       ModelFile.TORCH_MODEL_BIN_FILE)
         if os.path.exists(model_bin_file):
-            checkpoint = torch.load(model_bin_file)
-            self.model.load_state_dict(checkpoint, strict=False)
+            checkpoint = torch.load(
+                model_bin_file, map_location=torch.device('cpu'))
+            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+                self.model.load_state_dict(
+                    checkpoint['state_dict'], strict=False)
+            else:
+                self.model.load_state_dict(checkpoint, strict=False)
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        output = self.model.forward(input)
-        return {
-            'spec_l1': output[0],
-            'wav_l1': output[1],
-            'mask_l1': output[2],
-            'spec_l2': output[3],
-            'wav_l2': output[4],
-            'mask_l2': output[5]
+        result_list = self.model.forward(input['noisy'])
+        output = {
+            'spec_l1': result_list[0],
+            'wav_l1': result_list[1],
+            'mask_l1': result_list[2],
+            'spec_l2': result_list[3],
+            'wav_l2': result_list[4],
+            'mask_l2': result_list[5]
         }
-
-    def to(self, *args, **kwargs):
-        self.model = self.model.to(*args, **kwargs)
-        return self
-
-    def eval(self):
-        self.model = self.model.train(False)
-        return self
+        if 'clean' in input:
+            mix_result = self.model.loss(
+                input['noisy'], input['clean'], result_list, mode='Mix')
+            output.update(mix_result)
+            sisnr_result = self.model.loss(
+                input['noisy'], input['clean'], result_list, mode='SiSNR')
+            output.update(sisnr_result)
+            # logger hooker will use items under 'log_vars'
+            output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
+            output['log_vars'].update(
+                {k: sisnr_result[k].item()
+                 for k in sisnr_result})
+        return output
 
 
 class FRCRN(nn.Module):
@@ -110,7 +120,8 @@ class FRCRN(nn.Module):
                  win_len=400,
                  win_inc=100,
                  fft_len=512,
-                 win_type='hanning'):
+                 win_type='hanning',
+                 **kwargs):
         r"""
         Args:
             complex: Whether to use complex networks.
@@ -236,7 +247,7 @@ class FRCRN(nn.Module):
                 if count != 3:
                     loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
                                             est_mask, mode)
-            return loss
+            return dict(sisnr=loss)
 
         elif mode == 'Mix':
             count = 0
@@ -251,7 +262,7 @@ class FRCRN(nn.Module):
                     amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
                         noisy, est_spec, est_wav, labels, est_mask, mode)
                     loss = amp_loss + phase_loss + SiSNR_loss
-            return loss, amp_loss, phase_loss
+            return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss)
 
     def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
         r""" Compute the loss by mode
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index 81e47350..428ec367 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -33,6 +33,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
                                       ModelFile.TORCH_MODEL_BIN_FILE)
         self._model = None
         if os.path.exists(model_bin_file):
+            kwargs.pop('device')
             self._model = FSMNSeleNetV2(*args, **kwargs)
             checkpoint = torch.load(model_bin_file)
             self._model.load_state_dict(checkpoint, strict=False)
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 3b596769..872c42e8 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -1,15 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 import os.path as osp
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Union
-
-import numpy as np
+from typing import Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
+from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.device import device_placement, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
@@ -24,8 +24,7 @@ class Model(ABC):
     def __init__(self, model_dir, *args, **kwargs):
         self.model_dir = model_dir
         device_name = kwargs.get('device', 'gpu')
-        assert device_name in ['gpu',
-                               'cpu'], 'device should be either cpu or gpu.'
+        verify_device(device_name)
         self._device_name = device_name
 
     def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -72,6 +71,7 @@ class Model(ABC):
                         model_name_or_path: str,
                         revision: Optional[str] = DEFAULT_MODEL_REVISION,
                         cfg_dict: Config = None,
+                        device: str = None,
                         *model_args,
                         **kwargs):
         """ Instantiate a model from local directory or remote model repo. Note
@@ -97,7 +97,7 @@ class Model(ABC):
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        # TODO @wenmeng.zwm may should manually initialize model after model building
+        framework = cfg.framework
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
@@ -105,10 +105,41 @@ class Model(ABC):
         model_cfg.model_dir = local_model_dir
         for k, v in kwargs.items():
             model_cfg[k] = v
-        model = build_model(
-            model_cfg, task_name=task_name, default_args=kwargs)
+        if device is not None:
+            model_cfg.device = device
+            with device_placement(framework, device):
+                model = build_model(
+                    model_cfg, task_name=task_name, default_args=kwargs)
+        else:
+            model = build_model(
+                model_cfg, task_name=task_name, default_args=kwargs)
 
         # dynamically add pipeline info to model for pipeline inference
         if hasattr(cfg, 'pipeline'):
             model.pipeline = cfg.pipeline
         return model
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = None,
+                        config: Optional[dict] = None,
+                        **kwargs):
+        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            save_checkpoint_names (Union[str, List[str]]):
+            The checkpoint names to be saved in the target_folder
+
+            save_function (Callable, optional):
+            The function to use to save the state dictionary.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json, might not be identical with model.config
+
+        """
+        save_pretrained(self, target_folder, save_checkpoint_names,
+                        save_function, config, **kwargs)
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index f2ecd08e..331f23bd 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -1,9 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
+# yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
-               cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
-               face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_instance_segmentation,
-               image_portrait_enhancement, image_to_image_generation,
-               image_to_image_translation, object_detection,
-               product_retrieval_embedding, salient_detection,
-               super_resolution, video_single_object_tracking, virual_tryon)
+               body_3d_keypoints, cartoon, cmdssl_video_embedding,
+               crowd_counting, face_detection, face_generation,
+               image_classification, image_color_enhance, image_colorization,
+               image_denoise, image_instance_segmentation,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_semantic_segmentation,
+               image_to_image_generation, image_to_image_translation,
+               movie_scene_segmentation, object_detection,
+               product_retrieval_embedding, realtime_object_detection,
+               salient_detection, super_resolution,
+               video_single_object_tracking, video_summarization, virual_tryon)
+
+# yapf: enable
diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
index 48e75ae1..a5964e21 100644
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,5 +1,6 @@
 import torch.nn as nn
 
+from .s3dg import Inception3D
 from .tada_convnext import TadaConvNeXt
 
 
@@ -26,11 +27,25 @@ class BaseVideoModel(nn.Module):
         super(BaseVideoModel, self).__init__()
         # the backbone is created according to meta-architectures
         # defined in models/base/backbone.py
-        self.backbone = TadaConvNeXt(cfg)
+        if cfg.MODEL.NAME == 'ConvNeXt_tiny':
+            self.backbone = TadaConvNeXt(cfg)
+        elif cfg.MODEL.NAME == 'S3DG':
+            self.backbone = Inception3D(cfg)
+        else:
+            error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format(
+                cfg.MODEL.NAME)
+            raise NotImplementedError(error_str)
 
         # the head is created according to the heads
         # defined in models/module_zoo/heads
-        self.head = BaseHead(cfg)
+        if cfg.VIDEO.HEAD.NAME == 'BaseHead':
+            self.head = BaseHead(cfg)
+        elif cfg.VIDEO.HEAD.NAME == 'AvgHead':
+            self.head = AvgHead(cfg)
+        else:
+            error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format(
+                cfg.VIDEO.HEAD.NAME)
+            raise NotImplementedError(error_str)
 
     def forward(self, x):
         x = self.backbone(x)
@@ -88,3 +103,29 @@ class BaseHead(nn.Module):
         out = self.activation(out)
         out = out.view(out.shape[0], -1)
         return out, x.view(x.shape[0], -1)
+
+
+class AvgHead(nn.Module):
+    """
+    Constructs base head.
+    """
+
+    def __init__(
+        self,
+        cfg,
+    ):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(AvgHead, self).__init__()
+        self.cfg = cfg
+        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)
+
+    def forward(self, x):
+        if len(x.shape) == 5:
+            x = self.global_avg_pool(x)
+            # (N, C, T, H, W) -> (N, T, H, W, C).
+            x = x.permute((0, 2, 3, 4, 1))
+        out = x.view(x.shape[0], -1)
+        return out, x.view(x.shape[0], -1)
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
new file mode 100644
index 00000000..f258df16
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -0,0 +1,301 @@
+import torch
+import torch.nn as nn
+
+
+class InceptionBaseConv3D(nn.Module):
+    """
+    Constructs basic inception 3D conv.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(InceptionBaseConv3D, self).__init__()
+        self.conv = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn = nn.BatchNorm3d(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn.weight.data.fill_(1)
+        self.bn.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class InceptionBlock3D(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg, in_planes, out_planes):
+        super(InceptionBlock3D, self).__init__()
+
+        _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING
+
+        assert len(out_planes) == 6
+        assert isinstance(out_planes, list)
+
+        [
+            num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a,
+            num_out_2_0b, num_out_3_0b
+        ] = out_planes
+
+        self.branch0 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), )
+        self.branch1 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_1_0a,
+                num_out_1_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch2 = nn.Sequential(
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
+            STConv3d(
+                cfg,
+                num_out_2_0a,
+                num_out_2_0b,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+        )
+        self.branch3 = nn.Sequential(
+            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
+            InceptionBaseConv3D(
+                cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
+        )
+
+        self.out_channels = sum(
+            [num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])
+
+        self.gating = _gating
+        if _gating:
+            self.gating_b0 = SelfGating(num_out_0_0a)
+            self.gating_b1 = SelfGating(num_out_1_0b)
+            self.gating_b2 = SelfGating(num_out_2_0b)
+            self.gating_b3 = SelfGating(num_out_3_0b)
+
+    def forward(self, x):
+        x0 = self.branch0(x)
+        x1 = self.branch1(x)
+        x2 = self.branch2(x)
+        x3 = self.branch3(x)
+        if self.gating:
+            x0 = self.gating_b0(x0)
+            x1 = self.gating_b1(x1)
+            x2 = self.gating_b2(x2)
+            x3 = self.gating_b3(x3)
+
+        out = torch.cat((x0, x1, x2, x3), 1)
+
+        return out
+
+
+class SelfGating(nn.Module):
+
+    def __init__(self, input_dim):
+        super(SelfGating, self).__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+
+    def forward(self, input_tensor):
+        """Feature gating as used in S3D-G"""
+        spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
+        weights = self.fc(spatiotemporal_average)
+        weights = torch.sigmoid(weights)
+        return weights[:, :, None, None, None] * input_tensor
+
+
+class STConv3d(nn.Module):
+    """
+    Element constructing the S3D/S3DG.
+    See models/base/backbone.py L99-186.
+
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self,
+                 cfg,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride,
+                 padding=0):
+        super(STConv3d, self).__init__()
+        if isinstance(stride, tuple):
+            t_stride = stride[0]
+            stride = stride[-1]
+        else:  # int
+            t_stride = stride
+
+        self.bn_mmt = cfg.BN.MOMENTUM
+        self.bn_eps = float(cfg.BN.EPS)
+        self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride,
+                               t_stride, padding)
+
+    def _construct_branch(self,
+                          cfg,
+                          in_planes,
+                          out_planes,
+                          kernel_size,
+                          stride,
+                          t_stride,
+                          padding=0):
+        self.conv1 = nn.Conv3d(
+            in_planes,
+            out_planes,
+            kernel_size=(1, kernel_size, kernel_size),
+            stride=(1, stride, stride),
+            padding=(0, padding, padding),
+            bias=False)
+        self.conv2 = nn.Conv3d(
+            out_planes,
+            out_planes,
+            kernel_size=(kernel_size, 1, 1),
+            stride=(t_stride, 1, 1),
+            padding=(padding, 0, 0),
+            bias=False)
+
+        self.bn1 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.bn2 = nn.BatchNorm3d(
+            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(inplace=True)
+
+        # init
+        self.conv1.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.conv2.weight.data.normal_(
+            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
+        self.bn1.weight.data.fill_(1)
+        self.bn1.bias.data.zero_()
+        self.bn2.weight.data.fill_(1)
+        self.bn2.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        return x
+
+
+class Inception3D(nn.Module):
+    """
+    Backbone architecture for I3D/S3DG.
+    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(Inception3D, self).__init__()
+        _input_channel = cfg.DATA.NUM_INPUT_CHANNELS
+        self._construct_backbone(cfg, _input_channel)
+
+    def _construct_backbone(self, cfg, input_channel):
+        # ------------------- Block 1 -------------------
+        self.Conv_1a = STConv3d(
+            cfg, input_channel, 64, kernel_size=7, stride=2, padding=3)
+
+        self.block1 = nn.Sequential(self.Conv_1a)  # (64, 32, 112, 112)
+
+        # ------------------- Block 2 -------------------
+        self.MaxPool_2a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Conv_2b = InceptionBaseConv3D(
+            cfg, 64, 64, kernel_size=1, stride=1)
+        self.Conv_2c = STConv3d(
+            cfg, 64, 192, kernel_size=3, stride=1, padding=1)
+
+        self.block2 = nn.Sequential(
+            self.MaxPool_2a,  # (64, 32, 56, 56)
+            self.Conv_2b,  # (64, 32, 56, 56)
+            self.Conv_2c)  # (192, 32, 56, 56)
+
+        # ------------------- Block 3 -------------------
+        self.MaxPool_3a = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+        self.Mixed_3b = InceptionBlock3D(
+            cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32])
+        self.Mixed_3c = InceptionBlock3D(
+            cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64])
+
+        self.block3 = nn.Sequential(
+            self.MaxPool_3a,  # (192, 32, 28, 28)
+            self.Mixed_3b,  # (256, 32, 28, 28)
+            self.Mixed_3c)  # (480, 32, 28, 28)
+
+        # ------------------- Block 4 -------------------
+        self.MaxPool_4a = nn.MaxPool3d(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
+        self.Mixed_4b = InceptionBlock3D(
+            cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64])
+        self.Mixed_4c = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64])
+        self.Mixed_4d = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64])
+        self.Mixed_4e = InceptionBlock3D(
+            cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64])
+        self.Mixed_4f = InceptionBlock3D(
+            cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128])
+
+        self.block4 = nn.Sequential(
+            self.MaxPool_4a,  # (480, 16, 14, 14)
+            self.Mixed_4b,  # (512, 16, 14, 14)
+            self.Mixed_4c,  # (512, 16, 14, 14)
+            self.Mixed_4d,  # (512, 16, 14, 14)
+            self.Mixed_4e,  # (528, 16, 14, 14)
+            self.Mixed_4f)  # (832, 16, 14, 14)
+
+        # ------------------- Block 5 -------------------
+        self.MaxPool_5a = nn.MaxPool3d(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
+        self.Mixed_5b = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128])
+        self.Mixed_5c = InceptionBlock3D(
+            cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128])
+
+        self.block5 = nn.Sequential(
+            self.MaxPool_5a,  # (832, 8, 7, 7)
+            self.Mixed_5b,  # (832, 8, 7, 7)
+            self.Mixed_5c)  # (1024, 8, 7, 7)
+
+    def forward(self, x):
+        if isinstance(x, dict):
+            x = x['video']
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        return x
diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py
new file mode 100644
index 00000000..4bb83936
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .body_3d_pose import BodyKeypointsDetection3D
+
+else:
+    _import_structure = {
+        'body_3d_pose': ['BodyKeypointsDetection3D'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
new file mode 100644
index 00000000..87cd4962
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -0,0 +1,246 @@
+import logging
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
+    TemporalModel, TransCan3Dkeys)
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['BodyKeypointsDetection3D']
+
+
+class KeypointsTypes(object):
+    POSES_CAMERA = 'poses_camera'
+    POSES_TRAJ = 'poses_traj'
+
+
+@MODELS.register_module(
+    Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints)
+class BodyKeypointsDetection3D(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(cfg_path)
+        self._create_model()
+
+        if not osp.exists(model_path):
+            raise IOError(f'{model_path} is not exists.')
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.pretrained_state_dict = torch.load(
+            model_path, map_location=self._device)
+
+        self.load_pretrained()
+        self.to_device(self._device)
+        self.eval()
+
+    def _create_model(self):
+        self.model_pos = TemporalModel(
+            self.cfg.model.MODEL.IN_NUM_JOINTS,
+            self.cfg.model.MODEL.IN_2D_FEATURE,
+            self.cfg.model.MODEL.OUT_NUM_JOINTS,
+            filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS,
+            causal=self.cfg.model.MODEL.CAUSAL,
+            dropout=self.cfg.model.MODEL.DROPOUT,
+            channels=self.cfg.model.MODEL.CHANNELS,
+            dense=self.cfg.model.MODEL.DENSE)
+
+        receptive_field = self.model_pos.receptive_field()
+        self.pad = (receptive_field - 1) // 2
+        if self.cfg.model.MODEL.CAUSAL:
+            self.causal_shift = self.pad
+        else:
+            self.causal_shift = 0
+
+        self.model_traj = TransCan3Dkeys(
+            in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS
+            * self.cfg.model.MODEL.IN_2D_FEATURE,
+            num_features=1024,
+            out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE,
+            num_blocks=4,
+            time_window=receptive_field)
+
+    def eval(self):
+        self.model_pos.eval()
+        self.model_traj.eval()
+
+    def train(self):
+        self.model_pos.train()
+        self.model_traj.train()
+
+    def to_device(self, device):
+        self.model_pos = self.model_pos.to(device)
+        self.model_traj = self.model_traj.to(device)
+
+    def load_pretrained(self):
+        if 'model_pos' in self.pretrained_state_dict:
+            self.model_pos.load_state_dict(
+                self.pretrained_state_dict['model_pos'], strict=False)
+        else:
+            logging.error(
+                'Not load model pos from pretrained_state_dict, not in pretrained_state_dict'
+            )
+
+        if 'model_traj' in self.pretrained_state_dict:
+            self.model_traj.load_state_dict(
+                self.pretrained_state_dict['model_traj'], strict=False)
+        else:
+            logging.error(
+                'Not load model traj from pretrained_state_dict, not in pretrained_state_dict'
+            )
+        logging.info('Load pretrained model done.')
+
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """Proprocess of 2D input joints.
+
+        Args:
+            input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.
+
+        Returns:
+            Dict[str, Any]: canonical 2d points and root relative joints.
+        """
+        if 'cuda' == input.device.type:
+            input = input.data.cpu().numpy()
+        elif 'cpu' == input.device.type:
+            input = input.data.numpy()
+        pose2d = input
+
+        pose2d_canonical = self.canonicalize_2Ds(
+            pose2d, self.cfg.model.INPUT.FOCAL_LENGTH,
+            self.cfg.model.INPUT.CENTER)
+        pose2d_normalized = self.normalize_screen_coordinates(
+            pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H)
+        pose2d_rr = pose2d_normalized
+        pose2d_rr[:, 1:] -= pose2d_rr[:, :1]
+
+        # expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2]
+        pose2d_rr = np.expand_dims(
+            np.pad(
+                pose2d_rr,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_canonical = np.expand_dims(
+            np.pad(
+                pose2d_canonical,
+                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
+                 (0, 0), (0, 0)), 'edge'),
+            axis=0)
+        pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32))
+        pose2d_canonical = torch.from_numpy(
+            pose2d_canonical.astype(np.float32))
+
+        inputs_2d = pose2d_rr.clone()
+        if torch.cuda.is_available():
+            inputs_2d = inputs_2d.cuda(non_blocking=True)
+
+        # Positional model
+        if self.cfg.model.MODEL.USE_2D_OFFSETS:
+            inputs_2d[:, :, 0] = 0
+        else:
+            inputs_2d[:, :, 1:] += inputs_2d[:, :, :1]
+
+        return {
+            'inputs_2d': inputs_2d,
+            'pose2d_rr': pose2d_rr,
+            'pose2d_canonical': pose2d_canonical
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """3D human pose estimation.
+
+        Args:
+            input (Dict):
+                inputs_2d:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_rr:  [1, NUM_FRAME, NUM_JOINTS, 2]
+                pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2]
+                NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number)
+
+        Returns:
+            Dict[str, Any]:
+                "camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
+                    3D human pose keypoints in camera frame.
+                "camera_traj": Tensor, [1, NUM_FRAME, 1, 3],
+                    root keypoints coordinates in camere frame.
+        """
+        inputs_2d = input['inputs_2d']
+        pose2d_rr = input['pose2d_rr']
+        pose2d_canonical = input['pose2d_canonical']
+        with torch.no_grad():
+            # predict 3D pose keypoints
+            predicted_3d_pos = self.model_pos(inputs_2d)
+
+            # predict global trajectory
+            b1, w1, n1, d1 = inputs_2d.shape
+
+            input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr,
+                                                   pose2d_canonical)
+            b1, w1, n1, d1 = input_pose2d_abs.size()
+            b2, w2, n2, d2 = predicted_3d_pos.size()
+
+            if torch.cuda.is_available():
+                input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True)
+
+            predicted_3d_traj = self.model_traj(
+                input_pose2d_abs.view(b1, w1, n1 * d1),
+                predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3)
+
+            predict_dict = {
+                KeypointsTypes.POSES_CAMERA: predicted_3d_pos,
+                KeypointsTypes.POSES_TRAJ: predicted_3d_traj
+            }
+
+        return predict_dict
+
+    def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
+                       pose2d_canonical):
+        pad = self.pad
+        w = input_video_frame_num - pad * 2
+
+        lst_pose2d_rr = []
+        lst_pose2d_cannoical = []
+        for i in range(pad, w + pad):
+            lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
+            lst_pose2d_cannoical.append(pose2d_canonical[:,
+                                                         i - pad:i + pad + 1])
+
+        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
+        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
+
+        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
+            input_pose2d_abs = input_pose2d_cannoical.clone()
+        else:
+            input_pose2d_abs = input_pose2d_rr.clone()
+            input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]
+
+        return input_pose2d_abs
+
+    def canonicalize_2Ds(self, pos2d, f, c):
+        cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
+        fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
+        canoical_2Ds = (pos2d - cs) / fs
+        return canoical_2Ds
+
+    def normalize_screen_coordinates(self, X, w, h):
+        assert X.shape[-1] == 2
+
+        # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
+        return X / w * 2 - [1, h / w]
diff --git a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
new file mode 100644
index 00000000..b3eac2e5
--- /dev/null
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -0,0 +1,233 @@
+# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
+import torch
+import torch.nn as nn
+
+
+class TemporalModelBase(nn.Module):
+    """
+    Do not instantiate this class.
+    """
+
+    def __init__(self, num_joints_in, in_features, num_joints_out,
+                 filter_widths, causal, dropout, channels):
+        super().__init__()
+
+        # Validate input
+        for fw in filter_widths:
+            assert fw % 2 != 0, 'Only odd filter widths are supported'
+
+        self.num_joints_in = num_joints_in
+        self.in_features = in_features
+        self.num_joints_out = num_joints_out
+        self.filter_widths = filter_widths
+
+        self.drop = nn.Dropout(dropout)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.pad = [filter_widths[0] // 2]
+        self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1)
+        self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def receptive_field(self):
+        """
+        Return the total receptive field of this model as # of frames.
+        """
+        frames = 0
+        for f in self.pad:
+            frames += f
+        return 1 + 2 * frames
+
+    def total_causal_shift(self):
+        """
+        Return the asymmetric offset for sequence padding.
+        The returned value is typically 0 if causal convolutions are disabled,
+        otherwise it is half the receptive field.
+        """
+        frames = self.causal_shift[0]
+        next_dilation = self.filter_widths[0]
+        for i in range(1, len(self.filter_widths)):
+            frames += self.causal_shift[i] * next_dilation
+            next_dilation *= self.filter_widths[i]
+        return frames
+
+    def forward(self, x):
+        assert len(x.shape) == 4
+        assert x.shape[-2] == self.num_joints_in
+        assert x.shape[-1] == self.in_features
+
+        sz = x.shape[:3]
+        x = x.view(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+
+        x = self._forward_blocks(x)
+
+        x = x.permute(0, 2, 1)
+        x = x.view(sz[0], -1, self.num_joints_out, 3)
+
+        return x
+
+
+class TemporalModel(TemporalModelBase):
+    """
+    Reference 3D pose estimation model with temporal convolutions.
+    This implementation can be used for all use-cases.
+    """
+
+    def __init__(self,
+                 num_joints_in,
+                 in_features,
+                 num_joints_out,
+                 filter_widths,
+                 causal=False,
+                 dropout=0.25,
+                 channels=1024,
+                 dense=False):
+        """
+        Initialize this model.
+
+        Arguments:
+        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
+        in_features -- number of input features for each joint (typically 2 for 2D input)
+        num_joints_out -- number of output joints (can be different than input)
+        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
+        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
+        dropout -- dropout probability
+        channels -- number of convolution channels
+        dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
+        """
+        super().__init__(num_joints_in, in_features, num_joints_out,
+                         filter_widths, causal, dropout, channels)
+
+        self.expand_conv = nn.Conv1d(
+            num_joints_in * in_features,
+            channels,
+            filter_widths[0],
+            bias=False)
+
+        layers_conv = []
+        layers_bn = []
+
+        self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
+        next_dilation = filter_widths[0]
+        for i in range(1, len(filter_widths)):
+            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
+            self.causal_shift.append((filter_widths[i] // 2
+                                      * next_dilation) if causal else 0)
+
+            layers_conv.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    filter_widths[i] if not dense else (2 * self.pad[-1] + 1),
+                    dilation=next_dilation if not dense else 1,
+                    bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+            layers_conv.append(
+                nn.Conv1d(channels, channels, 1, dilation=1, bias=False))
+            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
+
+            next_dilation *= filter_widths[i]
+
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def _forward_blocks(self, x):
+        x = self.drop(self.relu(self.expand_bn(self.expand_conv(x))))
+        for i in range(len(self.pad) - 1):
+            pad = self.pad[i + 1]
+            shift = self.causal_shift[i + 1]
+            res = x[:, :, pad + shift:x.shape[2] - pad + shift]
+            x = self.drop(
+                self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))))
+            x = res + self.drop(
+                self.relu(self.layers_bn[2 * i + 1](
+                    self.layers_conv[2 * i + 1](x))))
+
+        x = self.shrink(x)
+        return x
+
+
+# regression of the trajectory
+class TransCan3Dkeys(nn.Module):
+
+    def __init__(self,
+                 in_channels=74,
+                 num_features=256,
+                 out_channels=44,
+                 time_window=10,
+                 num_blocks=2):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_features = num_features
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.time_window = time_window
+
+        self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1)
+        self.conv1 = nn.Sequential(
+            nn.ReplicationPad1d(1),
+            nn.Conv1d(
+                self.in_channels, self.num_features, kernel_size=3,
+                bias=False), self.expand_bn, nn.ReLU(inplace=True),
+            nn.Dropout(p=0.25))
+        self._make_blocks()
+        self.pad = nn.ReplicationPad1d(4)
+        self.relu = nn.ReLU(inplace=True)
+        self.drop = nn.Dropout(p=0.25)
+        self.reduce = nn.Conv1d(
+            self.num_features, self.num_features, kernel_size=self.time_window)
+        self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500)
+        self.embedding_3d_2 = nn.Linear(500, 500)
+        self.LReLU1 = nn.LeakyReLU()
+        self.LReLU2 = nn.LeakyReLU()
+        self.LReLU3 = nn.LeakyReLU()
+        self.out1 = nn.Linear(self.num_features + 500, self.num_features)
+        self.out2 = nn.Linear(self.num_features, self.out_channels)
+
+    def _make_blocks(self):
+        layers_conv = []
+        layers_bn = []
+        for i in range(self.num_blocks):
+            layers_conv.append(
+                nn.Conv1d(
+                    self.num_features,
+                    self.num_features,
+                    kernel_size=5,
+                    bias=False,
+                    dilation=2))
+            layers_bn.append(nn.BatchNorm1d(self.num_features))
+        self.layers_conv = nn.ModuleList(layers_conv)
+        self.layers_bn = nn.ModuleList(layers_bn)
+
+    def set_bn_momentum(self, momentum):
+        self.expand_bn.momentum = momentum
+        for bn in self.layers_bn:
+            bn.momentum = momentum
+
+    def forward(self, p2ds, p3d):
+        """
+        Args:
+        x - (B x T x J x C)
+        """
+        B, T, C = p2ds.shape
+        x = p2ds.permute((0, 2, 1))
+        x = self.conv1(x)
+        for i in range(self.num_blocks):
+            pre = x
+            x = self.pad(x)
+            x = self.layers_conv[i](x)
+            x = self.layers_bn[i](x)
+            x = self.drop(self.relu(x))
+            x = pre + x
+        x_2d = self.relu(self.reduce(x))
+        x_2d = x_2d.view(B, -1)
+        x_3d = self.LReLU1(self.embedding_3d_1(p3d))
+        x = torch.cat((x_2d, x_3d), 1)
+        x = self.LReLU3(self.out1(x))
+        x = self.out2(x)
+        return x
diff --git a/modelscope/models/cv/crowd_counting/cc_model.py b/modelscope/models/cv/crowd_counting/cc_model.py
index 4e3d0e9f..582b26f4 100644
--- a/modelscope/models/cv/crowd_counting/cc_model.py
+++ b/modelscope/models/cv/crowd_counting/cc_model.py
@@ -13,8 +13,8 @@ from modelscope.utils.constant import Tasks
     Tasks.crowd_counting, module_name=Models.crowd_counting)
 class HRNetCrowdCounting(TorchModel):
 
-    def __init__(self, model_dir: str):
-        super().__init__(model_dir)
+    def __init__(self, model_dir: str, **kwargs):
+        super().__init__(model_dir, **kwargs)
 
         from .hrnet_aspp_relu import HighResolutionNet as HRNet_aspp_relu
 
diff --git a/modelscope/models/cv/easycv_base.py b/modelscope/models/cv/easycv_base.py
new file mode 100644
index 00000000..7bc35e84
--- /dev/null
+++ b/modelscope/models/cv/easycv_base.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.base import BaseModel
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.models.base import TorchModel
+
+
+class EasyCVBaseModel(BaseModel, TorchModel):
+    """Base model for EasyCV."""
+
+    def __init__(self, model_dir=None, args=(), kwargs={}):
+        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
+        BaseModel.__init__(self)
+        TorchModel.__init__(self, model_dir=model_dir)
+
+    def forward(self, img, mode='train', **kwargs):
+        if self.training:
+            losses = self.forward_train(img, **kwargs)
+            loss, log_vars = self._parse_losses(losses)
+            return dict(loss=loss, log_vars=log_vars)
+        else:
+            return self.forward_test(img, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
diff --git a/modelscope/models/cv/image_classification/mmcls_model.py b/modelscope/models/cv/image_classification/mmcls_model.py
index 6a65656e..a6789d0b 100644
--- a/modelscope/models/cv/image_classification/mmcls_model.py
+++ b/modelscope/models/cv/image_classification/mmcls_model.py
@@ -10,7 +10,7 @@ from modelscope.utils.constant import Tasks
     Tasks.image_classification, module_name=Models.classification_model)
 class ClassificationModel(TorchModel):
 
-    def __init__(self, model_dir: str):
+    def __init__(self, model_dir: str, **kwargs):
         import mmcv
         from mmcls.models import build_classifier
 
diff --git a/modelscope/models/cv/image_panoptic_segmentation/__init__.py b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
new file mode 100644
index 00000000..2b2be4b7
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .panseg_model import SwinLPanopticSegmentation
+
+else:
+    _import_structure = {
+        'panseg_model': ['SwinLPanopticSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
new file mode 100644
index 00000000..f9022f90
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -0,0 +1,54 @@
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.panoptic_segmentation)
+class SwinLPanopticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'config.py')
+
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = load_checkpoint(
+            self.model, model_path, map_location='cpu')
+
+        self.CLASSES = checkpoint['meta']['CLASSES']
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def inference(self, data):
+        """data is dict,contain img and img_metas,follow with mmdet."""
+
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+        return results
+
+    def forward(self, Inputs):
+        import pdb
+        pdb.set_trace()
+        return self.model(**Inputs)
diff --git a/modelscope/models/cv/image_reid_person/__init__.py b/modelscope/models/cv/image_reid_person/__init__.py
new file mode 100644
index 00000000..0fe0bede
--- /dev/null
+++ b/modelscope/models/cv/image_reid_person/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .pass_model import PASS
+
+else:
+    _import_structure = {
+        'pass_model': ['PASS'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py
new file mode 100644
index 00000000..2222fedb
--- /dev/null
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -0,0 +1,136 @@
+# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# https://github.com/CASIA-IVA-Lab/PASS-reID
+
+import os
+from enum import Enum
+
+import torch
+import torch.nn as nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .transreid_model import vit_base_patch16_224_TransReID
+
+
+class Fusions(Enum):
+    CAT = 'cat'
+    MEAN = 'mean'
+
+
+@MODELS.register_module(
+    Tasks.image_reid_person, module_name=Models.image_reid_person)
+class PASS(TorchModel):
+
+    def __init__(self, cfg: Config, model_dir: str, **kwargs):
+        super(PASS, self).__init__(model_dir=model_dir)
+        size_train = cfg.INPUT.SIZE_TRAIN
+        sie_coe = cfg.MODEL.SIE_COE
+        stride_size = cfg.MODEL.STRIDE_SIZE
+        drop_path = cfg.MODEL.DROP_PATH
+        drop_out = cfg.MODEL.DROP_OUT
+        att_drop_rate = cfg.MODEL.ATT_DROP_RATE
+        gem_pooling = cfg.MODEL.GEM_POOLING
+        stem_conv = cfg.MODEL.STEM_CONV
+        weight = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.neck_feat = cfg.TEST.NECK_FEAT
+        self.dropout_rate = cfg.MODEL.DROPOUT_RATE
+        self.num_classes = cfg.DATASETS.NUM_CLASSES
+        self.multi_neck = cfg.MODEL.MULTI_NECK
+        self.feat_fusion = cfg.MODEL.FEAT_FUSION
+
+        self.base = vit_base_patch16_224_TransReID(
+            img_size=size_train,
+            sie_xishu=sie_coe,
+            stride_size=stride_size,
+            drop_path_rate=drop_path,
+            drop_rate=drop_out,
+            attn_drop_rate=att_drop_rate,
+            gem_pool=gem_pooling,
+            stem_conv=stem_conv)
+        self.in_planes = self.base.in_planes
+
+        if self.feat_fusion == Fusions.CAT.value:
+            self.classifier = nn.Linear(
+                self.in_planes * 2, self.num_classes, bias=False)
+        elif self.feat_fusion == Fusions.MEAN.value:
+            self.classifier = nn.Linear(
+                self.in_planes, self.num_classes, bias=False)
+
+        if self.multi_neck:
+            self.bottleneck = nn.BatchNorm1d(self.in_planes)
+            self.bottleneck.bias.requires_grad_(False)
+            self.bottleneck_1 = nn.BatchNorm1d(self.in_planes)
+            self.bottleneck_1.bias.requires_grad_(False)
+            self.bottleneck_2 = nn.BatchNorm1d(self.in_planes)
+            self.bottleneck_2.bias.requires_grad_(False)
+            self.bottleneck_3 = nn.BatchNorm1d(self.in_planes)
+            self.bottleneck_3.bias.requires_grad_(False)
+        else:
+            if self.feat_fusion == Fusions.CAT.value:
+                self.bottleneck = nn.BatchNorm1d(self.in_planes * 2)
+                self.bottleneck.bias.requires_grad_(False)
+            elif self.feat_fusion == Fusions.MEAN.value:
+                self.bottleneck = nn.BatchNorm1d(self.in_planes)
+                self.bottleneck.bias.requires_grad_(False)
+
+        self.dropout = nn.Dropout(self.dropout_rate)
+
+        self.load_param(weight)
+
+    def forward(self, input):
+
+        global_feat, local_feat_1, local_feat_2, local_feat_3 = self.base(
+            input)
+
+        # single-neck, almost the same performance
+        if not self.multi_neck:
+            if self.feat_fusion == Fusions.MEAN.value:
+                local_feat = local_feat_1 / 3. + local_feat_2 / 3. + local_feat_3 / 3.
+                final_feat_before = (global_feat + local_feat) / 2
+            elif self.feat_fusion == Fusions.CAT.value:
+                final_feat_before = torch.cat(
+                    (global_feat, local_feat_1 / 3. + local_feat_2 / 3.
+                     + local_feat_3 / 3.),
+                    dim=1)
+
+            final_feat_after = self.bottleneck(final_feat_before)
+        # multi-neck
+        else:
+            feat = self.bottleneck(global_feat)
+            local_feat_1_bn = self.bottleneck_1(local_feat_1)
+            local_feat_2_bn = self.bottleneck_2(local_feat_2)
+            local_feat_3_bn = self.bottleneck_3(local_feat_3)
+
+            if self.feat_fusion == Fusions.MEAN.value:
+                final_feat_before = ((global_feat + local_feat_1 / 3
+                                      + local_feat_2 / 3 + local_feat_3 / 3)
+                                     / 2.)
+                final_feat_after = (feat + local_feat_1_bn / 3
+                                    + local_feat_2_bn / 3
+                                    + local_feat_3_bn / 3) / 2.
+            elif self.feat_fusion == Fusions.CAT.value:
+                final_feat_before = torch.cat(
+                    (global_feat, local_feat_1 / 3. + local_feat_2 / 3.
+                     + local_feat_3 / 3.),
+                    dim=1)
+                final_feat_after = torch.cat(
+                    (feat, local_feat_1_bn / 3 + local_feat_2_bn / 3
+                     + local_feat_3_bn / 3),
+                    dim=1)
+
+        if self.neck_feat == 'after':
+            return final_feat_after
+        else:
+            return final_feat_before
+
+    def load_param(self, trained_path):
+        param_dict = torch.load(trained_path, map_location='cpu')
+        for i in param_dict:
+            try:
+                self.state_dict()[i.replace('module.',
+                                            '')].copy_(param_dict[i])
+            except Exception:
+                continue
diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py
new file mode 100644
index 00000000..275c4e22
--- /dev/null
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -0,0 +1,418 @@
+# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
+# https://github.com/CASIA-IVA-Lab/PASS-reID
+
+import collections.abc as container_abcs
+from functools import partial
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+def vit_base_patch16_224_TransReID(
+        img_size=(256, 128),
+        stride_size=16,
+        drop_path_rate=0.1,
+        camera=0,
+        view=0,
+        local_feature=False,
+        sie_xishu=1.5,
+        **kwargs):
+    model = TransReID(
+        img_size=img_size,
+        patch_size=16,
+        stride_size=stride_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        camera=camera,
+        view=view,
+        drop_path_rate=drop_path_rate,
+        sie_xishu=sie_xishu,
+        local_feature=local_feature,
+        **kwargs)
+    return model
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class TransReID(nn.Module):
+    """Transformer-based Object Re-Identification
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 stride_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 camera=0,
+                 view=0,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 local_feature=False,
+                 sie_xishu=1.0,
+                 hw_ratio=1,
+                 gem_pool=False,
+                 stem_conv=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.local_feature = local_feature
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            stride_size=stride_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            stem_conv=stem_conv)
+
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part_token1 = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part_token2 = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part_token3 = nn.Parameter(torch.zeros(1, 1, embed_dim))
+
+        self.cls_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part1_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part2_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.part3_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+        self.cam_num = camera
+        self.view_num = view
+        self.sie_xishu = sie_xishu
+        self.in_planes = 768
+        self.gem_pool = gem_pool
+
+        # Initialize SIE Embedding
+        if camera > 1 and view > 1:
+            self.sie_embed = nn.Parameter(
+                torch.zeros(camera * view, 1, embed_dim))
+        elif camera > 1:
+            self.sie_embed = nn.Parameter(torch.zeros(camera, 1, embed_dim))
+        elif view > 1:
+            self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim))
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.fc = nn.Linear(embed_dim,
+                            num_classes) if num_classes > 0 else nn.Identity()
+
+        self.gem = GeneralizedMeanPooling()
+
+    def forward_features(self, x, camera_id, view_id):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        part_tokens1 = self.part_token1.expand(B, -1, -1)
+        part_tokens2 = self.part_token2.expand(B, -1, -1)
+        part_tokens3 = self.part_token3.expand(B, -1, -1)
+        x = torch.cat(
+            (cls_tokens, part_tokens1, part_tokens2, part_tokens3, x), dim=1)
+
+        if self.cam_num > 0 and self.view_num > 0:
+            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[
+                camera_id * self.view_num + view_id]
+        elif self.cam_num > 0:
+            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[camera_id]
+        elif self.view_num > 0:
+            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id]
+        else:
+            x = x + torch.cat((self.cls_pos, self.part1_pos, self.part2_pos,
+                               self.part3_pos, self.pos_embed),
+                              dim=1)
+
+        x = self.pos_drop(x)
+
+        if self.local_feature:
+            for blk in self.blocks[:-1]:
+                x = blk(x)
+            return x
+        else:
+            for blk in self.blocks:
+                x = blk(x)
+
+            x = self.norm(x)
+        if self.gem_pool:
+            gf = self.gem(x[:, 1:].permute(0, 2, 1)).squeeze()
+            return x[:, 0] + gf
+        return x[:, 0], x[:, 1], x[:, 2], x[:, 3]
+
+    def forward(self, x, cam_label=None, view_label=None):
+        global_feat, local_feat_1, local_feat_2, local_feat_3 = self.forward_features(
+            x, cam_label, view_label)
+        return global_feat, local_feat_1, local_feat_2, local_feat_3
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding with overlapping patches
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 stride_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 stem_conv=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride_size_tuple = to_2tuple(stride_size)
+        self.num_x = (img_size[1] - patch_size[1]) // stride_size_tuple[1] + 1
+        self.num_y = (img_size[0] - patch_size[0]) // stride_size_tuple[0] + 1
+        self.num_patches = self.num_x * self.num_y
+        self.img_size = img_size
+        self.patch_size = patch_size
+
+        self.stem_conv = stem_conv
+        if self.stem_conv:
+            hidden_dim = 64
+            stem_stride = 2
+            stride_size = patch_size = patch_size[0] // stem_stride
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    in_chans,
+                    hidden_dim,
+                    kernel_size=7,
+                    stride=stem_stride,
+                    padding=3,
+                    bias=False),
+                IBN(hidden_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hidden_dim,
+                    hidden_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                IBN(hidden_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hidden_dim,
+                    hidden_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU(inplace=True),
+            )
+            in_chans = hidden_dim
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=stride_size)
+
+    def forward(self, x):
+        if self.stem_conv:
+            x = self.conv(x)
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)  # [64, 8, 768]
+
+        return x
+
+
+class GeneralizedMeanPooling(nn.Module):
+    """Applies a 2D power-average adaptive pooling over an input signal composed of several input planes.
+    The function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)`
+        - At p = infinity, one gets Max Pooling
+        - At p = 1, one gets Average Pooling
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+    """
+
+    def __init__(self, norm=3, output_size=1, eps=1e-6):
+        super(GeneralizedMeanPooling, self).__init__()
+        assert norm > 0
+        self.p = float(norm)
+        self.output_size = output_size
+        self.eps = eps
+
+    def forward(self, x):
+        x = x.clamp(min=self.eps).pow(self.p)
+        return F.adaptive_avg_pool1d(x, self.output_size).pow(1. / self.p)
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/modelscope/models/cv/image_semantic_segmentation/__init__.py b/modelscope/models/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..df56c5b8
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .semantic_seg_model import SemanticSegmentation
+    from .segformer import Segformer
+
+else:
+    _import_structure = {
+        'semantic_seg_model': ['SemanticSegmentation'],
+        'segformer': ['Segformer']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
new file mode 100644
index 00000000..2a75f318
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
@@ -0,0 +1 @@
+from .maskformer_semantic_head import MaskFormerSemanticHead
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
new file mode 100644
index 00000000..05e68d89
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+from mmdet.models.builder import build_loss
+
+
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(BasePanopticFusionHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = build_loss(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self):
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
+        """Forward function during training."""
+
+    @abstractmethod
+    def simple_test(self,
+                    img_metas,
+                    det_labels,
+                    mask_preds,
+                    seg_preds,
+                    det_bboxes,
+                    cfg=None,
+                    **kwargs):
+        """Test without augmentation."""
diff --git a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
new file mode 100644
index 00000000..6769ebaf
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn.functional as F
+from mmdet.models.builder import HEADS
+
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@HEADS.register_module()
+class MaskFormerSemanticHead(BasePanopticFusionHead):
+
+    def __init__(self,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 test_cfg=None,
+                 loss_panoptic=None,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
+                         loss_panoptic, init_cfg, **kwargs)
+
+    def forward_train(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def simple_test(self,
+                    mask_cls_results,
+                    mask_pred_results,
+                    img_metas,
+                    rescale=False,
+                    **kwargs):
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            # semantic inference
+            cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1]
+            mask_pred = mask_pred_result.sigmoid()
+            seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred)
+            # still need softmax and argmax
+            seg_logit = F.softmax(seg_mask, dim=0)
+            seg_pred = seg_logit.argmax(dim=0)
+            seg_pred = seg_pred.cpu().numpy()
+            results.append(seg_pred)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/segformer.py b/modelscope/models/cv/image_semantic_segmentation/segformer.py
new file mode 100644
index 00000000..46303526
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.segmentation import EncoderDecoder
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Models.segformer)
+class Segformer(EasyCVBaseModel, EncoderDecoder):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        EncoderDecoder.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
new file mode 100644
index 00000000..60acf28f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -0,0 +1,76 @@
+import os.path as osp
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_semantic_segmentation import (pan_merge,
+                                                              vit_adapter)
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation)
+@MODELS.register_module(
+    Tasks.image_segmentation,
+    module_name=Models.vitadapter_semantic_segmentation)
+class SemanticSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        from mmcv.runner import load_checkpoint
+        import mmcv
+        from mmdet.models import build_detector
+
+        config = osp.join(model_dir, 'mmcv_config.py')
+        cfg = mmcv.Config.fromfile(config)
+        if 'pretrained' in cfg.model:
+            cfg.model.pretrained = None
+        elif 'init_cfg' in cfg.model.backbone:
+            cfg.model.backbone.init_cfg = None
+
+        # build model
+        cfg.model.train_cfg = None
+        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        _ = load_checkpoint(self.model, model_path, map_location='cpu')
+
+        self.CLASSES = cfg['CLASSES']  # list
+        self.PALETTE = cfg['PALETTE']  # list
+
+        self.num_classes = len(self.CLASSES)
+        self.cfg = cfg
+
+    def forward(self, Inputs):
+        return self.model(**Inputs)
+
+    def postprocess(self, Inputs):
+        semantic_result = Inputs[0]
+
+        ids = np.unique(semantic_result)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+
+        segms = (semantic_result[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.CLASSES)[ids].tolist()
+
+        results = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return results
+
+    def inference(self, data):
+        with torch.no_grad():
+            results = self.model(return_loss=False, rescale=True, **data)
+
+        return results
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
new file mode 100644
index 00000000..82eec1c6
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
@@ -0,0 +1,3 @@
+from .models import backbone, decode_heads, segmentors
+from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
+                    seg_resize)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
new file mode 100644
index 00000000..ae5c5acf
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
@@ -0,0 +1,3 @@
+from .backbone import BASEBEiT, BEiTAdapter
+from .decode_heads import Mask2FormerHeadFromMMSeg
+from .segmentors import EncoderDecoderMask2Former
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
new file mode 100644
index 00000000..ab4258c1
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
@@ -0,0 +1,4 @@
+from .base import BASEBEiT
+from .beit_adapter import BEiTAdapter
+
+__all__ = ['BEiTAdapter', 'BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
new file mode 100644
index 00000000..03080342
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -0,0 +1,523 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import logging
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath
+
+_logger = logging.getLogger(__name__)
+
+
+def get_reference_points(spatial_shapes, device):
+    reference_points_list = []
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        ref_y, ref_x = torch.meshgrid(
+            torch.linspace(
+                0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+            torch.linspace(
+                0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+        ref_y = ref_y.reshape(-1)[None] / H_
+        ref_x = ref_x.reshape(-1)[None] / W_
+        ref = torch.stack((ref_x, ref_y), -1)
+        reference_points_list.append(ref)
+    reference_points = torch.cat(reference_points_list, 1)
+    reference_points = reference_points[:, :, None]
+    return reference_points
+
+
+def deform_inputs(x):
+    bs, c, h, w = x.shape
+    spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16),
+                                      (h // 32, w // 32)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 16, w // 16)], x.device)
+    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]
+
+    spatial_shapes = torch.as_tensor([(h // 16, w // 16)],
+                                     dtype=torch.long,
+                                     device=x.device)
+    level_start_index = torch.cat((spatial_shapes.new_zeros(
+        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+    reference_points = get_reference_points([(h // 8, w // 8),
+                                             (h // 16, w // 16),
+                                             (h // 32, w // 32)], x.device)
+    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]
+
+    return deform_inputs1, deform_inputs2
+
+
+class ConvFFN(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        n = N // 21
+        x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2,
+                                                    W * 2).contiguous()
+        x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H,
+                                                         W).contiguous()
+        x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2,
+                                                   W // 2).contiguous()
+        x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
+        x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
+        x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
+        x = torch.cat([x1, x2, x3], dim=1)
+        return x
+
+
+class Extractor(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 with_cp=False):
+        super().__init__()
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.with_cffn = with_cffn
+        self.with_cp = with_cp
+        if with_cffn:
+            self.ffn = ConvFFN(
+                in_features=dim,
+                hidden_features=int(dim * cffn_ratio),
+                drop=drop)
+            self.ffn_norm = norm_layer(dim)
+            self.drop_path = DropPath(
+                drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index, H, W):
+
+        def _inner_forward(query, feat):
+            attn = self.attn(
+                query=self.query_norm(query),
+                key=None,
+                value=self.feat_norm(feat),
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+
+            query = query + attn
+
+            if self.with_cffn:
+                query = query + self.drop_path(
+                    self.ffn(self.ffn_norm(query), H, W))
+            return query
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class Injector(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 n_levels=1,
+                 deform_ratio=1.0,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.query_norm = norm_layer(dim)
+        self.feat_norm = norm_layer(dim)
+        self.attn = MultiScaleDeformableAttention(
+            embed_dims=dim,
+            num_heads=num_heads,
+            num_levels=n_levels,
+            num_points=n_points,
+            batch_first=True)
+
+        # modify to fit the deform_ratio
+        value_proj_in_features = self.attn.value_proj.weight.shape[0]
+        value_proj_out_features = int(value_proj_in_features * deform_ratio)
+        self.attn.value_proj = nn.Linear(value_proj_in_features,
+                                         value_proj_out_features)
+        self.attn.output_proj = nn.Linear(value_proj_out_features,
+                                          value_proj_in_features)
+
+        self.gamma = nn.Parameter(
+            init_values * torch.ones((dim)), requires_grad=True)
+
+    def forward(self, query, reference_points, feat, spatial_shapes,
+                level_start_index):
+
+        def _inner_forward(query, feat):
+            input_query = self.query_norm(query)
+            input_value = self.feat_norm(feat)
+            attn = self.attn(
+                query=input_query,
+                key=None,
+                value=input_value,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index)
+            return query + self.gamma * attn
+
+        if self.with_cp and query.requires_grad:
+            query = cp.checkpoint(_inner_forward, query, feat)
+        else:
+            query = _inner_forward(query, feat)
+
+        return query
+
+
+class InteractionBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c
+
+
+class InteractionBlockWithCls(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=6,
+                 n_points=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 drop=0.,
+                 drop_path=0.,
+                 with_cffn=True,
+                 cffn_ratio=0.25,
+                 init_values=0.,
+                 deform_ratio=1.0,
+                 extra_extractor=False,
+                 with_cp=False):
+        super().__init__()
+
+        self.injector = Injector(
+            dim=dim,
+            n_levels=3,
+            num_heads=num_heads,
+            init_values=init_values,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cp=with_cp)
+        self.extractor = Extractor(
+            dim=dim,
+            n_levels=1,
+            num_heads=num_heads,
+            n_points=n_points,
+            norm_layer=norm_layer,
+            deform_ratio=deform_ratio,
+            with_cffn=with_cffn,
+            cffn_ratio=cffn_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            with_cp=with_cp)
+        if extra_extractor:
+            self.extra_extractors = nn.Sequential(*[
+                Extractor(
+                    dim=dim,
+                    num_heads=num_heads,
+                    n_points=n_points,
+                    norm_layer=norm_layer,
+                    with_cffn=with_cffn,
+                    cffn_ratio=cffn_ratio,
+                    deform_ratio=deform_ratio,
+                    drop=drop,
+                    drop_path=drop_path,
+                    with_cp=with_cp) for _ in range(2)
+            ])
+        else:
+            self.extra_extractors = None
+
+    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
+        x = self.injector(
+            query=x,
+            reference_points=deform_inputs1[0],
+            feat=c,
+            spatial_shapes=deform_inputs1[1],
+            level_start_index=deform_inputs1[2])
+        x = torch.cat((cls, x), dim=1)
+        for idx, blk in enumerate(blocks):
+            x = blk(x, H, W)
+        cls, x = x[:, :1, ], x[:, 1:, ]
+        c = self.extractor(
+            query=c,
+            reference_points=deform_inputs2[0],
+            feat=x,
+            spatial_shapes=deform_inputs2[1],
+            level_start_index=deform_inputs2[2],
+            H=H,
+            W=W)
+        if self.extra_extractors is not None:
+            for extractor in self.extra_extractors:
+                c = extractor(
+                    query=c,
+                    reference_points=deform_inputs2[0],
+                    feat=x,
+                    spatial_shapes=deform_inputs2[1],
+                    level_start_index=deform_inputs2[2],
+                    H=H,
+                    W=W)
+        return x, c, cls
+
+
+class SpatialPriorModule(nn.Module):
+
+    def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+
+        self.stem = nn.Sequential(*[
+            nn.Conv2d(
+                3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                inplanes,
+                inplanes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(inplanes),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        ])
+        self.conv2 = nn.Sequential(*[
+            nn.Conv2d(
+                inplanes,
+                2 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(2 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv3 = nn.Sequential(*[
+            nn.Conv2d(
+                2 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.conv4 = nn.Sequential(*[
+            nn.Conv2d(
+                4 * inplanes,
+                4 * inplanes,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False),
+            nn.SyncBatchNorm(4 * inplanes),
+            nn.ReLU(inplace=True)
+        ])
+        self.fc1 = nn.Conv2d(
+            inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
+        self.fc2 = nn.Conv2d(
+            2 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc3 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.fc4 = nn.Conv2d(
+            4 * inplanes,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            c1 = self.stem(x)
+            c2 = self.conv2(c1)
+            c3 = self.conv3(c2)
+            c4 = self.conv4(c3)
+            c1 = self.fc1(c1)
+            c2 = self.fc2(c2)
+            c3 = self.fc3(c3)
+            c4 = self.fc4(c4)
+
+            bs, dim, _, _ = c1.shape
+
+            c2 = c2.view(bs, dim, -1).transpose(1, 2)  # 8s
+            c3 = c3.view(bs, dim, -1).transpose(1, 2)  # 16s
+            c4 = c4.view(bs, dim, -1).transpose(1, 2)  # 32s
+
+            return c1, c2, c3, c4
+
+        if self.with_cp and x.requires_grad:
+            outs = cp.checkpoint(_inner_forward, x)
+        else:
+            outs = _inner_forward(x)
+        return outs
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
new file mode 100644
index 00000000..40b0fa89
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
@@ -0,0 +1,3 @@
+from .beit import BASEBEiT
+
+__all__ = ['BASEBEiT']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
new file mode 100644
index 00000000..a5811fb9
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -0,0 +1,476 @@
+# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit
+# This implementation refers to
+# https://github.com/czczup/ViT-Adapter.git
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.runner import _load_checkpoint
+from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # commit dropout for the original BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0]
+                                          - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance,
+                            num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h,
+                                                 coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer('relative_position_index',
+                                 relative_position_index)
+
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None,
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, H, W, rel_pos_bias=None):
+
+        def _inner_forward(x):
+            if self.gamma_1 is None:
+                x = x + self.drop_path(
+                    self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+            else:
+                x = x + self.drop_path(self.gamma_1 * self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, Hp, Wp
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self,
+                 backbone,
+                 img_size=224,
+                 feature_size=None,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(
+                    torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0]
+                                      - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance,
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register_module()
+class BASEBEiT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=512,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=80,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=None,
+                 init_values=None,
+                 use_checkpoint=False,
+                 use_abs_pos_emb=False,
+                 use_rel_pos_bias=True,
+                 use_shared_rel_pos_bias=False,
+                 pretrained=None,
+                 with_cp=False):
+        super().__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.norm_layer = norm_layer
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.drop_path_rate = drop_path_rate
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone,
+                img_size=img_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                with_cp=with_cp,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+            checkpoint = _load_checkpoint(
+                init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+
+    def fix_init_weight(self):
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
new file mode 100644
index 00000000..02a4968e
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -0,0 +1,169 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import logging
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.builder import BACKBONES
+from mmdet.models.utils.transformer import MultiScaleDeformableAttention
+from timm.models.layers import DropPath, trunc_normal_
+from torch.nn.init import normal_
+
+from .adapter_modules import InteractionBlockWithCls as InteractionBlock
+from .adapter_modules import SpatialPriorModule, deform_inputs
+from .base.beit import BASEBEiT
+
+_logger = logging.getLogger(__name__)
+
+
+@BACKBONES.register_module()
+class BEiTAdapter(BASEBEiT):
+
+    def __init__(self,
+                 pretrain_size=224,
+                 conv_inplane=64,
+                 n_points=4,
+                 deform_num_heads=6,
+                 init_values=0.,
+                 cffn_ratio=0.25,
+                 deform_ratio=1.0,
+                 with_cffn=True,
+                 interaction_indexes=None,
+                 add_vit_feature=True,
+                 with_cp=False,
+                 *args,
+                 **kwargs):
+
+        super().__init__(
+            init_values=init_values, with_cp=with_cp, *args, **kwargs)
+
+        self.num_block = len(self.blocks)
+        self.pretrain_size = (pretrain_size, pretrain_size)
+        self.flags = [
+            i for i in range(-1, self.num_block, self.num_block // 4)
+        ][1:]
+        self.interaction_indexes = interaction_indexes
+        self.add_vit_feature = add_vit_feature
+        embed_dim = self.embed_dim
+
+        self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
+        self.spm = SpatialPriorModule(
+            inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
+        self.interactions = nn.Sequential(*[
+            InteractionBlock(
+                dim=embed_dim,
+                num_heads=deform_num_heads,
+                n_points=n_points,
+                init_values=init_values,
+                drop_path=self.drop_path_rate,
+                norm_layer=self.norm_layer,
+                with_cffn=with_cffn,
+                cffn_ratio=cffn_ratio,
+                deform_ratio=deform_ratio,
+                extra_extractor=True if i == len(interaction_indexes)
+                - 1 else False,
+                with_cp=with_cp) for i in range(len(interaction_indexes))
+        ])
+
+        self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
+        self.norm1 = nn.SyncBatchNorm(embed_dim)
+        self.norm2 = nn.SyncBatchNorm(embed_dim)
+        self.norm3 = nn.SyncBatchNorm(embed_dim)
+        self.norm4 = nn.SyncBatchNorm(embed_dim)
+
+        self.up.apply(self._init_weights)
+        self.spm.apply(self._init_weights)
+        self.interactions.apply(self._init_weights)
+        self.apply(self._init_deform_weights)
+        normal_(self.level_embed)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16,
+                                      self.pretrain_size[1] // 16,
+                                      -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1)
+        return pos_embed
+
+    def _init_deform_weights(self, m):
+        if isinstance(m, MultiScaleDeformableAttention):
+            m.init_weights()
+
+    def _add_level_embed(self, c2, c3, c4):
+        c2 = c2 + self.level_embed[0]
+        c3 = c3 + self.level_embed[1]
+        c4 = c4 + self.level_embed[2]
+        return c2, c3, c4
+
+    def forward(self, x):
+        deform_inputs1, deform_inputs2 = deform_inputs(x)
+
+        # SPM forward
+        c1, c2, c3, c4 = self.spm(x)
+        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
+        c = torch.cat([c2, c3, c4], dim=1)
+
+        # Patch Embedding forward
+        x, H, W = self.patch_embed(x)
+        bs, n, dim = x.shape
+        cls = self.cls_token.expand(
+            bs, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+
+        if self.pos_embed is not None:
+            pos_embed = self._get_pos_embed(self.pos_embed, H, W)
+            x = x + pos_embed
+        x = self.pos_drop(x)
+
+        # Interaction
+        outs = list()
+        for i, layer in enumerate(self.interactions):
+            indexes = self.interaction_indexes[i]
+            x, c, cls = layer(x, c, cls,
+                              self.blocks[indexes[0]:indexes[-1] + 1],
+                              deform_inputs1, deform_inputs2, H, W)
+            outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous())
+
+        # Split & Reshape
+        c2 = c[:, 0:c2.size(1), :]
+        c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :]
+        c4 = c[:, c2.size(1) + c3.size(1):, :]
+
+        c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous()
+        c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous()
+        c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous()
+        c1 = self.up(c2) + c1
+
+        if self.add_vit_feature:
+            x1, x2, x3, x4 = outs
+            x1 = F.interpolate(
+                x1, scale_factor=4, mode='bilinear', align_corners=False)
+            x2 = F.interpolate(
+                x2, scale_factor=2, mode='bilinear', align_corners=False)
+            x4 = F.interpolate(
+                x4, scale_factor=0.5, mode='bilinear', align_corners=False)
+            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4
+
+        # Final Norm
+        f1 = self.norm1(c1)
+        f2 = self.norm2(c2)
+        f3 = self.norm3(c3)
+        f4 = self.norm4(c4)
+        return [f1, f2, f3, f4]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
new file mode 100644
index 00000000..9367806f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
@@ -0,0 +1,3 @@
+from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg
+
+__all__ = ['Mask2FormerHeadFromMMSeg']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
new file mode 100644
index 00000000..36660520
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -0,0 +1,267 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+from abc import ABCMeta, abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+from mmdet.models.builder import build_loss
+from mmdet.models.losses import accuracy
+
+from ...utils import build_pixel_sampler, seg_resize
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super(BaseDecodeHead, self).__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                seg_resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    @force_fp32(apply_to=('seg_logit', ))
+    def losses(self, seg_logit, seg_label):
+        """Compute segmentation loss."""
+        loss = dict()
+        seg_logit = seg_resize(
+            input=seg_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logit, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logit,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
new file mode 100644
index 00000000..ad8b1586
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -0,0 +1,581 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.ops import point_sample
+from mmcv.runner import ModuleList, force_fp32
+from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+
+from .base_decode_head import BaseDecodeHead
+
+
+@HEADS.register_module()
+class Mask2FormerHeadFromMMSeg(BaseDecodeHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
+            Mask2Former head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 num_transformer_feat_level=3,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=None,
+                 loss_mask=None,
+                 loss_dice=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Mask2FormerHeadFromMMSeg, self).__init__(
+            in_channels=in_channels,
+            channels=feat_channels,
+            num_classes=(num_things_classes + num_stuff_classes),
+            init_cfg=init_cfg,
+            input_transform='multiple_select',
+            **kwargs)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.transformerlayers. \
+            attn_cfgs.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+        self.conv_seg = None  # fix a bug here (conv_seg is not used)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape [num_queries,
+                cls_out_channels].
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape [num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.
+                    Each with shape [num_queries, ].
+                - label_weights_list (list[Tensor]): Label weights of all
+                    images.Each with shape [num_queries, ].
+                - mask_targets_list (list[Tensor]): Mask targets of all images.
+                    Each with shape [num_queries, h, w].
+                - mask_weights_list (list[Tensor]): Mask weights of all images.
+                    Each with shape [num_queries, ].
+                - num_total_pos (int): Number of positive samples in all
+                    images.
+                - num_total_neg (int): Number of negative samples in all
+                    images.
+        """
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      mask_preds_list, gt_labels_list,
+                                      gt_masks_list, img_metas)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list,
+                mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (num_gts, ).
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (num_gts, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+        """
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_points_pred,
+                                             gt_labels, gt_points_masks,
+                                             img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (num_gts, ).
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (num_gts, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1, 1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
+             gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape [num_decoder, batch_size, num_queries,
+                cls_out_channels].
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape [num_decoder, batch_size, num_queries, h, w].
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds,
+            all_gt_labels_list, all_gt_masks_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+            - cls_pred (Tensor): Classification scores in shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred (Tensor): Mask scores in shape \
+                (batch_size, num_queries,h, w).
+            - attn_mask (Tensor): Attention mask in shape \
+                (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        decoder_out = decoder_out.transpose(0, 1)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple: A tuple contains two elements.
+
+            - cls_pred_list (list[Tensor)]: Classification logits \
+                for each decoder layer. Each is a 3D-tensor with shape \
+                (batch_size, num_queries, cls_out_channels). \
+                Note `cls_out_channels` should includes background.
+            - mask_pred_list (list[Tensor]): Mask logits for each \
+                decoder layer. Each with shape (batch_size, num_queries, \
+                 h, w).
+        """
+        batch_size = len(img_metas)
+        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(2, 0, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (num_queries, batch_size, c)
+        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
+            (1, batch_size, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self.forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            attn_masks = [attn_mask, None]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                attn_masks=attn_masks,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self.forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels,
+                      gt_masks):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Multi-level features from the upstream network,
+                each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_semantic_seg (list[tensor]):Each element is the ground truth
+                of semantic segmentation with the shape (N, H, W).
+            train_cfg (dict): The training config, which not been used in
+                maskformer.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+
+        Returns:
+            losses (dict[str, Tensor]): a dictionary of loss components
+        """
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, img_metas)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
+                           img_metas)
+
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            inputs (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            test_cfg (dict): Testing config.
+
+        Returns:
+            seg_mask (Tensor): Predicted semantic segmentation logits.
+        """
+        all_cls_scores, all_mask_preds = self(inputs, img_metas)
+        cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
+        ori_h, ori_w, _ = img_metas[0]['ori_shape']
+
+        # semantic inference
+        cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_mask
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
new file mode 100644
index 00000000..1f2c8b04
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
@@ -0,0 +1,3 @@
+from .encoder_decoder_mask2former import EncoderDecoderMask2Former
+
+__all__ = ['EncoderDecoderMask2Former']
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
new file mode 100644
index 00000000..8bd8fa3f
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -0,0 +1,314 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseSegmentor(BaseModule, metaclass=ABCMeta):
+    """Base class for segmentors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseSegmentor, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got '
+                                f'{type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) != '
+                             f'num of image meta ({len(img_metas)})')
+
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        for img_meta in img_metas:
+            ori_shapes = [_['ori_shape'] for _ in img_meta]
+            if isinstance(ori_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0])
+                    for shape in ori_shapes)
+            else:
+                assert all(shape == ori_shapes[0] for shape in ori_shapes)
+
+            img_shapes = [_['img_shape'] for _ in img_meta]
+            if isinstance(img_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0])
+                    for shape in img_shapes)
+            else:
+                assert all(shape == img_shapes[0] for shape in img_shapes)
+
+            pad_shapes = [_['pad_shape'] for _ in img_meta]
+            if isinstance(pad_shapes[0], torch.Tensor):
+                assert all(
+                    tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0])
+                    for shape in pad_shapes)
+            else:
+                assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer=None, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        log_vars_ = dict()
+        for loss_name, loss_value in log_vars.items():
+            k = loss_name + '_val'
+            log_vars_[k] = loss_value
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars_,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        # If the loss_vars has different length, raise assertion error
+        # to prevent GPUs from infinite waiting.
+        if dist.is_available() and dist.is_initialized():
+            log_var_length = torch.tensor(len(log_vars), device=loss.device)
+            dist.all_reduce(log_var_length)
+            message = (f'rank {dist.get_rank()}'
+                       + f' len(log_vars): {len(log_vars)}' + ' keys: '
+                       + ','.join(log_vars.keys()) + '\n')
+            assert log_var_length == len(log_vars) * dist.get_world_size(), \
+                'loss log variables are different across GPUs!\n' + message
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return img
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
new file mode 100644
index 00000000..9287e8aa
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -0,0 +1,303 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import builder
+from mmdet.models.builder import DETECTORS
+
+from ...utils import add_prefix, seg_resize
+from .base_segmentor import BaseSegmentor
+
+
+@DETECTORS.register_module()
+class EncoderDecoderMask2Former(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(EncoderDecoderMask2Former, self).__init__(init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        decode_head.update(train_cfg=train_cfg)
+        decode_head.update(test_cfg=test_cfg)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = builder.build_head(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(builder.build_head(head_cfg))
+            else:
+                self.auxiliary_head = builder.build_head(auxiliary_head)
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, img, img_metas):
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(img)
+        out = self._decode_head_forward_test(x, img_metas)
+        out = seg_resize(
+            input=out,
+            size=img.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg,
+                                   **kwargs):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas,
+                                                     gt_semantic_seg, **kwargs)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas,
+                                                  gt_semantic_seg,
+                                                  self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(
+                x, img_metas, gt_semantic_seg, self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def forward_dummy(self, img):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(img, None)
+
+        return seg_logit
+
+    def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
+        """Forward function for training.
+
+        Args:
+            img (Tensor): Input images.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas,
+                                                      gt_semantic_seg,
+                                                      **kwargs)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, img_metas, gt_semantic_seg)
+            losses.update(loss_aux)
+
+        return losses
+
+    # TODO refactor
+    def slide_inference(self, img, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = img.size()
+        num_classes = self.num_classes
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = img[:, :, y1:y2, x1:x2]
+                crop_seg_logit = self.encode_decode(crop_img, img_meta)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        if torch.onnx.is_in_onnx_export():
+            # cast count_mat to constant while exporting to ONNX
+            count_mat = torch.from_numpy(
+                count_mat.cpu().detach().numpy()).to(device=img.device)
+        preds = preds / count_mat
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if rescale:
+            preds = seg_resize(
+                preds,
+                size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2]
+                if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else
+                img_meta[0]['ori_shape'],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+        return preds
+
+    def whole_inference(self, img, img_meta, rescale):
+        """Inference with full image."""
+
+        seg_logit = self.encode_decode(img, img_meta)
+        if rescale:
+            # support dynamic shape for onnx
+            if torch.onnx.is_in_onnx_export():
+                size = img.shape[2:]
+            else:
+                size = img_meta[0]['ori_shape'][:2]
+            seg_logit = seg_resize(
+                seg_logit,
+                size=size,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                warning=False)
+
+        return seg_logit
+
+    def inference(self, img, img_meta, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            img (Tensor): The input image of shape (N, 3, H, W).
+            img_meta (dict): Image info dict where each dict has: 'img_shape',
+                'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            rescale (bool): Whether rescale back to original shape.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = img_meta[0]['ori_shape']
+
+        def tensor_to_tuple(input_tensor):
+            return tuple(input_tensor.cpu().numpy())
+
+        if isinstance(ori_shape, torch.Tensor):
+            assert all(
+                tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape)
+                for _ in img_meta)
+        else:
+            assert all(_['ori_shape'] == ori_shape for _ in img_meta)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(img, img_meta, rescale)
+        else:
+            seg_logit = self.whole_inference(img, img_meta, rescale)
+        output = F.softmax(seg_logit, dim=1)
+        flip = img_meta[0]['flip']
+        if flip:
+            flip_direction = img_meta[0]['flip_direction']
+            assert flip_direction in ['horizontal', 'vertical']
+            if flip_direction == 'horizontal':
+                output = output.flip(dims=(3, ))
+            elif flip_direction == 'vertical':
+                output = output.flip(dims=(2, ))
+
+        return output
+
+    def simple_test(self, img, img_meta, rescale=True):
+        """Simple test with single image."""
+        seg_logit = self.inference(img, img_meta, rescale)
+        seg_pred = seg_logit.argmax(dim=1)
+        if torch.onnx.is_in_onnx_export():
+            # our inference backend only support 4D output
+            seg_pred = seg_pred.unsqueeze(0)
+            return seg_pred
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
+
+    def aug_test(self, imgs, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+        for i in range(1, len(imgs)):
+            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(imgs)
+        seg_pred = seg_logit.argmax(dim=1)
+        seg_pred = seg_pred.cpu().numpy()
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
new file mode 100644
index 00000000..dec8a5f2
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
@@ -0,0 +1,7 @@
+from .builder import build_pixel_sampler
+from .data_process_func import ResizeToMultiple
+from .seg_func import add_prefix, seg_resize
+
+__all__ = [
+    'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple'
+]
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
new file mode 100644
index 00000000..63d77fea
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -0,0 +1,11 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+from mmcv.utils import Registry, build_from_cfg
+
+PIXEL_SAMPLERS = Registry('pixel sampler')
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
new file mode 100644
index 00000000..194361af
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class ResizeToMultiple(object):
+    """Resize images & seg to multiple of divisor.
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def __call__(self, results):
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
diff --git a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
new file mode 100644
index 00000000..fba46b81
--- /dev/null
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -0,0 +1,48 @@
+# The implementation refers to the VitAdapter
+# available at
+# https://github.com/czczup/ViT-Adapter.git
+
+import warnings
+
+import torch.nn.functional as F
+
+
+def seg_resize(input,
+               size=None,
+               scale_factor=None,
+               mode='nearest',
+               align_corners=None,
+               warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/modelscope/models/cv/movie_scene_segmentation/__init__.py b/modelscope/models/cv/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..25dcda96
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import MovieSceneSegmentationModel
+    from .datasets import MovieSceneSegmentationDataset
+
+else:
+    _import_structure = {
+        'model': ['MovieSceneSegmentationModel'],
+        'datasets': ['MovieSceneSegmentationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/movie_scene_segmentation/get_model.py b/modelscope/models/cv/movie_scene_segmentation/get_model.py
new file mode 100644
index 00000000..5c66fc02
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/get_model.py
@@ -0,0 +1,45 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+from .utils.shot_encoder import resnet50
+from .utils.trn import TransformerCRN
+
+
+def get_shot_encoder(cfg):
+    name = cfg['model']['shot_encoder']['name']
+    shot_encoder_args = cfg['model']['shot_encoder'][name]
+    if name == 'resnet':
+        depth = shot_encoder_args['depth']
+        if depth == 50:
+            shot_encoder = resnet50(**shot_encoder_args['params'], )
+        else:
+            raise NotImplementedError
+    else:
+        raise NotImplementedError
+
+    return shot_encoder
+
+
+def get_contextual_relation_network(cfg):
+    crn = None
+
+    if cfg['model']['contextual_relation_network']['enabled']:
+        name = cfg['model']['contextual_relation_network']['name']
+        crn_args = cfg['model']['contextual_relation_network']['params'][name]
+        if name == 'trn':
+            sampling_name = cfg['model']['loss']['sampling_method']['name']
+            crn_args['neighbor_size'] = (
+                2 * cfg['model']['loss']['sampling_method']['params']
+                [sampling_name]['neighbor_size'])
+            crn = TransformerCRN(crn_args)
+        else:
+            raise NotImplementedError
+
+    return crn
+
+
+__all__ = ['get_shot_encoder', 'get_contextual_relation_network']
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
new file mode 100644
index 00000000..e9576963
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -0,0 +1,192 @@
+import os
+import os.path as osp
+from typing import Any, Dict
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as TF
+from PIL import Image
+from shotdetect_scenedetect_lgss import shot_detect
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .get_model import get_contextual_relation_network, get_shot_encoder
+from .utils.save_op import get_pred_boundary, pred2scene, scene2video
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        params = torch.load(model_path, map_location='cpu')
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+
+        def load_param_with_prefix(prefix, model, src_params):
+            own_state = model.state_dict()
+            for name, param in own_state.items():
+                src_name = prefix + '.' + name
+                own_state[name] = src_params[src_name]
+
+            model.load_state_dict(own_state)
+
+        self.shot_encoder = get_shot_encoder(self.cfg)
+        load_param_with_prefix('shot_encoder', self.shot_encoder, params)
+        self.crn = get_contextual_relation_network(self.cfg)
+        load_param_with_prefix('crn', self.crn, params)
+
+        crn_name = self.cfg.model.contextual_relation_network.name
+        hdim = self.cfg.model.contextual_relation_network.params[crn_name][
+            'hidden_size']
+        self.head_sbd = nn.Linear(hdim, 2)
+        load_param_with_prefix('head_sbd', self.head_sbd, params)
+
+        self.test_transform = TF.Compose([
+            TF.Resize(size=256, interpolation=Image.BICUBIC),
+            TF.CenterCrop(224),
+            TF.ToTensor(),
+            TF.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
+        sampling_method = self.cfg.dataset.sampling_method.name
+        self.neighbor_size = self.cfg.dataset.sampling_method.params[
+            sampling_method].neighbor_size
+
+        self.eps = 1e-5
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        data = inputs['video']
+        labels = inputs['label']
+        outputs = self.shared_step(data)
+
+        loss = F.cross_entropy(
+            outputs.squeeze(), labels.squeeze(), reduction='none')
+        lpos = labels == 1
+        lneg = labels == 0
+
+        pp, nn = 1, 1
+        wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps)
+        wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps)
+        w = wp + wn
+        loss = (w * loss).sum()
+
+        probs = torch.argmax(outputs, dim=1)
+
+        re = dict(pred=probs, loss=loss)
+        return re
+
+    def inference(self, batch):
+        logger.info('Begin scene detect ......')
+        bs = self.cfg.pipeline.batch_size_per_gpu
+        sids = batch['sid']
+        inputs = batch['shot_feat']
+
+        shot_num = len(sids)
+        cnt = shot_num // bs + 1
+
+        for i in range(cnt):
+            start = i * bs
+            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
+            input_ = inputs[start:end]
+            sid_ = sids[start:end]
+            input_ = torch.stack(input_)
+            outputs = self.shared_step(input_)  # shape [b,2]
+            prob = F.softmax(outputs, dim=1)
+            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
+            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
+        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
+
+        assert len(self.infer_result['sid']) == len(sids)
+        assert len(self.infer_result['pred']) == len(inputs)
+        return self.infer_result
+
+    def shared_step(self, inputs):
+        with torch.no_grad():
+            # infer shot encoder
+            shot_repr = self.extract_shot_representation(inputs)
+            assert len(shot_repr.shape) == 3
+
+        # infer CRN
+        _, pooled = self.crn(shot_repr, mask=None)
+        # infer boundary score
+        pred = self.head_sbd(pooled)
+        return pred
+
+    def save_shot_feat(self, _repr):
+        feat = _repr.float().cpu().numpy()
+        pth = self.cfg.dataset.img_path + '/features'
+        os.makedirs(pth)
+
+        for idx in range(_repr.shape[0]):
+            name = f'shot_{str(idx).zfill(4)}.npy'
+            name = osp.join(pth, name)
+            np.save(name, feat[idx])
+
+    def extract_shot_representation(self,
+                                    inputs: torch.Tensor) -> torch.Tensor:
+        """ inputs [b s k c h w] -> output [b d] """
+        assert len(inputs.shape) == 6  # (B Shot Keyframe C H W)
+        b, s, k, c, h, w = inputs.shape
+        inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s)
+        keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)]
+        # [k (b s) d] -> [(b s) d]
+        shot_repr = torch.stack(keyframe_repr).mean(dim=0)
+
+        shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s)
+        return shot_repr
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        logger.info('Generate scene .......')
+
+        pred_dict = inputs['feat']
+        thres = self.cfg.pipeline.save_threshold
+
+        anno_dict = get_pred_boundary(pred_dict, thres)
+        scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        if self.cfg.pipeline.save_split_scene:
+            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
+            print(f'Split scene video saved to {re_dir}')
+        return len(scene_list), scene_dict
+
+    def preprocess(self, inputs):
+        logger.info('Begin shot detect......')
+        shot_keyf_lst, anno, shot2keyf = shot_detect(
+            inputs, **self.cfg.preprocessor.shot_detect)
+        logger.info('Shot detect done!')
+
+        single_shot_feat, sid = [], []
+        for idx, one_shot in enumerate(shot_keyf_lst):
+            one_shot = [
+                self.test_transform(one_frame) for one_frame in one_shot
+            ]
+            one_shot = torch.stack(one_shot, dim=0)
+            single_shot_feat.append(one_shot)
+            sid.append(idx)
+        single_shot_feat = torch.stack(single_shot_feat, dim=0)
+        shot_feat = []
+        for idx, one_shot in enumerate(anno):
+            shot_idx = int(one_shot['shot_id']) + np.arange(
+                -self.neighbor_size, self.neighbor_size + 1)
+            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
+            _one_shot = single_shot_feat[shot_idx]
+            shot_feat.append(_one_shot)
+        self.shot2keyf = shot2keyf
+        self.anno = anno
+        return shot_feat, sid
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
new file mode 100644
index 00000000..3682726f
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
@@ -0,0 +1,3 @@
+from .save_op import get_pred_boundary, pred2scene, scene2video
+from .shot_encoder import resnet50
+from .trn import TransformerCRN
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/head.py b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
new file mode 100644
index 00000000..20a87e66
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -0,0 +1,29 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MlpHead(nn.Module):
+
+    def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+
+        self.model = nn.Sequential(
+            nn.Linear(self.input_dim, self.hidden_dim, bias=True),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.output_dim, bias=True),
+        )
+
+    def forward(self, x):
+        # x shape: [b t d] where t means the number of views
+        x = self.model(x)
+        return F.normalize(x, dim=-1)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
new file mode 100644
index 00000000..d7c8c0ed
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -0,0 +1,118 @@
+# ----------------------------------------------------------------------------------
+# The codes below partially refer to the SceneSeg LGSS.
+# Github: https://github.com/AnyiRao/SceneSeg
+# ----------------------------------------------------------------------------------
+import os
+import os.path as osp
+import subprocess
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+
+def get_pred_boundary(pred_dict, threshold=0.5):
+    pred = pred_dict['pred']
+    tmp = (pred > threshold).astype(np.int32)
+    anno_dict = {}
+    for idx in range(len(tmp)):
+        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
+    return anno_dict
+
+
+def pred2scene(shot2keyf, anno_dict):
+    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
+
+    scene_dict = {}
+    assert len(scene_list) == len(pair_list)
+    for scene_ind, scene_item in enumerate(scene_list):
+        scene_dict.update(
+            {scene_ind: {
+                'shot': pair_list[scene_ind],
+                'frame': scene_item
+            }})
+
+    return scene_dict, scene_list
+
+
+def scene2video(source_movie_fn, scene_list, thres):
+
+    vcap = cv2.VideoCapture(source_movie_fn)
+    fps = vcap.get(cv2.CAP_PROP_FPS)  # video.fps
+    out_video_dir_fn = os.path.join(os.getcwd(),
+                                    f'pred_result/scene_video_{thres}')
+    os.makedirs(out_video_dir_fn, exist_ok=True)
+
+    for scene_ind, scene_item in tqdm(enumerate(scene_list)):
+        scene = str(scene_ind).zfill(4)
+        start_frame = int(scene_item[0])
+        end_frame = int(scene_item[1])
+        start_time, end_time = start_frame / fps, end_frame / fps
+        duration_time = end_time - start_time
+        out_video_fn = os.path.join(out_video_dir_fn,
+                                    'scene_{}.mp4'.format(scene))
+        if os.path.exists(out_video_fn):
+            continue
+        call_list = ['ffmpeg']
+        call_list += ['-v', 'quiet']
+        call_list += [
+            '-y', '-ss',
+            str(start_time), '-t',
+            str(duration_time), '-i', source_movie_fn
+        ]
+        call_list += ['-map_chapters', '-1']
+        call_list += [out_video_fn]
+        subprocess.call(call_list)
+    return osp.join(os.getcwd(), 'pred_result')
+
+
+def get_demo_scene_list(shot2keyf, anno_dict):
+    pair_list = get_pair_list(anno_dict)
+
+    scene_list = []
+    for pair in pair_list:
+        start_shot, end_shot = int(pair[0]), int(pair[-1])
+        start_frame = shot2keyf[start_shot].split(' ')[0]
+        end_frame = shot2keyf[end_shot].split(' ')[1]
+        scene_list.append((start_frame, end_frame))
+    return scene_list, pair_list
+
+
+def get_pair_list(anno_dict):
+    sort_anno_dict_key = sorted(anno_dict.keys())
+    tmp = 0
+    tmp_list = []
+    tmp_label_list = []
+    anno_list = []
+    anno_label_list = []
+    for key in sort_anno_dict_key:
+        value = anno_dict.get(key)
+        tmp += value
+        tmp_list.append(key)
+        tmp_label_list.append(value)
+        if tmp == 1:
+            anno_list.append(tmp_list)
+            anno_label_list.append(tmp_label_list)
+            tmp = 0
+            tmp_list = []
+            tmp_label_list = []
+            continue
+        if key == sort_anno_dict_key[-1]:
+            if len(tmp_list) > 0:
+                anno_list.append(tmp_list)
+                anno_label_list.append(tmp_label_list)
+    if len(anno_list) == 0:
+        return None
+    while [] in anno_list:
+        anno_list.remove([])
+    tmp_anno_list = [anno_list[0]]
+    pair_list = []
+    for ind in range(len(anno_list) - 1):
+        cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1])
+        if cont_count > 1:
+            pair_list.extend(tmp_anno_list)
+            tmp_anno_list = [anno_list[ind + 1]]
+            continue
+        tmp_anno_list.append(anno_list[ind + 1])
+    pair_list.extend(tmp_anno_list)
+    return pair_list
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
new file mode 100644
index 00000000..7ad1907f
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -0,0 +1,331 @@
+"""
+Modified from original implementation in torchvision
+"""
+
+from typing import Any, Callable, List, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            groups: int = 1,
+            dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        in_channel_dim: int = 3,
+        zero_init_residual: bool = False,
+        use_last_block_grid: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.use_last_block_grid = use_last_block_grid
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            in_channel_dim,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+        )
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight,
+                                      0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight,
+                                      0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+            ))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                ))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool,
+                      grid_only: bool) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        if grid:
+            x_grid = []
+
+        if 3 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.layer4(x)
+
+        if 4 in level:
+            x_grid.append(x.detach().clone())
+            if not both and len(level) == 1:
+                return x_grid
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        if not grid or len(level) == 0:
+            return x
+
+        if grid_only:
+            return x_grid
+
+        if both:
+            return x, x_grid
+
+        return x
+
+    def forward(
+        self,
+        x: Tensor,
+        grid: bool = False,
+        level: List = [],
+        both: bool = False,
+        grid_only: bool = False,
+    ) -> Tensor:
+        return self._forward_impl(x, grid, level, both, grid_only)
+
+
+def resnet50(**kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/trn.py b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
new file mode 100644
index 00000000..769e9ee4
--- /dev/null
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
@@ -0,0 +1,132 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+from transformers.models.bert.modeling_bert import BertEncoder
+
+
+class ShotEmbedding(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size)
+        self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size)
+        self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0)
+
+        # tf naming convention for layer norm
+        self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(cfg.hidden_dropout_prob)
+
+        self.register_buffer('pos_ids',
+                             torch.arange(nn_size, dtype=torch.long))
+
+    def forward(
+        self,
+        shot_emb: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        assert len(shot_emb.size()) == 3
+
+        if pos_ids is None:
+            pos_ids = self.pos_ids
+
+        # this for mask embedding (un-masked ones remain unchanged)
+        if mask is not None:
+            self.mask_embedding.weight.data[0, :].fill_(0)
+            mask_emb = self.mask_embedding(mask.long())
+            shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb
+
+        # we set [CLS] token to averaged feature
+        cls_emb = shot_emb.mean(dim=1)
+
+        # embedding shots
+        shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1)
+        shot_emb = self.shot_embedding(shot_emb)
+        pos_emb = self.position_embedding(pos_ids)
+        embeddings = shot_emb + pos_emb[None, :]
+        embeddings = self.dropout(self.LayerNorm(embeddings))
+        return embeddings
+
+
+class TransformerCRN(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.pooling_method = cfg.pooling_method
+        self.shot_embedding = ShotEmbedding(cfg)
+        self.encoder = BertEncoder(cfg)
+
+        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
+        self.register_buffer(
+            'attention_mask',
+            self._get_extended_attention_mask(
+                torch.ones((1, nn_size)).float()),
+        )
+
+    def forward(
+        self,
+        shot: torch.Tensor,
+        mask: torch.Tensor = None,
+        pos_ids: torch.Tensor = None,
+        pooling_method: str = None,
+    ):
+        if self.attention_mask.shape[1] != (shot.shape[1] + 1):
+            n_shot = shot.shape[1] + 1  # +1 for CLS token
+            attention_mask = self._get_extended_attention_mask(
+                torch.ones((1, n_shot), dtype=torch.float, device=shot.device))
+        else:
+            attention_mask = self.attention_mask
+
+        shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids)
+        encoded_emb = self.encoder(
+            shot_emb, attention_mask=attention_mask).last_hidden_state
+
+        return encoded_emb, self.pooler(
+            encoded_emb, pooling_method=pooling_method)
+
+    def pooler(self, sequence_output, pooling_method=None):
+        if pooling_method is None:
+            pooling_method = self.pooling_method
+
+        if pooling_method == 'cls':
+            return sequence_output[:, 0, :]
+        elif pooling_method == 'avg':
+            return sequence_output[:, 1:].mean(dim=1)
+        elif pooling_method == 'max':
+            return sequence_output[:, 1:].max(dim=1)[0]
+        elif pooling_method == 'center':
+            cidx = sequence_output.shape[1] // 2
+            return sequence_output[:, cidx, :]
+        else:
+            raise ValueError
+
+    def _get_extended_attention_mask(self, attention_mask):
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f'Wrong shape for attention_mask (shape {attention_mask.shape})'
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index fa73686d..974375ce 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .mmdet_model import DetectionModel
+    from .yolox_pai import YOLOX
 
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
+        'yolox_pai': ['YOLOX']
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py
new file mode 100644
index 00000000..985cc136
--- /dev/null
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.detection.detectors import YOLOX as _YOLOX
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Models.yolox)
+class YOLOX(EasyCVBaseModel, _YOLOX):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        _YOLOX.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/product_retrieval_embedding/item_model.py b/modelscope/models/cv/product_retrieval_embedding/item_model.py
index 2a893669..85a636c0 100644
--- a/modelscope/models/cv/product_retrieval_embedding/item_model.py
+++ b/modelscope/models/cv/product_retrieval_embedding/item_model.py
@@ -13,8 +13,8 @@ from modelscope.models.cv.product_retrieval_embedding.item_embedding import (
     preprocess, resnet50_embed)
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import create_device
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import create_device
 
 logger = get_logger()
 
@@ -48,9 +48,8 @@ class ProductRetrievalEmbedding(TorchModel):
             filter_param(src_params, own_state)
             model.load_state_dict(own_state)
 
-        cpu_flag = device == 'cpu'
         self.device = create_device(
-            cpu_flag)  # device.type == "cpu" or device.type == "cuda"
+            device)  # device.type == "cpu" or device.type == "cuda"
         self.use_gpu = self.device.type == 'cuda'
 
         # config the model path
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py
new file mode 100644
index 00000000..aed13cec
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .realtime_detector import RealtimeDetector
+else:
+    _import_structure = {
+        'realtime_detector': ['RealtimeDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
new file mode 100644
index 00000000..b147f769
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -0,0 +1,85 @@
+import argparse
+import logging as logger
+import os
+import os.path as osp
+import time
+
+import cv2
+import json
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .yolox.data.data_augment import ValTransform
+from .yolox.exp import get_exp_by_name
+from .yolox.utils import postprocess
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Models.realtime_object_detection)
+class RealtimeDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # model type
+        self.exp = get_exp_by_name(self.config.model_type)
+
+        # build model
+        self.model = self.exp.get_model()
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt = torch.load(model_path, map_location='cpu')
+
+        # load the model state dict
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        # params setting
+        self.exp.num_classes = self.config.num_classes
+        self.confthre = self.config.conf_thr
+        self.num_classes = self.exp.num_classes
+        self.nmsthre = self.exp.nmsthre
+        self.test_size = self.exp.test_size
+        self.preproc = ValTransform(legacy=False)
+
+    def inference(self, img):
+        with torch.no_grad():
+            outputs = self.model(img)
+        return outputs
+
+    def forward(self, inputs):
+        return self.inference(inputs)
+
+    def preprocess(self, img):
+        img = LoadImage.convert_to_ndarray(img)
+        height, width = img.shape[:2]
+        self.ratio = min(self.test_size[0] / img.shape[0],
+                         self.test_size[1] / img.shape[1])
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+
+        return img
+
+    def postprocess(self, input):
+        outputs = postprocess(
+            input,
+            self.num_classes,
+            self.confthre,
+            self.nmsthre,
+            class_agnostic=True)
+
+        if len(outputs) == 1:
+            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
+            scores = outputs[0][:, 5].cpu().numpy()
+            labels = outputs[0][:, 6].cpu().int().numpy()
+
+        return bboxes, scores, labels
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
new file mode 100644
index 00000000..b52a65fe
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
@@ -0,0 +1,69 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+from ..utils import xyxy2cxcywh
+
+
+def preproc(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones(
+            (input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+
+
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+
+    dimension -> tensorize -> color adj
+
+    Arguments:
+        resize (int): input dimension to SSD
+        rgb_means ((int,int,int)): average RGB of the dataset
+            (104,117,123)
+        swap ((int,int,int)): final order of channels
+
+    Returns:
+        transform (transform) : callable transform to be applied to test/val
+        data
+    """
+
+    def __init__(self, swap=(2, 0, 1), legacy=False):
+        self.swap = swap
+        self.legacy = legacy
+
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.swap)
+        if self.legacy:
+            img = img[::-1, :, :].copy()
+            img /= 255.0
+            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        return img, np.zeros((1, 5))
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
new file mode 100644
index 00000000..e8e3be15
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .base_exp import BaseExp
+from .build import get_exp_by_name
+from .yolox_base import Exp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
new file mode 100644
index 00000000..a4278cbf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
@@ -0,0 +1,12 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from abc import ABCMeta, abstractmethod
+
+from torch.nn import Module
+
+
+class BaseExp(metaclass=ABCMeta):
+
+    @abstractmethod
+    def get_model(self) -> Module:
+        pass
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
new file mode 100644
index 00000000..4858100c
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -0,0 +1,18 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import sys
+
+
+def get_exp_by_name(exp_name):
+    exp = exp_name.replace('-',
+                           '_')  # convert string like "yolox-s" to "yolox_s"
+    if exp == 'yolox_s':
+        from .default import YoloXSExp as YoloXExp
+    elif exp == 'yolox_nano':
+        from .default import YoloXNanoExp as YoloXExp
+    elif exp == 'yolox_tiny':
+        from .default import YoloXTinyExp as YoloXExp
+    else:
+        pass
+    return YoloXExp()
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
new file mode 100644
index 00000000..552bbccd
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .yolox_nano import YoloXNanoExp
+from .yolox_s import YoloXSExp
+from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
new file mode 100644
index 00000000..330eef16
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_nano.py
@@ -0,0 +1,46 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+import torch.nn as nn
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXNanoExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXNanoExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.test_size = (416, 416)
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if 'model' not in self.__dict__:
+            from ...models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(
+                self.depth,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True,
+            )
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act,
+                depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
new file mode 100644
index 00000000..5a123b37
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_s.py
@@ -0,0 +1,13 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXSExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXSExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
new file mode 100644
index 00000000..a80d0f2d
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/yolox_tiny.py
@@ -0,0 +1,20 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class YoloXTinyExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXTinyExp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(
+            os.path.realpath(__file__))[1].split('.')[0]
+        self.enable_mixup = False
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
new file mode 100644
index 00000000..a2a41535
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -0,0 +1,59 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+
+class Exp(BaseExp):
+
+    def __init__(self):
+        super().__init__()
+
+        # ---------------- model config ---------------- #
+        # detect classes number of model
+        self.num_classes = 80
+        # factor of model depth
+        self.depth = 1.00
+        # factor of model width
+        self.width = 1.00
+        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        self.act = 'silu'
+        # -----------------  testing config ------------------ #
+        # output image size during evaluation/test
+        self.test_size = (640, 640)
+        # confidence threshold during evaluation/test,
+        # boxes whose scores are less than test_conf will be filtered
+        self.test_conf = 0.01
+        # nms threshold
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from ..models import YOLOX, YOLOPAFPN, YOLOXHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, 'model', None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(
+                self.depth, self.width, in_channels=in_channels, act=self.act)
+            head = YOLOXHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                act=self.act)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        self.model.train()
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
new file mode 100644
index 00000000..20b1a0d1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
@@ -0,0 +1,7 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .darknet import CSPDarknet, Darknet
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
new file mode 100644
index 00000000..8ece2a1e
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/darknet.py
@@ -0,0 +1,189 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from torch import nn
+
+from .network_blocks import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+            self,
+            depth,
+            in_channels=3,
+            stem_out_channels=32,
+            out_features=('dark3', 'dark4', 'dark5'),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output channels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(
+                in_channels, stem_out_channels, ksize=3, stride=1,
+                act='lrelu'),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[0], stride=2))
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[1], stride=2))
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[2], stride=2))
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2],
+                                 in_channels * 2),
+        )
+
+    def make_group_layer(self,
+                         in_channels: int,
+                         num_blocks: int,
+                         stride: int = 1):
+        'starts with conv layer then has `num_blocks` `ResLayer`'
+        return [
+            BaseConv(
+                in_channels,
+                in_channels * 2,
+                ksize=3,
+                stride=stride,
+                act='lrelu'),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            BaseConv(in_filters, filters_list[0], 1, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            SPPBottleneck(
+                in_channels=filters_list[1],
+                out_channels=filters_list[0],
+                activation='lrelu',
+            ),
+            BaseConv(
+                filters_list[0], filters_list[1], 3, stride=1, act='lrelu'),
+            BaseConv(
+                filters_list[1], filters_list[0], 1, stride=1, act='lrelu'),
+        ])
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
new file mode 100644
index 00000000..fd15c1c1
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
@@ -0,0 +1,213 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act='silu'):
+        super(BaseConv, self).__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super(DWConv, self).__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
new file mode 100644
index 00000000..0cbebb09
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_fpn.py
@@ -0,0 +1,80 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=53,
+        in_features=['dark3', 'dark4', 'dark5'],
+    ):
+        super(YOLOFPN, self).__init__()
+
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act='lrelu')
+
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(*[
+            self._make_cbl(in_filters, filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+            self._make_cbl(filters_list[0], filters_list[1], 3),
+            self._make_cbl(filters_list[1], filters_list[0], 1),
+        ])
+        return m
+
+    def load_pretrained_model(self, filename='./weights/darknet53.mix.pth'):
+        with open(filename, 'rb') as f:
+            state_dict = torch.load(f, map_location='cpu')
+        print('loading pretrained weights...')
+        self.backbone.load_state_dict(state_dict)
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
new file mode 100644
index 00000000..1eef93a4
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_head.py
@@ -0,0 +1,182 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import bboxes_iou, meshgrid
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act='silu',
+        depthwise=False,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super(YOLOXHead, self).__init__()
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                ))
+            self.cls_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.reg_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction='none')
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction='none')
+        # self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                pass
+            else:
+                output = torch.cat(
+                    [reg_output,
+                     obj_output.sigmoid(),
+                     cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            pass
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
+                                dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
new file mode 100644
index 00000000..cd4258bf
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolo_pafpn.py
@@ -0,0 +1,126 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=('dark3', 'dark4', 'dark5'),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+    ):
+        super(YOLOPAFPN, self).__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width),
+            int(in_channels[1] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width),
+            int(in_channels[0] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
new file mode 100644
index 00000000..181c368b
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/yolox.py
@@ -0,0 +1,33 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super(YOLOX, self).__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None):
+        fpn_outs = self.backbone(x)
+        if self.training:
+            raise NotImplementedError('Training is not supported yet!')
+        else:
+            outputs = self.head(fpn_outs)
+
+        return outputs
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
new file mode 100644
index 00000000..2c1ea489
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/__init__.py
@@ -0,0 +1,5 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+from .boxes import *  # noqa
+
+__all__ = ['bboxes_iou', 'meshgrid', 'postprocess', 'xyxy2cxcywh', 'xyxy2xywh']
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
new file mode 100644
index 00000000..b29a3a04
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/utils/boxes.py
@@ -0,0 +1,107 @@
+# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
+
+import torch
+import torchvision
+
+_TORCH_VER = [int(x) for x in torch.__version__.split('.')[:2]]
+
+
+def meshgrid(*tensors):
+    if _TORCH_VER >= [1, 10]:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
+
+
+def postprocess(prediction,
+                num_classes,
+                conf_thre=0.7,
+                nms_thre=0.45,
+                class_agnostic=False):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(
+            image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+        conf_mask = image_pred[:, 4] * class_conf.squeeze()
+        conf_mask = (conf_mask >= conf_thre).squeeze()
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat(
+            (image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+
+        if class_agnostic:
+            nms_out_index = torchvision.ops.nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                nms_thre,
+            )
+        else:
+            nms_out_index = torchvision.ops.batched_nms(
+                detections[:, :4],
+                detections[:, 4] * detections[:, 5],
+                detections[:, 6],
+                nms_thre,
+            )
+
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
index 89cf8bf6..a4aeffd1 100644
--- a/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
+++ b/modelscope/models/cv/skin_retouching/retinaface/box_utils.py
@@ -6,7 +6,8 @@ import torch
 
 
 def point_form(boxes: torch.Tensor) -> torch.Tensor:
-    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form ground truth data.
+    """Convert prior_boxes to (x_min, y_min, x_max, y_max) representation for comparison to point form \
+       ground truth data.
 
     Args:
         boxes: center-size default boxes from priorbox layers.
diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
index 772813cf..8be07928 100644
--- a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 from easydict import EasyDict as edict
 
 cfg = edict()
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
index 158d88aa..00eb7e1c 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
index 45706f71..3505d5e1 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import math
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
index e64b68d7..77706dbc 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
index 0e623505..b1099fdf 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
index e2d2f80f..de3a7b83 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
index 977e936d..40ed54f1 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch
 from torch import nn
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
index a49fa50c..e1130069 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch
 
 
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
index cd393109..9f010332 100644
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 from functools import partial
 
 import torch
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
index 3eff252a..02f4c79e 100644
--- a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import torch
 
 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
index 505b2aa9..51911957 100644
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -1,5 +1,4 @@
-# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
-# https://github.com/botaoye/OSTrack/
+# The implementation is based on OSTrack, available at https://github.com/botaoye/OSTrack/
 import math
 from typing import Optional
 
@@ -238,24 +237,3 @@ def check_box(box: list, image_height, image_width) -> bool:
     if box[3] < 0 or box[3] >= image_height:
         return False
     return True
-
-
-def show_tracking_result(video_in_path, bboxes, video_save_path):
-    cap = cv2.VideoCapture(video_in_path)
-    for i in range(len(bboxes)):
-        box = bboxes[i]
-        success, frame = cap.read()
-        if success is False:
-            raise Exception(video_in_path,
-                            ' can not be correctly decoded by OpenCV.')
-        if i == 0:
-            size = (frame.shape[1], frame.shape[0])
-            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
-            video_writer = cv2.VideoWriter(video_save_path, fourcc,
-                                           cap.get(cv2.CAP_PROP_FPS), size,
-                                           True)
-        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
-                      5)
-        video_writer.write(frame)
-    video_writer.release
-    cap.release()
diff --git a/modelscope/models/cv/video_summarization/__init__.py b/modelscope/models/cv/video_summarization/__init__.py
new file mode 100644
index 00000000..064110f7
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/__init__.py
@@ -0,0 +1 @@
+from .summarizer import PGLVideoSummarization
diff --git a/modelscope/models/cv/video_summarization/base_model.py b/modelscope/models/cv/video_summarization/base_model.py
new file mode 100644
index 00000000..670da251
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/base_model.py
@@ -0,0 +1,118 @@
+# The implementation is based on pytorch-caffe-models, available at https://github.com/crowsonkb/pytorch-caffe-models.
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Inception(nn.Module):
+
+    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5,
+                 pool_proj):
+        super().__init__()
+        self.conv_1x1 = nn.Conv2d(in_channels, ch1x1, 1)
+        self.relu_1x1 = nn.ReLU(inplace=True)
+        self.conv_3x3_reduce = nn.Conv2d(in_channels, ch3x3red, 1)
+        self.relu_3x3_reduce = nn.ReLU(inplace=True)
+        self.conv_3x3 = nn.Conv2d(ch3x3red, ch3x3, 3, padding=1)
+        self.relu_3x3 = nn.ReLU(inplace=True)
+        self.conv_5x5_reduce = nn.Conv2d(in_channels, ch5x5red, 1)
+        self.relu_5x5_reduce = nn.ReLU(inplace=True)
+        self.conv_5x5 = nn.Conv2d(ch5x5red, ch5x5, 5, padding=2)
+        self.relu_5x5 = nn.ReLU(inplace=True)
+        self.pool = nn.MaxPool2d(3, stride=1, padding=1)
+        self.pool_proj = nn.Conv2d(in_channels, pool_proj, 1)
+        self.relu_pool_proj = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        branch_1 = self.relu_1x1(self.conv_1x1(x))
+        branch_2 = self.relu_3x3_reduce(self.conv_3x3_reduce(x))
+        branch_2 = self.relu_3x3(self.conv_3x3(branch_2))
+        branch_3 = self.relu_5x5_reduce(self.conv_5x5_reduce(x))
+        branch_3 = self.relu_5x5(self.conv_5x5(branch_3))
+        branch_4 = self.pool(x)
+        branch_4 = self.relu_pool_proj(self.pool_proj(branch_4))
+        return torch.cat([branch_1, branch_2, branch_3, branch_4], dim=1)
+
+
+class GoogLeNet(nn.Sequential):
+
+    def __init__(self, num_classes=1000):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.pool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.norm1 = nn.LocalResponseNorm(5, alpha=0.0001, beta=0.75)
+        self.conv2_reduce = nn.Conv2d(64, 64, kernel_size=1)
+        self.relu2_reduce = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(64, 192, kernel_size=3, padding=1)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.norm2 = nn.LocalResponseNorm(5, alpha=0.0001, beta=0.75)
+        self.pool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.inception_3a = Inception(192, 64, 96, 128, 16, 32, 32)
+        self.inception_3b = Inception(256, 128, 128, 192, 32, 96, 64)
+        self.pool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.inception_4a = Inception(480, 192, 96, 208, 16, 48, 64)
+        self.inception_4b = Inception(512, 160, 112, 224, 24, 64, 64)
+        self.inception_4c = Inception(512, 128, 128, 256, 24, 64, 64)
+        self.inception_4d = Inception(512, 112, 144, 288, 32, 64, 64)
+        self.inception_4e = Inception(528, 256, 160, 320, 32, 128, 128)
+        self.pool4 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
+        self.inception_5a = Inception(832, 256, 160, 320, 32, 128, 128)
+        self.inception_5b = Inception(832, 384, 192, 384, 48, 128, 128)
+        self.pool5 = nn.AdaptiveAvgPool2d((1, 1))
+        self.loss3_classifier = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.relu1(self.conv1(x))
+        x = self.pool1(x)
+        x = self.norm1(x)
+        x = self.relu2_reduce(self.conv2_reduce(x))
+        x = self.relu2(self.conv2(x))
+        x = self.norm2(x)
+        x = self.pool2(x)
+        x = self.inception_3a(x)
+        x = self.inception_3b(x)
+        x = self.pool3(x)
+        x = self.inception_4a(x)
+        x = self.inception_4b(x)
+        x = self.inception_4c(x)
+        x = self.inception_4d(x)
+        x = self.inception_4e(x)
+        x = self.pool4(x)
+        x = self.inception_5a(x)
+        x = self.inception_5b(x)
+        x = self.pool5(x).flatten(1)
+        return x
+
+
+class bvlc_googlenet(nn.Module):
+
+    def __init__(self, input_size=224):
+        """model for the BVLC GoogLeNet, trained on ImageNet.
+        URL: https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet"""
+        super(bvlc_googlenet, self).__init__()
+
+        self.model = GoogLeNet(num_classes=1000)
+
+        self.input_size = input_size
+        self.input_mean = (104.0, 117.0, 123.0)
+
+    def forward(self, frame):
+        x = cv2.resize(frame,
+                       (self.input_size, self.input_size)).astype(np.float32)
+        x = (x - self.input_mean).astype(np.float32)
+        x = np.transpose(x, [2, 0, 1])
+
+        x = np.expand_dims(x, 0)
+        x = torch.from_numpy(x)
+        if not next(self.model.parameters()).device.type == 'cpu':
+            x = x.cuda()
+        with torch.no_grad():
+            frame_feat = self.model(x)
+            if not frame_feat.device.type == 'cpu':
+                frame_feat = frame_feat.cpu()
+            frame_feat = frame_feat.numpy()
+            frame_feat = frame_feat / np.linalg.norm(frame_feat)
+        return frame_feat.reshape(-1)
diff --git a/modelscope/models/cv/video_summarization/kts/__init__.py b/modelscope/models/cv/video_summarization/kts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_auto.py b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
new file mode 100644
index 00000000..a794ca26
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
@@ -0,0 +1,35 @@
+# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+
+import numpy as np
+
+from .cpd_nonlin import cpd_nonlin
+
+
+def cpd_auto(K, ncp, vmax, desc_rate=1, **kwargs):
+    """Detect change points automatically selecting their number
+
+    :param K: Kernel between each pair of frames in video
+    :param ncp: Maximum number of change points
+    :param vmax: Special parameter
+    :param desc_rate: Rate of descriptor sampling, vmax always corresponds to 1x
+    :param kwargs: Extra parameters for ``cpd_nonlin``
+    :return: Tuple (cps, costs)
+        - cps - best selected change-points
+        - costs - costs for 0,1,2,...,m change-points
+    """
+    m = ncp
+    _, scores = cpd_nonlin(K, m, backtrack=False, **kwargs)
+
+    N = K.shape[0]
+    N2 = N * desc_rate  # length of the video before down-sampling
+
+    penalties = np.zeros(m + 1)
+    # Prevent division by zero (in case of 0 changes)
+    ncp = np.arange(1, m + 1)
+    penalties[1:] = (vmax * ncp / (2.0 * N2)) * (np.log(float(N2) / ncp) + 1)
+
+    costs = scores / float(N) + penalties
+    m_best = np.argmin(costs)
+    cps, scores2 = cpd_nonlin(K, m_best, **kwargs)
+
+    return cps, scores2
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
new file mode 100644
index 00000000..ef2eb6ef
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
@@ -0,0 +1,102 @@
+# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+
+import numpy as np
+
+
+def calc_scatters(K):
+    """Calculate scatter matrix: scatters[i,j] = {scatter of the sequence with
+    starting frame i and ending frame j}
+    """
+    n = K.shape[0]
+    K1 = np.cumsum([0] + list(np.diag(K)))
+    K2 = np.zeros((n + 1, n + 1))
+    # TODO: use the fact that K - symmetric
+    K2[1:, 1:] = np.cumsum(np.cumsum(K, 0), 1)
+
+    diagK2 = np.diag(K2)
+
+    i = np.arange(n).reshape((-1, 1))
+    j = np.arange(n).reshape((1, -1))
+
+    ij_f32 = ((j - i + 1).astype(np.float32) + (j == i - 1).astype(np.float32))
+    diagK2_K2 = (
+        diagK2[1:].reshape((1, -1)) + diagK2[:-1].reshape(
+            (-1, 1)) - K2[1:, :-1].T - K2[:-1, 1:])
+    scatters = (
+        K1[1:].reshape((1, -1)) - K1[:-1].reshape(
+            (-1, 1)) - diagK2_K2 / ij_f32)
+
+    scatters[j < i] = 0
+
+    return scatters
+
+
+def cpd_nonlin(K,
+               ncp,
+               lmin=1,
+               lmax=100000,
+               backtrack=True,
+               verbose=True,
+               out_scatters=None):
+    """Change point detection with dynamic programming
+
+    :param K: Square kernel matrix
+    :param ncp: Number of change points to detect (ncp >= 0)
+    :param lmin: Minimal length of a segment
+    :param lmax: Maximal length of a segment
+    :param backtrack: If False - only evaluate objective scores (to save memory)
+    :param verbose: If true, print verbose message
+    :param out_scatters: Output scatters
+    :return: Tuple (cps, obj_vals)
+        - cps - detected array of change points: mean is thought to be constant
+            on [ cps[i], cps[i+1] )
+        - obj_vals - values of the objective function for 0..m changepoints
+    """
+    m = int(ncp)  # prevent numpy.int64
+
+    n, n1 = K.shape
+    assert n == n1, 'Kernel matrix awaited.'
+    assert (m + 1) * lmin <= n <= (m + 1) * lmax
+    assert 1 <= lmin <= lmax
+
+    if verbose:
+        print('Precomputing scatters...')
+    J = calc_scatters(K)
+
+    if out_scatters is not None:
+        out_scatters[0] = J
+
+    if verbose:
+        print('Inferring best change points...')
+    # Iden[k, l] - value of the objective for k change-points and l first frames
+    Iden = 1e101 * np.ones((m + 1, n + 1))
+    Iden[0, lmin:lmax] = J[0, lmin - 1:lmax - 1]
+
+    if backtrack:
+        # p[k, l] --- 'previous change' --- best t[k] when t[k+1] equals l
+        p = np.zeros((m + 1, n + 1), dtype=int)
+    else:
+        p = np.zeros((1, 1), dtype=int)
+
+    for k in range(1, m + 1):
+        for l_frame in range((k + 1) * lmin, n + 1):
+            tmin = max(k * lmin, l_frame - lmax)
+            tmax = l_frame - lmin + 1
+            c = J[tmin:tmax, l_frame - 1].reshape(-1) + \
+                Iden[k - 1, tmin:tmax].reshape(-1)
+            Iden[k, l_frame] = np.min(c)
+            if backtrack:
+                p[k, l_frame] = np.argmin(c) + tmin
+
+    # Collect change points
+    cps = np.zeros(m, dtype=int)
+
+    if backtrack:
+        cur = n
+        for k in range(m, 0, -1):
+            cps[k - 1] = p[k, cur]
+            cur = cps[k - 1]
+
+    scores = Iden[:, n].copy()
+    scores[scores > 1e99] = np.inf
+    return cps, scores
diff --git a/modelscope/models/cv/video_summarization/pgl_sum.py b/modelscope/models/cv/video_summarization/pgl_sum.py
new file mode 100644
index 00000000..ab3010c9
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/pgl_sum.py
@@ -0,0 +1,311 @@
+# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self,
+                 input_size=1024,
+                 output_size=1024,
+                 freq=10000,
+                 heads=1,
+                 pos_enc=None):
+        """ The basic (multi-head) Attention 'cell' containing the learnable parameters of Q, K and V
+
+        :param int input_size: Feature input size of Q, K, V.
+        :param int output_size: Feature -hidden- size of Q, K, V.
+        :param int freq: The frequency of the sinusoidal positional encoding.
+        :param int heads: Number of heads for the attention module.
+        :param str | None pos_enc: The type of the positional encoding [supported: Absolute, Relative].
+        """
+        super(SelfAttention, self).__init__()
+
+        self.permitted_encodings = ['absolute', 'relative']
+        if pos_enc is not None:
+            pos_enc = pos_enc.lower()
+            assert pos_enc in self.permitted_encodings, f'Supported encodings: {*self.permitted_encodings,}'
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.heads = heads
+        self.pos_enc = pos_enc
+        self.freq = freq
+        self.Wk, self.Wq, self.Wv = nn.ModuleList(), nn.ModuleList(
+        ), nn.ModuleList()
+        for _ in range(self.heads):
+            self.Wk.append(
+                nn.Linear(
+                    in_features=input_size,
+                    out_features=output_size // heads,
+                    bias=False))
+            self.Wq.append(
+                nn.Linear(
+                    in_features=input_size,
+                    out_features=output_size // heads,
+                    bias=False))
+            self.Wv.append(
+                nn.Linear(
+                    in_features=input_size,
+                    out_features=output_size // heads,
+                    bias=False))
+        self.out = nn.Linear(
+            in_features=output_size, out_features=input_size, bias=False)
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.drop = nn.Dropout(p=0.5)
+
+    def getAbsolutePosition(self, T):
+        """Calculate the sinusoidal positional encoding based on the absolute position of each considered frame.
+        Based on 'Attention is all you need' paper (https://arxiv.org/abs/1706.03762)
+
+        :param int T: Number of frames contained in Q, K and V
+        :return: Tensor with shape [T, T]
+        """
+        freq = self.freq
+        d = self.input_size
+
+        pos = torch.tensor([k for k in range(T)],
+                           device=self.out.weight.device)
+        i = torch.tensor([k for k in range(T // 2)],
+                         device=self.out.weight.device)
+
+        # Reshape tensors each pos_k for each i indices
+        pos = pos.reshape(pos.shape[0], 1)
+        pos = pos.repeat_interleave(i.shape[0], dim=1)
+        i = i.repeat(pos.shape[0], 1)
+
+        AP = torch.zeros(T, T, device=self.out.weight.device)
+        AP[pos, 2 * i] = torch.sin(pos / freq**((2 * i) / d))
+        AP[pos, 2 * i + 1] = torch.cos(pos / freq**((2 * i) / d))
+        return AP
+
+    def getRelativePosition(self, T):
+        """Calculate the sinusoidal positional encoding based on the relative position of each considered frame.
+        r_pos calculations as here: https://theaisummer.com/positional-embeddings/
+
+        :param int T: Number of frames contained in Q, K and V
+        :return: Tensor with shape [T, T]
+        """
+        freq = self.freq
+        d = 2 * T
+        min_rpos = -(T - 1)
+
+        i = torch.tensor([k for k in range(T)], device=self.out.weight.device)
+        j = torch.tensor([k for k in range(T)], device=self.out.weight.device)
+
+        # Reshape tensors each i for each j indices
+        i = i.reshape(i.shape[0], 1)
+        i = i.repeat_interleave(i.shape[0], dim=1)
+        j = j.repeat(i.shape[0], 1)
+
+        # Calculate the relative positions
+        r_pos = j - i - min_rpos
+
+        RP = torch.zeros(T, T, device=self.out.weight.device)
+        idx = torch.tensor([k for k in range(T // 2)],
+                           device=self.out.weight.device)
+        RP[:, 2 * idx] = torch.sin(
+            r_pos[:, 2 * idx] / freq**((i[:, 2 * idx] + j[:, 2 * idx]) / d))
+        RP[:, 2 * idx + 1] = torch.cos(
+            r_pos[:, 2 * idx + 1]
+            / freq**((i[:, 2 * idx + 1] + j[:, 2 * idx + 1]) / d))
+        return RP
+
+    def forward(self, x):
+        """ Compute the weighted frame features, based on either the global or local (multi-head) attention mechanism.
+
+        :param torch.tensor x: Frame features with shape [T, input_size]
+        :return: A tuple of:
+                    y: Weighted features based on the attention weights, with shape [T, input_size]
+                    att_weights : The attention weights (before dropout), with shape [T, T]
+        """
+        outputs = []
+        for head in range(self.heads):
+            K = self.Wk[head](x)
+            Q = self.Wq[head](x)
+            V = self.Wv[head](x)
+
+            # Q *= 0.06                       # scale factor VASNet
+            # Q /= np.sqrt(self.output_size)  # scale factor (i.e 1 / sqrt(d_k) )
+            energies = torch.matmul(Q, K.transpose(1, 0))
+            if self.pos_enc is not None:
+                if self.pos_enc == 'absolute':
+                    AP = self.getAbsolutePosition(T=energies.shape[0])
+                    energies = energies + AP
+                elif self.pos_enc == 'relative':
+                    RP = self.getRelativePosition(T=energies.shape[0])
+                    energies = energies + RP
+
+            att_weights = self.softmax(energies)
+            _att_weights = self.drop(att_weights)
+            y = torch.matmul(_att_weights, V)
+
+            # Save the current head output
+            outputs.append(y)
+        y = self.out(torch.cat(outputs, dim=1))
+        return y, att_weights.clone(
+        )  # for now we don't deal with the weights (probably max or avg pooling)
+
+
+class MultiAttention(nn.Module):
+
+    def __init__(self,
+                 input_size=1024,
+                 output_size=1024,
+                 freq=10000,
+                 pos_enc=None,
+                 num_segments=None,
+                 heads=1,
+                 fusion=None):
+        """ Class wrapping the MultiAttention part of PGL-SUM; its key modules and parameters.
+
+        :param int input_size: The expected input feature size.
+        :param int output_size: The hidden feature size of the attention mechanisms.
+        :param int freq: The frequency of the sinusoidal positional encoding.
+        :param None | str pos_enc: The selected positional encoding [absolute, relative].
+        :param None | int num_segments: The selected number of segments to split the videos.
+        :param int heads: The selected number of global heads.
+        :param None | str fusion: The selected type of feature fusion.
+        """
+        super(MultiAttention, self).__init__()
+
+        # Global Attention, considering differences among all frames
+        self.attention = SelfAttention(
+            input_size=input_size,
+            output_size=output_size,
+            freq=freq,
+            pos_enc=pos_enc,
+            heads=heads)
+
+        self.num_segments = num_segments
+        if self.num_segments is not None:
+            assert self.num_segments >= 2, 'num_segments must be None or 2+'
+            self.local_attention = nn.ModuleList()
+            for _ in range(self.num_segments):
+                # Local Attention, considering differences among the same segment with reduce hidden size
+                self.local_attention.append(
+                    SelfAttention(
+                        input_size=input_size,
+                        output_size=output_size // num_segments,
+                        freq=freq,
+                        pos_enc=pos_enc,
+                        heads=4))
+        self.permitted_fusions = ['add', 'mult', 'avg', 'max']
+        self.fusion = fusion
+        if self.fusion is not None:
+            self.fusion = self.fusion.lower()
+            assert self.fusion in self.permitted_fusions, f'Fusion method must be: {*self.permitted_fusions,}'
+
+    def forward(self, x):
+        """ Compute the weighted frame features, based on the global and locals (multi-head) attention mechanisms.
+
+        :param torch.Tensor x: Tensor with shape [T, input_size] containing the frame features.
+        :return: A tuple of:
+            weighted_value: Tensor with shape [T, input_size] containing the weighted frame features.
+            attn_weights: Tensor with shape [T, T] containing the attention weights.
+        """
+        weighted_value, attn_weights = self.attention(x)  # global attention
+
+        if self.num_segments is not None and self.fusion is not None:
+            segment_size = math.ceil(x.shape[0] / self.num_segments)
+            for segment in range(self.num_segments):
+                left_pos = segment * segment_size
+                right_pos = (segment + 1) * segment_size
+                local_x = x[left_pos:right_pos]
+                weighted_local_value, attn_local_weights = self.local_attention[
+                    segment](local_x)  # local attentions
+
+                # Normalize the features vectors
+                weighted_value[left_pos:right_pos] = F.normalize(
+                    weighted_value[left_pos:right_pos].clone(), p=2, dim=1)
+                weighted_local_value = F.normalize(
+                    weighted_local_value, p=2, dim=1)
+                if self.fusion == 'add':
+                    weighted_value[left_pos:right_pos] += weighted_local_value
+                elif self.fusion == 'mult':
+                    weighted_value[left_pos:right_pos] *= weighted_local_value
+                elif self.fusion == 'avg':
+                    weighted_value[left_pos:right_pos] += weighted_local_value
+                    weighted_value[left_pos:right_pos] /= 2
+                elif self.fusion == 'max':
+                    weighted_value[left_pos:right_pos] = torch.max(
+                        weighted_value[left_pos:right_pos].clone(),
+                        weighted_local_value)
+
+        return weighted_value, attn_weights
+
+
+class PGL_SUM(nn.Module):
+
+    def __init__(self,
+                 input_size=1024,
+                 output_size=1024,
+                 freq=10000,
+                 pos_enc=None,
+                 num_segments=None,
+                 heads=1,
+                 fusion=None):
+        """ Class wrapping the PGL-SUM model; its key modules and parameters.
+
+        :param int input_size: The expected input feature size.
+        :param int output_size: The hidden feature size of the attention mechanisms.
+        :param int freq: The frequency of the sinusoidal positional encoding.
+        :param None | str pos_enc: The selected positional encoding [absolute, relative].
+        :param None | int num_segments: The selected number of segments to split the videos.
+        :param int heads: The selected number of global heads.
+        :param None | str fusion: The selected type of feature fusion.
+        """
+        super(PGL_SUM, self).__init__()
+
+        self.attention = MultiAttention(
+            input_size=input_size,
+            output_size=output_size,
+            freq=freq,
+            pos_enc=pos_enc,
+            num_segments=num_segments,
+            heads=heads,
+            fusion=fusion)
+        self.linear_1 = nn.Linear(
+            in_features=input_size, out_features=input_size)
+        self.linear_2 = nn.Linear(
+            in_features=self.linear_1.out_features, out_features=1)
+
+        self.drop = nn.Dropout(p=0.5)
+        self.norm_y = nn.LayerNorm(normalized_shape=input_size, eps=1e-6)
+        self.norm_linear = nn.LayerNorm(
+            normalized_shape=self.linear_1.out_features, eps=1e-6)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, frame_features):
+        """ Produce frames importance scores from the frame features, using the PGL-SUM model.
+
+        :param torch.Tensor frame_features: Tensor of shape [T, input_size] containing the frame features produced by
+        using the pool5 layer of GoogleNet.
+        :return: A tuple of:
+            y: Tensor with shape [1, T] containing the frames importance scores in [0, 1].
+            attn_weights: Tensor with shape [T, T] containing the attention weights.
+        """
+        frame_features = frame_features.reshape(-1, frame_features.shape[-1])
+        residual = frame_features
+        weighted_value, attn_weights = self.attention(frame_features)
+        y = weighted_value + residual
+        y = self.drop(y)
+        y = self.norm_y(y)
+
+        # 2-layer NN (Regressor Network)
+        y = self.linear_1(y)
+        y = self.relu(y)
+        y = self.drop(y)
+        y = self.norm_linear(y)
+
+        y = self.linear_2(y)
+        y = self.sigmoid(y)
+        y = y.view(1, -1)
+
+        return y, attn_weights
diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py
new file mode 100644
index 00000000..c95da025
--- /dev/null
+++ b/modelscope/models/cv/video_summarization/summarizer.py
@@ -0,0 +1,224 @@
+# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+
+import os.path as osp
+from copy import deepcopy
+from typing import Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.video_summarization.kts.cpd_auto import cpd_auto
+from modelscope.models.cv.video_summarization.pgl_sum import PGL_SUM
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def get_change_points(video_feat, n_frame):
+    video_feat = np.array(video_feat, np.float32)
+    K = np.dot(video_feat, video_feat.T)
+    change_points, _ = cpd_auto(K, ncp=120, vmax=2.2 / 4.0, lmin=1)
+    change_points = change_points * 15
+    change_points = np.concatenate(([0], change_points, [n_frame - 1]))
+
+    temp_change_points = []
+    for idx in range(len(change_points) - 1):
+        segment = [change_points[idx], change_points[idx + 1] - 1]
+        if idx == len(change_points) - 2:
+            segment = [change_points[idx], change_points[idx + 1]]
+
+        temp_change_points.append(segment)
+    change_points = np.array(list(temp_change_points))
+
+    temp_n_frame_per_seg = []
+    for change_points_idx in range(len(change_points)):
+        n_frame = change_points[change_points_idx][1] - change_points[
+            change_points_idx][0]
+        temp_n_frame_per_seg.append(n_frame)
+    n_frame_per_seg = np.array(list(temp_n_frame_per_seg))
+
+    return change_points, n_frame_per_seg
+
+
+def knap_sack(W, wt, val, n):
+    """ Maximize the value that a knapsack of capacity W can hold. You can either put the item or discard it, there is
+    no concept of putting some part of item in the knapsack.
+
+    :param int W: Maximum capacity -in frames- of the knapsack.
+    :param list[int] wt: The weights (lengths -in frames-) of each video shot.
+    :param list[float] val: The values (importance scores) of each video shot.
+    :param int n: The number of the shots.
+    :return: A list containing the indices of the selected shots.
+    """
+    K = [[0 for _ in range(W + 1)] for _ in range(n + 1)]
+
+    # Build table K[][] in bottom up manner
+    for i in range(n + 1):
+        for w in range(W + 1):
+            if i == 0 or w == 0:
+                K[i][w] = 0
+            elif wt[i - 1] <= w:
+                K[i][w] = max(val[i - 1] + K[i - 1][w - wt[i - 1]],
+                              K[i - 1][w])
+            else:
+                K[i][w] = K[i - 1][w]
+
+    selected = []
+    w = W
+    for i in range(n, 0, -1):
+        if K[i][w] != K[i - 1][w]:
+            selected.insert(0, i - 1)
+            w -= wt[i - 1]
+
+    return selected
+
+
+def generate_summary(all_shot_bound, all_scores, all_nframes, all_positions):
+    """ Generate the automatic machine summary, based on the video shots; the frame importance scores; the number of
+    frames in the original video and the position of the sub-sampled frames of the original video.
+
+    :param list[np.ndarray] all_shot_bound: The video shots for all the -original- testing videos.
+    :param list[np.ndarray] all_scores: The calculated frame importance scores for all the sub-sampled testing videos.
+    :param list[np.ndarray] all_nframes: The number of frames for all the -original- testing videos.
+    :param list[np.ndarray] all_positions: The position of the sub-sampled frames for all the -original- testing videos.
+    :return: A list containing the indices of the selected frames for all the -original- testing videos.
+    """
+    all_summaries = []
+    for video_index in range(len(all_scores)):
+        # Get shots' boundaries
+        shot_bound = all_shot_bound[video_index]  # [number_of_shots, 2]
+        frame_init_scores = all_scores[video_index]
+        n_frames = all_nframes[video_index]
+        positions = all_positions[video_index]
+
+        # Compute the importance scores for the initial frame sequence (not the sub-sampled one)
+        frame_scores = np.zeros(n_frames, dtype=np.float32)
+        if positions.dtype != int:
+            positions = positions.astype(np.int32)
+        if positions[-1] != n_frames:
+            positions = np.concatenate([positions, [n_frames]])
+        for i in range(len(positions) - 1):
+            pos_left, pos_right = positions[i], positions[i + 1]
+            if i == len(frame_init_scores):
+                frame_scores[pos_left:pos_right] = 0
+            else:
+                frame_scores[pos_left:pos_right] = frame_init_scores[i]
+
+        # Compute shot-level importance scores by taking the average importance scores of all frames in the shot
+        shot_imp_scores = []
+        shot_lengths = []
+        for shot in shot_bound:
+            shot_lengths.append(shot[1] - shot[0] + 1)
+            shot_imp_scores.append(
+                (frame_scores[shot[0]:shot[1] + 1].mean()).item())
+
+        # Select the best shots using the knapsack implementation
+        final_shot = shot_bound[-1]
+        final_max_length = int((final_shot[1] + 1) * 0.15)
+
+        selected = knap_sack(final_max_length, shot_lengths, shot_imp_scores,
+                             len(shot_lengths))
+
+        # Select all frames from each selected shot (by setting their value in the summary vector to 1)
+        summary = np.zeros(final_shot[1] + 1, dtype=np.int8)
+        for shot in selected:
+            summary[shot_bound[shot][0]:shot_bound[shot][1] + 1] = 1
+
+        all_summaries.append(summary)
+
+    return all_summaries
+
+
+@MODELS.register_module(
+    Tasks.video_summarization, module_name=Models.video_summarization)
+class PGLVideoSummarization(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video summarization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        self.loss = nn.MSELoss()
+        self.model = PGL_SUM(
+            input_size=1024,
+            output_size=1024,
+            num_segments=4,
+            heads=8,
+            fusion='add',
+            pos_enc='absolute')
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.model = self.model.to(self._device)
+
+        self.model = self.load_pretrained(self.model, model_path)
+
+        if self.training:
+            self.model.train()
+        else:
+            self.model.eval()
+
+    def load_pretrained(self, net, load_path, strict=True, param_key='params'):
+        if isinstance(net, (DataParallel, DistributedDataParallel)):
+            net = net.module
+        load_net = torch.load(
+            load_path, map_location=lambda storage, loc: storage)
+        if param_key is not None:
+            if param_key not in load_net and 'params' in load_net:
+                param_key = 'params'
+                logger.info(
+                    f'Loading: {param_key} does not exist, use params.')
+            if param_key in load_net:
+                load_net = load_net[param_key]
+        logger.info(
+            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
+        )
+        # remove unnecessary 'module.'
+        for k, v in deepcopy(load_net).items():
+            if k.startswith('module.'):
+                load_net[k[7:]] = v
+                load_net.pop(k)
+        net.load_state_dict(load_net, strict=strict)
+        logger.info('load model done.')
+        return net
+
+    def _train_forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        frame_features = input['frame_features']
+        gtscore = input['gtscore']
+        preds, attn_weights = self.model(frame_features)
+        return {'loss': self.loss(preds, gtscore)}
+
+    def _inference_forward(self, input: Dict[str,
+                                             Tensor]) -> Dict[str, Tensor]:
+        frame_features = input['frame_features']
+        y, attn_weights = self.model(frame_features)
+        return {'scores': y}
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Union[list, Tensor]]: results
+        """
+        for key, value in input.items():
+            input[key] = input[key].to(self._device)
+        if self.training:
+            return self._train_forward(input)
+        else:
+            return self._inference_forward(input)
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 112b3a58..9219a281 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
 
     from .clip import CLIPForMultiModalEmbedding
     from .gemm import GEMMForMultiModalEmbedding
+    from .team import TEAMForMultiModalSimilarity
     from .diffusion import DiffusionForTextToImageSynthesis
     from .mmr import VideoCLIPForMultiModalEmbedding
     from .mplug_for_all_tasks import MPlugForAllTasks
@@ -19,6 +20,7 @@ else:
         'clip': ['CLIPForMultiModalEmbedding'],
         'diffusion': ['DiffusionForTextToImageSynthesis'],
         'gemm': ['GEMMForMultiModalEmbedding'],
+        'team': ['TEAMForMultiModalSimilarity'],
         'mmr': ['VideoCLIPForMultiModalEmbedding'],
         'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 88a4ddda..4e959a17 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -24,8 +24,8 @@ logger = get_logger()
     Tasks.video_multi_modal_embedding, module_name=Models.video_clip)
 class VideoCLIPForMultiModalEmbedding(TorchModel):
 
-    def __init__(self, model_dir, device_id=-1):
-        super().__init__(model_dir=model_dir, device_id=device_id)
+    def __init__(self, model_dir, **kwargs):
+        super().__init__(model_dir=model_dir, **kwargs)
         # model config parameters
         with open(f'{model_dir}/{ModelFile.CONFIGURATION}', 'r') as json_file:
             model_config = json.load(json_file)
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index c275ed15..914678c5 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -64,6 +64,10 @@ class MPlugConfig(PretrainedConfig):
             clip_transformer_width=768,
             clip_transformer_heads=12,
             clip_transformer_layers=12,
+            # retrieval
+            queue_size=65536,
+            embed_dim=256,
+            temp=0.07,
             **kwargs):
 
         super().__init__(**kwargs)
@@ -99,6 +103,10 @@ class MPlugConfig(PretrainedConfig):
         self.clip_transformer_width = clip_transformer_width
         self.clip_transformer_heads = clip_transformer_heads
         self.clip_transformer_layers = clip_transformer_layers
+        # retrieval
+        self.queue_size = queue_size
+        self.embed_dim = embed_dim
+        self.temp = temp
 
     @classmethod
     def from_yaml_file(cls, yaml_file: Union[str,
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 50622cc0..78f60f9b 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1855,7 +1855,8 @@ class MPlug(PreTrainedModel):
 
         task_mapping = {
             Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
-            Tasks.image_captioning: MPLUGForImageCaption
+            Tasks.image_captioning: MPlugForImageCaption,
+            Tasks.image_text_retrieval: MPlugForImageTextRetrieval,
         }
         config = cls.config_class.from_yaml_file(
             os.path.join(model_dir, CONFIG_NAME))
@@ -1915,6 +1916,33 @@ class MPlug(PreTrainedModel):
         clip_model.visual.positional_embedding = pos_embed
         return clip_model
 
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
     def forward(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -1969,71 +1997,6 @@ class MPlug(PreTrainedModel):
                 [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
         return torch.index_select(x, dim, order_index.to(x.device))
 
-    def rank_answer(self, question_states, question_atts, answer_ids,
-                    answer_atts, k):
-
-        num_ques = question_states.size(0)
-        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
-
-        start_output = self.text_decoder(
-            start_ids,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            return_dict=True,
-            reduction='none')
-        logits = start_output.logits[:, 0, :]  # first token's logit
-
-        # topk_probs: top-k probability
-        # topk_ids: [num_question, k]
-        answer_first_token = answer_ids[:, 1]
-        prob_first_token = F.softmax(
-            logits, dim=1).index_select(
-                dim=1, index=answer_first_token)
-        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
-
-        # answer input: [num_question*k, answer_len]
-        input_ids = []
-        input_atts = []
-        for b, topk_id in enumerate(topk_ids):
-            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
-            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
-        input_ids = torch.cat(input_ids, dim=0)
-        input_atts = torch.cat(input_atts, dim=0)
-
-        targets_ids = input_ids.masked_fill(
-            input_ids == self.tokenizer.pad_token_id, -100)
-
-        # repeat encoder's output for top-k answers
-        question_states = self._tile(question_states, 0, k)
-        question_atts = self._tile(question_atts, 0, k)
-
-        output = self.text_decoder(
-            input_ids,
-            attention_mask=input_atts,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            labels=targets_ids,
-            return_dict=True,
-            reduction='none')
-
-        answer_loss = output.loss
-        answer_loss = answer_loss.view(input_ids.size(0), -1)
-
-        # topk_prob: first token probability
-        topk_probs = topk_probs.view(-1, 1)
-        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
-
-        # re-calculate log probabilities for the answer sequences using chain rule
-        log_probs_sum = log_probs.sum(1)
-        log_probs_sum = log_probs_sum.view(num_ques, k)
-
-        topk_probs = F.softmax(log_probs_sum, dim=-1)
-        # get top-k after re-ranking
-        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
-        topk_ids = torch.gather(topk_ids, 1, rerank_id)
-
-        return topk_ids, topk_probs
-
 
 class MPlugForVisualQuestionAnswering(MPlug):
 
@@ -2043,33 +2006,6 @@ class MPlugForVisualQuestionAnswering(MPlug):
         self.beam_generator = TextGenerator(config, self.text_decoder)
         self.init_distill(config)
 
-    def init_distill(self, config):
-        self.distill = config.distill
-        if self.distill:
-            self.visual_encoder_m = self._initialize_clip(config)
-            self.text_encoder_m = BertModel(
-                self.config_encoder, add_pooling_layer=False)
-            self.fusion_encoder_m = FusionModel(
-                self.config_fusion, add_pooling_layer=False)
-            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
-            self.model_pairs = [
-                [self.visual_encoder, self.visual_encoder_m],
-                [self.text_encoder, self.text_encoder_m],
-                [self.text_decoder, self.text_decoder_m],
-            ]
-            if self.config_encoder.hidden_size != config.vision_width:
-                self.visn_fc_m = nn.Linear(config.vision_width,
-                                           self.config_encoder.hidden_size)
-                self.visn_layer_norm_m = nn.LayerNorm(
-                    self.config_encoder.hidden_size, eps=1e-12)
-                self.dropout_m = nn.Dropout(
-                    self.config_encoder.hidden_dropout_prob)
-                self.model_pairs.extend(
-                    [[self.visn_fc, self.visn_fc_m],
-                     [self.visn_layer_norm, self.visn_layer_norm_m]])
-            self.copy_params()
-            self.momentum = 0.995
-
     def forward(self,
                 image,
                 question,
@@ -2111,6 +2047,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
             merge_text_attention = torch.cat(
                 [image_atts, question.attention_mask], 1)
 
+            if k is None:
+                k = [1] * question_output.shape[0]
             question_states = []
             question_atts = []
             for b, n in enumerate(k):
@@ -2177,6 +2115,8 @@ class MPlugForVisualQuestionAnswering(MPlug):
                     return_dict=True,
                     reduction='none',
                 )
+            if weights is None:
+                weights = 1
             loss = weights * answer_output.loss
             loss = loss.sum() / image.size(0)
 
@@ -2203,7 +2143,7 @@ class MPlugForVisualQuestionAnswering(MPlug):
             return topk_ids, topk_probs
 
 
-class MPLUGForImageCaption(MPlug):
+class MPlugForImageCaption(MPlug):
 
     def __init__(self, config):
         super().__init__(config)
@@ -2262,50 +2202,278 @@ class MPLUGForImageCaption(MPlug):
         if train:
             answer_targets = answer.input_ids.masked_fill(
                 answer.input_ids == self.tokenizer.pad_token_id, -100)
-            text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
-                return_dict=True)
-            text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
-                encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=False)
-
-            image_output, question_output = fusion_output
-
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-
             answer_output = self.text_decoder(
                 answer.input_ids,
                 attention_mask=answer.attention_mask,
-                encoder_hidden_states=question_output,
-                encoder_attention_mask=merge_text_attention,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
                 labels=answer_targets,
                 return_dict=True,
                 reduction='none')
             loss = answer_output.loss
+
             return loss
         else:
+            topk_ids, topk_probs = self.generation(image_embeds, image_atts)
+            return topk_ids, topk_probs
+
+
+class MPlugForImageTextRetrieval(MPlug):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.embed_dim
+        self.temp = nn.Parameter(torch.ones([]) * config.temp)
+        self.queue_size = config.queue_size
+        self.momentum = config.momentum
+        self.alpha = config.alpha
+
+        self.queue_size = config.queue_size
+        self.text_width = self.config_encoder.hidden_size
+        self.embed_dim = config.embed_dim
+
+        self.vision_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.itm_head = nn.Linear(self.text_width, 2)
+
+        self.register_buffer('image_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('text_queue',
+                             torch.randn(self.embed_dim, self.queue_size))
+        self.register_buffer('idx_queue', torch.full((1, self.queue_size),
+                                                     -100))
+        self.register_buffer('queue_ptr', torch.zeros(1, dtype=torch.long))
+
+        self.image_queue = F.normalize(self.image_queue, dim=0)
+        self.text_queue = F.normalize(self.text_queue, dim=0)
+        self.init_distill(config)
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.vision_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.text_proj_m = nn.Linear(self.text_width, self.embed_dim)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_proj, self.text_proj_m],
+                [self.vision_proj, self.vision_proj_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat, idx):
+
+        def concat_all_gather(tensor):
+            """
+            Performs all_gather operation on the provided tensors.
+            *** Warning ***: torch.distributed.all_gather has no gradient.
+            """
+            if not torch.distributed.is_initialized():
+                return tensor
+            tensors_gather = [
+                torch.ones_like(tensor)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            torch.distributed.all_gather(
+                tensors_gather, tensor, async_op=False)
+
+            output = torch.cat(tensors_gather, dim=0)
+            return output
+
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+        idxs = concat_all_gather(idx)
+
+        batch_size = image_feats.shape[0]
+
+        ptr = int(self.queue_ptr)
+        # assert self.queue_size % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
+        self.idx_queue[:, ptr:ptr + batch_size] = idxs.T
+        ptr = (ptr + batch_size) % self.queue_size  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    def forward(self, image, text, idx=None, train=True):
+        if train:
+            image_embeds = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            if self.large:
+                image_embeds = self.dropout(
+                    self.visn_layer_norm(self.visn_fc(image_embeds)))
+            image_atts = torch.ones(
+                image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+            image_feat = F.normalize(
+                self.vision_proj(image_embeds[:, 0, :]), dim=-1)
             text_output = self.text_encoder(
-                question.input_ids,
-                attention_mask=question.attention_mask,
+                text.input_ids,
+                attention_mask=text.attention_mask,
                 return_dict=True)
             text_embeds = text_output.last_hidden_state
-            fusion_output = self.fusion_encoder(
+            text_feat = F.normalize(
+                self.text_proj(text_embeds[:, 0, :]), dim=-1)
+
+            idx = idx.view(-1, 1)
+            idx_all = torch.cat(
+                [idx.t(), self.idx_queue.clone().detach()], dim=1)
+            pos_idx = torch.eq(idx, idx_all).float()
+            sim_targets = pos_idx / pos_idx.sum(1, keepdim=True)
+
+            with torch.no_grad():
+                self._momentum_update()
+                image_embeds_m = self.visual_encoder_m.visual(
+                    image, skip_last_layer=True)
+                if self.large:
+                    image_embeds_m = self.dropout_m(
+                        self.visn_layer_norm_m(self.visn_fc_m(image_embeds_m)))
+                image_feat_m = F.normalize(
+                    self.vision_proj_m(image_embeds_m[:, 0, :]), dim=-1)
+                image_feat_all = torch.cat(
+                    [image_feat_m.t(),
+                     self.image_queue.clone().detach()],
+                    dim=1)
+                text_output_m = self.text_encoder_m(
+                    text.input_ids,
+                    attention_mask=text.attention_mask,
+                    return_dict=True)
+                text_feat_m = F.normalize(
+                    self.text_proj_m(text_output_m.last_hidden_state[:, 0, :]),
+                    dim=-1)
+                text_feat_all = torch.cat(
+                    [text_feat_m.t(),
+                     self.text_queue.clone().detach()], dim=1)
+
+                if self.distill:
+                    sim_i2t_m = image_feat_m @ text_feat_all / self.temp
+                    sim_t2i_m = text_feat_m @ image_feat_all / self.temp
+
+                    sim_i2t_targets = self.alpha * F.softmax(
+                        sim_i2t_m, dim=1) + (1 - self.alpha) * sim_targets
+                    sim_t2i_targets = self.alpha * F.softmax(
+                        sim_t2i_m, dim=1) + (1 - self.alpha) * sim_targets
+
+            sim_i2t = image_feat @ text_feat_all / self.temp
+            sim_t2i = text_feat @ image_feat_all / self.temp
+
+            if self.distill:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets,
+                    dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets,
+                    dim=1).mean()
+            else:
+                loss_i2t = -torch.sum(
+                    F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
+                loss_t2i = -torch.sum(
+                    F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
+
+            loss_ita = (loss_i2t + loss_t2i) / 2
+
+            self._dequeue_and_enqueue(image_feat_m, text_feat_m, idx)
+
+            # forward the positve image-text pair
+            _, output_pos = self.fusion_encoder(
                 encoder_embeds=text_embeds,
-                attention_mask=question.attention_mask,
+                attention_mask=text.attention_mask,
                 encoder_hidden_states=image_embeds,
                 encoder_attention_mask=image_atts,
-                return_dict=False)
-            image_output, question_output = fusion_output
-            question_output = torch.cat([image_output, question_output], 1)
-            merge_text_attention = torch.cat(
-                [image_atts, question.attention_mask], 1)
-            topk_ids, topk_probs = self.generation(question_output,
-                                                   merge_text_attention)
-            return topk_ids, topk_probs
+                return_dict=False,
+            )
+            with torch.no_grad():
+                bs = image.size(0)
+                weights_i2t = F.softmax(sim_i2t[:, :bs], dim=1)
+                weights_t2i = F.softmax(sim_t2i[:, :bs], dim=1)
+
+                mask = torch.eq(idx, idx.T)
+                weights_i2t.masked_fill_(mask, 0)
+                weights_t2i.masked_fill_(mask, 0)
+
+            # select a negative image for each text
+            image_embeds_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+                image_embeds_neg.append(image_embeds[neg_idx])
+            image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
+
+            # select a negative text for each image
+            text_embeds_neg = []
+            text_atts_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+                text_embeds_neg.append(text_embeds[neg_idx])
+                text_atts_neg.append(text.attention_mask[neg_idx])
+            text_embeds_neg = torch.stack(text_embeds_neg, dim=0)
+            text_atts_neg = torch.stack(text_atts_neg, dim=0)
+
+            text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
+            text_atts_all = torch.cat([text.attention_mask, text_atts_neg],
+                                      dim=0)
+
+            image_embeds_all = torch.cat([image_embeds_neg, image_embeds],
+                                         dim=0)
+            image_atts_all = torch.cat([image_atts, image_atts], dim=0)
+
+            _, output_neg = self.fusion_encoder(
+                encoder_embeds=text_embeds_all,
+                attention_mask=text_atts_all,
+                encoder_hidden_states=image_embeds_all,
+                encoder_attention_mask=image_atts_all,
+                return_dict=False,
+            )
+
+            vl_embeddings = torch.cat(
+                [output_pos[:, 0, :], output_neg[:, 0, :]], dim=0)
+            vl_output = self.itm_head(vl_embeddings)
+
+            ones_tmp = torch.ones(bs, dtype=torch.long)
+            zeros_tmp = torch.zeros(2 * bs, dtype=torch.long)
+            itm_labels = torch.cat([ones_tmp, zeros_tmp],
+                                   dim=0).to(image.device)
+            loss_itm = F.cross_entropy(vl_output, itm_labels)
+
+            return loss_ita + loss_itm
+        else:
+            text_output = self.text_encoder(
+                text.input_ids, attention_mask=text.attention_mask)
+            text_feat = text_output.last_hidden_state
+            image_feat = self.visual_encoder.visual(
+                image, skip_last_layer=True)
+            image_feat = self.visn_layer_norm(self.visn_fc(image_feat))
+            image_att = torch.ones(
+                image_feat.size()[:-1],
+                dtype=torch.long,
+                device=image_feat.device)
+            _, output = self.fusion_encoder(
+                encoder_embeds=text_feat,
+                attention_mask=text.attention_mask,
+                encoder_hidden_states=image_feat,
+                encoder_attention_mask=image_att,
+                return_dict=False,
+            )
+            scores = self.itm_head(output[:, 0, :])
+            scores = F.softmax(scores, dim=-1)
+
+            return scores
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index bb5a9c46..608cc733 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, List
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
@@ -12,6 +12,7 @@ __all__ = ['MPlugForAllTasks']
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.mplug)
 @MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
+@MODELS.register_module(Tasks.image_text_retrieval, module_name=Models.mplug)
 class MPlugForAllTasks(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -25,12 +26,6 @@ class MPlugForAllTasks(TorchModel):
         self.model = MPlug.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
 
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """return the result by the model
 
@@ -45,13 +40,54 @@ class MPlugForAllTasks(TorchModel):
                     }
         """
 
-        topk_ids, _ = self.model(**input)
         replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                                ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                                ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
 
-        pred_string = self.tokenizer.decode(topk_ids[0][0])
-        for _old, _new in replace_tokens_bert:
-            pred_string = pred_string.replace(_old, _new)
-        pred_string = pred_string.strip()
-        return pred_string
+        # inference
+        if not self.training and 'question' in input:
+            output = self.model(input['image'], input['question'], train=False)
+            if not isinstance(output, tuple):
+                return output
+            topk_ids, _ = output
+            pred_string: str = self.tokenizer.decode(topk_ids[0][0])
+            for _old, _new in replace_tokens_bert:
+                pred_string = pred_string.replace(_old, _new)
+            pred_string = pred_string.strip()
+            return pred_string
+
+        # train and evaluate
+        import addict
+        image = input['image']
+        answer = addict.Dict(
+            input_ids=input['answer_input_ids'],
+            attention_mask=input['answer_attention_mask'])
+        if 'index' not in input:
+            question = addict.Dict(
+                input_ids=input['question_input_ids'],
+                attention_mask=input['question_attention_mask'])
+            output = self.model(image, question, answer, train=self.training)
+        else:
+            index = input['index']
+            output = self.model(image, answer, index, train=self.training)
+        if self.training:
+            return {'loss': output}
+
+        # evaluate
+        topk_ids, _ = output
+        preds: List[str] = [
+            self.tokenizer.decode(batch[0]) for batch in topk_ids
+        ]
+        for i in range(len(preds)):
+            for _old, _new in replace_tokens_bert:
+                preds[i] = preds[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        tgts: List[str] = [
+            self.tokenizer.decode(batch)
+            for batch in input['answer_input_ids'].cpu().numpy().tolist()
+        ]
+        for i in range(len(tgts)):
+            for _old, _new in replace_tokens_bert:
+                tgts[i] = tgts[i].replace(_old, _new)
+            preds[i] = preds[i].strip()
+        return {'preds': preds, 'tgts': tgts}
diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index 5cdc9668..b942e3fa 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -6,17 +6,30 @@ import numpy as np
 import torch
 import torch.cuda
 from PIL import Image
+from pkg_resources import packaging
 from taming.models.vqgan import GumbelVQ, VQModel
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.mmr.models.module_clip import CLIP
+from modelscope.models.multi_modal.mmr.models.tokenization_clip import \
+    SimpleTokenizer as ClipTokenizer
 from modelscope.models.multi_modal.ofa import OFAModel, OFATokenizer
 from modelscope.models.multi_modal.ofa.generate import sequence_generator as sg
 from modelscope.models.multi_modal.ofa.generate.search import Sampling
 from modelscope.models.multi_modal.ofa.generate.utils import move_to_device
 from modelscope.utils.constant import Tasks
 
+try:
+    from torchvision.transforms import InterpolationMode
+
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
 __all__ = ['OfaForTextToImageSynthesis']
 
 
@@ -43,6 +56,74 @@ def load_vqgan(config, ckpt_path=None, is_gumbel=False):
     return model.eval()
 
 
+def build_clip_model(model_path):
+    state_dict = torch.load(model_path, map_location='cpu').state_dict()
+    vit = 'visual.proj' in state_dict
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split('.')[2] for k in state_dict
+                    if k.startswith(f'visual.layer{b}')))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0]
+             - 1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split('.')[2] for k in state_dict
+            if k.startswith('transformer.resblocks')))
+
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        if key in state_dict:
+            del state_dict[key]
+
+    model.load_state_dict(state_dict)
+    return model.eval()
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def build_clip_transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
 @MODELS.register_module(Tasks.text_to_image_synthesis, module_name=Models.ofa)
 class OfaForTextToImageSynthesis(Model):
 
@@ -65,11 +146,23 @@ class OfaForTextToImageSynthesis(Model):
             vqgan_config,
             ckpt_path=os.path.join(model_dir, 'vqgan_model.ckpt'),
             is_gumbel=True).to(self._device)
+
+        # Initialize OpenAI clip
+
+        self.clip_tokenizer = ClipTokenizer(model_dir)
+        self.clip_model = build_clip_model(
+            os.path.join(model_dir, 'ViT-B-16.pt'))
+        self.clip_preprocess = build_clip_transform(
+            self.clip_model.visual.input_resolution)
+
+        self.clip_model.to(self._device)
+        self.clip_model.eval()
+
         # Initialize generator
         sampling = Sampling(self.tokenizer, sampling_topp=0.9)
         sg_args = {
             'tokenizer': self.tokenizer,
-            'beam_size': 1,
+            'beam_size': 2,
             'max_len_b': 1024,
             'min_len': 1024,
             'search_strategy': sampling,
@@ -78,13 +171,68 @@ class OfaForTextToImageSynthesis(Model):
         }
         self.generator = sg.SequenceGenerator(**sg_args)
 
+    def clip_tokenize(self, texts, context_length=77, truncate=False):
+
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.clip_tokenizer.encoder['<|startoftext|>']
+        eot_token = self.clip_tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.clip_tokenizer.encode(text)
+                      + [eot_token] for text in texts]
+        if packaging.version.parse(
+                torch.__version__) < packaging.version.parse('1.8.0'):
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.long)
+        else:
+            result = torch.zeros(
+                len(all_tokens), context_length, dtype=torch.int)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate:
+                    tokens = tokens[:context_length]
+                    tokens[-1] = eot_token
+                else:
+                    raise RuntimeError(
+                        f'Input {texts[i]} is too long for context length {context_length}'
+                    )
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
     def forward(self, input: Dict[str, Any]):
+
+        text = input['samples'][0]['text']
         input = move_to_device(input, self._device)
+        clip_text_input = self.clip_tokenize([text]).to(self._device)
+
         gen_output = self.generator.generate([self.model], input)
-        gen_tokens = gen_output[0][0]['tokens'][:-1]
-        codes = gen_tokens.view(1, 32, 32) - 50265
+        gen_tokens = torch.stack(
+            [item['tokens'][:-1] for item in gen_output[0]], dim=0)
+        codes = gen_tokens.view(-1, 32, 32) - 50265
+
         quant_b = self.vqgan_model.quantize.get_codebook_entry(
             codes.view(-1),
             list(codes.size()) + [self.vqgan_model.quantize.embedding_dim])
-        dec = self.vqgan_model.decode(quant_b)[0]
-        return custom_to_pil(dec)
+        imgs = self.vqgan_model.decode(quant_b)
+
+        sample_num = imgs.size()[0]
+        pil_imgs = [custom_to_pil(imgs[i]) for i in range(sample_num)]
+
+        clip_image_input = torch.stack(
+            [self.clip_preprocess(img) for img in pil_imgs],
+            dim=0).to(self._device)
+
+        with torch.no_grad():
+            hyp_image_features = self.clip_model.encode_image(clip_image_input)
+            hyp_image_features /= hyp_image_features.norm(dim=-1, keepdim=True)
+            text_features = self.clip_model.encode_text(clip_text_input)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+        ti_similarity = hyp_image_features @ text_features.T
+
+        sorted_score, ti_indices = torch.sort(
+            ti_similarity.view(-1), descending=True)
+
+        pil_imgs_orderby_ti = [pil_imgs[index] for index in ti_indices]
+        return pil_imgs_orderby_ti[0]
diff --git a/modelscope/models/multi_modal/team/__init__.py b/modelscope/models/multi_modal/team/__init__.py
new file mode 100644
index 00000000..0597040c
--- /dev/null
+++ b/modelscope/models/multi_modal/team/__init__.py
@@ -0,0 +1 @@
+from .team_model import TEAMForMultiModalSimilarity
diff --git a/modelscope/models/multi_modal/team/team_model.py b/modelscope/models/multi_modal/team/team_model.py
new file mode 100644
index 00000000..4aa77e17
--- /dev/null
+++ b/modelscope/models/multi_modal/team/team_model.py
@@ -0,0 +1,126 @@
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from tokenizers import BertWordPieceTokenizer
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .utils import TEAM, BertWrapper, CLIPVisionWrapper, CrossLayer
+
+logger = get_logger()
+
+__all__ = ['TEAMForMultiModalSimilarity']
+
+
+@MODELS.register_module(Tasks.multi_modal_similarity, module_name=Models.team)
+class TEAMForMultiModalSimilarity(TorchModel):
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        text_model = BertWrapper(
+            config_json='{}/text_config.json'.format(model_dir),
+            feat_dim=768,
+            token_dim=1024)
+        text_model.bert.cls = None
+        image_model = CLIPVisionWrapper()
+
+        self.model = TEAM(
+            text_model,
+            image_model,
+            pretrained='{}/{}'.format(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.eval()
+
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+        self.text_tokenizer = BertWordPieceTokenizer(
+            '{}/{}'.format(model_dir, ModelFile.VOCAB_FILE), lowercase=False)
+        self.text_tokenizer.enable_truncation(max_length=30)
+
+        norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
+                            (0.26862954, 0.26130258, 0.27577711))
+        self.img_preprocessor = Compose([
+            Resize((224, 224), interpolation=Image.BICUBIC),
+            ToTensor(), norm_op
+        ])
+
+    def tokenize_text(self, text_str):
+        tokens = self.text_tokenizer.encode(text_str)
+        max_tokens = 30
+        text_ids_tensor = torch.zeros((1, max_tokens)).long()
+        text_mask_tensor = torch.zeros((1, max_tokens))
+        text_ids, text_mask = tokens.ids, tokens.attention_mask
+        text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
+        text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)
+        return text_ids_tensor, text_mask_tensor
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            if 'img' in input and input['img'] is not None:
+                input_img = input['img']
+                input_img = LoadImage.convert_to_img(input_img)
+                img_tensor = self.img_preprocessor(input_img)[None, ...]
+
+                if self.device_id >= 0:
+                    img_tensor = img_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                _, _, image_feature, image_tensors = self.model.get_feature(
+                    None, None, img_tensor)
+                image_feature = image_feature.cpu().numpy()
+            else:
+                image_feature, image_tensors = None, None
+
+            if 'text' in input and input['text'] is not None:
+                text_str = input['text']
+                if isinstance(text_str, str):
+                    text_ids_tensor, text_mask_tensor = self.tokenize_text(
+                        text_str)
+                else:
+                    raise TypeError(
+                        f'text should be str, but got {type(text_str)}')
+
+                if self.device_id >= 0:
+                    text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                    text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
+                        self.device_id))
+                text_feature, text_tensors, _, _ = self.model.get_feature(
+                    text_ids_tensor, text_mask_tensor, None)
+                text_feature = text_feature.cpu().numpy()
+            else:
+                text_tensors, text_mask_tensor = None, None
+
+            if text_tensors is not None and text_mask_tensor is not None and image_tensors is not None:
+                score = self.model.get_cross_score(text_tensors,
+                                                   text_mask_tensor,
+                                                   image_tensors)[0].item()
+            else:
+                score = None
+            output = {
+                OutputKeys.IMG_EMBEDDING: image_feature,
+                OutputKeys.TEXT_EMBEDDING: text_feature,
+                OutputKeys.SCORES: score
+            }
+            return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/models/multi_modal/team/utils.py b/modelscope/models/multi_modal/team/utils.py
new file mode 100644
index 00000000..3b3e394e
--- /dev/null
+++ b/modelscope/models/multi_modal/team/utils.py
@@ -0,0 +1,326 @@
+""" Generative Multimodal Model
+Base Transformer code is adapted from https://github.com/openai/CLIP/,
+originally MIT License, Copyright (c) 2021 OpenAI,
+"""
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+from transformers import BertConfig, BertForMaskedLM
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 use_gc=False):
+        super().__init__()
+        self.use_gc = use_gc
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        if self.use_gc:
+            for each_block in self.resblocks:
+                x = checkpoint.checkpoint(each_block, x)
+            return x
+        else:
+            return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 output_dim: int,
+                 use_gc=False):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads, use_gc=use_gc)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIPVisionWrapper(nn.Module):
+
+    def __init__(self, ):
+        super().__init__()
+        self.vision_transformer = VisionTransformer(
+            input_resolution=224,
+            patch_size=14,
+            width=1024,
+            layers=24,
+            heads=16,
+            output_dim=768)
+
+    def forward(self, x):
+        x = self.vision_transformer.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        class_embedding = self.vision_transformer.class_embedding.to(x.dtype) + \
+            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([class_embedding, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.vision_transformer.positional_embedding.to(x.dtype)
+        x = self.vision_transformer.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.vision_transformer.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x_tensor = x.clone()
+        x = self.vision_transformer.ln_post(x[:, 0, :])
+
+        if self.vision_transformer.proj is not None:
+            x = x @ self.vision_transformer.proj
+
+        return x, x_tensor
+
+
+class BertWrapper(nn.Module):
+
+    def __init__(self, config_json, feat_dim, token_dim):
+        super(BertWrapper, self).__init__()
+        bert_config = BertConfig.from_json_file(config_json)
+        self.bert = BertForMaskedLM(bert_config).bert
+
+        self.projector = nn.Linear(768, feat_dim, bias=False)
+        self.projector_token_embeds = nn.Linear(768, token_dim)
+
+    def forward(self, input_ids, attention_mask):
+        trans_features = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask
+        }
+        output_states = self.bert(**trans_features, return_dict=False)
+        output_tokens = output_states[0]
+
+        cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
+
+        return self.projector(cls_tokens), self.projector_token_embeds(
+            output_tokens)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class CrossLayer(nn.Module):
+
+    def __init__(self, feat_dim, mlp_ratio):
+        super(CrossLayer, self).__init__()
+        self.norm1 = nn.LayerNorm(feat_dim)
+        self.norm2 = nn.LayerNorm(feat_dim)
+        self.norm3 = nn.LayerNorm(feat_dim)
+
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=feat_dim, num_heads=16)
+        self.ffn = Mlp(
+            in_features=feat_dim,
+            hidden_features=feat_dim * mlp_ratio,
+            drop=0.1)
+
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.dropout3 = nn.Dropout(0.1)
+
+    def forward(self, text_tensors, text_masks, image_tensors,
+                retrieved_tensors):
+        retrieved_tensors_res = self.norm1(retrieved_tensors)
+        retrieved_tensors_res = self.self_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            retrieved_tensors_res.permute(1, 0, 2),
+            key_padding_mask=(text_masks == 0),
+        )[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout1(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm2(retrieved_tensors)
+        retrieved_tensors_res = self.cross_attn(
+            (text_tensors + retrieved_tensors_res).permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2),
+            image_tensors.permute(1, 0, 2))[0].permute(1, 0, 2)
+        retrieved_tensors = retrieved_tensors + self.dropout2(
+            retrieved_tensors_res)
+
+        retrieved_tensors_res = self.norm3(retrieved_tensors)
+        retrieved_tensors = retrieved_tensors + self.dropout3(
+            self.ffn(retrieved_tensors_res))
+
+        return retrieved_tensors
+
+
+class TEAM(nn.Module):
+
+    def __init__(self, text_model, image_model, pretrained):
+        super(TEAM, self).__init__()
+        self.text_model = text_model
+        self.image_model = image_model
+
+        self.cross_model = nn.ModuleList(
+            [CrossLayer(feat_dim=1024, mlp_ratio=2)])
+
+        self.image_tensor_fc = nn.Linear(1024, 768)
+        self.text_tensor_fc = nn.Linear(1024, 768)
+
+        params = torch.load(pretrained, 'cpu')
+        self.load_state_dict(params, strict=True)
+
+    def get_feature(self, text_data=None, text_mask=None, img_tensor=None):
+        if text_data is not None:
+            text_feature, text_tensors = self.text_model(text_data, text_mask)
+            text_feature = F.normalize(text_feature, p=2.0, dim=1)
+        else:
+            text_feature, text_tensors = None, None
+
+        if img_tensor is not None:
+            image_feature, image_tensors = self.image_model(img_tensor)
+            image_feature = F.normalize(image_feature, p=2.0, dim=1)
+        else:
+            image_feature, image_tensors = None, None
+
+        return text_feature, text_tensors, image_feature, image_tensors
+
+    def get_cross_score(self, text_tensors, text_mask, image_tensors):
+        retrieved_tensors = torch.zeros_like(text_tensors)
+        pair_score_list = []
+        text_tensors_proj = self.text_tensor_fc(text_tensors)
+        text_mask_float = text_mask.type(text_tensors_proj.dtype)
+        for each_cross_model in self.cross_model:
+            retrieved_tensors = each_cross_model(text_tensors, text_mask,
+                                                 image_tensors,
+                                                 retrieved_tensors)
+            retrieved_tensors_proj = self.image_tensor_fc(retrieved_tensors)
+
+            pair_score = torch.sum(
+                F.normalize(retrieved_tensors_proj, p=2.0, dim=2)
+                * F.normalize(text_tensors_proj, p=2.0, dim=2),
+                dim=2)
+            pair_score_reduced = torch.sum(
+                pair_score * text_mask_float, dim=1) / torch.clamp(
+                    torch.sum(text_mask_float, dim=1), min=1.0)
+            pair_score_list.append(pair_score_reduced)
+        return pair_score_list
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 3fd76f98..e17a1d31 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -7,10 +7,13 @@ if TYPE_CHECKING:
     from .backbones import SbertModel
     from .heads import SequenceClassificationHead
     from .bert_for_sequence_classification import BertForSequenceClassification
+    from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM)
-    from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition
+    from .nncrf_for_named_entity_recognition import (
+        TransformerCRFForNamedEntityRecognition,
+        LSTMCRFForNamedEntityRecognition)
     from .palm_v2 import PalmForTextGeneration
     from .token_classification import SbertForTokenClassification
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
@@ -18,9 +21,12 @@ if TYPE_CHECKING:
     from .space import SpaceForDialogModeling
     from .space import SpaceForDialogStateTracking
     from .star_text_to_sql import StarForTextToSql
-    from .task_models.task_model import SingleBackboneTaskModelBase
+    from .task_models import (InformationExtractionModel,
+                              SequenceClassificationModel,
+                              SingleBackboneTaskModelBase)
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
+    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
 
 else:
     _import_structure = {
@@ -29,10 +35,13 @@ else:
         'heads': ['SequenceClassificationHead'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
+        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
         'masked_language':
         ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
-        'nncrf_for_named_entity_recognition':
-        ['TransformerCRFForNamedEntityRecognition'],
+        'nncrf_for_named_entity_recognition': [
+            'TransformerCRFForNamedEntityRecognition',
+            'LSTMCRFForNamedEntityRecognition'
+        ],
         'palm_v2': ['PalmForTextGeneration'],
         'token_classification': ['SbertForTokenClassification'],
         'sequence_classification':
@@ -41,9 +50,13 @@ else:
             'SpaceForDialogIntent', 'SpaceForDialogModeling',
             'SpaceForDialogStateTracking'
         ],
-        'task_model': ['SingleBackboneTaskModelBase'],
+        'task_models': [
+            'InformationExtractionModel', 'SequenceClassificationModel',
+            'SingleBackboneTaskModelBase'
+        ],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
         'gpt3': ['GPT3ForTextGeneration'],
+        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert_for_document_segmentation.py
new file mode 100644
index 00000000..dfa57597
--- /dev/null
+++ b/modelscope/models/nlp/bert_for_document_segmentation.py
@@ -0,0 +1,108 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.models.bert.modeling_bert import (BertModel,
+                                                    BertPreTrainedModel)
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+
+__all__ = ['BertForDocumentSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.document_segmentation, module_name=Models.bert_for_ds)
+class BertForDocumentSegmentation(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+    def build_with_config(self, config):
+        self.bert_model = BertForDocumentSegmentationBase.from_pretrained(
+            self.model_dir, from_tf=False, config=config)
+        return self.bert_model
+
+    def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]:
+        pass
+
+
+class BertForDocumentSegmentationBase(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.sentence_pooler_type = None
+        self.bert = BertModel(config, add_pooling_layer=False)
+
+        classifier_dropout = config.hidden_dropout_prob
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.class_weights = None
+        self.init_weights()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                sentence_attention_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        if self.sentence_pooler_type is not None:
+            raise NotImplementedError
+        else:
+            sequence_output = self.dropout(sequence_output)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(weight=self.class_weights)
+            if sentence_attention_mask is not None:
+                active_loss = sentence_attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index 7cff9ad4..fe1402e8 100644
--- a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -60,5 +60,6 @@ class GPT3ForTextGeneration(TorchModel):
         sample_output = self.model.generate(**gen_params)
         return {
             OutputKeys.TEXT:
-            self.tokenizer.decode(sample_output[0], skip_special_tokens=True)
+            self.tokenizer.decode(sample_output[0],
+                                  skip_special_tokens=True).replace(' ', '')
         }
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index f7024713..4e30f697 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -16,9 +16,10 @@ import math
 import os
 from typing import Optional, Union
 
+import addict
 import torch
-from addict import Dict
-from torch.nn import Dropout, Embedding, LayerNorm, Linear, Module, Softmax
+from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear,
+                      Module, Softmax)
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
@@ -308,20 +309,25 @@ class GPT3Model(PreTrainedModel):
                 input_ids,
                 attention_mask=None,
                 position_ids=None,
+                labels=None,
                 **kwargs):
         seq_length = input_ids.size(1)
-        if attention_mask is None:
-            attention_mask = torch.tril(
-                torch.ones((1, seq_length, seq_length),
-                           dtype=torch.long,
-                           device=input_ids.device))
+        attention_mask = torch.tril(
+            torch.ones((1, 1, seq_length, seq_length),
+                       dtype=torch.long,
+                       device=input_ids.device))
         if position_ids is None:
             position_ids = torch.arange(
                 seq_length, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         logits = self.language_model(input_ids, attention_mask, position_ids)
-        return Dict(logits=logits)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return addict.Dict(loss=loss, logits=logits)
 
     @classmethod
     def from_pretrained(
diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
new file mode 100644
index 00000000..cf957834
--- /dev/null
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -0,0 +1,106 @@
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.information_extraction, module_name=Heads.information_extraction)
+class InformationExtractionHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        assert config.get('labels') is not None
+        self.labels = config.labels
+        self.s_layer = nn.Linear(config.hidden_size, 2)  # head, tail, bce
+        self.o_layer = nn.Linear(2 * config.hidden_size, 2)  # head, tail, bce
+        self.p_layer = nn.Linear(config.hidden_size,
+                                 len(self.labels))  # label, ce
+        self.mha = nn.MultiheadAttention(config.hidden_size, 4)
+
+    def forward(self, sequence_output, text, offsets, threshold=0.5):
+        # assert batch size == 1
+        spos = []
+        s_head_logits, s_tail_logits = self.s_layer(sequence_output).split(
+            1, dim=-1)  # (b, seq_len, 2)
+        s_head_logits = s_head_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_tail_logits = s_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+        s_masks, subjects = self._get_masks_and_mentions(
+            text, offsets, s_head_logits, s_tail_logits, None, threshold)
+        for s_mask, subject in zip(s_masks, subjects):
+            masked_sequence_output = sequence_output * s_mask.unsqueeze(
+                0).unsqueeze(-1)  # (b, s, h)
+            subjected_sequence_output = self.mha(
+                sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0, 2),
+                masked_sequence_output.permute(1, 0,
+                                               2))[0].permute(1, 0,
+                                                              2)  # (b, s, h)
+            cat_sequence_output = torch.cat(
+                (sequence_output, subjected_sequence_output), dim=-1)
+            o_head_logits, o_tail_logits = self.o_layer(
+                cat_sequence_output).split(
+                    1, dim=-1)
+            o_head_logits = o_head_logits[0, :, 0].sigmoid()  # (seq_len)
+            o_tail_logits = o_tail_logits[0, :, 0].sigmoid()  # (seq_len)
+            so_masks, objects = self._get_masks_and_mentions(
+                text, offsets, o_head_logits, o_tail_logits, s_mask, threshold)
+            for so_mask, object in zip(so_masks, objects):
+                masked_sequence_output = (
+                    sequence_output * so_mask.unsqueeze(0).unsqueeze(-1)).sum(
+                        1)  # (b, h)
+                lengths = so_mask.unsqueeze(0).sum(-1, keepdim=True)  # (b, 1)
+                pooled_subject_object = masked_sequence_output / lengths  # (b, h)
+                label = self.p_layer(pooled_subject_object).sigmoid().squeeze(
+                    0)
+                for i in range(label.size(-1)):
+                    if label[i] > threshold:
+                        predicate = self.labels[i]
+                        spos.append((subject, predicate, object))
+        return spos
+
+    def _get_masks_and_mentions(self,
+                                text,
+                                offsets,
+                                heads,
+                                tails,
+                                init_mask=None,
+                                threshold=0.5):
+        '''
+        text: str
+        heads: tensor (len(heads))
+        tails: tensor (len(tails))
+        '''
+        seq_len = heads.size(-1)
+        potential_heads = []
+        for i in range(seq_len - 1):
+            if heads[i] > threshold:
+                potential_heads.append(i)
+        potential_heads.append(seq_len - 1)
+        masks = []
+        mentions = []
+        for i in range(len(potential_heads) - 1):
+            head_index = potential_heads[i]
+            tail_index, max_val = None, 0
+            for j in range(head_index, potential_heads[i + 1]):
+                if tails[j] > max_val and tails[j] > threshold:
+                    tail_index = j
+                    max_val = tails[j]
+            if tail_index is not None:
+                mask = torch.zeros_like(
+                    heads) if init_mask is None else init_mask.clone()
+                mask[head_index:tail_index + 1] = 1
+                masks.append(mask)  # (seq_len)
+                char_head = offsets[head_index][0]
+                char_tail = offsets[tail_index][1]
+                mention = text[char_head:char_tail]
+                mentions.append(mention)
+        return masks, mentions
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
index 2015997f..37216510 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
@@ -10,27 +10,25 @@ from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['TransformerCRFForNamedEntityRecognition']
+__all__ = [
+    'TransformerCRFForNamedEntityRecognition',
+    'LSTMCRFForNamedEntityRecognition'
+]
 
 
-@MODELS.register_module(
-    Tasks.named_entity_recognition, module_name=Models.tcrf)
-class TransformerCRFForNamedEntityRecognition(TorchModel):
-    """This model wraps the TransformerCRF model to register into model sets.
-    """
+class SequenceLabelingForNamedEntityRecognition(TorchModel):
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-
-        self.config = AutoConfig.from_pretrained(model_dir)
-        num_labels = self.config.num_labels
-
-        self.model = TransformerCRF(model_dir, num_labels)
+        self.model = self.init_model(model_dir, *args, **kwargs)
 
         model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
         self.model.load_state_dict(
             torch.load(model_ckpt, map_location=torch.device('cpu')))
 
+    def init_model(self, model_dir, *args, **kwargs):
+        raise NotImplementedError
+
     def train(self):
         return self.model.train()
 
@@ -64,6 +62,39 @@ class TransformerCRFForNamedEntityRecognition(TorchModel):
         return output
 
 
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.tcrf)
+class TransformerCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the TransformerCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        num_labels = self.config.num_labels
+
+        model = TransformerCRF(model_dir, num_labels)
+        return model
+
+
+@MODELS.register_module(
+    Tasks.named_entity_recognition, module_name=Models.lcrf)
+class LSTMCRFForNamedEntityRecognition(
+        SequenceLabelingForNamedEntityRecognition):
+    """This model wraps the LSTMCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        vocab_size = self.config.vocab_size
+        embed_width = self.config.embed_width
+        num_labels = self.config.num_labels
+        lstm_hidden_size = self.config.lstm_hidden_size
+
+        model = LSTMCRF(vocab_size, embed_width, num_labels, lstm_hidden_size)
+        return model
+
+
 class TransformerCRF(nn.Module):
     """A transformer based model to NER tasks.
 
@@ -105,6 +136,56 @@ class TransformerCRF(nn.Module):
         return outputs
 
 
+class LSTMCRF(nn.Module):
+    """
+    A standard bilstm-crf model for fast prediction.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embed_width,
+                 num_labels,
+                 lstm_hidden_size=100,
+                 **kwargs):
+        super(LSTMCRF, self).__init__()
+        self.embedding = Embedding(vocab_size, embed_width)
+        self.lstm = nn.LSTM(
+            embed_width,
+            lstm_hidden_size,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True)
+        self.ffn = nn.Linear(lstm_hidden_size * 2, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self, inputs):
+        embedding = self.embedding(inputs['input_ids'])
+        lstm_output, _ = self.lstm(embedding)
+        logits = self.ffn(lstm_output)
+
+        if 'label_mask' in inputs:
+            mask = inputs['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        outputs = {'logits': logits}
+        return outputs
+
+    def decode(self, inputs):
+        seq_lens = inputs['label_mask'].sum(-1).long()
+        mask = torch.arange(
+            inputs['label_mask'].shape[1],
+            device=seq_lens.device)[None, :] < seq_lens[:, None]
+        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        outputs = {'predicts': predicts}
+        return outputs
+
+
 class CRF(nn.Module):
     """Conditional random field.
     This module implements a conditional random field [LMP01]_. The forward computation
@@ -547,3 +628,14 @@ class CRF(nn.Module):
 
         return torch.where(mask.unsqueeze(-1), best_tags_arr,
                            oor_tag).permute(2, 1, 0)
+
+
+class Embedding(nn.Module):
+
+    def __init__(self, vocab_size, embed_width):
+        super(Embedding, self).__init__()
+
+        self.embedding = nn.Embedding(vocab_size, embed_width)
+
+    def forward(self, input_ids):
+        return self.embedding(input_ids)
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index 1cbf4f58..ff6fd732 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -6,6 +6,7 @@ import subprocess
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
+import addict
 import json
 import numpy as np
 import torch
@@ -726,10 +727,11 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
                                    self.palm.vocab_size,
                                    config.label_smoothing)
 
-    def forward(self, src, tgt, mask_src):
-        output = self.palm(src, tgt, mask_src)[0]
-        loss = self.loss(tgt, output)
-        return loss
+    def forward(self, input_ids, attention_mask, labels):
+        output = self.palm(
+            src=input_ids, tgt=labels, mask_src=attention_mask)[0]
+        loss = self.loss(labels, output)
+        return addict.Dict(loss=loss)
 
 
 class Translator(nn.Module):
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index e432cc58..ae92427e 100644
--- a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -29,20 +29,19 @@ class PalmForTextGeneration(TorchModel):
         self.generator = Translator(self.model)
 
     def _evaluate_postprocess(self, ids_list: List[List[int]]) -> List[str]:
-        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
-                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
-                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), ('[unused1]',
+                                                                  ''),
+                               (r' +', ' '), ('[SEP]', ''), ('[unused2]', ''),
+                               ('[CLS]', ''), ('[UNK]', ''), (' ', ''))
         replace_tokens_roberta = ((r' +', ' '), ('<mask>', '. '),
                                   ('<pad>', ''), ('<s>', ''), ('</s>', ''),
                                   ('<unk>', ' '), ('<q>', '. '))
 
+        replace_tokens = replace_tokens_roberta \
+            if self.model.config.encoder == 'roberta' else replace_tokens_bert
         strings = [self.tokenizer.decode(pred_ids) for pred_ids in ids_list]
-        for _old, _new in replace_tokens_bert:
+        for _old, _new in replace_tokens:
             strings = [s.replace(_old, _new) for s in strings]
-        for _old, _new in replace_tokens_roberta:
-            strings = [s.replace(_old, _new) for s in strings]
-        for s in strings:
-            s.strip()
         return strings
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -64,14 +63,15 @@ class PalmForTextGeneration(TorchModel):
                     }
         """
         if self.training:
-            return {'loss': self.model(**input)}
+            return self.model(**input)
         else:
-            outputs = self.generator(input['src'], input['mask_src'])
+            outputs = self.generator(input['input_ids'],
+                                     input['attention_mask'])
             preds = outputs['predictions']
             pred_ids_list = [
                 pred_batch[0].cpu().numpy().tolist() for pred_batch in preds
             ]
-            tgt_ids_list = input['tgt'].cpu().numpy().tolist()
+            tgt_ids_list = input['labels'].cpu().numpy().tolist()
             return {
                 'preds': self._evaluate_postprocess(pred_ids_list),
                 'tgts': self._evaluate_postprocess(tgt_ids_list)
diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/sbert_for_faq_question_answering.py
new file mode 100644
index 00000000..23ccdcc5
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_faq_question_answering.py
@@ -0,0 +1,249 @@
+import math
+import os
+from collections import namedtuple
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertConfig, SbertModel
+from modelscope.models.nlp.task_models.task_model import BaseTaskModel
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['SbertForFaqQuestionAnswering']
+
+
+class SbertForFaqQuestionAnsweringBase(BaseTaskModel):
+    """base class for faq models
+    """
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super(SbertForFaqQuestionAnsweringBase,
+              self).__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = SbertConfig.from_pretrained(model_dir)
+        self.bert = SbertModel(backbone_cfg)
+
+        model_config = Config.from_file(
+            os.path.join(model_dir,
+                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+
+        metric = model_config.get('metric', 'cosine')
+        pooling_method = model_config.get('pooling', 'avg')
+
+        Arg = namedtuple('args', [
+            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
+        ])
+        args = Arg(
+            metrics=metric,
+            proj_hidden_size=self.bert.config.hidden_size,
+            hidden_size=self.bert.config.hidden_size,
+            dropout=0.0,
+            pooling=pooling_method)
+
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def _get_onehot_labels(self, labels, support_size, num_cls):
+        labels_ = labels.view(support_size, 1)
+        target_oh = torch.zeros(support_size, num_cls).to(labels)
+        target_oh.scatter_(dim=1, index=labels_, value=1)
+        return target_oh.view(support_size, num_cls).float()
+
+    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.pooling(last_hidden_states, input_mask)
+        return pooled_representation
+
+
+@MODELS.register_module(
+    Tasks.faq_question_answering, module_name=Models.structbert)
+class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase):
+    _backbone_prefix = ''
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        assert not self.training
+        query = input['query']
+        support = input['support']
+        if isinstance(query, list):
+            query = torch.stack(query)
+        if isinstance(support, list):
+            support = torch.stack(support)
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        query_mask = torch.ne(query, 0).view([n_query, -1])
+        support_mask = torch.ne(support, 0).view([n_support, -1])
+
+        support_labels = input['support_labels']
+        num_cls = torch.max(support_labels) + 1
+        onehot_labels = self._get_onehot_labels(support_labels, n_support,
+                                                num_cls)
+
+        input_ids = torch.cat([query, support])
+        input_mask = torch.cat([query_mask, support_mask], dim=0)
+        pooled_representation = self.forward_sentence_embedding({
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            input_mask
+        })
+        z_query = pooled_representation[:n_query]
+        z_support = pooled_representation[n_query:]
+        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
+        protos = torch.matmul(onehot_labels.transpose(0, 1),
+                              z_support) / cls_n_support.unsqueeze(-1)
+        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        if self.metrics_layer.name == 'relation':
+            scores = torch.sigmoid(scores)
+        return {'scores': scores}
+
+
+activations = {
+    'relu': F.relu,
+    'tanh': torch.tanh,
+    'linear': lambda x: x,
+}
+
+activation_coeffs = {
+    'relu': math.sqrt(2),
+    'tanh': 5 / 3,
+    'linear': 1.,
+}
+
+
+class LinearProjection(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 activation='linear',
+                 bias=True):
+        super().__init__()
+        self.activation = activations[activation]
+        activation_coeff = activation_coeffs[activation]
+        linear = nn.Linear(in_features, out_features, bias=bias)
+        nn.init.normal_(
+            linear.weight, std=math.sqrt(1. / in_features) * activation_coeff)
+        if bias:
+            nn.init.zeros_(linear.bias)
+        self.model = nn.utils.weight_norm(linear)
+
+    def forward(self, x):
+        return self.activation(self.model(x))
+
+
+class RelationModule(nn.Module):
+
+    def __init__(self, args):
+        super(RelationModule, self).__init__()
+        input_size = args.proj_hidden_size * 4
+        self.prediction = torch.nn.Sequential(
+            LinearProjection(
+                input_size, args.proj_hidden_size * 4, activation='relu'),
+            nn.Dropout(args.dropout),
+            LinearProjection(args.proj_hidden_size * 4, 1))
+
+    def forward(self, query, protos):
+        n_cls = protos.shape[0]
+        n_query = query.shape[0]
+        protos = protos.unsqueeze(0).repeat(n_query, 1, 1)
+        query = query.unsqueeze(1).repeat(1, n_cls, 1)
+        input_feat = torch.cat(
+            [query, protos, (protos - query).abs(), query * protos], dim=-1)
+        dists = self.prediction(input_feat)  # [bsz,n_query,n_cls,1]
+        return dists.squeeze(-1)
+
+
+class MetricsLayer(nn.Module):
+
+    def __init__(self, args):
+        super(MetricsLayer, self).__init__()
+        self.args = args
+        assert args.metrics in ('relation', 'cosine')
+        if args.metrics == 'relation':
+            self.relation_net = RelationModule(args)
+
+    @property
+    def name(self):
+        return self.args.metrics
+
+    def forward(self, query, protos):
+        """ query : [bsz, n_query, dim]
+            support : [bsz, n_query, n_cls, dim] | [bsz, n_cls, dim]
+        """
+        if self.args.metrics == 'cosine':
+            supervised_dists = self.cosine_similarity(query, protos)
+            if self.training:
+                supervised_dists *= 5
+        elif self.args.metrics in ('relation', ):
+            supervised_dists = self.relation_net(query, protos)
+        else:
+            raise NotImplementedError
+        return supervised_dists
+
+    def cosine_similarity(self, x, y):
+        # x=[bsz, n_query, dim]
+        # y=[bsz, n_cls, dim]
+        n_query = x.shape[0]
+        n_cls = y.shape[0]
+        dim = x.shape[-1]
+        x = x.unsqueeze(1).expand([n_query, n_cls, dim])
+        y = y.unsqueeze(0).expand([n_query, n_cls, dim])
+        return F.cosine_similarity(x, y, -1)
+
+
+class AveragePooling(nn.Module):
+
+    def forward(self, x, mask, dim=1):
+        return torch.sum(
+            x * mask.float(), dim=dim) / torch.sum(
+                mask.float(), dim=dim)
+
+
+class AttnPooling(nn.Module):
+
+    def __init__(self, input_size, hidden_size=None, output_size=None):
+        super().__init__()
+        self.input_proj = nn.Sequential(
+            LinearProjection(input_size, hidden_size), nn.Tanh(),
+            LinearProjection(hidden_size, 1, bias=False))
+        self.output_proj = LinearProjection(
+            input_size, output_size) if output_size else lambda x: x
+
+    def forward(self, x, mask):
+        score = self.input_proj(x)
+        score = score * mask.float() + -1e4 * (1. - mask.float())
+        score = F.softmax(score, dim=1)
+        features = self.output_proj(x)
+        return torch.matmul(score.transpose(1, 2), features).squeeze(1)
+
+
+class PoolingLayer(nn.Module):
+
+    def __init__(self, args):
+        super(PoolingLayer, self).__init__()
+        if args.pooling == 'attn':
+            self.pooling = AttnPooling(args.proj_hidden_size,
+                                       args.proj_hidden_size,
+                                       args.proj_hidden_size)
+        elif args.pooling == 'avg':
+            self.pooling = AveragePooling()
+        else:
+            raise NotImplementedError(args.pooling)
+
+    def forward(self, x, mask):
+        return self.pooling(x, mask)
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index e69de29b..49cf0ee4 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .information_extraction import InformationExtractionModel
+    from .sequence_classification import SequenceClassificationModel
+    from .task_model import SingleBackboneTaskModelBase
+
+else:
+    _import_structure = {
+        'information_extraction': ['InformationExtractionModel'],
+        'sequence_classification': ['SequenceClassificationModel'],
+        'task_model': ['SingleBackboneTaskModelBase'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
new file mode 100644
index 00000000..20a44787
--- /dev/null
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -0,0 +1,49 @@
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+
+__all__ = ['InformationExtractionModel']
+
+
+@MODELS.register_module(
+    Tasks.information_extraction,
+    module_name=TaskModels.information_extraction)
+class InformationExtractionModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the information extraction model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(sequence_output, input['text'],
+                                    input['offsets'])
+        return {OutputKeys.SPO_LIST: outputs}
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        return sequence_output, pooled_output
diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py
index 8e0647bb..073f9396 100644
--- a/modelscope/msdatasets/__init__.py
+++ b/modelscope/msdatasets/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import cv
 from .ms_dataset import MsDataset
diff --git a/modelscope/msdatasets/cv/__init__.py b/modelscope/msdatasets/cv/__init__.py
new file mode 100644
index 00000000..fad91bcf
--- /dev/null
+++ b/modelscope/msdatasets/cv/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import (image_classification, image_semantic_segmentation,
+               object_detection)
diff --git a/modelscope/msdatasets/cv/image_classification/__init__.py b/modelscope/msdatasets/cv/image_classification/__init__.py
new file mode 100644
index 00000000..95e8d7a1
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .classification_dataset import ClsDataset
+
+else:
+    _import_structure = {'classification_dataset': ['ClsDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_classification/classification_dataset.py b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
new file mode 100644
index 00000000..c7145f2b
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_classification/classification_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.classification import ClsDataset as _ClsDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_classification, module_name=Datasets.ClsDataset)
+class ClsDataset(_ClsDataset):
+    """EasyCV dataset for classification.
+    For more details, please refer to :
+    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/classification/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Sequence of transform object or config dict to be composed.
+    """
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
new file mode 100644
index 00000000..26121bdb
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .segmentation_dataset import SegDataset
+
+else:
+    _import_structure = {'easycv_segmentation': ['SegDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
new file mode 100644
index 00000000..21114c11
--- /dev/null
+++ b/modelscope/msdatasets/cv/image_semantic_segmentation/segmentation_dataset.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.segmentation import SegDataset as _SegDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Datasets.SegDataset)
+class SegDataset(_SegDataset):
+    """EasyCV dataset for Sementic segmentation.
+    For more details, please refer to :
+    https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/segmentation/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Sequence of transform object or config dict to be composed.
+        ignore_index (int): Label index to be ignored.
+        profiling: If set True, will print transform time.
+    """
diff --git a/modelscope/msdatasets/cv/object_detection/__init__.py b/modelscope/msdatasets/cv/object_detection/__init__.py
new file mode 100644
index 00000000..30af2d9b
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .easycv_detection import DetDataset, DetImagesMixDataset
+
+else:
+    _import_structure = {
+        'easycv_detection': ['DetDataset', 'DetImagesMixDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
new file mode 100644
index 00000000..5b130a3e
--- /dev/null
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.detection import DetDataset as _DetDataset
+from easycv.datasets.detection import \
+    DetImagesMixDataset as _DetImagesMixDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
+class DetDataset(_DetDataset):
+    """EasyCV dataset for object detection.
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
+
+    Args:
+        data_source: Data source config to parse input data.
+        pipeline: Transform config list
+        profiling: If set True, will print pipeline time
+        classes: A list of class names, used in evaluation for result and groundtruth visualization
+    """
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Datasets.DetImagesMixDataset)
+class DetImagesMixDataset(_DetImagesMixDataset):
+    """EasyCV dataset for object detection, a wrapper of multiple images mixed dataset.
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+    output boxes format: cx, cy, w, h
+
+    For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/mix.py .
+
+    Args:
+        data_source (:obj:`DetSourceCoco`): Data source config to parse input data.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        label_padding: out labeling padding [N, 120, 5]
+    """
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 6e4486dd..338c6333 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,9 +1,11 @@
+import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import json
 import numpy as np
+import torch
 from datasets import Dataset, DatasetDict
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
@@ -12,9 +14,11 @@ from datasets.utils.download_manager import DownloadConfig
 from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
-from modelscope.msdatasets.config import MS_DATASETS_CACHE
+from modelscope.hub.repository import DatasetRepository
 from modelscope.utils.config import ConfigDict
-from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+from modelscope.utils.config_ds import MS_DATASETS_CACHE
+from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
+                                       DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
 from .task_datasets.builder import build_task_dataset
@@ -23,6 +27,7 @@ from .utils.dataset_utils import (get_dataset_files,
                                   get_target_dataset_structure,
                                   load_dataset_builder)
 from .utils.download_utils import DatasetDownloadManager
+from .utils.upload_utils import DatasetUploadManager
 
 logger = get_logger()
 
@@ -37,6 +42,46 @@ def format_list(para) -> List:
     return para
 
 
+class MsIterableDataset(torch.utils.data.IterableDataset):
+
+    def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
+                 columns):
+        super(MsIterableDataset).__init__()
+        self.dataset = dataset
+        self.preprocessor_list = preprocessor_list
+        self.retained_columns = retained_columns
+        self.columns = columns
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:  # single-process data loading
+            iter_start = 0
+            iter_end = len(self.dataset)
+        else:  # in a worker process
+            per_worker = math.ceil(
+                len(self.dataset) / float(worker_info.num_workers))
+            worker_id = worker_info.id
+            iter_start = worker_id * per_worker
+            iter_end = min(iter_start + per_worker, len(self.dataset))
+
+        for idx in range(iter_start, iter_end):
+            item_dict = self.dataset[idx]
+            res = {
+                k: np.array(item_dict[k])
+                for k in self.columns if k in self.retained_columns
+            }
+            for preprocessor in self.preprocessor_list:
+                res.update({
+                    k: np.array(v)
+                    for k, v in preprocessor(item_dict).items()
+                    if k in self.retained_columns
+                })
+            yield res
+
+
 class MsDataset:
     """
     ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
@@ -97,7 +142,7 @@ class MsDataset:
     @staticmethod
     def load(
         dataset_name: Union[str, list],
-        namespace: Optional[str] = 'modelscope',
+        namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
         target: Optional[str] = None,
         version: Optional[str] = DEFAULT_DATASET_REVISION,
         hub: Optional[Hubs] = Hubs.modelscope,
@@ -171,15 +216,17 @@ class MsDataset:
                              Mapping[str, Union[str, Sequence[str]]]]] = None,
                          download_mode: Optional[DownloadMode] = None,
                          **config_kwargs) -> Union[dict, 'MsDataset']:
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        download_dataset = ''
         if isinstance(dataset_name, str):
+            download_dataset = dataset_name
             dataset_formation = DatasetFormations.native
             if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                     (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                 dataset_formation = DatasetFormations.hf_compatible
             elif is_relative_path(dataset_name) and dataset_name.count(
                     '/') == 0:
-                from modelscope.hub.api import HubApi
-                api = HubApi()
                 dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
                     dataset_name, namespace, download_mode, version)
                 # dataset organized to be compatible with hf format
@@ -219,6 +266,11 @@ class MsDataset:
         else:
             raise TypeError('path must be a str or a list, but got'
                             f' {type(dataset_name)}')
+
+        if download_dataset:
+            api.on_dataset_download(
+                dataset_name=download_dataset, namespace=namespace)
+
         return MsDataset.from_hf_dataset(dataset, target=target)
 
     @staticmethod
@@ -238,15 +290,15 @@ class MsDataset:
                 break
         target_subset_name, target_dataset_structure = get_target_dataset_structure(
             dataset_json, subset_name, split)
-        meta_map, file_map = get_dataset_files(target_dataset_structure,
-                                               dataset_name, namespace,
-                                               version)
+        meta_map, file_map, args_map = get_dataset_files(
+            target_dataset_structure, dataset_name, namespace, version)
         builder = load_dataset_builder(
             dataset_name,
             subset_name,
             namespace,
             meta_data_files=meta_map,
             zip_data_files=file_map,
+            args_map=args_map,
             cache_dir=MS_DATASETS_CACHE,
             version=version,
             split=list(target_dataset_structure.keys()),
@@ -308,45 +360,8 @@ class MsDataset:
                 continue
             retained_columns.append(k)
 
-        import math
-        import torch
-
-        class MsIterableDataset(torch.utils.data.IterableDataset):
-
-            def __init__(self, dataset: Iterable):
-                super(MsIterableDataset).__init__()
-                self.dataset = dataset
-
-            def __len__(self):
-                return len(self.dataset)
-
-            def __iter__(self):
-                worker_info = torch.utils.data.get_worker_info()
-                if worker_info is None:  # single-process data loading
-                    iter_start = 0
-                    iter_end = len(self.dataset)
-                else:  # in a worker process
-                    per_worker = math.ceil(
-                        len(self.dataset) / float(worker_info.num_workers))
-                    worker_id = worker_info.id
-                    iter_start = worker_id * per_worker
-                    iter_end = min(iter_start + per_worker, len(self.dataset))
-
-                for idx in range(iter_start, iter_end):
-                    item_dict = self.dataset[idx]
-                    res = {
-                        k: np.array(item_dict[k])
-                        for k in columns if k in retained_columns
-                    }
-                    for preprocessor in preprocessor_list:
-                        res.update({
-                            k: np.array(v)
-                            for k, v in preprocessor(item_dict).items()
-                            if k in retained_columns
-                        })
-                    yield res
-
-        return MsIterableDataset(self._hf_ds)
+        return MsIterableDataset(self._hf_ds, preprocessor_list,
+                                 retained_columns, columns)
 
     def to_torch_dataset(
         self,
@@ -539,3 +554,100 @@ class MsDataset:
     def to_hf_dataset(self) -> Dataset:
         self._hf_ds.reset_format()
         return self._hf_ds
+
+    @staticmethod
+    def upload(object_name: str,
+               local_file_path: str,
+               dataset_name: str,
+               namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
+               version: Optional[str] = DEFAULT_DATASET_REVISION) -> None:
+        """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first.
+
+        Args:
+            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip
+            local_file_path (str): Local file to upload
+            dataset_name (str): Name of the dataset
+            namespace(str, optional): Namespace of the dataset
+            version: Optional[str]: Version of the dataset
+
+        Returns:
+            None
+
+        """
+        from modelscope.hub.api import HubApi
+        _hub_api = HubApi()
+        cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
+        _upload_manager = DatasetUploadManager(
+            dataset_name=dataset_name,
+            namespace=namespace,
+            version=version,
+            cookies=cookies)
+        _upload_manager.upload(object_name, local_file_path)
+
+    @staticmethod
+    def clone_meta(dataset_work_dir: str,
+                   dataset_id: str,
+                   revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                   auth_token: Optional[str] = None,
+                   git_path: Optional[str] = None) -> None:
+        """Clone meta-file of dataset from the ModelScope Hub.
+        Args:
+            dataset_work_dir (str): Current git working directory.
+            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+        Returns:
+            None
+        """
+
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id=dataset_id,
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        clone_work_dir = _repo.clone()
+        if clone_work_dir:
+            logger.info('Already cloned repo to: {}'.format(clone_work_dir))
+        else:
+            logger.warning('The repo working dir is already ex.')
+
+    @staticmethod
+    def upload_meta(dataset_work_dir: str,
+                    dataset_id: str,
+                    commit_message: str,
+                    revision: Optional[str] = DEFAULT_DATASET_REVISION,
+                    auth_token: Optional[str] = None,
+                    git_path: Optional[str] = None,
+                    force: bool = False) -> None:
+        """Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.
+
+        Args:
+            dataset_work_dir (str): Current working directory.
+            dataset_id (str): Dataset id, It should be like your-namespace/your-dataset-name .
+            commit_message (str): Commit message.
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time, if None, we will use saved token.
+            git_path:(`Optional[str]`):
+                The git command line path, if None, we use 'git'
+            force (Optional[bool]): whether to use forced-push.
+
+        Returns:
+            None
+
+        """
+        _repo = DatasetRepository(
+            repo_work_dir=dataset_work_dir,
+            dataset_id=dataset_id,
+            revision=revision,
+            auth_token=auth_token,
+            git_path=git_path)
+        _repo.push(commit_message=commit_message, branch=revision, force=force)
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index c80f8cd5..f97ff8b2 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -9,6 +9,8 @@ if TYPE_CHECKING:
     from .torch_base_dataset import TorchTaskDataset
     from .veco_dataset import VecoDataset
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
+    from .movie_scene_segmentation import MovieSceneSegmentationDataset
+    from .video_summarization_dataset import VideoSummarizationDataset
 
 else:
     _import_structure = {
@@ -17,7 +19,9 @@ else:
         'torch_base_dataset': ['TorchTaskDataset'],
         'veco_dataset': ['VecoDataset'],
         'image_instance_segmentation_coco_dataset':
-        ['ImageInstanceSegmentationCocoDataset']
+        ['ImageInstanceSegmentationCocoDataset'],
+        'video_summarization_dataset': ['VideoSummarizationDataset'],
+        'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index 04c8e142..10cf7bfb 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -59,18 +59,21 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
                  preprocessor=None,
                  classes=None,
                  seg_prefix=None,
+                 folder_name=None,
+                 ann_file=None,
+                 img_prefix=None,
                  test_mode=False,
                  filter_empty_gt=True,
                  **kwargs):
-        self.data_root = next(iter(split_config.values()))
+        data_root = next(iter(split_config.values()))
+        self.data_root = osp.join(data_root,
+                                  folder_name) if folder_name else data_root
         self.split = next(iter(split_config.keys()))
         self.preprocessor = preprocessor
 
-        self.ann_file = osp.join(self.data_root,
-                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.ann_file = osp.join(self.data_root, ann_file)
 
-        self.img_prefix = osp.join(self.data_root,
-                                   DATASET_STRUCTURE[self.split]['images'])
+        self.img_prefix = osp.join(self.data_root, img_prefix)
         self.seg_prefix = seg_prefix
         self.test_mode = test_mode
         self.filter_empty_gt = filter_empty_gt
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..e56039ac
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
@@ -0,0 +1 @@
+from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
new file mode 100644
index 00000000..925d6281
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -0,0 +1,173 @@
+# ---------------------------------------------------------------------------------------------------
+# The implementation is built upon BaSSL, publicly available at https://github.com/kakaobrain/bassl
+# ---------------------------------------------------------------------------------------------------
+import copy
+import os
+import os.path as osp
+import random
+
+import json
+import torch
+from torchvision.datasets.folder import pil_loader
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from . import sampler
+
+DATASET_STRUCTURE = {
+    'train': {
+        'annotation': 'anno/train.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    },
+    'test': {
+        'annotation': 'anno/test.json',
+        'images': 'keyf_240p',
+        'feat': 'feat'
+    }
+}
+
+
+@TASK_DATASETS.register_module(
+    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
+class MovieSceneSegmentationDataset(TorchTaskDataset):
+    """dataset for movie scene segmentation.
+
+    Args:
+        split_config (dict): Annotation file path. {"train":"xxxxx"}
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+    """
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+
+        self.split = next(iter(split_config.keys()))
+        self.preprocessor = kwargs['preprocessor']
+
+        self.ann_file = osp.join(self.data_root,
+                                 DATASET_STRUCTURE[self.split]['annotation'])
+        self.img_prefix = osp.join(self.data_root,
+                                   DATASET_STRUCTURE[self.split]['images'])
+        self.feat_prefix = osp.join(self.data_root,
+                                    DATASET_STRUCTURE[self.split]['feat'])
+
+        self.test_mode = kwargs['test_mode']
+        if self.test_mode:
+            self.preprocessor.eval()
+        else:
+            self.preprocessor.train()
+
+        self.cfg = kwargs.pop('cfg', None)
+
+        self.num_keyframe = self.cfg.num_keyframe if self.cfg is not None else 3
+        self.use_single_keyframe = self.cfg.use_single_keyframe if self.cfg is not None else False
+
+        self.load_data()
+        self.init_sampler(self.cfg)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.anno_data)
+
+    def __getitem__(self, idx: int):
+        data = self.anno_data[
+            idx]  # {"video_id", "shot_id", "num_shot", "boundary_label"}
+        vid, sid = data['video_id'], data['shot_id']
+        num_shot = data['num_shot']
+
+        shot_idx = self.shot_sampler(int(sid), num_shot)
+
+        video = self.load_shot_list(vid, shot_idx)
+        if self.preprocessor is None:
+            video = torch.stack(video, dim=0)
+            video = video.view(-1, self.num_keyframe, 3, 224, 224)
+        else:
+            video = self.preprocessor(video)
+
+        payload = {
+            'idx': idx,
+            'vid': vid,
+            'sid': sid,
+            'video': video,
+            'label': abs(data['boundary_label']),  # ignore -1 label.
+        }
+        return payload
+
+    def load_data(self):
+        self.tmpl = '{}/shot_{}_img_{}.jpg'  # video_id, shot_id, shot_num
+
+        if not self.test_mode:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+            self.vidsid2label = {
+                f"{it['video_id']}_{it['shot_id']}": it['boundary_label']
+                for it in self.anno_data
+            }
+        else:
+            with open(self.ann_file) as f:
+                self.anno_data = json.load(f)
+
+    def init_sampler(self, cfg):
+        # shot sampler
+        if cfg is not None:
+            self.sampling_method = cfg.sampling_method.name
+            sampler_args = copy.deepcopy(
+                cfg.sampling_method.params.get(self.sampling_method, {}))
+            if self.sampling_method == 'instance':
+                self.shot_sampler = sampler.InstanceShotSampler()
+            elif self.sampling_method == 'temporal':
+                self.shot_sampler = sampler.TemporalShotSampler(**sampler_args)
+            elif self.sampling_method == 'shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'bassl+shotcol':
+                self.shot_sampler = sampler.SequenceShotSampler(**sampler_args)
+            elif self.sampling_method == 'sbd':
+                self.shot_sampler = sampler.NeighborShotSampler(**sampler_args)
+            else:
+                raise NotImplementedError
+        else:
+            self.shot_sampler = sampler.NeighborShotSampler()
+
+    def load_shot_list(self, vid, shot_idx):
+        shot_list = []
+        cache = {}
+        for sidx in shot_idx:
+            vidsid = f'{vid}_{sidx:04d}'
+            if vidsid in cache:
+                shot = cache[vidsid]
+            else:
+                shot_path = os.path.join(
+                    self.img_prefix, self.tmpl.format(vid, f'{sidx:04d}',
+                                                      '{}'))
+                shot = self.load_shot_keyframes(shot_path)
+                cache[vidsid] = shot
+            shot_list.extend(shot)
+        return shot_list
+
+    def load_shot_keyframes(self, path):
+        shot = None
+        if not self.test_mode and self.use_single_keyframe:
+            # load one randomly sampled keyframe
+            shot = [
+                pil_loader(
+                    path.format(random.randint(0, self.num_keyframe - 1)))
+            ]
+        else:
+            # load all keyframes
+            shot = [
+                pil_loader(path.format(i)) for i in range(self.num_keyframe)
+            ]
+        assert shot is not None
+        return shot
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
new file mode 100644
index 00000000..0fc2fe0f
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/sampler.py
@@ -0,0 +1,102 @@
+# ------------------------------------------------------------------------------------
+# BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+
+import random
+
+import numpy as np
+
+
+class InstanceShotSampler:
+    """ This is for instance at pre-training stage """
+
+    def __call__(self, center_sid: int, *args, **kwargs):
+        return center_sid
+
+
+class TemporalShotSampler:
+    """ This is for temporal at pre-training stage """
+
+    def __init__(self, neighbor_size: int):
+        self.N = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        """ we randomly sample one shot from neighbor shots within local temporal window
+        """
+        shot_idx = center_sid + np.arange(
+            -self.N, self.N + 1
+        )  # total number of neighbor shots = 2N+1 (query (1) + neighbors (2*N))
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # deal with out-of-boundary indices
+        shot_idx = random.choice(
+            np.unique(np.delete(shot_idx, np.where(shot_idx == center_sid))))
+        return shot_idx
+
+
+class SequenceShotSampler:
+    """ This is for bassl or shotcol at pre-training stage """
+
+    def __init__(self, neighbor_size: int, neighbor_interval: int):
+        self.interval = neighbor_interval
+        self.window_size = neighbor_size * self.interval  # temporal coverage
+
+    def __call__(self,
+                 center_sid: int,
+                 total_num_shot: int,
+                 sparse_method: str = 'edge'):
+        """
+        Args:
+            center_sid: index of center shot
+            total_num_shot: last index of shot for given video
+            sparse_stride: stride to sample sparse ones from dense sequence
+                    for curriculum learning
+        """
+
+        dense_shot_idx = center_sid + np.arange(
+            -self.window_size, self.window_size + 1,
+            self.interval)  # total number of shots = 2*neighbor_size+1
+
+        if dense_shot_idx[0] < 0:
+            # if center_sid is near left-side of video, we shift window rightward
+            # so that the leftmost index is 0
+            dense_shot_idx -= dense_shot_idx[0]
+        elif dense_shot_idx[-1] > (total_num_shot - 1):
+            # if center_sid is near right-side of video, we shift window leftward
+            # so that the rightmost index is total_num_shot - 1
+            dense_shot_idx -= dense_shot_idx[-1] - (total_num_shot - 1)
+
+        # to deal with videos that have smaller number of shots than window size
+        dense_shot_idx = np.clip(dense_shot_idx, 0, total_num_shot)
+
+        if sparse_method == 'edge':
+            # in this case, we use two edge shots as sparse sequence
+            sparse_stride = len(dense_shot_idx) - 1
+            sparse_idx_to_dense = np.arange(0, len(dense_shot_idx),
+                                            sparse_stride)
+        elif sparse_method == 'edge+center':
+            # in this case, we use two edge shots + center shot as sparse sequence
+            sparse_idx_to_dense = np.array(
+                [0, len(dense_shot_idx) - 1,
+                 len(dense_shot_idx) // 2])
+
+        shot_idx = [sparse_idx_to_dense, dense_shot_idx]
+        return shot_idx
+
+
+class NeighborShotSampler:
+    """ This is for scene boundary detection (sbd), i.e., fine-tuning stage """
+
+    def __init__(self, neighbor_size: int = 8):
+        self.neighbor_size = neighbor_size
+
+    def __call__(self, center_sid: int, total_num_shot: int):
+        # total number of shots = 2 * neighbor_size + 1
+        shot_idx = center_sid + np.arange(-self.neighbor_size,
+                                          self.neighbor_size + 1)
+        shot_idx = np.clip(shot_idx, 0,
+                           total_num_shot)  # for out-of-boundary indices
+
+        return shot_idx
diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
new file mode 100644
index 00000000..89deb7ba
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
@@ -0,0 +1,69 @@
+import os
+
+import h5py
+import json
+import numpy as np
+import torch
+
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+
+
+class VideoSummarizationDataset(TorchTaskDataset):
+
+    def __init__(self, mode, opt, root_dir):
+        self.mode = mode
+        self.data_filename = os.path.join(root_dir, opt.dataset_file)
+        self.split_filename = os.path.join(root_dir, opt.split_file)
+        self.split_index = opt.split_index  # it represents the current split (varies from 0 to 4)
+        hdf = h5py.File(self.data_filename, 'r')
+        self.list_frame_features, self.list_gtscores = [], []
+        self.list_user_summary = []
+        self.list_change_points = []
+        self.list_n_frames = []
+        self.list_positions = []
+
+        with open(self.split_filename) as f:
+            data = json.loads(f.read())
+            for i, split in enumerate(data):
+                if i == self.split_index:
+                    self.split = split
+                    break
+
+        for video_name in self.split[self.mode + '_keys']:
+            frame_features = torch.Tensor(
+                np.array(hdf[video_name + '/features']))
+            gtscore = torch.Tensor(np.array(hdf[video_name + '/gtscore']))
+            user_summary = np.array(hdf[f'{video_name}/user_summary'])
+            change_points = np.array(hdf[f'{video_name}/change_points'])
+            n_frames = np.array(hdf[f'{video_name}/n_frames'])
+            positions = np.array(hdf[f'{video_name}/picks'])
+
+            self.list_frame_features.append(frame_features)
+            self.list_gtscores.append(gtscore)
+            self.list_user_summary.append(user_summary)
+            self.list_change_points.append(change_points)
+            self.list_n_frames.append(n_frames)
+            self.list_positions.append(positions)
+
+        hdf.close()
+
+    def __len__(self):
+        self.len = len(self.split[self.mode + '_keys'])
+        return self.len
+
+    def __getitem__(self, index):
+        frame_features = self.list_frame_features[index]
+        gtscore = self.list_gtscores[index]
+        user_summary = self.list_user_summary[index]
+        change_points = self.list_change_points[index]
+        n_frames = self.list_n_frames[index]
+        positions = self.list_positions[index]
+
+        return dict(
+            frame_features=frame_features,
+            gtscore=gtscore,
+            user_summary=user_summary,
+            change_points=change_points,
+            n_frames=n_frames,
+            positions=positions)
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 85489c58..825400c4 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -5,10 +5,11 @@ import datasets
 import pandas as pd
 import pyarrow as pa
 from datasets.info import DatasetInfo
+from datasets.naming import camelcase_to_snakecase
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock
 
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,7 +28,6 @@ class MsCsvDatasetBuilder(csv.Csv):
         zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
         **config_kwargs,
     ):
-        self.namespace = namespace
         super().__init__(
             cache_dir=cache_dir,
             name=subset_name,
@@ -35,9 +35,9 @@ class MsCsvDatasetBuilder(csv.Csv):
             data_files=meta_data_files,
             **config_kwargs)
 
-        self.name = dataset_name
-        self.info.builder_name = self.name
-        self._cache_dir = self._build_cache_dir()
+        self.name = camelcase_to_snakecase(dataset_name)
+        self.info.builder_name = dataset_name
+        self._cache_dir = self._build_cache_dir(namespace=namespace)
         lock_path = os.path.join(
             self._cache_dir_root,
             self._cache_dir.replace(os.sep, '_') + '.lock')
@@ -48,7 +48,6 @@ class MsCsvDatasetBuilder(csv.Csv):
                     logger.info(
                         f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}'
                     )
-                    self.info = DatasetInfo.from_directory(self._cache_dir)
                 # dir exists but no data, remove the empty dir as data aren't available anymore
                 else:
                     logger.warning(
@@ -57,14 +56,17 @@ class MsCsvDatasetBuilder(csv.Csv):
                     os.rmdir(self._cache_dir)
         self.zip_data_files = zip_data_files
 
-    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+    def _relative_data_dir(self,
+                           with_version=True,
+                           with_hash=True,
+                           namespace=DEFAULT_DATASET_NAMESPACE) -> str:
         """Relative path of this dataset in cache_dir:
         Will be:
             self.name/self.config.version/self.hash/
         or if a namespace has been specified:
             self.namespace___self.name/self.config.version/self.hash/
         """
-        builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
+        builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
         builder_config = self.config
         hash = self.hash
         if builder_config:
@@ -76,10 +78,11 @@ class MsCsvDatasetBuilder(csv.Csv):
             builder_data_dir = os.path.join(builder_data_dir, hash)
         return builder_data_dir
 
-    def _build_cache_dir(self):
+    def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
         builder_data_dir = os.path.join(
             self._cache_dir_root,
-            self._relative_data_dir(with_version=False, with_hash=True))
+            self._relative_data_dir(
+                with_version=False, with_hash=True, namespace=namespace))
 
         return builder_data_dir
 
@@ -97,15 +100,8 @@ class MsCsvDatasetBuilder(csv.Csv):
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        'files':
-                        dl_manager.iter_files(files),
-                        'base_dir':
-                        os.path.join(
-                            zip_data_files.get(split_name),
-                            os.path.splitext(
-                                self.zip_data_files.get(split_name))[0])
-                        if self.zip_data_files.get(split_name) else
-                        zip_data_files.get(split_name)
+                        'files': dl_manager.iter_files(files),
+                        'base_dir': zip_data_files.get(split_name)
                     }))
         return splits
 
@@ -161,6 +157,7 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self.zip_data_files = zip_data_files
         self.split_path_dict = None
         self.config = None
+        self.info = DatasetInfo.from_dict({'builder_name': dataset_name})
         self._cache_dir_root = os.path.expanduser(cache_dir)
         self._cache_dir = self._build_cache_dir()
         self._config_kwargs = config_kwargs
@@ -181,12 +178,8 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
         self._download_and_prepare(dl_manager=dl_manager)
 
     def _download_and_prepare(self, dl_manager):
-        split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
-        self.split_path_dict = {
-            k: os.path.join(v,
-                            os.path.splitext(self.zip_data_files[k])[0])
-            for k, v in split_path_dict.items()
-        }
+        self.split_path_dict = dl_manager.download_and_extract(
+            self.zip_data_files)
 
     def as_dataset(self):
         return ExternalDataset(self.split_path_dict, self._config_kwargs)
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index 09556d84..769bed93 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -1,6 +1,6 @@
 import os
 from collections import defaultdict
-from typing import Mapping, Optional, Sequence, Union
+from typing import Any, Mapping, Optional, Sequence, Union
 
 from datasets.builder import DatasetBuilder
 
@@ -11,6 +11,14 @@ from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 logger = get_logger()
 
 
+def format_dataset_structure(dataset_structure):
+    return {
+        k: v
+        for k, v in dataset_structure.items()
+        if (v.get('meta') or v.get('file'))
+    }
+
+
 def get_target_dataset_structure(dataset_structure: dict,
                                  subset_name: Optional[str] = None,
                                  split: Optional[str] = None):
@@ -56,7 +64,8 @@ def get_target_dataset_structure(dataset_structure: dict,
             f'No subset_name specified, defaulting to the {target_subset_name}'
         )
     # verify dataset split
-    target_dataset_structure = dataset_structure[target_subset_name]
+    target_dataset_structure = format_dataset_structure(
+        dataset_structure[target_subset_name])
     if split and split not in target_dataset_structure:
         raise ValueError(
             f'split {split} not found. Available: {target_dataset_structure.keys()}'
@@ -83,6 +92,7 @@ def get_dataset_files(subset_split_into: dict,
     """
     meta_map = defaultdict(dict)
     file_map = defaultdict(dict)
+    args_map = defaultdict(dict)
     from modelscope.hub.api import HubApi
     modelscope_api = HubApi()
     for split, info in subset_split_into.items():
@@ -90,7 +100,8 @@ def get_dataset_files(subset_split_into: dict,
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
-    return meta_map, file_map
+        args_map[split] = info.get('args')
+    return meta_map, file_map, args_map
 
 
 def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
@@ -98,12 +109,16 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
                                                              Sequence[str]]],
                          zip_data_files: Mapping[str, Union[str,
                                                             Sequence[str]]],
-                         cache_dir: str, version: Optional[Union[str]],
-                         split: Sequence[str],
+                         args_map: Mapping[str, Any], cache_dir: str,
+                         version: Optional[Union[str]], split: Sequence[str],
                          **config_kwargs) -> DatasetBuilder:
     sub_dir = os.path.join(version, '_'.join(split))
     meta_data_file = next(iter(meta_data_files.values()))
     if not meta_data_file:
+        args_map = next(iter(args_map.values()))
+        if args_map is None:
+            args_map = {}
+        args_map.update(config_kwargs)
         builder_instance = TaskSpecificDatasetBuilder(
             dataset_name=dataset_name,
             namespace=namespace,
@@ -112,7 +127,7 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
             meta_data_files=meta_data_files,
             zip_data_files=zip_data_files,
             hash=sub_dir,
-            **config_kwargs)
+            **args_map)
     elif meta_data_file.endswith('.csv'):
         builder_instance = MsCsvDatasetBuilder(
             dataset_name=dataset_name,
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index bc637f0e..eb1c99ef 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -34,8 +34,8 @@ class DatasetDownloadManager(DownloadManager):
         url_or_filename = str(url_or_filename)
         if is_relative_path(url_or_filename):
             # fetch oss files
-            return self.oss_utilities.download(url_or_filename,
-                                               self.download_config.cache_dir)
+            return self.oss_utilities.download(
+                url_or_filename, download_config=download_config)
         else:
             return cached_path(
                 url_or_filename, download_config=download_config)
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 83cfc7dd..63a1cf77 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,6 +1,5 @@
 from __future__ import print_function
 import os
-import sys
 
 import oss2
 from datasets.utils.file_utils import hash_url_to_filename
@@ -19,7 +18,14 @@ class OssUtilities:
         self.oss_dir = oss_config['Dir']
         self.oss_backup_dir = oss_config['BackupDir']
 
-    def download(self, oss_file_name, cache_dir):
+    @staticmethod
+    def _percentage(consumed_bytes, total_bytes):
+        if total_bytes:
+            rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
+            print('\r{0}% '.format(rate), end='', flush=True)
+
+    def download(self, oss_file_name, download_config):
+        cache_dir = download_config.cache_dir
         candidate_key = os.path.join(self.oss_dir, oss_file_name)
         candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
         file_oss_key = candidate_key if self.bucket.object_exists(
@@ -27,11 +33,30 @@ class OssUtilities:
         filename = hash_url_to_filename(file_oss_key, etag=None)
         local_path = os.path.join(cache_dir, filename)
 
-        def percentage(consumed_bytes, total_bytes):
-            if total_bytes:
-                rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
-                print('\r{0}% '.format(rate), end='', flush=True)
-
-        self.bucket.get_object_to_file(
-            file_oss_key, local_path, progress_callback=percentage)
+        if download_config.force_download or not os.path.exists(local_path):
+            oss2.resumable_download(
+                self.bucket,
+                file_oss_key,
+                local_path,
+                multiget_threshold=0,
+                progress_callback=self._percentage)
         return local_path
+
+    def upload(self, oss_file_name: str, local_file_path: str) -> str:
+        max_retries = 3
+        retry_count = 0
+        object_key = os.path.join(self.oss_dir, oss_file_name)
+
+        while True:
+            try:
+                retry_count += 1
+                self.bucket.put_object_from_file(
+                    object_key,
+                    local_file_path,
+                    progress_callback=self._percentage)
+                break
+            except Exception:
+                if retry_count >= max_retries:
+                    raise
+
+        return object_key
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
new file mode 100644
index 00000000..eff3aca0
--- /dev/null
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -0,0 +1,23 @@
+from http.cookiejar import CookieJar
+
+from .oss_utils import OssUtilities
+
+
+class DatasetUploadManager(object):
+
+    def __init__(self, dataset_name: str, namespace: str, version: str,
+                 cookies: CookieJar):
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        oss_config = api.get_dataset_access_config_session(
+            cookies=cookies,
+            dataset_name=dataset_name,
+            namespace=namespace,
+            revision=version)
+
+        self.oss_utilities = OssUtilities(oss_config)
+
+    def upload(self, oss_file_name: str, local_file_path: str) -> str:
+        oss_object_key = self.oss_utilities.upload(
+            oss_file_name=oss_file_name, local_file_path=local_file_path)
+        return oss_object_key
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 200a03cd..aebb9138 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -7,6 +7,7 @@ class OutputKeys(object):
     LOSS = 'loss'
     LOGITS = 'logits'
     SCORES = 'scores'
+    SCORE = 'score'
     LABEL = 'label'
     LABELS = 'labels'
     INPUT_IDS = 'input_ids'
@@ -34,6 +35,8 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    SPLIT_VIDEO_NUM = 'split_video_num'
+    SPLIT_META_DICT = 'split_meta_dict'
 
 
 TASK_OUTPUTS = {
@@ -188,6 +191,16 @@ TASK_OUTPUTS = {
     Tasks.body_2d_keypoints:
     [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
 
+    # 3D human body keypoints detection result for single sample
+    # {
+    #   "poses": [
+    #               [[x, y, z]*17],
+    #               [[x, y, z]*17],
+    #               [[x, y, z]*17]
+    #             ]
+    # }
+    Tasks.body_3d_keypoints: [OutputKeys.POSES],
+
     # video single object tracking result for single video
     # {
     #   "boxes": [
@@ -230,6 +243,22 @@ TASK_OUTPUTS = {
     # }
     Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
 
+    # movide scene segmentation result for a single video
+    # {
+    #        "split_video_num":3,
+    #        "split_meta_dict":
+    #        {
+    #           scene_id:
+    #           {
+    #               "shot": [0,1,2],
+    #               "frame": [start_frame, end_frame]
+    #           }
+    #        }
+    #
+    # }
+    Tasks.movie_scene_segmentation:
+    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_DICT],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
@@ -273,8 +302,7 @@ TASK_OUTPUTS = {
     #     "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲，石顺义作词，王锡仁作曲，范琳琳演唱",
     #     "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
     # }
-    Tasks.relation_extraction:
-    [OutputKeys.UUID, OutputKeys.TEXT, OutputKeys.SPO_LIST],
+    Tasks.relation_extraction: [OutputKeys.SPO_LIST],
 
     # translation result for a source sentence
     #   {
@@ -488,6 +516,15 @@ TASK_OUTPUTS = {
     Tasks.generative_multi_modal_embedding:
     [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION],
 
+    # multi-modal similarity result for single sample
+    # {
+    #   "img_embedding": np.array with shape [1, D],
+    #   "text_embedding": np.array with shape [1, D],
+    #   "similarity": float
+    # }
+    Tasks.multi_modal_similarity:
+    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+
     # VQA result for a sample
     # {"text": "this is a text answser. "}
     Tasks.visual_question_answering: [OutputKeys.TEXT],
@@ -503,4 +540,20 @@ TASK_OUTPUTS = {
     #       "labels": ["entailment", "contradiction", "neutral"]
     # }
     Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
+
+    # {
+    #   'output': [
+    #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
+    #      {'label': '13421097', 'score': 2.2825044965202324e-08}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}],
+    #     [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
+    #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
+    # }
+    Tasks.faq_question_answering: [OutputKeys.OUTPUT],
+    # image person reid result for single sample
+    #   {
+    #       "img_embedding": np.array with shape [1, D],
+    #   }
+    Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
 }
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index e9cb8db3..5ed4d769 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -10,20 +10,8 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import audio_norm
 from modelscope.utils.constant import Tasks
-from modelscope.utils.torch_utils import create_device
-
-
-def audio_norm(x):
-    rms = (x**2).mean()**0.5
-    scalar = 10**(-25 / 20) / rms
-    x = x * scalar
-    pow_x = x**2
-    avg_pow_x = pow_x.mean()
-    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
-    scalarx = 10**(-25 / 20) / rmsx
-    x = x * scalarx
-    return x
 
 
 @PIPELINES.register_module(
@@ -99,7 +87,8 @@ class ANSPipeline(Pipeline):
                 current_idx = 0
                 while current_idx + window <= t:
                     print('current_idx: {}'.format(current_idx))
-                    tmp_input = ndarray[:, current_idx:current_idx + window]
+                    tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+                                                   + window])
                     tmp_output = self.model(
                         tmp_input, )['wav_l2'][0].cpu().numpy()
                     end_index = current_idx + window - give_up_length
@@ -112,7 +101,8 @@ class ANSPipeline(Pipeline):
                                     give_up_length:-give_up_length]
                     current_idx += stride
             else:
-                outputs = self.model(ndarray)['wav_l2'][0].cpu().numpy()
+                outputs = self.model(
+                    dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
         outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
         return {OutputKeys.OUTPUT_PCM: outputs}
 
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 041dfb34..c0f3cbd0 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -14,9 +14,10 @@ from modelscope.outputs import TASK_OUTPUTS
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
+from modelscope.utils.device import (create_device, device_placement,
+                                     verify_device)
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
-from modelscope.utils.torch_utils import create_device
 from .util import is_model, is_official_hub_path
 
 if is_torch_available():
@@ -41,7 +42,8 @@ class Pipeline(ABC):
             logger.info(f'initiate model from location {model}.')
             # expecting model has been prefetched to local cache beforehand
             return Model.from_pretrained(
-                model, model_prefetched=True) if is_model(model) else model
+                model, model_prefetched=True,
+                device=self.device_name) if is_model(model) else model
         elif isinstance(model, Model):
             return model
         else:
@@ -74,11 +76,15 @@ class Pipeline(ABC):
             config_file(str, optional): Filepath to configuration file.
             model: (list of) Model name or model object
             preprocessor: (list of) Preprocessor object
-            device (str): gpu device or cpu device to use
+            device (str): device str, should be either cpu, cuda, gpu, gpu:X or cuda:X
             auto_collate (bool): automatically to convert data to tensor or not.
         """
         if config_file is not None:
             self.cfg = Config.from_file(config_file)
+
+        verify_device(device)
+        self.device_name = device
+
         if not isinstance(model, List):
             self.model = self.initiate_single_model(model)
             self.models = [self.model]
@@ -94,15 +100,15 @@ class Pipeline(ABC):
         else:
             self.framework = None
 
-        assert device in ['gpu', 'cpu'], 'device should be either cpu or gpu.'
-        self.device_name = device
         if self.framework == Frameworks.torch:
-            self.device = create_device(self.device_name == 'cpu')
+            self.device = create_device(self.device_name)
         self._model_prepare = False
         self._model_prepare_lock = Lock()
         self._auto_collate = auto_collate
 
     def prepare_model(self):
+        """ Place model on certain device for pytorch models before first inference
+        """
         self._model_prepare_lock.acquire(timeout=600)
 
         def _prepare_single(model):
@@ -125,39 +131,6 @@ class Pipeline(ABC):
             self._model_prepare = True
         self._model_prepare_lock.release()
 
-    @contextmanager
-    def place_device(self):
-        """ device placement function, allow user to specify which device to place pipeline
-
-        Returns:
-            Context manager
-
-        Examples:
-
-        ```python
-        # Requests for using pipeline on cuda:0 for gpu
-        pipeline = pipeline(..., device='gpu')
-        with pipeline.device():
-            output = pipe(...)
-        ```
-        """
-        if self.framework == Frameworks.tf:
-            if self.device_name == 'cpu':
-                with tf.device('/CPU:0'):
-                    yield
-            else:
-                with tf.device('/device:GPU:0'):
-                    yield
-
-        elif self.framework == Frameworks.torch:
-            if self.device_name == 'gpu':
-                device = create_device()
-                if device.type == 'gpu':
-                    torch.cuda.set_device(device)
-            yield
-        else:
-            yield
-
     def _get_framework(self) -> str:
         frameworks = []
         for m in self.models:
@@ -267,15 +240,16 @@ class Pipeline(ABC):
                 raise ValueError(f'Unsupported data type {type(data)}')
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
-        preprocess_params = kwargs.get('preprocess_params')
-        forward_params = kwargs.get('forward_params')
-        postprocess_params = kwargs.get('postprocess_params')
+        preprocess_params = kwargs.get('preprocess_params', {})
+        forward_params = kwargs.get('forward_params', {})
+        postprocess_params = kwargs.get('postprocess_params', {})
 
         out = self.preprocess(input, **preprocess_params)
-        with self.place_device():
-            if self.framework == Frameworks.torch and self._auto_collate:
+        with device_placement(self.framework, self.device_name):
+            if self.framework == Frameworks.torch:
                 with torch.no_grad():
-                    out = self._collate_fn(out)
+                    if self._auto_collate:
+                        out = self._collate_fn(out)
                     out = self.forward(out, **forward_params)
             else:
                 out = self.forward(out, **forward_params)
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 4105e28b..8a1a3646 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -23,6 +23,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
+    Tasks.information_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
     Tasks.sentence_similarity:
     (Pipelines.sentence_similarity,
      'damo/nlp_structbert_sentence-similarity_chinese-base'),
@@ -79,6 +82,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     (Pipelines.generative_multi_modal_embedding,
      'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
      ),
+    Tasks.multi_modal_similarity:
+    (Pipelines.multi_modal_similarity,
+     'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'),
     Tasks.visual_question_answering:
     (Pipelines.visual_question_answering,
      'damo/mplug_visual-question-answering_coco_large_en'),
@@ -89,6 +95,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_diffusion_text-to-image-synthesis_tiny'),
     Tasks.body_2d_keypoints: (Pipelines.body_2d_keypoints,
                               'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
+    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
+                              'damo/cv_canonical_body-3d-keypoints_video'),
     Tasks.face_detection: (Pipelines.face_detection,
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
@@ -129,11 +137,19 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_convnextTiny_ocr-recognition-general_damo'),
     Tasks.skin_retouching: (Pipelines.skin_retouching,
                             'damo/cv_unet_skin-retouching'),
+    Tasks.faq_question_answering:
+    (Pipelines.faq_question_answering,
+     'damo/nlp_structbert_faq-question-answering_chinese-base'),
     Tasks.crowd_counting: (Pipelines.crowd_counting,
                            'damo/cv_hrnet_crowd-counting_dcanet'),
     Tasks.video_single_object_tracking:
     (Pipelines.video_single_object_tracking,
      'damo/cv_vitb_video-single-object-tracking_ostrack'),
+    Tasks.image_reid_person: (Pipelines.image_reid_person,
+                              'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.movie_scene_segmentation:
+    (Pipelines.movie_scene_segmentation,
+     'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
 }
 
 
@@ -216,7 +232,6 @@ def pipeline(task: str = None,
         f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
 
     model = normalize_model_input(model, model_revision)
-
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index cee91c8e..01c69758 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -7,7 +7,9 @@ if TYPE_CHECKING:
     from .action_recognition_pipeline import ActionRecognitionPipeline
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
+    from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
+    from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
     from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
@@ -23,12 +25,16 @@ if TYPE_CHECKING:
     from .image_denoise_pipeline import ImageDenoisePipeline
     from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
     from .image_matting_pipeline import ImageMattingPipeline
+    from .image_panoptic_segmentation_pipeline import ImagePanopticSegmentationPipeline
     from .image_portrait_enhancement_pipeline import ImagePortraitEnhancementPipeline
+    from .image_reid_person_pipeline import ImageReidPersonPipeline
+    from .image_semantic_segmentation_pipeline import ImageSemanticSegmentationPipeline
     from .image_style_transfer_pipeline import ImageStyleTransferPipeline
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
+    from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
     from .ocr_detection_pipeline import OCRDetectionPipeline
     from .ocr_recognition_pipeline import OCRRecognitionPipeline
@@ -36,12 +42,17 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
+    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
+        'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
+        'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
         'image_detection_pipeline': ['ImageDetectionPipeline'],
         'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
@@ -58,14 +69,21 @@ else:
         'image_instance_segmentation_pipeline':
         ['ImageInstanceSegmentationPipeline'],
         'image_matting_pipeline': ['ImageMattingPipeline'],
+        'image_panoptic_segmentation_pipeline':
+        ['ImagePanopticSegmentationPipeline'],
         'image_portrait_enhancement_pipeline':
         ['ImagePortraitEnhancementPipeline'],
+        'image_reid_person_pipeline': ['ImageReidPersonPipeline'],
+        'image_semantic_segmentation_pipeline':
+        ['ImageSemanticSegmentationPipeline'],
         'image_style_transfer_pipeline': ['ImageStyleTransferPipeline'],
         'image_super_resolution_pipeline': ['ImageSuperResolutionPipeline'],
         'image_to_image_translation_pipeline':
         ['Image2ImageTranslationPipeline'],
         'product_retrieval_embedding_pipeline':
         ['ProductRetrievalEmbeddingPipeline'],
+        'realtime_object_detection_pipeline':
+        ['RealtimeObjectDetectionPipeline'],
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generation_pipeline':
         ['Image2ImageGenerationPipeline'],
@@ -75,6 +93,10 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
+        'easycv_pipeline':
+        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'movie_scene_segmentation_pipeline':
+        ['MovieSceneSegmentationPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
index 087548f0..e3400ea7 100644
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -33,6 +33,7 @@ class ActionRecognitionPipeline(Pipeline):
         config_path = osp.join(self.model, ModelFile.CONFIGURATION)
         logger.info(f'loading config from {config_path}')
         self.cfg = Config.from_file(config_path)
+
         self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
         self.infer_model.eval()
         self.infer_model.load_state_dict(
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
new file mode 100644
index 00000000..e9e4e9e8
--- /dev/null
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -0,0 +1,213 @@
+import os
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.body_3d_keypoints.body_3d_pose import (
+    BodyKeypointsDetection3D, KeypointsTypes)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def convert_2_h36m(joints, joints_nbr=15):
+    lst_mappings = [[0, 8], [1, 7], [2, 12], [3, 13], [4, 14], [5, 9], [6, 10],
+                    [7, 11], [8, 1], [9, 2], [10, 3], [11, 4], [12, 5],
+                    [13, 6], [14, 0]]
+    nbr, dim = joints.shape
+    h36m_joints = np.zeros((nbr, dim))
+    for mapping in lst_mappings:
+        h36m_joints[mapping[1]] = joints[mapping[0]]
+
+    if joints_nbr == 17:
+        lst_mappings_17 = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
+                                    [5, 5], [6, 6], [7, 8], [8, 10], [9, 11],
+                                    [10, 12], [11, 13], [12, 14], [13, 15],
+                                    [14, 16]])
+        h36m_joints_17 = np.zeros((17, 2))
+        h36m_joints_17[lst_mappings_17[:, 1]] = h36m_joints[lst_mappings_17[:,
+                                                                            0]]
+        h36m_joints_17[7] = (h36m_joints_17[0] + h36m_joints_17[8]) * 0.5
+        h36m_joints_17[9] = (h36m_joints_17[8] + h36m_joints_17[10]) * 0.5
+        h36m_joints = h36m_joints_17
+
+    return h36m_joints
+
+
+def smooth_pts(cur_pts, pre_pts, bbox, smooth_x=15.0, smooth_y=15.0):
+    if pre_pts is None:
+        return cur_pts
+
+    w, h = bbox[1] - bbox[0]
+    if w == 0 or h == 0:
+        return cur_pts
+
+    size_pre = len(pre_pts)
+    size_cur = len(cur_pts)
+    if (size_pre == 0 or size_cur == 0):
+        return cur_pts
+
+    factor_x = -(smooth_x / w)
+    factor_y = -(smooth_y / w)
+
+    for i in range(size_cur):
+        w_x = np.exp(factor_x * np.abs(cur_pts[i][0] - pre_pts[i][0]))
+        w_y = np.exp(factor_y * np.abs(cur_pts[i][1] - pre_pts[i][1]))
+        cur_pts[i][0] = (1.0 - w_x) * cur_pts[i][0] + w_x * pre_pts[i][0]
+        cur_pts[i][1] = (1.0 - w_y) * cur_pts[i][1] + w_y * pre_pts[i][1]
+    return cur_pts
+
+
+def smoothing(lst_kps, lst_bboxes, smooth_x=15.0, smooth_y=15.0):
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_smoothed_kps = []
+    prev_pts = None
+    for i in range(lst_kps.shape[0]):
+        smoothed_cur_kps = smooth_pts(lst_kps[i], prev_pts,
+                                      lst_bboxes[i][0:-1].reshape(2, 2),
+                                      smooth_x, smooth_y)
+        lst_smoothed_kps.append(smoothed_cur_kps)
+        prev_pts = smoothed_cur_kps
+
+    return np.array(lst_smoothed_kps)
+
+
+def convert_2_h36m_data(lst_kps, lst_bboxes, joints_nbr=15):
+    lst_kps = lst_kps.squeeze()
+    lst_bboxes = lst_bboxes.squeeze()
+
+    assert lst_kps.shape[0] == lst_bboxes.shape[0]
+
+    lst_kps = smoothing(lst_kps, lst_bboxes)
+
+    keypoints = []
+    for i in range(lst_kps.shape[0]):
+        h36m_joints_2d = convert_2_h36m(lst_kps[i], joints_nbr=joints_nbr)
+        keypoints.append(h36m_joints_2d)
+    return keypoints
+
+
+@PIPELINES.register_module(
+    Tasks.body_3d_keypoints, module_name=Pipelines.body_3d_keypoints)
+class Body3DKeypointsPipeline(Pipeline):
+
+    def __init__(self, model: Union[str, BodyKeypointsDetection3D], **kwargs):
+        """Human body 3D pose estimation.
+
+        Args:
+            model (Union[str, BodyKeypointsDetection3D]): model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        self.keypoint_model_3d = model if isinstance(
+            model, BodyKeypointsDetection3D) else Model.from_pretrained(model)
+        self.keypoint_model_3d.eval()
+
+        # init human body 2D keypoints detection pipeline
+        self.human_body_2d_kps_det_pipeline = 'damo/cv_hrnetv2w32_body-2d-keypoints_image'
+        self.human_body_2d_kps_detector = pipeline(
+            Tasks.body_2d_keypoints,
+            model=self.human_body_2d_kps_det_pipeline,
+            device='gpu' if torch.cuda.is_available() else 'cpu')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        video_frames = self.read_video_frames(input)
+        if 0 == len(video_frames):
+            res = {'success': False, 'msg': 'get video frame failed.'}
+            return res
+
+        all_2d_poses = []
+        all_boxes_with_socre = []
+        max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME  # max video frame number to be predicted 3D joints
+        for i, frame in enumerate(video_frames):
+            kps_2d = self.human_body_2d_kps_detector(frame)
+            box = kps_2d['boxes'][
+                0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
+            pose = kps_2d['poses'][0]  # keypoints: [15, 2]
+            score = kps_2d['scores'][0]  # keypoints: [15, 2]
+            all_2d_poses.append(pose)
+            all_boxes_with_socre.append(
+                list(np.array(box).reshape(
+                    (-1))) + [score])  # construct to list with shape [5]
+            if (i + 1) >= max_frame:
+                break
+
+        all_2d_poses_np = np.array(all_2d_poses).reshape(
+            (len(all_2d_poses), 15,
+             2))  # 15: 2d keypoints number, 2: keypoint coordinate (x, y)
+        all_boxes_np = np.array(all_boxes_with_socre).reshape(
+            (len(all_boxes_with_socre), 5))  # [x1, y1, x2, y2, score]
+
+        kps_2d_h36m_17 = convert_2_h36m_data(
+            all_2d_poses_np,
+            all_boxes_np,
+            joints_nbr=self.keypoint_model_3d.cfg.model.MODEL.IN_NUM_JOINTS)
+        kps_2d_h36m_17 = np.array(kps_2d_h36m_17)
+        res = {'success': True, 'input_2d_pts': kps_2d_h36m_17}
+        return res
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if not input['success']:
+            res = {'success': False, 'msg': 'preprocess failed.'}
+            return res
+
+        input_2d_pts = input['input_2d_pts']
+        outputs = self.keypoint_model_3d.preprocess(input_2d_pts)
+        outputs = self.keypoint_model_3d.forward(outputs)
+        res = dict({'success': True}, **outputs)
+        return res
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        res = {OutputKeys.POSES: []}
+
+        if not input['success']:
+            pass
+        else:
+            poses = input[KeypointsTypes.POSES_CAMERA]
+            res = {OutputKeys.POSES: poses.data.cpu().numpy()}
+        return res
+
+    def read_video_frames(self, video_url: Union[str, cv2.VideoCapture]):
+        """Read video from local video file or from a video stream URL.
+
+        Args:
+            video_url (str or cv2.VideoCapture): Video path or video stream.
+
+        Raises:
+            Exception: Open video fail.
+
+        Returns:
+            [nd.array]: List of video frames.
+        """
+        frames = []
+        if isinstance(video_url, str):
+            cap = cv2.VideoCapture(video_url)
+            if not cap.isOpened():
+                raise Exception(
+                    'modelscope error: %s cannot be decoded by OpenCV.' %
+                    (video_url))
+        else:
+            cap = video_url
+
+        max_frame_num = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME
+        frame_idx = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_idx += 1
+            frames.append(frame)
+            if frame_idx >= max_frame_num:
+                break
+        cap.release()
+        return frames
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..0984ff43
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .detection_pipeline import EasyCVDetectionPipeline
+    from .segmentation_pipeline import EasyCVSegmentationPipeline
+else:
+    _import_structure = {
+        'detection_pipeline': ['EasyCVDetectionPipeline'],
+        'segmentation_pipeline': ['EasyCVSegmentationPipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
new file mode 100644
index 00000000..d6495f0a
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import os.path as osp
+from typing import Any
+
+from easycv.utils.ms_utils import EasyCVMeta
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines.util import is_official_hub_path
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+
+
+class EasyCVPipeline(object):
+    """Base pipeline for EasyCV.
+    Loading configuration file of modelscope style by default,
+    but it is actually use the predictor api of easycv to predict.
+    So here we do some adaptation work for configuration and predict api.
+    """
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+
+        """
+        self.model_file_pattern = model_file_pattern
+
+        assert isinstance(model, str)
+        if osp.exists(model):
+            model_dir = model
+        else:
+            assert is_official_hub_path(
+                model), 'Only support local model path and official hub path!'
+            model_dir = snapshot_download(
+                model_id=model, revision=DEFAULT_MODEL_REVISION)
+
+        assert osp.isdir(model_dir)
+        model_files = glob.glob(
+            os.path.join(model_dir, self.model_file_pattern))
+        assert len(
+            model_files
+        ) == 1, f'Need one model file, but find {len(model_files)}: {model_files}'
+
+        model_path = model_files[0]
+        self.model_path = model_path
+
+        # get configuration file from source model dir
+        self.config_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        assert os.path.exists(
+            self.config_file
+        ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'
+
+        self.cfg = Config.from_file(self.config_file)
+        self.predict_op = self._build_predict_op()
+
+    def _build_predict_op(self):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config
+        })
+        return pipeline_op
+
+    def _to_easycv_config(self):
+        """Adapt to EasyCV predictor."""
+        # TODO: refine config compatibility problems
+
+        easycv_arch = self.cfg.model.pop(EasyCVMeta.ARCH, None)
+        model_cfg = self.cfg.model
+        # Revert to the configuration of easycv
+        if easycv_arch is not None:
+            model_cfg.update(easycv_arch)
+
+        easycv_config = Config(dict(model=model_cfg))
+
+        reserved_keys = []
+        if hasattr(self.cfg, EasyCVMeta.META):
+            easycv_meta_cfg = getattr(self.cfg, EasyCVMeta.META)
+            reserved_keys = easycv_meta_cfg.get(EasyCVMeta.RESERVED_KEYS, [])
+            for key in reserved_keys:
+                easycv_config.merge_from_dict({key: getattr(self.cfg, key)})
+        if 'test_pipeline' not in reserved_keys:
+            easycv_config.merge_from_dict(
+                {'test_pipeline': self.cfg.dataset.val.get('pipeline', [])})
+
+        return easycv_config
+
+    def __call__(self, inputs) -> Any:
+        # TODO: support image url
+        return self.predict_op(inputs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
new file mode 100644
index 00000000..32365102
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.easycv_detection)
+class EasyCVDetectionPipeline(EasyCVPipeline):
+    """Pipeline for easycv detection task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVDetectionPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
new file mode 100644
index 00000000..2182e3b3
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/segmentation_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation, module_name=Pipelines.easycv_segmentation)
+class EasyCVSegmentationPipeline(EasyCVPipeline):
+    """Pipeline for easycv segmentation task."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(EasyCVSegmentationPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
diff --git a/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
new file mode 100644
index 00000000..5e4cd4c6
--- /dev/null
+++ b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
@@ -0,0 +1,75 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_recognition import BaseVideoModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import ReadVideoData
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_embedding, module_name=Pipelines.hicossl_video_embedding)
+class HICOSSLVideoEmbeddingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a hicossl video embedding pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = BaseVideoModel(cfg=self.cfg).to(self.device)
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device)['model_state'],
+            strict=False)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(
+                self.cfg, input, num_temporal_views_override=1).to(self.device)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        feature = self.perform_inference(input['video_data'])
+        return {OutputKeys.VIDEO_EMBEDDING: feature.data.cpu().numpy()}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        """ Perform feature extracting for a given video
+        Args:
+            model (BaseVideoModel): video model with loadded state dict.
+            max_bsz (int): the maximum batch size, limited by GPU memory.
+        Returns:
+            pred (Tensor): the extracted features for input video clips.
+        """
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz])[0])
+        pred = torch.cat(preds_list, dim=0)
+        return pred
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index 9c3c418e..eb669354 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -16,6 +16,7 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
+from ...utils.device import device_placement
 
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
@@ -36,11 +37,14 @@ class ImageCartoonPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        self.facer = FaceAna(self.model)
-        self.sess_anime_head = self.load_sess(
-            os.path.join(self.model, 'cartoon_anime_h.pb'), 'model_anime_head')
-        self.sess_anime_bg = self.load_sess(
-            os.path.join(self.model, 'cartoon_anime_bg.pb'), 'model_anime_bg')
+        with device_placement(self.framework, self.device_name):
+            self.facer = FaceAna(self.model)
+            self.sess_anime_head = self.load_sess(
+                os.path.join(self.model, 'cartoon_anime_h.pb'),
+                'model_anime_head')
+            self.sess_anime_bg = self.load_sess(
+                os.path.join(self.model, 'cartoon_anime_bg.pb'),
+                'model_anime_bg')
 
         self.box_width = 288
         global_mask = cv2.imread(os.path.join(self.model, 'alpha.jpg'))
diff --git a/modelscope/pipelines/cv/image_matting_pipeline.py b/modelscope/pipelines/cv/image_matting_pipeline.py
index d9e81959..d7b7fc3c 100644
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -31,19 +32,20 @@ class ImageMattingPipeline(Pipeline):
             tf = tf.compat.v1
         model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)
 
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        self._session = tf.Session(config=config)
-        with self._session.as_default():
-            logger.info(f'loading model from {model_path}')
-            with tf.gfile.FastGFile(model_path, 'rb') as f:
-                graph_def = tf.GraphDef()
-                graph_def.ParseFromString(f.read())
-                tf.import_graph_def(graph_def, name='')
-                self.output = self._session.graph.get_tensor_by_name(
-                    'output_png:0')
-                self.input_name = 'input_image:0'
-            logger.info('load model done')
+        with device_placement(self.framework, self.device_name):
+            config = tf.ConfigProto(allow_soft_placement=True)
+            config.gpu_options.allow_growth = True
+            self._session = tf.Session(config=config)
+            with self._session.as_default():
+                logger.info(f'loading model from {model_path}')
+                with tf.gfile.FastGFile(model_path, 'rb') as f:
+                    graph_def = tf.GraphDef()
+                    graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(graph_def, name='')
+                    self.output = self._session.graph.get_tensor_by_name(
+                        'output_png:0')
+                    self.input_name = 'input_image:0'
+                logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
         img = LoadImage.convert_to_ndarray(input)
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
new file mode 100644
index 00000000..9ffc2b03
--- /dev/null
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -0,0 +1,103 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_panoptic_segmentation)
+class ImagePanopticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image panoptic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('panoptic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            # input is str, file names, pipeline loadimagefromfile
+            # collect data
+            data = dict(img_info=dict(filename=input), img_prefix=None)
+        elif isinstance(input, PIL.Image.Image):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input.convert('RGB'))
+            # collect data
+            data = dict(img=img)
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+            img = img[:, :, ::-1]  # in rgb order
+            # collect data
+            data = dict(img=img)
+
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # bz=1, tcguo
+        pan_results = inputs[0]['pan_results']
+        INSTANCE_OFFSET = 1000
+
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.model.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+        masks = [it.astype(np.int) for it in segms]
+        labels_txt = np.array(self.model.CLASSES)[labels].tolist()
+
+        outputs = {
+            OutputKeys.MASKS: masks,
+            OutputKeys.LABELS: labels_txt,
+            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py
new file mode 100644
index 00000000..a14666a1
--- /dev/null
+++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py
@@ -0,0 +1,58 @@
+import math
+import os
+from typing import Any, Dict
+
+import torch
+import torchvision.transforms as T
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_reid_person, module_name=Pipelines.image_reid_person)
+class ImageReidPersonPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        assert isinstance(model, str), 'model must be a single str'
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        logger.info(f'loading model config from dir {model}')
+
+        cfg_path = os.path.join(model, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(cfg_path)
+        cfg = cfg.model.cfg
+        self.model = self.model.to(self.device)
+        self.model.eval()
+
+        self.val_transforms = T.Compose([
+            T.Resize(cfg.INPUT.SIZE_TEST),
+            T.ToTensor(),
+            T.Normalize(mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD)
+        ])
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        img = self.val_transforms(img)
+        img = img.unsqueeze(0)
+        img = img.to(self.device)
+        return {'img': img}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        img = input['img']
+        img_embedding = self.model(img)
+        return {OutputKeys.IMG_EMBEDDING: img_embedding}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
new file mode 100644
index 00000000..e3e1fd6b
--- /dev/null
+++ b/modelscope/pipelines/cv/image_semantic_segmentation_pipeline.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_semantic_segmentation)
+class ImageSemanticSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image semantic segmentation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('semantic segmentation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        from mmdet.datasets.pipelines import Compose
+        from mmcv.parallel import collate, scatter
+        from mmdet.datasets import replace_ImageToTensor
+
+        cfg = self.model.cfg
+        # build the data pipeline
+
+        if isinstance(input, str):
+            # input is str, file names, pipeline loadimagefromfile
+            # collect data
+            data = dict(img_info=dict(filename=input), img_prefix=None)
+        elif isinstance(input, PIL.Image.Image):  # BGR
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            img = np.array(input)[:, :, ::-1]
+            # collect data
+            data = dict(img=img)
+        elif isinstance(input, np.ndarray):
+            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(input, cv2.COLOR_GRAY2BGR)
+            else:
+                img = input
+            # collect data
+            data = dict(img=img)
+
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        # data = dict(img=input)
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+        test_pipeline = Compose(cfg.data.test.pipeline)
+
+        data = test_pipeline(data)
+        # copy from mmdet_model collect data
+        data = collate([data], samples_per_gpu=1)
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        results = self.model.postprocess(inputs)
+        outputs = {
+            OutputKeys.MASKS: results[OutputKeys.MASKS],
+            OutputKeys.LABELS: results[OutputKeys.LABELS],
+            OutputKeys.SCORES: results[OutputKeys.SCORES]
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_style_transfer_pipeline.py b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
index a67aaec2..827a0d44 100644
--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -31,30 +32,31 @@ class ImageStyleTransferPipeline(Pipeline):
             tf = tf.compat.v1
         model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)
 
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        self._session = tf.Session(config=config)
-        self.max_length = 800
-        with self._session.as_default():
-            logger.info(f'loading model from {model_path}')
-            with tf.gfile.FastGFile(model_path, 'rb') as f:
-                graph_def = tf.GraphDef()
-                graph_def.ParseFromString(f.read())
-                tf.import_graph_def(graph_def, name='')
-
-                self.content = tf.get_default_graph().get_tensor_by_name(
-                    'content:0')
-                self.style = tf.get_default_graph().get_tensor_by_name(
-                    'style:0')
-                self.output = tf.get_default_graph().get_tensor_by_name(
-                    'stylized_output:0')
-                self.attention = tf.get_default_graph().get_tensor_by_name(
-                    'attention_map:0')
-                self.inter_weight = tf.get_default_graph().get_tensor_by_name(
-                    'inter_weight:0')
-                self.centroids = tf.get_default_graph().get_tensor_by_name(
-                    'centroids:0')
-            logger.info('load model done')
+        with device_placement(self.framework, self.device_name):
+            config = tf.ConfigProto(allow_soft_placement=True)
+            config.gpu_options.allow_growth = True
+            self._session = tf.Session(config=config)
+            self.max_length = 800
+            with self._session.as_default():
+                logger.info(f'loading model from {model_path}')
+                with tf.gfile.FastGFile(model_path, 'rb') as f:
+                    graph_def = tf.GraphDef()
+                    graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(graph_def, name='')
+
+                    self.content = tf.get_default_graph().get_tensor_by_name(
+                        'content:0')
+                    self.style = tf.get_default_graph().get_tensor_by_name(
+                        'style:0')
+                    self.output = tf.get_default_graph().get_tensor_by_name(
+                        'stylized_output:0')
+                    self.attention = tf.get_default_graph().get_tensor_by_name(
+                        'attention_map:0')
+                    self.inter_weight = tf.get_default_graph(
+                    ).get_tensor_by_name('inter_weight:0')
+                    self.centroids = tf.get_default_graph().get_tensor_by_name(
+                        'centroids:0')
+                logger.info('load model done')
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, {}, {}
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
new file mode 100644
index 00000000..0ef0261d
--- /dev/null
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -0,0 +1,67 @@
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.movie_scene_segmentation,
+    module_name=Pipelines.movie_scene_segmentation)
+class MovieSceneSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a movie scene segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            device = 'gpu'
+        else:
+            device = 'cpu'
+        super().__init__(model=model, device=device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """ use pyscenedetect to detect shot from the input video, and generate key-frame jpg, anno.ndjson, and shot-frame.txt
+            Then use shot-encoder to encoder feat of the detected key-frame
+
+        Args:
+            input: path of the input video
+
+        """
+        self.input_video_pth = input
+        if isinstance(input, str):
+            shot_feat, sid = self.model.preprocess(input)
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+
+        result = {'sid': sid, 'shot_feat': shot_feat}
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model.inference(input)
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
+        video_num, meta_dict = self.model.postprocess(data)
+        result = {
+            OutputKeys.SPLIT_VIDEO_NUM: video_num,
+            OutputKeys.SPLIT_META_DICT: meta_dict
+        }
+        return result
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 32209c1e..62248714 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -11,11 +11,17 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 from .ocr_utils import (SegLinkDetector, cal_width, combine_segments_python,
                         decode_segments_links_python, nms_python,
                         rboxes_to_polygons)
 
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
+
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()
@@ -51,66 +57,67 @@ class OCRDetectionPipeline(Pipeline):
             osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
             'checkpoint-80000')
 
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        self._session = tf.Session(config=config)
-        self.input_images = tf.placeholder(
-            tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
-        self.output = {}
-
-        with tf.variable_scope('', reuse=tf.AUTO_REUSE):
-            global_step = tf.get_variable(
-                'global_step', [],
-                initializer=tf.constant_initializer(0),
-                dtype=tf.int64,
-                trainable=False)
-            variable_averages = tf.train.ExponentialMovingAverage(
-                0.997, global_step)
-
-            # detector
-            detector = SegLinkDetector()
-            all_maps = detector.build_model(
-                self.input_images, is_training=False)
-
-            # decode local predictions
-            all_nodes, all_links, all_reg = [], [], []
-            for i, maps in enumerate(all_maps):
-                cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
-                reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
-
-                cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
-
-                lnk_prob_pos = tf.nn.softmax(
-                    tf.reshape(lnk_maps, [-1, 4])[:, :2])
-                lnk_prob_mut = tf.nn.softmax(
-                    tf.reshape(lnk_maps, [-1, 4])[:, 2:])
-                lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
-
-                all_nodes.append(cls_prob)
-                all_links.append(lnk_prob)
-                all_reg.append(reg_maps)
-
-            # decode segments and links
-            image_size = tf.shape(self.input_images)[1:3]
-            segments, group_indices, segment_counts, _ = decode_segments_links_python(
-                image_size,
-                all_nodes,
-                all_links,
-                all_reg,
-                anchor_sizes=list(detector.anchor_sizes))
-
-            # combine segments
-            combined_rboxes, combined_counts = combine_segments_python(
-                segments, group_indices, segment_counts)
-            self.output['combined_rboxes'] = combined_rboxes
-            self.output['combined_counts'] = combined_counts
-
-        with self._session.as_default() as sess:
-            logger.info(f'loading model from {model_path}')
-            # load model
-            model_loader = tf.train.Saver(
-                variable_averages.variables_to_restore())
-            model_loader.restore(sess, model_path)
+        with device_placement(self.framework, self.device_name):
+            config = tf.ConfigProto(allow_soft_placement=True)
+            config.gpu_options.allow_growth = True
+            self._session = tf.Session(config=config)
+            self.input_images = tf.placeholder(
+                tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
+            self.output = {}
+
+            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+                global_step = tf.get_variable(
+                    'global_step', [],
+                    initializer=tf.constant_initializer(0),
+                    dtype=tf.int64,
+                    trainable=False)
+                variable_averages = tf.train.ExponentialMovingAverage(
+                    0.997, global_step)
+
+                # detector
+                detector = SegLinkDetector()
+                all_maps = detector.build_model(
+                    self.input_images, is_training=False)
+
+                # decode local predictions
+                all_nodes, all_links, all_reg = [], [], []
+                for i, maps in enumerate(all_maps):
+                    cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
+                    reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
+
+                    cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
+
+                    lnk_prob_pos = tf.nn.softmax(
+                        tf.reshape(lnk_maps, [-1, 4])[:, :2])
+                    lnk_prob_mut = tf.nn.softmax(
+                        tf.reshape(lnk_maps, [-1, 4])[:, 2:])
+                    lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
+
+                    all_nodes.append(cls_prob)
+                    all_links.append(lnk_prob)
+                    all_reg.append(reg_maps)
+
+                # decode segments and links
+                image_size = tf.shape(self.input_images)[1:3]
+                segments, group_indices, segment_counts, _ = decode_segments_links_python(
+                    image_size,
+                    all_nodes,
+                    all_links,
+                    all_reg,
+                    anchor_sizes=list(detector.anchor_sizes))
+
+                # combine segments
+                combined_rboxes, combined_counts = combine_segments_python(
+                    segments, group_indices, segment_counts)
+                self.output['combined_rboxes'] = combined_rboxes
+                self.output['combined_counts'] = combined_counts
+
+            with self._session.as_default() as sess:
+                logger.info(f'loading model from {model_path}')
+                # load model
+                model_loader = tf.train.Saver(
+                    variable_averages.variables_to_restore())
+                model_loader.restore(sess, model_path)
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
         img = LoadImage.convert_to_ndarray(input)
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index 2bc8a8bf..09807b10 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -1,8 +1,10 @@
 import math
 import os
 import shutil
+import sys
 import uuid
 
+import absl.flags as absl_flags
 import cv2
 import numpy as np
 import tensorflow as tf
@@ -12,6 +14,10 @@ from . import utils
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 
+# skip parse sys.argv in tf, so fix bug:
+# absl.flags._exceptions.UnrecognizedFlagError:
+# Unknown command line flag 'OCRDetectionPipeline: Unknown command line flag
+absl_flags.FLAGS(sys.argv, known_only=True)
 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_string('weight_init_method', 'xavier',
                            'Weight initialization method')
@@ -88,7 +94,7 @@ def _nn_variable(name, shape, init_method, collection=None, **kwargs):
     else:
         raise 'Unsupported weight initialization method: ' + init_method
 
-    var = tf.get_variable(name, shape=shape, initializer=initializer, **kwargs)
+    var = tf.get_variable(name, shape=shape, initializer=initializer)
     if collection is not None:
         tf.add_to_collection(collection, var)
 
diff --git a/modelscope/pipelines/cv/realtime_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
new file mode 100644
index 00000000..629720d1
--- /dev/null
+++ b/modelscope/pipelines/cv/realtime_object_detection_pipeline.py
@@ -0,0 +1,50 @@
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import json
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.realtime_object_detection import RealtimeDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.realtime_object_detection)
+class RealtimeObjectDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.model = RealtimeDetector(model)
+
+    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
+        output = self.model.preprocess(input)
+        return {'pre_output': output}
+
+    def forward(self, input: Tensor) -> Dict[Tensor, Dict[str, np.ndarray]]:
+        pre_output = input['pre_output']
+        forward_output = self.model(pre_output)
+        return {'forward_output': forward_output}
+
+    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
+                    **kwargs) -> str:
+        forward_output = input['forward_output']
+        bboxes, scores, labels = forward_output
+        return {
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
diff --git a/modelscope/pipelines/cv/skin_retouching_pipeline.py b/modelscope/pipelines/cv/skin_retouching_pipeline.py
index d9b49ff3..f8c9de60 100644
--- a/modelscope/pipelines/cv/skin_retouching_pipeline.py
+++ b/modelscope/pipelines/cv/skin_retouching_pipeline.py
@@ -23,6 +23,7 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import create_device, device_placement
 from modelscope.utils.logger import get_logger
 
 if tf.__version__ >= '2.0':
@@ -42,12 +43,9 @@ class SkinRetouchingPipeline(Pipeline):
         Args:
             model: model id on modelscope hub.
         """
-        super().__init__(model=model)
+        super().__init__(model=model, device=device)
 
-        if torch.cuda.is_available() and device == 'gpu':
-            device = 'cuda'
-        else:
-            device = 'cpu'
+        device = create_device(self.device_name)
         model_path = os.path.join(self.model, ModelFile.TORCH_MODEL_FILE)
         detector_model_path = os.path.join(
             self.model, 'retinaface_resnet50_2020-07-20_old_torch.pth')
@@ -81,16 +79,17 @@ class SkinRetouchingPipeline(Pipeline):
 
         self.skin_model_path = skin_model_path
         if self.skin_model_path is not None:
-            config = tf.ConfigProto(allow_soft_placement=True)
-            config.gpu_options.per_process_gpu_memory_fraction = 0.3
-            config.gpu_options.allow_growth = True
-            self.sess = tf.Session(config=config)
-            with tf.gfile.FastGFile(self.skin_model_path, 'rb') as f:
-                graph_def = tf.GraphDef()
-                graph_def.ParseFromString(f.read())
-                self.sess.graph.as_default()
-                tf.import_graph_def(graph_def, name='')
-                self.sess.run(tf.global_variables_initializer())
+            with device_placement(self.framework, self.device_name):
+                config = tf.ConfigProto(allow_soft_placement=True)
+                config.gpu_options.per_process_gpu_memory_fraction = 0.3
+                config.gpu_options.allow_growth = True
+                self.sess = tf.Session(config=config)
+                with tf.gfile.FastGFile(self.skin_model_path, 'rb') as f:
+                    graph_def = tf.GraphDef()
+                    graph_def.ParseFromString(f.read())
+                    self.sess.graph.as_default()
+                    tf.import_graph_def(graph_def, name='')
+                    self.sess.run(tf.global_variables_initializer())
 
         self.image_files_transforms = transforms.Compose([
             transforms.ToTensor(),
diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
new file mode 100644
index 00000000..001780e1
--- /dev/null
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -0,0 +1,109 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_summarization import PGLVideoSummarization
+from modelscope.models.cv.video_summarization.base_model import bvlc_googlenet
+from modelscope.models.cv.video_summarization.summarizer import (
+    generate_summary, get_change_points)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_summarization, module_name=Pipelines.video_summarization)
+class VideoSummarizationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a video summarization pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        logger.info(f'loading model from {model}')
+        googlenet_model_path = osp.join(model, 'bvlc_googlenet.pt')
+        config_path = osp.join(model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+
+        self.googlenet_model = bvlc_googlenet()
+        self.googlenet_model.model.load_state_dict(
+            torch.load(
+                googlenet_model_path, map_location=torch.device(self.device)))
+        self.googlenet_model = self.googlenet_model.to(self.device).eval()
+
+        self.pgl_model = PGLVideoSummarization(model)
+        self.pgl_model = self.pgl_model.to(self.device).eval()
+
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if not isinstance(input, str):
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        frames = []
+        picks = []
+        cap = cv2.VideoCapture(input)
+        frame_idx = 0
+        while (cap.isOpened()):
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_idx % 15 == 0:
+                frames.append(frame)
+                picks.append(frame_idx)
+            frame_idx += 1
+        n_frame = frame_idx
+
+        result = {
+            'video_name': input,
+            'video_frames': np.array(frames),
+            'n_frame': n_frame,
+            'picks': np.array(picks)
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        frame_features = []
+        for frame in tqdm(input['video_frames']):
+            feat = self.googlenet_model(frame)
+            frame_features.append(feat)
+
+        change_points, n_frame_per_seg = get_change_points(
+            frame_features, input['n_frame'])
+
+        summary = self.inference(frame_features, input['n_frame'],
+                                 input['picks'], change_points)
+
+        return {OutputKeys.OUTPUT: summary}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+    def inference(self, frame_features, n_frames, picks, change_points):
+        frame_features = torch.from_numpy(np.array(frame_features, np.float32))
+        picks = np.array(picks, np.int32)
+
+        with torch.no_grad():
+            results = self.pgl_model(dict(frame_features=frame_features))
+            scores = results['scores']
+            if not scores.device.type == 'cpu':
+                scores = scores.cpu()
+            scores = scores.squeeze(0).numpy().tolist()
+            summary = generate_summary([change_points], [scores], [n_frames],
+                                       [picks])[0]
+
+        return summary.tolist()
diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
new file mode 100644
index 00000000..1ebcf526
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import MPlugPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_text_retrieval, module_name=Pipelines.image_text_retrieval)
+class ImageTextRetrievalPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a
+        image text retrieval pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            f'model must be a single str or Model, but got {type(model)}'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None:
+            preprocessor = MPlugPreprocessor(pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.SCORES: inputs[0].tolist()}
diff --git a/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
new file mode 100644
index 00000000..7d3ffed3
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/team_multi_modal_similarity_pipeline.py
@@ -0,0 +1,31 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.multi_modal_similarity, module_name=Pipelines.multi_modal_similarity)
+class TEAMMultiModalSimilarityPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a multimodal similarity pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return self.model(input)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
index 166d3f06..bc697b05 100644
--- a/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/video_multi_modal_embedding_pipeline.py
@@ -4,6 +4,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import Tasks
+from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -26,7 +27,7 @@ class VideoMultiModalEmbeddingPipeline(Pipeline):
         return input
 
     def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
-        with self.place_device():
+        with device_placement(self.framework, self.device_name):
             out = self.forward(input)
 
         self._check_output(out)
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 0cdb633c..665e016d 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -8,7 +8,9 @@ if TYPE_CHECKING:
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
+    from .document_segmentation_pipeline import DocumentSegmentationPipeline
     from .fill_mask_pipeline import FillMaskPipeline
+    from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
     from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
@@ -20,6 +22,8 @@ if TYPE_CHECKING:
     from .summarization_pipeline import SummarizationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
+    from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
+    from .relation_extraction_pipeline import RelationExtractionPipeline
 
 else:
     _import_structure = {
@@ -29,7 +33,9 @@ else:
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
+        'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
+        'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'single_sentence_classification_pipeline':
         ['SingleSentenceClassificationPipeline'],
         'pair_sentence_classification_pipeline':
@@ -44,7 +50,9 @@ else:
         'translation_pipeline': ['TranslationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
-        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline']
+        'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
+        'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
+        'relation_extraction_pipeline': ['RelationExtractionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
new file mode 100644
index 00000000..00837bf3
--- /dev/null
+++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import re
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import torch
+from datasets import Dataset
+from transformers.models.bert.modeling_bert import BertConfig
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DocumentSegmentationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['DocumentSegmentationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.document_segmentation, module_name=Pipelines.document_segmentation)
+class DocumentSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: DocumentSegmentationPreprocessor = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+
+        self.model_dir = model.model_dir
+        config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
+
+        self.document_segmentation_model = model.build_with_config(
+            config=config)
+
+        if preprocessor is None:
+            preprocessor = DocumentSegmentationPreprocessor(
+                self.model_dir, config)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+
+    def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        output = self.predict(documents)
+        output = self.postprocess(output)
+        return output
+
+    def predict(self, documents: Union[List[str], str]) -> Dict[str, Any]:
+        pred_samples = self.cut_documents(documents)
+        predict_examples = Dataset.from_dict(pred_samples)
+
+        # Predict Feature Creation
+        predict_dataset = self.preprocessor(predict_examples)
+        num_examples = len(
+            predict_examples[self.preprocessor.context_column_name])
+        num_samples = len(
+            predict_dataset[self.preprocessor.context_column_name])
+
+        predict_dataset.pop('segment_ids')
+        labels = predict_dataset.pop('labels')
+        sentences = predict_dataset.pop('sentences')
+        example_ids = predict_dataset.pop(
+            self.preprocessor.example_id_column_name)
+
+        with torch.no_grad():
+            input = {
+                key: torch.tensor(val)
+                for key, val in predict_dataset.items()
+            }
+            predictions = self.document_segmentation_model.forward(
+                **input).logits
+
+        predictions = np.argmax(predictions, axis=2)
+        assert len(sentences) == len(
+            predictions), 'sample {}  infer_sample {} prediction {}'.format(
+                num_samples, len(sentences), len(predictions))
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [
+                self.preprocessor.label_list[p]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        true_labels = [
+            [
+                self.preprocessor.label_list[l]
+                for (p, l) in zip(prediction, label) if l != -100  # noqa *
+            ] for prediction, label in zip(predictions, labels)
+        ]
+
+        # Save predictions
+        out = []
+        for i in range(num_examples):
+            out.append({'sentences': [], 'labels': [], 'predictions': []})
+
+        for prediction, sentence_list, label, example_id in zip(
+                true_predictions, sentences, true_labels, example_ids):
+            if len(label) < len(sentence_list):
+                label.append('B-EOP')
+                prediction.append('B-EOP')
+            assert len(sentence_list) == len(prediction), '{} {}'.format(
+                len(sentence_list), len(prediction))
+            assert len(sentence_list) == len(label), '{} {}'.format(
+                len(sentence_list), len(label))
+            out[example_id]['sentences'].extend(sentence_list)
+            out[example_id]['labels'].extend(label)
+            out[example_id]['predictions'].extend(prediction)
+
+        return out
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        result = []
+        list_count = len(inputs)
+        for num in range(list_count):
+            res = []
+            for s, p in zip(inputs[num]['sentences'],
+                            inputs[num]['predictions']):
+                s = s.strip()
+                if p == 'B-EOP':
+                    s = ''.join([s, '\n\t'])
+                res.append(s)
+
+            document = ('\t' + ''.join(res))
+            result.append(document)
+
+        if list_count == 1:
+            return {OutputKeys.TEXT: result[0]}
+        else:
+            return {OutputKeys.TEXT: result}
+
+    def cut_documents(self, para: Union[List[str], str]):
+        document_list = para
+        if isinstance(para, str):
+            document_list = [para]
+        sentences = []
+        labels = []
+        example_id = []
+        id = 0
+        for document in document_list:
+            sentence = self.cut_sentence(document)
+            label = ['O'] * (len(sentence) - 1) + ['B-EOP']
+            sentences.append(sentence)
+            labels.append(label)
+            example_id.append(id)
+            id += 1
+
+        return {
+            'example_id': example_id,
+            'sentences': sentences,
+            'labels': labels
+        }
+
+    def cut_sentence(self, para):
+        para = re.sub(r'([。！.!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+        para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)  # noqa *
+        para = para.rstrip()
+        return [_ for _ in para.split('\n') if _]
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
new file mode 100644
index 00000000..65831a17
--- /dev/null
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -0,0 +1,76 @@
+from typing import Any, Dict, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['FaqQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.faq_question_answering, module_name=Pipelines.faq_question_answering)
+class FaqQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[str, SbertForFaqQuestionAnswering],
+                 preprocessor: FaqQuestionAnsweringPreprocessor = None,
+                 **kwargs):
+        model = model if isinstance(
+            model,
+            SbertForFaqQuestionAnswering) else Model.from_pretrained(model)
+        model.eval()
+        if preprocessor is None:
+            preprocessor = FaqQuestionAnsweringPreprocessor(
+                model.model_dir, **kwargs)
+        self.preprocessor = preprocessor
+        super(FaqQuestionAnsweringPipeline, self).__init__(
+            model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return pipeline_parameters, pipeline_parameters, pipeline_parameters
+
+    def get_sentence_embedding(self, inputs, max_len=None):
+        inputs = self.preprocessor.batch_encode(inputs, max_length=max_len)
+        sentence_vecs = self.model.forward_sentence_embedding(inputs)
+        sentence_vecs = sentence_vecs.detach().tolist()
+        return sentence_vecs
+
+    def forward(self, inputs: [list, Dict[str, Any]],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs)
+
+    def postprocess(self, inputs: [list, Dict[str, Any]],
+                    **postprocess_params) -> Dict[str, Any]:
+        scores = inputs['scores']
+        labels = []
+        for item in scores:
+            tmplabels = [
+                self.preprocessor.get_label(label_id)
+                for label_id in range(len(item))
+            ]
+            labels.append(tmplabels)
+
+        predictions = []
+        for tmp_scores, tmp_labels in zip(scores.tolist(), labels):
+            prediction = []
+            for score, label in zip(tmp_scores, tmp_labels):
+                prediction.append({
+                    OutputKeys.LABEL: label,
+                    OutputKeys.SCORE: score
+                })
+            predictions.append(
+                list(
+                    sorted(
+                        prediction,
+                        key=lambda d: d[OutputKeys.SCORE],
+                        reverse=True)))
+
+        return {OutputKeys.OUTPUT: predictions}
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
new file mode 100644
index 00000000..4cb138d6
--- /dev/null
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      RelationExtractionPreprocessor)
+from modelscope.utils.constant import Tasks
+
+__all__ = ['InformationExtractionPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.information_extraction, module_name=Pipelines.relation_extraction)
+class InformationExtractionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = RelationExtractionPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        return inputs
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index b0b06c88..8fbdde86 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -84,6 +84,9 @@ class NamedEntityRecognitionPipeline(Pipeline):
                     entity['span'] = text[entity['start']:entity['end']]
                     entities.append(entity)
                     entity = {}
+        if entity:
+            entity['span'] = text[entity['start']:entity['end']]
+            entities.append(entity)
         outputs = {OutputKeys.OUTPUT: entities}
 
         return outputs
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index 909e3c6c..e4893577 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -1,8 +1,11 @@
 import os.path as osp
 from typing import Any, Dict
 
+import jieba
 import numpy as np
 import tensorflow as tf
+from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+from subword_nmt import apply_bpe
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
@@ -31,7 +34,7 @@ class TranslationPipeline(Pipeline):
 
         @param model: A Model instance.
         """
-        super().__init__(model=model)
+        super().__init__(model=model, **kwargs)
         model = self.model.model_dir
         tf.reset_default_graph()
 
@@ -59,6 +62,21 @@ class TranslationPipeline(Pipeline):
             dtype=tf.int64, shape=[None, None], name='input_wids')
         self.output = {}
 
+        # preprocess
+        self._src_lang = self.cfg['preprocessor']['src_lang']
+        self._tgt_lang = self.cfg['preprocessor']['tgt_lang']
+        self._src_bpe_path = osp.join(
+            model, self.cfg['preprocessor']['src_bpe']['file'])
+
+        if self._src_lang == 'zh':
+            self._tok = jieba
+        else:
+            self._punct_normalizer = MosesPunctNormalizer(lang=self._src_lang)
+            self._tok = MosesTokenizer(lang=self._src_lang)
+        self._detok = MosesDetokenizer(lang=self._tgt_lang)
+
+        self._bpe = apply_bpe.BPE(open(self._src_bpe_path))
+
         # model
         output = self.model(self.input_wids)
         self.output.update(output)
@@ -70,10 +88,19 @@ class TranslationPipeline(Pipeline):
             model_loader.restore(sess, model_path)
 
     def preprocess(self, input: str) -> Dict[str, Any]:
+        if self._src_lang == 'zh':
+            input_tok = self._tok.cut(input)
+            input_tok = ' '.join(list(input_tok))
+        else:
+            input = self._punct_normalizer.normalize(input)
+            input_tok = self._tok.tokenize(
+                input, return_str=True, aggressive_dash_splits=True)
+
+        input_bpe = self._bpe.process_line(input_tok)
         input_ids = np.array([[
             self._src_vocab[w]
             if w in self._src_vocab else self.cfg['model']['src_vocab_size']
-            for w in input.strip().split()
+            for w in input_bpe.strip().split()
         ]])
         result = {'input_ids': input_ids}
         return result
@@ -92,5 +119,6 @@ class TranslationPipeline(Pipeline):
             self._trg_rvocab[wid] if wid in self._trg_rvocab else '<unk>'
             for wid in wids
         ]).replace('@@ ', '').replace('@@', '')
+        translation_out = self._detok.detokenize(translation_out.split())
         result = {OutputKeys.TRANSLATION: translation_out}
         return result
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 0328b91a..9f7d595e 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -21,11 +21,14 @@ if TYPE_CHECKING:
                       SingleSentenceClassificationPreprocessor,
                       PairSentenceClassificationPreprocessor,
                       FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
-                      NERPreprocessor, TextErrorCorrectionPreprocessor)
+                      NERPreprocessor, TextErrorCorrectionPreprocessor,
+                      FaqQuestionAnsweringPreprocessor,
+                      RelationExtractionPreprocessor)
+    from .slp import DocumentSegmentationPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
-    from .video import ReadVideoData
+    from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
 
 else:
@@ -35,7 +38,7 @@ else:
         'common': ['Compose', 'ToTensor', 'Filter'],
         'audio': ['LinearAECAndFbank'],
         'asr': ['WavToScp'],
-        'video': ['ReadVideoData'],
+        'video': ['ReadVideoData', 'MovieSceneSegmentationPreprocessor'],
         'image': [
             'LoadImage', 'load_image', 'ImageColorEnhanceFinetunePreprocessor',
             'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
@@ -48,8 +51,11 @@ else:
             'SingleSentenceClassificationPreprocessor',
             'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-            'TextErrorCorrectionPreprocessor'
+            'TextErrorCorrectionPreprocessor',
+            'FaqQuestionAnsweringPreprocessor',
+            'RelationExtractionPreprocessor'
         ],
+        'slp': ['DocumentSegmentationPreprocessor'],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor', 'InputFeatures'
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index 6932371d..60f6e0eb 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -264,3 +264,28 @@ class ImageInstanceSegmentationPreprocessor(Preprocessor):
                 return None
 
         return results
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.video_summarization_preprocessor)
+class VideoSummarizationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        self.model_dir: str = model_dir
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict[str, Any]
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        return data
diff --git a/modelscope/preprocessors/movie_scene_segmentation/__init__.py b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
new file mode 100644
index 00000000..73da792d
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .transforms import get_transform
+else:
+    _import_structure = {
+        'transforms': ['get_transform'],
+    }
+
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/movie_scene_segmentation/transforms.py b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
new file mode 100644
index 00000000..b4e57420
--- /dev/null
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -0,0 +1,312 @@
+# ------------------------------------------------------------------------------------
+# The codes below partially refer to the BaSSL
+# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# Github: https://github.com/kakaobrain/bassl
+# ------------------------------------------------------------------------------------
+import numbers
+import os.path as osp
+import random
+from typing import List
+
+import numpy as np
+import torch
+import torchvision.transforms as TF
+import torchvision.transforms.functional as F
+from PIL import Image, ImageFilter
+
+
+def get_transform(lst):
+    assert len(lst) > 0
+    transform_lst = []
+    for item in lst:
+        transform_lst.append(build_transform(item))
+    transform = TF.Compose(transform_lst)
+    return transform
+
+
+def build_transform(cfg):
+    assert isinstance(cfg, dict)
+    cfg = cfg.copy()
+    type = cfg.pop('type')
+
+    if type == 'VideoResizedCenterCrop':
+        return VideoResizedCenterCrop(**cfg)
+    elif type == 'VideoToTensor':
+        return VideoToTensor(**cfg)
+    elif type == 'VideoRandomResizedCrop':
+        return VideoRandomResizedCrop(**cfg)
+    elif type == 'VideoRandomHFlip':
+        return VideoRandomHFlip()
+    elif type == 'VideoRandomColorJitter':
+        return VideoRandomColorJitter(**cfg)
+    elif type == 'VideoRandomGaussianBlur':
+        return VideoRandomGaussianBlur(**cfg)
+    else:
+        raise NotImplementedError
+
+
+class VideoResizedCenterCrop(torch.nn.Module):
+
+    def __init__(self, image_size, crop_size):
+        self.tfm = TF.Compose([
+            TF.Resize(size=image_size, interpolation=Image.BICUBIC),
+            TF.CenterCrop(crop_size),
+        ])
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.tfm(img) for img in imgmap]
+
+
+class VideoToTensor(torch.nn.Module):
+
+    def __init__(self, mean=None, std=None, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+        assert self.mean is not None
+        assert self.std is not None
+
+    def __to_tensor__(self, img):
+        return F.to_tensor(img)
+
+    def __normalize__(self, img):
+        return F.normalize(img, self.mean, self.std, self.inplace)
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        return [self.__normalize__(self.__to_tensor__(img)) for img in imgmap]
+
+
+class VideoRandomResizedCrop(torch.nn.Module):
+
+    def __init__(self, size, bottom_area=0.2):
+        self.p = 1.0
+        self.interpolation = Image.BICUBIC
+        self.size = size
+        self.bottom_area = bottom_area
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:  # do RandomResizedCrop, consistent=True
+            top, left, height, width = TF.RandomResizedCrop.get_params(
+                imgmap[0],
+                scale=(self.bottom_area, 1.0),
+                ratio=(3 / 4.0, 4 / 3.0))
+            return [
+                F.resized_crop(
+                    img=img,
+                    top=top,
+                    left=left,
+                    height=height,
+                    width=width,
+                    size=(self.size, self.size),
+                ) for img in imgmap
+            ]
+        else:
+            return [
+                F.resize(img=img, size=[self.size, self.size])
+                for img in imgmap
+            ]
+
+
+class VideoRandomHFlip(torch.nn.Module):
+
+    def __init__(self, consistent=True, command=None, seq_len=0):
+        self.consistent = consistent
+        if seq_len != 0:
+            self.consistent = False
+        if command == 'left':
+            self.threshold = 0
+        elif command == 'right':
+            self.threshold = 1
+        else:
+            self.threshold = 0.5
+        self.seq_len = seq_len
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if self.consistent:
+            if random.random() < self.threshold:
+                return [i.transpose(Image.FLIP_LEFT_RIGHT) for i in imgmap]
+            else:
+                return imgmap
+        else:
+            result = []
+            for idx, i in enumerate(imgmap):
+                if idx % self.seq_len == 0:
+                    th = random.random()
+                if th < self.threshold:
+                    result.append(i.transpose(Image.FLIP_LEFT_RIGHT))
+                else:
+                    result.append(i)
+            assert len(result) == len(imgmap)
+            return result
+
+
+class VideoRandomColorJitter(torch.nn.Module):
+    """Randomly change the brightness, contrast and saturation of an image.
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness=0,
+        contrast=0,
+        saturation=0,
+        hue=0,
+        consistent=True,
+        p=1.0,
+        seq_len=0,
+    ):
+        self.brightness = self._check_input(brightness, 'brightness')
+        self.contrast = self._check_input(contrast, 'contrast')
+        self.saturation = self._check_input(saturation, 'saturation')
+        self.hue = self._check_input(
+            hue, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+        self.consistent = consistent
+        self.threshold = p
+        self.seq_len = seq_len
+
+    def _check_input(self,
+                     value,
+                     name,
+                     center=1,
+                     bound=(0, float('inf')),
+                     clip_first_on_zero=True):
+        if isinstance(value, numbers.Number):
+            if value < 0:
+                raise ValueError(
+                    'If {} is a single number, it must be non negative.'.
+                    format(name))
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0)
+        elif isinstance(value, (tuple, list)) and len(value) == 2:
+            if not bound[0] <= value[0] <= value[1] <= bound[1]:
+                raise ValueError('{} values should be between {}'.format(
+                    name, bound))
+        else:
+            raise TypeError(
+                '{} should be a single number or a list/tuple with lenght 2.'.
+                format(name))
+
+        # if value is 0 or (1., 1.) for brightness/contrast/saturation
+        # or (0., 0.) for hue, do nothing
+        if value[0] == value[1] == center:
+            value = None
+        return value
+
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+        Arguments are same as that of __init__.
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+
+        if brightness is not None:
+            brightness_factor = random.uniform(brightness[0], brightness[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_brightness(img, brightness_factor)))
+
+        if contrast is not None:
+            contrast_factor = random.uniform(contrast[0], contrast[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_contrast(img, contrast_factor)))
+
+        if saturation is not None:
+            saturation_factor = random.uniform(saturation[0], saturation[1])
+            transforms.append(
+                TF.Lambda(
+                    lambda img: F.adjust_saturation(img, saturation_factor)))
+
+        if hue is not None:
+            hue_factor = random.uniform(hue[0], hue[1])
+            transforms.append(
+                TF.Lambda(lambda img: F.adjust_hue(img, hue_factor)))
+
+        random.shuffle(transforms)
+        transform = TF.Compose(transforms)
+
+        return transform
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.threshold:  # do ColorJitter
+            if self.consistent:
+                transform = self.get_params(self.brightness, self.contrast,
+                                            self.saturation, self.hue)
+
+                return [transform(i) for i in imgmap]
+            else:
+                if self.seq_len == 0:
+                    return [
+                        self.get_params(self.brightness, self.contrast,
+                                        self.saturation, self.hue)(img)
+                        for img in imgmap
+                    ]
+                else:
+                    result = []
+                    for idx, img in enumerate(imgmap):
+                        if idx % self.seq_len == 0:
+                            transform = self.get_params(
+                                self.brightness,
+                                self.contrast,
+                                self.saturation,
+                                self.hue,
+                            )
+                        result.append(transform(img))
+                    return result
+
+        else:
+            return imgmap
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        format_string += 'brightness={0}'.format(self.brightness)
+        format_string += ', contrast={0}'.format(self.contrast)
+        format_string += ', saturation={0}'.format(self.saturation)
+        format_string += ', hue={0})'.format(self.hue)
+        return format_string
+
+
+class VideoRandomGaussianBlur(torch.nn.Module):
+
+    def __init__(self, radius_min=0.1, radius_max=2.0, p=0.5):
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+        self.p = p
+
+    def __call__(self, imgmap):
+        assert isinstance(imgmap, list)
+        if random.random() < self.p:
+            result = []
+            for _, img in enumerate(imgmap):
+                _radius = random.uniform(self.radius_min, self.radius_max)
+                result.append(
+                    img.filter(ImageFilter.GaussianBlur(radius=_radius)))
+            return result
+        else:
+            return imgmap
+
+
+def apply_transform(images, trans):
+    return torch.stack(trans(images), dim=0)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 7a7b5854..17d61ae3 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from io import BytesIO
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 from PIL import Image
@@ -110,12 +110,20 @@ class OfaPreprocessor(Preprocessor):
     Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
 class MPlugPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer_max_length: int = 25,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.model_dir = model_dir
+        self.mode = mode
+        self.tokenizer_max_length = tokenizer_max_length
 
         self._tokenizer = None
         self._patch_resize_transform = None
+        self._image_map = {}
 
     @property
     def tokenizer(self):
@@ -145,42 +153,56 @@ class MPlugPreprocessor(Preprocessor):
             ])
         return self._patch_resize_transform
 
-    def __call__(self, *args, **kwargs):
-        call_mapping = {
-            Tasks.visual_question_answering: self.vqa_call,
-            Tasks.image_captioning: self.caption_call
-        }
+    def image_open(self, path: str) -> Tuple[Image.Image, int]:
+        if path not in self._image_map:
+            index = len(self._image_map)
+            self._image_map[path] = (Image.open(path), index)
+        return self._image_map[path]
 
+    def __call__(
+            self, data: Union[Image.Image, tuple,
+                              Dict[str, Any]]) -> Dict[str, Any]:
         self.cfg = Config.from_file(
             osp.join(self.model_dir, ModelFile.CONFIGURATION))
-        return call_mapping[self.cfg.task](*args, **kwargs)
 
-    def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
-        image: Image.Image = data[0] if isinstance(data,
-                                                   tuple) else data['image']
-        question: str = data[1] if isinstance(data,
-                                              tuple) else data['question']
-        image = image.convert('RGB')
-        image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer([question.lower()],
-                                  padding='longest',
-                                  return_tensors='pt')
-
-        return {'image': image, 'question': question, 'train': False}
-
-    def caption_call(
-            self, data: Union[Image.Image, tuple,
-                              Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, Image.Image):
+        if isinstance(data, (Image.Image, str)):
             image = data
         elif isinstance(data, tuple):
             image = data[0]
         else:
             image = data['image']
+        index = 0
+        if isinstance(image, str):
+            image, index = self.image_open(image)
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
-        image = torch.stack([image], dim=0)
-        question = self.tokenizer('', return_tensors='pt')
-
-        return {'image': image, 'question': question, 'train': False}
+        question = '' if self.cfg.task == Tasks.image_captioning \
+            else data[1 if isinstance(data, tuple) else 'question']
+        question = self.tokenizer(
+            question.lower(),
+            padding='max_length',
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors='pt')
+
+        if self.mode == ModeKeys.INFERENCE:
+            image = torch.stack([image], dim=0)
+            return {'image': image, 'question': question}
+        else:
+            answer = data['answer']
+            answer = self.tokenizer(
+                answer,
+                padding='max_length',
+                truncation=True,
+                max_length=self.tokenizer_max_length,
+                return_tensors='pt')
+            output = {
+                'image': image,
+                'question_input_ids': question.input_ids.squeeze(),
+                'question_attention_mask': question.attention_mask.squeeze(),
+                'answer_input_ids': answer.input_ids.squeeze(),
+                'answer_attention_mask': answer.attention_mask.squeeze(),
+            }
+            if self.cfg.task == Tasks.image_text_retrieval:
+                output['index'] = index
+            return output
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 25576667..4882c477 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,10 +5,11 @@ import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, BertTokenizerFast
 
 from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
+from modelscope.utils.config import ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.type_assert import type_assert
@@ -21,7 +22,8 @@ __all__ = [
     'PairSentenceClassificationPreprocessor',
     'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
-    'TextErrorCorrectionPreprocessor'
+    'TextErrorCorrectionPreprocessor', 'FaqQuestionAnsweringPreprocessor',
+    'RelationExtractionPreprocessor'
 ]
 
 
@@ -366,15 +368,20 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
     def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         if self._mode == ModeKeys.INFERENCE:
             return super().__call__(data)
-        src_txt = data['src_txt']
-        tgt_txt = data['tgt_txt']
-        src_rst = super().__call__(src_txt)
-        tgt_rst = super().__call__(tgt_txt)
+        src_rst = super().__call__(data['src_txt'])
+        src_input_ids = src_rst['input_ids']
+        src_attention_mask = src_rst['attention_mask']
+        if 'tgt_txt' in data:
+            labels = super().__call__(data['tgt_txt'])['input_ids']
+        else:
+            labels = src_input_ids[1:]
+            src_input_ids = src_input_ids[:-1]
+            src_attention_mask = src_attention_mask[:-1]
 
         return {
-            'src': src_rst['input_ids'],
-            'tgt': tgt_rst['input_ids'],
-            'mask_src': src_rst['attention_mask']
+            'input_ids': src_input_ids,
+            'attention_mask': src_attention_mask,
+            'labels': labels,
         }
 
 
@@ -532,8 +539,13 @@ class NERPreprocessor(Preprocessor):
 
         self.model_dir: str = model_dir
         self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=False)
+        self.is_transformer_based_model = 'lstm' not in model_dir
+        if self.is_transformer_based_model:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_dir, use_fast=True)
+        else:
+            self.tokenizer = BertTokenizerFast.from_pretrained(
+                model_dir, use_fast=True)
         self.is_split_into_words = self.tokenizer.init_kwargs.get(
             'is_split_into_words', False)
 
@@ -597,6 +609,11 @@ class NERPreprocessor(Preprocessor):
                 else:
                     label_mask.append(1)
                     offset_mapping.append(encodings['offset_mapping'][i])
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
         return {
             'text': text,
             'input_ids': input_ids,
@@ -606,6 +623,52 @@ class NERPreprocessor(Preprocessor):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(Preprocessor):
+    """The tokenizer preprocessor used in normal RE task.
+
+    NOTE: This preprocessor may be merged with the TokenClassificationPreprocessor in the next edition.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_error_correction)
 class TextErrorCorrectionPreprocessor(Preprocessor):
@@ -645,3 +708,86 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         sample = dict()
         sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
         return sample
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
+class FaqQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super(FaqQuestionAnsweringPreprocessor, self).__init__(
+            model_dir, pair=False, mode=ModeKeys.INFERENCE, **kwargs)
+        import os
+        from transformers import BertTokenizer
+
+        from modelscope.utils.config import Config
+        from modelscope.utils.constant import ModelFile
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        preprocessor_config = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
+                ConfigFields.preprocessor, {})
+        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        self.label_dict = None
+
+    def pad(self, samples, max_len):
+        result = []
+        for sample in samples:
+            pad_len = max_len - len(sample[:max_len])
+            result.append(sample[:max_len]
+                          + [self.tokenizer.pad_token_id] * pad_len)
+        return result
+
+    def set_label_dict(self, label_dict):
+        self.label_dict = label_dict
+
+    def get_label(self, label_id):
+        assert self.label_dict is not None and label_id < len(self.label_dict)
+        return self.label_dict[label_id]
+
+    def encode_plus(self, text):
+        return [
+            self.tokenizer.cls_token_id
+        ] + self.tokenizer.convert_tokens_to_ids(
+            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
+        queryset = data['query_set']
+        if not isinstance(queryset, list):
+            queryset = [queryset]
+        supportset = data['support_set']
+        supportset = sorted(supportset, key=lambda d: d['label'])
+
+        queryset_tokenized = [self.encode_plus(text) for text in queryset]
+        supportset_tokenized = [
+            self.encode_plus(item['text']) for item in supportset
+        ]
+
+        max_len = max(
+            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
+        max_len = min(TMP_MAX_LEN, max_len)
+        queryset_padded = self.pad(queryset_tokenized, max_len)
+        supportset_padded = self.pad(supportset_tokenized, max_len)
+
+        supportset_labels_ori = [item['label'] for item in supportset]
+        label_dict = []
+        for label in supportset_labels_ori:
+            if label not in label_dict:
+                label_dict.append(label)
+        self.set_label_dict(label_dict)
+        supportset_labels_ids = [
+            label_dict.index(label) for label in supportset_labels_ori
+        ]
+        return {
+            'query': queryset_padded,
+            'support': supportset_padded,
+            'support_labels': supportset_labels_ids
+        }
+
+    def batch_encode(self, sentence_list: list, max_length=None):
+        if not max_length:
+            max_length = self.MAX_LEN
+        return self.tokenizer.batch_encode_plus(
+            sentence_list, padding=True, max_length=max_length)
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index ebedd6fc..83c4e28a 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -27,7 +27,8 @@ class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        source = data['text'].lower().strip().split()[:self.max_src_length]
+        source = ' '.join(
+            data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
         inputs = self.get_inputs(source)
         sample = {
diff --git a/modelscope/preprocessors/slp.py b/modelscope/preprocessors/slp.py
new file mode 100644
index 00000000..d9c2d9b7
--- /dev/null
+++ b/modelscope/preprocessors/slp.py
@@ -0,0 +1,223 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from transformers import BertTokenizerFast
+
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields
+from modelscope.utils.hub import get_model_type, parse_label_mapping
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
+
+__all__ = ['DocumentSegmentationPreprocessor']
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            print(str(e))
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index 36110d1b..f693cd9e 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -9,35 +9,56 @@ import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose
 
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
 
-def ReadVideoData(cfg, video_path):
+
+def ReadVideoData(cfg,
+                  video_path,
+                  num_spatial_crops_override=None,
+                  num_temporal_views_override=None):
     """ simple interface to load video frames from file
 
     Args:
         cfg (Config): The global config object.
         video_path (str): video file path
+        num_spatial_crops_override (int): the spatial crops per clip
+        num_temporal_views_override (int): the temporal clips per video
+    Returns:
+        data (Tensor): the normalized video clips for model inputs
     """
-    data = _decode_video(cfg, video_path)
-    transform = kinetics400_tranform(cfg)
+    data = _decode_video(cfg, video_path, num_temporal_views_override)
+    if num_spatial_crops_override is not None:
+        num_spatial_crops = num_spatial_crops_override
+        transform = kinetics400_tranform(cfg, num_spatial_crops_override)
+    else:
+        num_spatial_crops = cfg.TEST.NUM_SPATIAL_CROPS
+        transform = kinetics400_tranform(cfg, cfg.TEST.NUM_SPATIAL_CROPS)
     data_list = []
     for i in range(data.size(0)):
-        for j in range(cfg.TEST.NUM_SPATIAL_CROPS):
+        for j in range(num_spatial_crops):
             transform.transforms[1].set_spatial_index(j)
             data_list.append(transform(data[i]))
     return torch.stack(data_list, dim=0)
 
 
-def kinetics400_tranform(cfg):
+def kinetics400_tranform(cfg, num_spatial_crops):
     """
     Configs the transform for the kinetics-400 dataset.
     We apply controlled spatial cropping and normalization.
     Args:
         cfg (Config): The global config object.
+        num_spatial_crops (int): the spatial crops per clip
+    Returns:
+        transform_function (Compose): the transform function for input clips
     """
     resize_video = KineticsResizedCrop(
         short_side_range=[cfg.DATA.TEST_SCALE, cfg.DATA.TEST_SCALE],
         crop_size=cfg.DATA.TEST_CROP_SIZE,
-        num_spatial_crops=cfg.TEST.NUM_SPATIAL_CROPS)
+        num_spatial_crops=num_spatial_crops)
     std_transform_list = [
         transforms.ToTensorVideo(), resize_video,
         transforms.NormalizeVideo(
@@ -54,17 +75,17 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
             vid_length  (int): the length of the whole video (valid selection range).
             vid_fps     (int): the original video fps
             target_fps  (int): the normalized video fps
-            clip_idx    (int): -1 for random temporal sampling, and positive values for
-                                sampling specific clip from the video
+            clip_idx    (int): -1 for random temporal sampling, and positive values for sampling specific
+                                clip from the video
             num_clips   (int): the total clips to be sampled from each video.
-                                combined with clip_idx, the sampled video is the "clip_idx-th"
-                                 video from "num_clips" videos.
+                                combined with clip_idx, the sampled video is the "clip_idx-th" video from
+                                "num_clips" videos.
             num_frames  (int): number of frames in each sampled clips.
             interval    (int): the interval to sample each frame.
             minus_interval (bool): control the end index
         Returns:
             index (tensor): the sampled frame indexes
-        """
+    """
     if num_frames == 1:
         index = [random.randint(0, vid_length - 1)]
     else:
@@ -72,7 +93,10 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
         clip_length = num_frames * interval * vid_fps / target_fps
 
         max_idx = max(vid_length - clip_length, 0)
-        start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
+        if num_clips == 1:
+            start_idx = max_idx / 2
+        else:
+            start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
         if minus_interval:
             end_idx = start_idx + clip_length - interval
         else:
@@ -84,59 +108,79 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
     return index
 
 
-def _decode_video_frames_list(cfg, frames_list, vid_fps):
+def _decode_video_frames_list(cfg,
+                              frames_list,
+                              vid_fps,
+                              num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
             cfg          (Config): The global config object.
             frames_list  (list):  all frames for a video, the frames should be numpy array.
             vid_fps      (int):  the fps of this video.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     assert isinstance(frames_list, list)
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(frames_list), vid_fps, cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(frames_list),
+            vid_fps,
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         frames = torch.from_numpy(
-            np.stack([frames_list[l_index] for l_index in list_.tolist()],
-                     axis=0))
+            np.stack([frames_list[index] for index in list_.tolist()], axis=0))
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
-
+    del vr
     return frames
 
 
-def _decode_video(cfg, path):
+def _decode_video(cfg, path, num_temporal_views_override=None):
     """
         Decodes the video given the numpy frames.
         Args:
+            cfg          (Config): The global config object.
             path          (str): video file path.
+            num_temporal_views_override (int): the temporal clips per video
         Returns:
             frames            (Tensor): video tensor data
     """
     vr = VideoReader(path)
-
-    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+    if num_temporal_views_override is not None:
+        num_clips_per_video = num_temporal_views_override
+    else:
+        num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
 
     frame_list = []
     for clip_idx in range(num_clips_per_video):
         # for each clip in the video,
         # a list is generated before decoding the specified frames from the video
         list_ = _interval_based_sampling(
-            len(vr), vr.get_avg_fps(), cfg.DATA.TARGET_FPS, clip_idx,
-            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
-            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+            len(vr),
+            vr.get_avg_fps(),
+            cfg.DATA.TARGET_FPS,
+            clip_idx,
+            num_clips_per_video,
+            cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE,
+            cfg.DATA.MINUS_INTERVAL,
+        )
         frames = None
         if path.endswith('.avi'):
             append_list = torch.arange(0, list_[0], 4)
@@ -149,8 +193,6 @@ def _decode_video(cfg, path):
                 vr.get_batch(list_).to_dlpack()).clone()
         frame_list.append(frames)
     frames = torch.stack(frame_list)
-    if num_clips_per_video == 1:
-        frames = frames.squeeze(0)
     del vr
     return frames
 
@@ -218,6 +260,29 @@ class KineticsResizedCrop(object):
                     y = y_max // 2
         return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
 
+    def _get_random_crop(self, clip):
+        _, _, clip_height, clip_width = clip.shape
+
+        short_side = min(clip_height, clip_width)
+        long_side = max(clip_height, clip_width)
+        new_short_side = int(random.uniform(*self.short_side_range))
+        new_long_side = int(long_side / short_side * new_short_side)
+        if clip_height < clip_width:
+            new_clip_height = new_short_side
+            new_clip_width = new_long_side
+        else:
+            new_clip_height = new_long_side
+            new_clip_width = new_short_side
+
+        new_clip = torch.nn.functional.interpolate(
+            clip, size=(new_clip_height, new_clip_width), mode='bilinear')
+
+        x_max = int(new_clip_width - self.crop_size)
+        y_max = int(new_clip_height - self.crop_size)
+        x = int(random.uniform(0, x_max))
+        y = int(random.uniform(0, y_max))
+        return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
+
     def set_spatial_index(self, idx):
         """Set the spatial cropping index for controlled cropping..
         Args:
@@ -227,3 +292,42 @@ class KineticsResizedCrop(object):
 
     def __call__(self, clip):
         return self._get_controlled_crop(clip)
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.movie_scene_segmentation_preprocessor)
+class MovieSceneSegmentationPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        """
+        movie scene segmentation preprocessor
+        """
+        super().__init__(*args, **kwargs)
+
+        self.is_train = kwargs.pop('is_train', True)
+        self.preprocessor_train_cfg = kwargs.pop(ModeKeys.TRAIN, None)
+        self.preprocessor_test_cfg = kwargs.pop(ModeKeys.EVAL, None)
+        self.num_keyframe = kwargs.pop('num_keyframe', 3)
+
+        from .movie_scene_segmentation import get_transform
+        self.train_transform = get_transform(self.preprocessor_train_cfg)
+        self.test_transform = get_transform(self.preprocessor_test_cfg)
+
+    def train(self):
+        self.is_train = True
+        return
+
+    def eval(self):
+        self.is_train = False
+        return
+
+    @type_assert(object, object)
+    def __call__(self, results):
+        if self.is_train:
+            transforms = self.train_transform
+        else:
+            transforms = self.test_transform
+
+        results = torch.stack(transforms(results), dim=0)
+        results = results.view(-1, self.num_keyframe, 3, 224, 224)
+        return results
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 17ed7f3c..8f8938c8 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -4,10 +4,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .audio.ans_trainer import ANSTrainer
     from .base import DummyTrainer
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
-                     ImagePortraitEnhancementTrainer)
+                     ImagePortraitEnhancementTrainer,
+                     MovieSceneSegmentationTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
@@ -15,11 +17,12 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'audio.ans_trainer': ['ANSTrainer'],
         'base': ['DummyTrainer'],
         'builder': ['build_trainer'],
         'cv': [
             'ImageInstanceSegmentationTrainer',
-            'ImagePortraitEnhancementTrainer'
+            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer'],
diff --git a/modelscope/trainers/audio/__init__.py b/modelscope/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py
new file mode 100644
index 00000000..f782b836
--- /dev/null
+++ b/modelscope/trainers/audio/ans_trainer.py
@@ -0,0 +1,57 @@
+import time
+from typing import List, Optional, Union
+
+from datasets import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import TrainerStages
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_frcrn_ans_cirm_16k)
+class ANSTrainer(EpochBasedTrainer):
+    """
+    A trainer is used for acoustic noise suppression.
+    Override train_loop() to use dataset just one time.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train_loop(self, data_loader):
+        """
+        Update epoch by step number, based on super method.
+        """
+        self.invoke_hook(TrainerStages.before_run)
+        self._epoch = 0
+        kwargs = {}
+        self.model.train()
+        enumerated = enumerate(data_loader)
+        for _ in range(self._epoch, self._max_epochs):
+            self.invoke_hook(TrainerStages.before_train_epoch)
+            self._inner_iter = 0
+            for i, data_batch in enumerated:
+                data_batch = to_device(data_batch, self.device)
+                self.data_batch = data_batch
+                self._inner_iter += 1
+                self.invoke_hook(TrainerStages.before_train_iter)
+                self.train_step(self.model, data_batch, **kwargs)
+                self.invoke_hook(TrainerStages.after_train_iter)
+                del self.data_batch
+                self._iter += 1
+                if self._inner_iter >= self.iters_per_epoch:
+                    break
+
+            self.invoke_hook(TrainerStages.after_train_epoch)
+            self._epoch += 1
+
+        self.invoke_hook(TrainerStages.after_run)
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 99c2aea5..4c65870e 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_trainer import \
         ImageInstanceSegmentationTrainer
     from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
+    from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
 
 else:
     _import_structure = {
@@ -14,6 +15,7 @@ else:
         ['ImageInstanceSegmentationTrainer'],
         'image_portrait_enhancement_trainer':
         ['ImagePortraitEnhancementTrainer'],
+        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..ee4dd849
--- /dev/null
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -0,0 +1,20 @@
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+
+
+@TRAINERS.register_module(module_name=Trainers.movie_scene_segmentation)
+class MovieSceneSegmentationTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
diff --git a/modelscope/trainers/easycv/__init__.py b/modelscope/trainers/easycv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/trainers/easycv/trainer.py b/modelscope/trainers/easycv/trainer.py
new file mode 100644
index 00000000..dee06a41
--- /dev/null
+++ b/modelscope/trainers/easycv/trainer.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.base import TRAINERS
+from modelscope.trainers.easycv.utils import register_util
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import build_parallel
+from modelscope.trainers.parallel.utils import is_parallel
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.import_utils import LazyImportModule
+from modelscope.utils.registry import default_group
+
+
+@TRAINERS.register_module(module_name=Trainers.easycv)
+class EasyCVEpochBasedTrainer(EpochBasedTrainer):
+    """Epoch based Trainer for EasyCV.
+
+    Args:
+        task: Task name.
+        cfg_file(str): The config file of EasyCV.
+        model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
+            or a model id. If model is None, build_model method will be called.
+        train_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*):
+            The dataset to use for training.
+            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
+            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
+            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
+            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
+            sets the seed of the RNGs used.
+        eval_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation.
+        preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor.
+            NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code,
+            this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
+            Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
+            this preprocessing action will be executed every time the dataset's __getitem__ is called.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use.
+        max_epochs: (int, optional): Total training epochs.
+    """
+
+    def __init__(
+            self,
+            task: str,
+            cfg_file: Optional[str] = None,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        self.task = task
+        register_util.register_parallel()
+        register_util.register_part_mmcv_hooks_to_ms()
+
+        super(EasyCVEpochBasedTrainer, self).__init__(
+            model=model,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            model_revision=model_revision,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            **kwargs)
+
+        # reset data_collator
+        from mmcv.parallel import collate
+
+        self.train_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.train.dataloader.batch_size_per_gpu)
+        self.eval_data_collator = partial(
+            collate,
+            samples_per_gpu=self.cfg.evaluation.dataloader.batch_size_per_gpu)
+
+        # Register easycv hooks dynamicly. If the hook already exists in modelscope,
+        # the hook in modelscope will be used, otherwise register easycv hook into ms.
+        # We must manually trigger lazy import to detect whether the hook is in modelscope.
+        # TODO: use ast index to detect whether the hook is in modelscope
+        for h_i in self.cfg.train.get('hooks', []):
+            sig = ('HOOKS', default_group, h_i['type'])
+            LazyImportModule.import_module(sig)
+            if h_i['type'] not in HOOKS._modules[default_group]:
+                if h_i['type'] in [
+                        'TensorboardLoggerHookV2', 'WandbLoggerHookV2'
+                ]:
+                    raise ValueError(
+                        'Not support hook %s now, we will support it in the future!'
+                        % h_i['type'])
+                register_util.register_hook_to_ms(h_i['type'], self.logger)
+
+        # reset parallel
+        if not self._dist:
+            assert not is_parallel(
+                self.model
+            ), 'Not support model wrapped by custom parallel if not in distributed mode!'
+            dp_cfg = dict(
+                type='MMDataParallel',
+                module=self.model,
+                device_ids=[torch.cuda.current_device()])
+            self.model = build_parallel(dp_cfg)
+
+    def create_optimizer_and_scheduler(self):
+        """ Create optimizer and lr scheduler
+        """
+        optimizer, lr_scheduler = self.optimizers
+        if optimizer is None:
+            optimizer_cfg = self.cfg.train.get('optimizer', None)
+        else:
+            optimizer_cfg = None
+
+        optim_options = {}
+        if optimizer_cfg is not None:
+            optim_options = optimizer_cfg.pop('options', {})
+            from easycv.apis.train import build_optimizer
+            optimizer = build_optimizer(self.model, optimizer_cfg)
+
+        if lr_scheduler is None:
+            lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None)
+        else:
+            lr_scheduler_cfg = None
+
+        lr_options = {}
+        # Adapt to mmcv lr scheduler hook.
+        # Please refer to: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+        if lr_scheduler_cfg is not None:
+            assert optimizer is not None
+            lr_options = lr_scheduler_cfg.pop('options', {})
+            assert 'policy' in lr_scheduler_cfg
+            policy_type = lr_scheduler_cfg.pop('policy')
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_scheduler_cfg['type'] = hook_type
+
+            self.cfg.train.lr_scheduler_hook = lr_scheduler_cfg
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+        return self.optimizer, self.lr_scheduler, optim_options, lr_options
+
+    def to_parallel(self, model) -> Union[nn.Module, TorchModel]:
+        if self.cfg.get('parallel', None) is not None:
+            self.cfg.parallel.update(
+                dict(module=model, device_ids=[torch.cuda.current_device()]))
+            return build_parallel(self.cfg.parallel)
+
+        dp_cfg = dict(
+            type='MMDistributedDataParallel',
+            module=model,
+            device_ids=[torch.cuda.current_device()])
+
+        return build_parallel(dp_cfg)
+
+    def rebuild_config(self, cfg: Config):
+        cfg.task = self.task
+
+        return cfg
diff --git a/modelscope/trainers/easycv/utils/__init__.py b/modelscope/trainers/easycv/utils/__init__.py
new file mode 100644
index 00000000..23cfa36a
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hooks import AddLrLogHook
+    from .metric import EasyCVMetric
+
+else:
+    _import_structure = {'hooks': ['AddLrLogHook'], 'metric': ['EasyCVMetric']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/easycv/utils/hooks.py b/modelscope/trainers/easycv/utils/hooks.py
new file mode 100644
index 00000000..62bc6d1e
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/hooks.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.trainers.hooks import HOOKS, Priority
+from modelscope.trainers.hooks.lr_scheduler_hook import LrSchedulerHook
+from modelscope.utils.constant import LogKeys
+
+
+@HOOKS.register_module(module_name='AddLrLogHook')
+class AddLrLogHook(LrSchedulerHook):
+    """For EasyCV to adapt to ModelScope, the lr log of EasyCV is added in the trainer,
+    but the trainer of ModelScope does not and it is added in the lr scheduler hook.
+    But The lr scheduler hook used by EasyCV is the hook of mmcv, and there is no lr log.
+    It will be deleted in the future.
+    """
+    PRIORITY = Priority.NORMAL
+
+    def __init__(self):
+        pass
+
+    def before_run(self, trainer):
+        pass
+
+    def before_train_iter(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def before_train_epoch(self, trainer):
+        trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
+
+    def after_train_epoch(self, trainer):
+        pass
diff --git a/modelscope/trainers/easycv/utils/metric.py b/modelscope/trainers/easycv/utils/metric.py
new file mode 100644
index 00000000..53937b67
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/metric.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import itertools
+from typing import Dict
+
+import numpy as np
+import torch
+
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS
+
+
+@METRICS.register_module(module_name='EasyCVMetric')
+class EasyCVMetric(Metric):
+    """Adapt to ModelScope Metric for EasyCV evaluator.
+    """
+
+    def __init__(self, trainer=None, evaluators=None, *args, **kwargs):
+        from easycv.core.evaluation.builder import build_evaluator
+
+        self.trainer = trainer
+        self.evaluators = build_evaluator(evaluators)
+        self.preds = []
+        self.grountruths = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.preds.append(outputs)
+        del inputs
+
+    def evaluate(self):
+        results = {}
+        for _, batch in enumerate(self.preds):
+            for k, v in batch.items():
+                if k not in results:
+                    results[k] = []
+                results[k].append(v)
+
+        for k, v in results.items():
+            if len(v) == 0:
+                raise ValueError(f'empty result for {k}')
+
+            if isinstance(v[0], torch.Tensor):
+                results[k] = torch.cat(v, 0)
+            elif isinstance(v[0], (list, np.ndarray)):
+                results[k] = list(itertools.chain.from_iterable(v))
+            else:
+                raise ValueError(
+                    f'value of batch prediction dict should only be tensor or list, {k} type is {v[0]}'
+                )
+
+        metric_values = self.trainer.eval_dataset.evaluate(
+            results, self.evaluators)
+        return metric_values
diff --git a/modelscope/trainers/easycv/utils/register_util.py b/modelscope/trainers/easycv/utils/register_util.py
new file mode 100644
index 00000000..f80eaace
--- /dev/null
+++ b/modelscope/trainers/easycv/utils/register_util.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+import logging
+
+from modelscope.trainers.hooks import HOOKS
+from modelscope.trainers.parallel.builder import PARALLEL
+
+
+def register_parallel():
+    from mmcv.parallel import MMDistributedDataParallel, MMDataParallel
+
+    PARALLEL.register_module(
+        module_name='MMDistributedDataParallel',
+        module_cls=MMDistributedDataParallel)
+    PARALLEL.register_module(
+        module_name='MMDataParallel', module_cls=MMDataParallel)
+
+
+def register_hook_to_ms(hook_name, logger=None):
+    """Register EasyCV hook to ModelScope."""
+    from easycv.hooks import HOOKS as _EV_HOOKS
+
+    if hook_name not in _EV_HOOKS._module_dict:
+        raise ValueError(
+            f'Not found hook "{hook_name}" in EasyCV hook registries!')
+
+    obj = _EV_HOOKS._module_dict[hook_name]
+    HOOKS.register_module(module_name=hook_name, module_cls=obj)
+
+    log_str = f'Register hook "{hook_name}" to modelscope hooks.'
+    logger.info(log_str) if logger is not None else logging.info(log_str)
+
+
+def register_part_mmcv_hooks_to_ms():
+    """Register required mmcv hooks to ModelScope.
+    Currently we only registered all lr scheduler hooks in EasyCV and mmcv.
+    Please refer to:
+        EasyCV: https://github.com/alibaba/EasyCV/blob/master/easycv/hooks/lr_update_hook.py
+        mmcv: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py
+    """
+    from mmcv.runner.hooks import lr_updater
+    from mmcv.runner.hooks import HOOKS as _MMCV_HOOKS
+    from easycv.hooks import StepFixCosineAnnealingLrUpdaterHook, YOLOXLrUpdaterHook
+    from easycv.hooks.logger import PreLoggerHook
+
+    mmcv_hooks_in_easycv = [('StepFixCosineAnnealingLrUpdaterHook',
+                             StepFixCosineAnnealingLrUpdaterHook),
+                            ('YOLOXLrUpdaterHook', YOLOXLrUpdaterHook),
+                            ('PreLoggerHook', PreLoggerHook)]
+
+    members = inspect.getmembers(lr_updater)
+    members.extend(mmcv_hooks_in_easycv)
+
+    for name, obj in members:
+        if name in _MMCV_HOOKS._module_dict:
+            HOOKS.register_module(
+                module_name=name,
+                module_cls=obj,
+            )
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index fc0281a1..cf7a0f7a 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 
+import json
+
 from modelscope import __version__
 from modelscope.metainfo import Hooks
 from modelscope.utils.checkpoint import save_checkpoint
-from modelscope.utils.constant import LogKeys
+from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import is_master
 from .builder import HOOKS
@@ -73,6 +75,25 @@ class CheckpointHook(Hook):
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
+        self._save_pretrained(trainer)
+
+    def _save_pretrained(self, trainer):
+        if self.is_last_epoch(trainer) and self.by_epoch:
+            output_dir = os.path.join(self.save_dir,
+                                      ModelFile.TRAIN_OUTPUT_DIR)
+            from modelscope.trainers.parallel.utils import is_parallel
+
+            if is_parallel(trainer.model):
+                model = trainer.model.module
+            else:
+                model = trainer.model
+
+            if hasattr(model, 'save_pretrained'):
+                model.save_pretrained(
+                    output_dir,
+                    ModelFile.TORCH_MODEL_BIN_FILE,
+                    save_function=save_checkpoint,
+                    config=trainer.cfg.to_dict())
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -166,3 +187,4 @@ class BestCkptSaverHook(CheckpointHook):
             )
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
         self._best_ckpt_file = cur_save_name
+        self._save_pretrained(trainer)
diff --git a/modelscope/trainers/hooks/logger/base.py b/modelscope/trainers/hooks/logger/base.py
index e1da251f..684c4a8c 100644
--- a/modelscope/trainers/hooks/logger/base.py
+++ b/modelscope/trainers/hooks/logger/base.py
@@ -60,6 +60,18 @@ class LoggerHook(Hook):
         else:
             return False
 
+    def fetch_tensor(self, trainer, n=0):
+        """Fetch latest n values or all values, process tensor type, convert to numpy for dump logs."""
+        assert n >= 0
+        for key in trainer.log_buffer.val_history:
+            values = trainer.log_buffer.val_history[key][-n:]
+
+            for i, v in enumerate(values):
+                if isinstance(v, torch.Tensor):
+                    values[i] = v.clone().detach().cpu().numpy()
+
+            trainer.log_buffer.val_history[key][-n:] = values
+
     def get_epoch(self, trainer):
         if trainer.mode in [ModeKeys.TRAIN, ModeKeys.EVAL]:
             epoch = trainer.epoch + 1
@@ -88,11 +100,14 @@ class LoggerHook(Hook):
 
     def after_train_iter(self, trainer):
         if self.by_epoch and self.every_n_inner_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif not self.by_epoch and self.every_n_iters(trainer, self.interval):
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
         elif self.end_of_epoch(trainer) and not self.ignore_last:
             # not precise but more stable
+            self.fetch_tensor(trainer, self.interval)
             trainer.log_buffer.average(self.interval)
 
         if trainer.log_buffer.ready:
@@ -107,6 +122,7 @@ class LoggerHook(Hook):
                 trainer.log_buffer.clear_output()
 
     def after_val_epoch(self, trainer):
+        self.fetch_tensor(trainer)
         trainer.log_buffer.average()
         self.log(trainer)
         if self.reset_flag:
diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py
index 067c1d83..62ae91a8 100644
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -241,8 +241,10 @@ def input_fn(src_file,
     trg_dataset = tf.data.TextLineDataset(trg_file)
     src_trg_dataset = tf.data.Dataset.zip((src_dataset, trg_dataset))
     src_trg_dataset = src_trg_dataset.map(
-        lambda src, trg:
-        (tf.string_split([src]).values, tf.string_split([trg]).values),
+        lambda src, trg: (tf.string_split([src]), tf.string_split([trg])),
+        num_parallel_calls=10).prefetch(1000000)
+    src_trg_dataset = src_trg_dataset.map(
+        lambda src, trg: (src.values, trg.values),
         num_parallel_calls=10).prefetch(1000000)
     src_trg_dataset = src_trg_dataset.map(
         lambda src, trg: (src_vocab.lookup(src), trg_vocab.lookup(trg)),
diff --git a/modelscope/trainers/optimizer/__init__.py b/modelscope/trainers/optimizer/__init__.py
index 884f3043..9962c2c2 100644
--- a/modelscope/trainers/optimizer/__init__.py
+++ b/modelscope/trainers/optimizer/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .builder import OPTIMIZERS, build_optimizer
+from .child_tuning_adamw_optimizer import ChildTuningAdamW
 
-__all__ = ['OPTIMIZERS', 'build_optimizer']
+__all__ = ['OPTIMIZERS', 'build_optimizer', 'ChildTuningAdamW']
diff --git a/modelscope/trainers/optimizer/builder.py b/modelscope/trainers/optimizer/builder.py
index 4d772dd9..f43768d6 100644
--- a/modelscope/trainers/optimizer/builder.py
+++ b/modelscope/trainers/optimizer/builder.py
@@ -20,7 +20,10 @@ def build_optimizer(model: torch.nn.Module,
     """
     if hasattr(model, 'module'):
         model = model.module
-    cfg.params = model.parameters()
+
+    if default_args is None:
+        default_args = {}
+    default_args['params'] = model.parameters()
 
     return build_from_cfg(
         cfg, OPTIMIZERS, group_key=default_group, default_args=default_args)
diff --git a/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
new file mode 100644
index 00000000..d004071f
--- /dev/null
+++ b/modelscope/trainers/optimizer/child_tuning_adamw_optimizer.py
@@ -0,0 +1,188 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import types
+from typing import Callable, Iterable, Tuple
+
+import numpy as np
+import torch
+from torch.distributions.bernoulli import Bernoulli
+from torch.optim import Optimizer
+
+from modelscope.utils.logger import get_logger
+from .builder import OPTIMIZERS, default_group
+
+logger = get_logger(__name__)
+
+__all__ = ['calculate_fisher', 'ChildTuningAdamW']
+
+
+def calculate_fisher(model: torch.nn.Module,
+                     data_loader,
+                     forward_step,
+                     reserve_p,
+                     grad_clip=None):
+
+    gradient_mask = dict()
+    model.train()
+    for name, params in model.named_parameters():
+        if 'layer' in name:
+            gradient_mask[params] = params.new_zeros(params.size())
+
+    iters = len(data_loader)
+    for inputs in data_loader:
+        loss = forward_step(model, inputs)
+        loss.backward()
+        for name, params in model.named_parameters():
+            if 'layer' in name:
+                if grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(params, **grad_clip)
+                gradient_mask[params] += (params.grad**2) / iters
+        model.zero_grad()
+
+    logger.info('Calculate Fisher Information...')
+
+    # Numpy
+    r = None
+    for k, v in gradient_mask.items():
+        v = v.view(-1).cpu().numpy()
+        if r is None:
+            r = v
+        else:
+            r = np.append(r, v)
+    polar = np.percentile(r, (1 - reserve_p) * 100)
+    for k in gradient_mask:
+        gradient_mask[k] = gradient_mask[k] >= polar
+    print('Polar => {}'.format(polar))
+
+    # TODO: pytorch: torch.kthvalue
+
+    return gradient_mask
+
+
+@OPTIMIZERS.register_module(
+    group_key=default_group, module_name='ChildTuningAdamW')
+class ChildTuningAdamW(Optimizer):
+
+    def __init__(self,
+                 params: Iterable[torch.nn.parameter.Parameter],
+                 lr: float = 1e-3,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-6,
+                 weight_decay: float = 0.0,
+                 correct_bias: bool = True,
+                 reserve_p=1.0,
+                 mode=None):
+        if lr < 0.0:
+            raise ValueError(
+                'Invalid learning rate: {} - should be >= 0.0'.format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(
+                'Invalid beta parameter: {} - should be in [0.0, 1.0['.format(
+                    betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError(
+                'Invalid epsilon value: {} - should be >= 0.0'.format(eps))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            correct_bias=correct_bias)
+        super().__init__(params, defaults)
+
+        self.gradient_mask = None
+        self.reserve_p = reserve_p
+        self.mode = mode
+
+    def set_gradient_mask(self, gradient_mask):
+        self.gradient_mask = gradient_mask
+
+    def step(self, closure: Callable = None):
+        """
+        Performs a single optimization step.
+        Arguments:
+            closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Adam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                # ChildTuning code
+                if self.mode is not None:
+                    if self.mode == 'ChildTuning-D':
+                        if p in self.gradient_mask:
+                            grad *= self.gradient_mask[p]
+                    else:
+                        # ChildTuning-F
+                        grad_mask = Bernoulli(
+                            grad.new_full(
+                                size=grad.size(), fill_value=self.reserve_p))
+                        grad *= grad_mask.sample() / self.reserve_p
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1**state['step']
+                    bias_correction2 = 1.0 - beta2**state['step']
+                    step_size = step_size * math.sqrt(
+                        bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay'])
+
+        return loss
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 0916495c..614b728a 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -26,7 +26,6 @@ from modelscope.msdatasets.task_datasets.torch_base_dataset import \
     TorchTaskDataset
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import build_preprocessor
-from modelscope.preprocessors.common import Compose
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -36,11 +35,11 @@ from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
                                        ConfigKeys, Hubs, ModeKeys, ModelFile,
                                        Tasks, TrainerStages)
 from modelscope.utils.data_utils import to_device
+from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import (create_device, get_dist_info,
-                                          init_dist)
+from modelscope.utils.torch_utils import get_dist_info, init_dist
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -83,7 +82,8 @@ class EpochBasedTrainer(BaseTrainer):
             model: Optional[Union[TorchModel, nn.Module, str]] = None,
             cfg_file: Optional[str] = None,
             arg_parse_fn: Optional[Callable] = None,
-            data_collator: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
             train_dataset: Optional[Union[MsDataset, Dataset]] = None,
             eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
             preprocessor: Optional[Union[Preprocessor,
@@ -104,21 +104,24 @@ class EpochBasedTrainer(BaseTrainer):
             if cfg_file is None:
                 cfg_file = os.path.join(self.model_dir,
                                         ModelFile.CONFIGURATION)
-            self.model = self.build_model()
         else:
-            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
-            assert isinstance(
-                model,
-                (TorchModel, nn.Module
-                 )), 'model should be either str, TorchMode or nn.Module.'
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
             self.model_dir = os.path.dirname(cfg_file)
-            self.model = model
 
         super().__init__(cfg_file, arg_parse_fn)
+
         # add default config
         self.cfg.merge_from_dict(self._get_default_config(), force=False)
         self.cfg = self.rebuild_config(self.cfg)
 
+        if 'cfg_options' in kwargs:
+            self.cfg.merge_from_dict(kwargs['cfg_options'])
+
+        if isinstance(model, (TorchModel, nn.Module)):
+            self.model = model
+        else:
+            self.model = self.build_model()
+
         if 'work_dir' in kwargs:
             self.work_dir = kwargs['work_dir']
         else:
@@ -150,9 +153,8 @@ class EpochBasedTrainer(BaseTrainer):
             self.eval_preprocessor.mode = ModeKeys.EVAL
 
         device_name = kwargs.get('device', 'gpu')
-        assert device_name in ['gpu',
-                               'cpu'], 'device should be either cpu or gpu.'
-        self.device = create_device(device_name == 'cpu')
+        verify_device(device_name)
+        self.device = create_device(device_name)
 
         self.train_dataset = self.to_task_dataset(
             train_dataset,
@@ -163,7 +165,24 @@ class EpochBasedTrainer(BaseTrainer):
             mode=ModeKeys.EVAL,
             preprocessor=self.eval_preprocessor)
 
-        self.data_collator = data_collator if data_collator is not None else default_collate
+        self.train_data_collator, self.eval_default_collate = None, None
+        if isinstance(data_collator, Mapping):
+            if not (ConfigKeys.train in data_collator
+                    or ConfigKeys.val in data_collator):
+                raise ValueError(
+                    f'data_collator must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
+                )
+            if ConfigKeys.train in data_collator:
+                assert isinstance(data_collator[ConfigKeys.train], Callable)
+                self.train_data_collator = data_collator[ConfigKeys.train]
+            if ConfigKeys.val in data_collator:
+                assert isinstance(data_collator[ConfigKeys.val], Callable)
+                self.eval_data_collator = data_collator[ConfigKeys.val]
+        else:
+            collate_fn = default_collate if data_collator is None else data_collator
+            self.train_data_collator = collate_fn
+            self.eval_data_collator = collate_fn
+
         self.metrics = self.get_metrics()
         self._metric_values = None
         self.optimizers = optimizers
@@ -293,7 +312,8 @@ class EpochBasedTrainer(BaseTrainer):
                     else ConfigDict(type=None, mode=mode)
                 return datasets.to_torch_dataset(
                     task_data_config=cfg,
-                    task_name=self.cfg.task,
+                    task_name=self.cfg.task
+                    if hasattr(self.cfg, ConfigFields.task) else None,
                     preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
@@ -365,7 +385,7 @@ class EpochBasedTrainer(BaseTrainer):
 
         return train_preprocessor, eval_preprocessor
 
-    def get_metrics(self) -> List[str]:
+    def get_metrics(self) -> List[Union[str, Dict]]:
         """Get the metric class types.
 
         The first choice will be the metrics configured in the config file, if not found, the default metrics will be
@@ -385,7 +405,7 @@ class EpochBasedTrainer(BaseTrainer):
                 f'Metrics are needed in evaluation, please try to either '
                 f'add metrics in configuration.json or add the default metric for {self.cfg.task}.'
             )
-        if isinstance(metrics, str):
+        if isinstance(metrics, (str, Mapping)):
             metrics = [metrics]
         return metrics
 
@@ -400,6 +420,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.train_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.train_data_collator,
                 **self.cfg.train.get('dataloader', {}))
         self.data_loader = self.train_dataloader
 
@@ -419,6 +440,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.eval_dataset,
                 dist=self._dist,
                 seed=self._seed,
+                collate_fn=self.eval_data_collator,
                 **self.cfg.evaluation.get('dataloader', {}))
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
@@ -441,7 +463,7 @@ class EpochBasedTrainer(BaseTrainer):
         override this method in a subclass.
 
         """
-        model = Model.from_pretrained(self.model_dir)
+        model = Model.from_pretrained(self.model_dir, cfg_dict=self.cfg)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
         elif isinstance(model, nn.Module):
@@ -553,6 +575,7 @@ class EpochBasedTrainer(BaseTrainer):
             self.train_dataset,
             dist=self._dist,
             seed=self._seed,
+            collate_fn=self.train_data_collator,
             **self.cfg.train.get('dataloader', {}))
         return data_loader
 
@@ -570,9 +593,9 @@ class EpochBasedTrainer(BaseTrainer):
                 mode=ModeKeys.EVAL,
                 preprocessor=self.eval_preprocessor)
 
-        batch_size = self.cfg.evaluation.batch_size
-        workers = self.cfg.evaluation.workers
-        shuffle = self.cfg.evaluation.get('shuffle', False)
+        batch_size = self.cfg.evaluation.dataloader.batch_size_per_gpu
+        workers = self.cfg.evaluation.dataloader.workers_per_gpu
+        shuffle = self.cfg.evaluation.dataloader.get('shuffle', False)
         data_loader = self._build_dataloader_with_dataset(
             self.eval_dataset,
             batch_size_per_gpu=batch_size,
@@ -581,25 +604,31 @@ class EpochBasedTrainer(BaseTrainer):
             dist=self._dist,
             seed=self._seed,
             persistent_workers=True,
+            collate_fn=self.eval_data_collator,
         )
         return data_loader
 
     def build_dataset(self, data_cfg, mode, preprocessor=None):
         """ Build torch dataset object using data config
         """
-        dataset = MsDataset.load(
-            dataset_name=data_cfg.name,
-            split=data_cfg.split,
-            subset_name=data_cfg.subset_name if hasattr(
-                data_cfg, 'subset_name') else None,
-            hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
-            **data_cfg,
-        )
-        cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
-        torch_dataset = dataset.to_torch_dataset(
-            task_data_config=cfg,
-            task_name=self.cfg.task,
-            preprocessors=self.preprocessor)
+        # TODO: support MsDataset load for cv
+        if hasattr(data_cfg, 'name'):
+            dataset = MsDataset.load(
+                dataset_name=data_cfg.name,
+                split=data_cfg.split,
+                subset_name=data_cfg.subset_name if hasattr(
+                    data_cfg, 'subset_name') else None,
+                hub=data_cfg.hub
+                if hasattr(data_cfg, 'hub') else Hubs.modelscope,
+                **data_cfg,
+            )
+            cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
+            torch_dataset = dataset.to_torch_dataset(
+                task_data_config=cfg,
+                task_name=self.cfg.task,
+                preprocessors=self.preprocessor)
+        else:
+            torch_dataset = build_task_dataset(data_cfg, self.cfg.task)
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
@@ -747,7 +776,6 @@ class EpochBasedTrainer(BaseTrainer):
             sampler=sampler,
             num_workers=num_workers,
             batch_sampler=batch_sampler,
-            collate_fn=self.data_collator,
             pin_memory=kwargs.pop('pin_memory', False),
             worker_init_fn=init_fn,
             **kwargs)
@@ -773,6 +801,7 @@ class EpochBasedTrainer(BaseTrainer):
                 self.invoke_hook(TrainerStages.after_train_iter)
                 del self.data_batch
                 self._iter += 1
+                self._mode = ModeKeys.TRAIN
 
                 if i + 1 >= self.iters_per_epoch:
                     break
@@ -821,12 +850,14 @@ class EpochBasedTrainer(BaseTrainer):
         Args:
             hook (:obj:`Hook`): The hook to be registered.
         """
-        assert isinstance(hook, Hook)
         # insert the hook to a sorted list
         inserted = False
         for i in range(len(self._hooks) - 1, -1, -1):
-            if get_priority(hook.PRIORITY) > get_priority(
-                    self._hooks[i].PRIORITY):
+            p = hook.PRIORITY if hasattr(hook, 'PRIORITY') else Priority.NORMAL
+            p_i = self._hooks[i].PRIORITY if hasattr(
+                self._hooks[i], 'PRIORITY') else Priority.NORMAL
+
+            if get_priority(p) > get_priority(p_i):
                 self._hooks.insert(i + 1, hook)
                 inserted = True
                 break
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 759bd447..990a9571 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -7,6 +7,7 @@ import os.path as osp
 import time
 import traceback
 from functools import reduce
+from pathlib import Path
 from typing import Generator, Union
 
 import gast
@@ -14,9 +15,9 @@ import json
 
 from modelscope import __version__
 from modelscope.fileio.file import LocalStorage
-from modelscope.metainfo import (Heads, Hooks, LR_Schedulers, Metrics, Models,
-                                 Optimizers, Pipelines, Preprocessors,
-                                 TaskModels, Trainers)
+from modelscope.metainfo import (Datasets, Heads, Hooks, LR_Schedulers,
+                                 Metrics, Models, Optimizers, Pipelines,
+                                 Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
@@ -24,14 +25,14 @@ from modelscope.utils.registry import default_group
 
 logger = get_logger()
 storage = LocalStorage()
+p = Path(__file__)
 
 # get the path of package 'modelscope'
-MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1])
+MODELSCOPE_PATH = p.resolve().parents[1]
 REGISTER_MODULE = 'register_module'
 IGNORED_PACKAGES = ['modelscope', '.']
 SCAN_SUB_FOLDERS = [
-    'models', 'metrics', 'pipelines', 'preprocessors',
-    'msdatasets/task_datasets', 'trainers'
+    'models', 'metrics', 'pipelines', 'preprocessors', 'trainers', 'msdatasets'
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
new file mode 100644
index 00000000..14374c65
--- /dev/null
+++ b/modelscope/utils/audio/audio_utils.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+SEGMENT_LENGTH_TRAIN = 16000
+
+
+def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN):
+    """
+    Dataset mapping function to split one audio into segments.
+    It only works in batch mode.
+    """
+    noisy_arrays = []
+    for x in batch['noisy']:
+        length = len(x['array'])
+        noisy = np.array(x['array'])
+        for offset in range(segment_length, length, segment_length):
+            noisy_arrays.append(noisy[offset - segment_length:offset])
+    clean_arrays = []
+    for x in batch['clean']:
+        length = len(x['array'])
+        clean = np.array(x['array'])
+        for offset in range(segment_length, length, segment_length):
+            clean_arrays.append(clean[offset - segment_length:offset])
+    return {'noisy': noisy_arrays, 'clean': clean_arrays}
+
+
+def audio_norm(x):
+    rms = (x**2).mean()**0.5
+    scalar = 10**(-25 / 20) / rms
+    x = x * scalar
+    pow_x = x**2
+    avg_pow_x = pow_x.mean()
+    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
+    scalarx = 10**(-25 / 20) / rmsx
+    x = x * scalarx
+    return x
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 76fb2a19..8b9d027a 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -1,15 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import io
+import os
 import time
 from collections import OrderedDict
-from typing import Optional
+from shutil import copytree, ignore_patterns, rmtree
+from typing import Callable, List, Optional, Union
 
+import json
+import numpy as np
 import torch
 from torch.optim import Optimizer
 
 from modelscope import __version__
-from modelscope.fileio import File
+from modelscope.fileio import File, LocalStorage
+from modelscope.utils.config import JSONIteratorEncoder
+from modelscope.utils.constant import ConfigFields, ModelFile
+
+storage = LocalStorage()
 
 
 def weights_to_cpu(state_dict):
@@ -72,3 +80,76 @@ def save_checkpoint(model: torch.nn.Module,
     with io.BytesIO() as f:
         torch.save(checkpoint, f)
         File.write(f.getvalue(), filename)
+
+
+def save_pretrained(model,
+                    target_folder: Union[str, os.PathLike],
+                    save_checkpoint_name: str = None,
+                    save_function: Callable = None,
+                    config: Optional[dict] = None,
+                    **kwargs):
+    """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+
+    Args:
+        model (Model): Model whose params are to be saved.
+
+        target_folder (Union[str, os.PathLike]):
+        Directory to which to save. Will be created if it doesn't exist.
+
+        save_checkpoint_name (str):
+        The checkpoint name to be saved in the target_folder
+
+        save_function (Callable, optional):
+        The function to use to save the state dictionary.
+
+        config (Optional[dict], optional):
+        The config for the configuration.json, might not be identical with model.config
+    """
+
+    if save_function is None or not isinstance(save_function, Callable):
+        raise Exception('A valid save function must be passed in')
+
+    if target_folder is None or os.path.isfile(target_folder):
+        raise ValueError(
+            f'Provided path ({target_folder}) should be a directory, not a file'
+        )
+
+    if save_checkpoint_name is None:
+        raise Exception(
+            'At least pass in one checkpoint name for saving method')
+
+    if config is None:
+        raise ValueError('Configuration is not valid')
+
+    # Clean the folder from a previous save
+    if os.path.exists(target_folder):
+        rmtree(target_folder)
+
+    # Single ckpt path, sharded ckpt logic will be added later
+    output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
+
+    # Save the files to be copied to the save directory, ignore the original ckpts and configuration
+    origin_file_to_be_ignored = [save_checkpoint_name]
+    ignore_file_set = set(origin_file_to_be_ignored)
+    ignore_file_set.add(ModelFile.CONFIGURATION)
+    ignore_file_set.add('.*')
+    if hasattr(model, 'model_dir') and model.model_dir is not None:
+        copytree(
+            model.model_dir,
+            target_folder,
+            ignore=ignore_patterns(*ignore_file_set))
+
+    # Save the ckpt to the save directory
+    try:
+        save_function(model, output_ckpt_path)
+    except Exception as e:
+        raise Exception(
+            f'During saving checkpoints, the error of "{type(e).__name__} '
+            f'with msg {e} throwed')
+
+    # Dump the config to the configuration.json
+    if ConfigFields.pipeline not in config:
+        config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
+    cfg_str = json.dumps(config, cls=JSONIteratorEncoder)
+    config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
+    storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index a28ac1ab..42985db6 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Dict, Union
 
 import addict
+import json
 from yapf.yapflib.yapf_api import FormatCode
 
 from modelscope.utils.constant import ConfigFields, ModelFile
@@ -627,3 +628,20 @@ def check_config(cfg: Union[str, ConfigDict]):
         check_attr(ConfigFields.model)
         check_attr(ConfigFields.preprocessor)
         check_attr(ConfigFields.evaluation)
+
+
+class JSONIteratorEncoder(json.JSONEncoder):
+    """Implement this method in order that supporting arbitrary iterators, it returns
+        a serializable object for ``obj``, or calls the base implementation
+        (to raise a ``TypeError``).
+
+    """
+
+    def default(self, obj):
+        try:
+            iterable = iter(obj)
+        except TypeError:
+            pass
+        else:
+            return list(iterable)
+        return json.JSONEncoder.default(self, obj)
diff --git a/modelscope/msdatasets/config.py b/modelscope/utils/config_ds.py
similarity index 87%
rename from modelscope/msdatasets/config.py
rename to modelscope/utils/config_ds.py
index 0357e823..bafe3f99 100644
--- a/modelscope/msdatasets/config.py
+++ b/modelscope/utils/config_ds.py
@@ -4,9 +4,9 @@ from pathlib import Path
 # Cache location
 from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT
 
-DEFAULT_CACHE_HOME = '~/.cache'
+DEFAULT_CACHE_HOME = Path.home().joinpath('.cache')
 CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
-DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
+DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope', 'hub')
 MS_CACHE_HOME = os.path.expanduser(
     os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))
 
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 1a3fb7c3..960e9600 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -24,6 +24,7 @@ class CVTasks(object):
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
     body_2d_keypoints = 'body-2d-keypoints'
+    body_3d_keypoints = 'body-3d-keypoints'
     general_recognition = 'general-recognition'
 
     image_classification = 'image-classification'
@@ -61,9 +62,12 @@ class CVTasks(object):
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
     crowd_counting = 'crowd-counting'
+    movie_scene_segmentation = 'movie-scene-segmentation'
 
-    # video related
+    # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
+    video_summarization = 'video-summarization'
+    image_reid_person = 'image-reid-person'
 
 
 class NLPTasks(object):
@@ -93,7 +97,10 @@ class NLPTasks(object):
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'
+    faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
+    information_extraction = 'information-extraction'
+    document_segmentation = 'document-segmentation'
 
 
 class AudioTasks(object):
@@ -113,9 +120,11 @@ class MultiModalTasks(object):
     text_to_image_synthesis = 'text-to-image-synthesis'
     multi_modal_embedding = 'multi-modal-embedding'
     generative_multi_modal_embedding = 'generative-multi-modal-embedding'
+    multi_modal_similarity = 'multi-modal-similarity'
     visual_question_answering = 'visual-question-answering'
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
+    image_text_retrieval = 'image-text-retrieval'
 
 
 class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
@@ -209,6 +218,7 @@ class ModelFile(object):
     VOCAB_FILE = 'vocab.txt'
     ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
+    TRAIN_OUTPUT_DIR = 'output'
 
 
 class ConfigFields(object):
@@ -251,6 +261,7 @@ class Frameworks(object):
 
 DEFAULT_MODEL_REVISION = 'master'
 DEFAULT_DATASET_REVISION = 'master'
+DEFAULT_DATASET_NAMESPACE = 'modelscope'
 
 
 class ModeKeys:
@@ -290,3 +301,9 @@ class ColorCodes:
     GREEN = '\033[92m'
     RED = '\033[91m'
     END = '\033[0m'
+
+
+class Devices:
+    """device used for training and inference"""
+    cpu = 'cpu'
+    gpu = 'gpu'
diff --git a/modelscope/utils/cv/heatmap.py b/modelscope/utils/cv/heatmap.py
deleted file mode 100644
index 4d248a92..00000000
--- a/modelscope/utils/cv/heatmap.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import cv2
-import numpy as np
-
-
-def numpy_to_cv2img(vis_img):
-    """to convert a np.array Hotmap with shape(h, w) to cv2 img
-
-    Args:
-        vis_img (np.array): input data
-
-    Returns:
-        cv2 img
-    """
-    vis_img = (vis_img - vis_img.min()) / (
-        vis_img.max() - vis_img.min() + 1e-5)
-    vis_img = (vis_img * 255).astype(np.uint8)
-    vis_img = cv2.applyColorMap(vis_img, cv2.COLORMAP_JET)
-    return vis_img
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
new file mode 100644
index 00000000..ea1d95b5
--- /dev/null
+++ b/modelscope/utils/cv/image_utils.py
@@ -0,0 +1,196 @@
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.image import load_image
+
+
+def numpy_to_cv2img(img_array):
+    """to convert a np.array with shape(h, w) to cv2 img
+
+    Args:
+        img_array (np.array): input data
+
+    Returns:
+        cv2 img
+    """
+    img_array = (img_array - img_array.min()) / (
+        img_array.max() - img_array.min() + 1e-5)
+    img_array = (img_array * 255).astype(np.uint8)
+    img_array = cv2.applyColorMap(img_array, cv2.COLORMAP_JET)
+    return img_array
+
+
+def draw_joints(image, np_kps, score, threshold=0.2):
+    lst_parent_ids_17 = [0, 0, 0, 1, 2, 0, 0, 5, 6, 7, 8, 5, 6, 11, 12, 13, 14]
+    lst_left_ids_17 = [1, 3, 5, 7, 9, 11, 13, 15]
+    lst_right_ids_17 = [2, 4, 6, 8, 10, 12, 14, 16]
+
+    lst_parent_ids_15 = [0, 0, 1, 2, 3, 1, 5, 6, 14, 8, 9, 14, 11, 12, 1]
+    lst_left_ids_15 = [2, 3, 4, 8, 9, 10]
+    lst_right_ids_15 = [5, 6, 7, 11, 12, 13]
+
+    if np_kps.shape[0] == 17:
+        lst_parent_ids = lst_parent_ids_17
+        lst_left_ids = lst_left_ids_17
+        lst_right_ids = lst_right_ids_17
+
+    elif np_kps.shape[0] == 15:
+        lst_parent_ids = lst_parent_ids_15
+        lst_left_ids = lst_left_ids_15
+        lst_right_ids = lst_right_ids_15
+
+    for i in range(len(lst_parent_ids)):
+        pid = lst_parent_ids[i]
+        if i == pid:
+            continue
+
+        if (score[i] < threshold or score[1] < threshold):
+            continue
+
+        if i in lst_left_ids and pid in lst_left_ids:
+            color = (0, 255, 0)
+        elif i in lst_right_ids and pid in lst_right_ids:
+            color = (255, 0, 0)
+        else:
+            color = (0, 255, 255)
+
+        cv2.line(image, (int(np_kps[i, 0]), int(np_kps[i, 1])),
+                 (int(np_kps[pid][0]), int(np_kps[pid, 1])), color, 3)
+
+    for i in range(np_kps.shape[0]):
+        if score[i] < threshold:
+            continue
+        cv2.circle(image, (int(np_kps[i, 0]), int(np_kps[i, 1])), 5,
+                   (0, 0, 255), -1)
+
+
+def draw_box(image, box):
+    cv2.rectangle(image, (int(box[0][0]), int(box[0][1])),
+                  (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
+
+
+def realtime_object_detection_bbox_vis(image, bboxes):
+    for bbox in bboxes:
+        cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                      (255, 0, 0), 2)
+    return image
+
+
+def draw_keypoints(output, original_image):
+    poses = np.array(output[OutputKeys.POSES])
+    scores = np.array(output[OutputKeys.SCORES])
+    boxes = np.array(output[OutputKeys.BOXES])
+    assert len(poses) == len(scores) and len(poses) == len(boxes)
+    image = cv2.imread(original_image, -1)
+    for i in range(len(poses)):
+        draw_box(image, np.array(boxes[i]))
+        draw_joints(image, np.array(poses[i]), np.array(scores[i]))
+    return image
+
+
+def draw_face_detection_result(img_path, detection_result):
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        kps = kpss[i].reshape(-1, 2).astype(np.int32)
+        score = scores[i]
+        x1, y1, x2, y2 = bbox
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
+        for kp in kps:
+            cv2.circle(img, tuple(kp), 1, (0, 0, 255), 1)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    print(f'Found {len(scores)} faces')
+    return img
+
+
+def created_boxed_image(image_in, box):
+    image = load_image(image_in)
+    img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
+    cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
+                  (0, 255, 0), 3)
+    return img
+
+
+def show_video_tracking_result(video_in_path, bboxes, video_save_path):
+    cap = cv2.VideoCapture(video_in_path)
+    for i in range(len(bboxes)):
+        box = bboxes[i]
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(video_in_path,
+                            ' can not be correctly decoded by OpenCV.')
+        if i == 0:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
+                      5)
+        video_writer.write(frame)
+    video_writer.release
+    cap.release()
+
+
+def panoptic_seg_masks_to_image(masks):
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    from mmdet.core.visualization.image import _get_bias_color
+    taken_colors = set([0, 0, 0])
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        while tuple(color_mask) in taken_colors:
+            color_mask = _get_bias_color(color_mask)
+        taken_colors.add(tuple(color_mask))
+
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+
+    return draw_img
+
+
+def semantic_seg_masks_to_image(masks):
+    from mmdet.core.visualization.palette import get_palette
+    mask_palette = get_palette('coco', 133)
+
+    draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
+
+    for i, mask in enumerate(masks):
+        color_mask = mask_palette[i]
+        mask = mask.astype(bool)
+        draw_img[mask] = color_mask
+    return draw_img
+
+
+def show_video_summarization_result(video_in_path, result, video_save_path):
+    frame_indexes = result[OutputKeys.OUTPUT]
+    cap = cv2.VideoCapture(video_in_path)
+    for i in range(len(frame_indexes)):
+        idx = frame_indexes[i]
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(video_in_path,
+                            ' can not be correctly decoded by OpenCV.')
+        if i == 0:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        if idx == 1:
+            video_writer.write(frame)
+    video_writer.release()
+    cap.release()
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
new file mode 100644
index 00000000..77e23122
--- /dev/null
+++ b/modelscope/utils/device.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from contextlib import contextmanager
+
+from modelscope.utils.constant import Devices, Frameworks
+from modelscope.utils.import_utils import is_tf_available, is_torch_available
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def verify_device(device_name):
+    """ Verify device is valid, device should be either cpu, cuda, gpu, cuda:X or gpu:X.
+
+    Args:
+        device (str):  device str, should be either cpu, cuda, gpu, gpu:X or cuda:X
+            where X is the ordinal for gpu device.
+
+    Return:
+        device info (tuple):  device_type and device_id, if device_id is not set, will use 0 as default.
+    """
+    device_name = device_name.lower()
+    eles = device_name.split(':')
+    err_msg = 'device should be either cpu, cuda, gpu, gpu:X or cuda:X where X is the ordinal for gpu device.'
+    assert len(eles) <= 2, err_msg
+    assert eles[0] in ['cpu', 'cuda', 'gpu'], err_msg
+    device_type = eles[0]
+    device_id = None
+    if len(eles) > 1:
+        device_id = int(eles[1])
+    if device_type == 'cuda':
+        device_type = Devices.gpu
+    if device_type == Devices.gpu and device_id is None:
+        device_id = 0
+    return device_type, device_id
+
+
+@contextmanager
+def device_placement(framework, device_name='gpu:0'):
+    """ Device placement function, allow user to specify which device to place model or tensor
+    Args:
+        framework (str):  tensorflow or pytorch.
+        device (str):  gpu or cpu to use, if you want to specify certain gpu,
+            use gpu:$gpu_id or cuda:$gpu_id.
+
+    Returns:
+        Context manager
+
+    Examples:
+
+    ```python
+    # Requests for using model on cuda:0 for gpu
+    with device_placement('pytorch', device='gpu:0'):
+        model = Model.from_pretrained(...)
+    ```
+    """
+    device_type, device_id = verify_device(device_name)
+
+    if framework == Frameworks.tf:
+        import tensorflow as tf
+        if device_type == Devices.gpu and not tf.test.is_gpu_available():
+            logger.warning(
+                'tensorflow cuda is not available, using cpu instead.')
+        device_type = Devices.cpu
+        if device_type == Devices.cpu:
+            with tf.device('/CPU:0'):
+                yield
+        else:
+            if device_type == Devices.gpu:
+                with tf.device(f'/device:gpu:{device_id}'):
+                    yield
+
+    elif framework == Frameworks.torch:
+        import torch
+        if device_type == Devices.gpu:
+            if torch.cuda.is_available():
+                torch.cuda.set_device(f'cuda:{device_id}')
+            else:
+                logger.warning('cuda is not available, using cpu instead.')
+        yield
+    else:
+        yield
+
+
+def create_device(device_name):
+    """ create torch device
+
+    Args:
+        device_name (str):  cpu, gpu, gpu:0, cuda:0 etc.
+    """
+    import torch
+    device_type, device_id = verify_device(device_name)
+    use_cuda = False
+    if device_type == Devices.gpu:
+        use_cuda = True
+        if not torch.cuda.is_available():
+            logger.warning(
+                'cuda is not available, create gpu device failed, using cpu instead.'
+            )
+            use_cuda = False
+
+    if use_cuda:
+        device = torch.device(f'cuda:{device_id}')
+    else:
+        device = torch.device('cpu')
+
+    return device
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index a04d890f..9b82f8d2 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import inspect
-import os
+from pathlib import Path
 
 
 # TODO: remove this api, unify to flattened args
@@ -33,6 +33,5 @@ def get_default_cache_dir():
     """
     default base dir: '~/.cache/modelscope'
     """
-    default_cache_dir = os.path.expanduser(
-        os.path.join('~/.cache', 'modelscope'))
+    default_cache_dir = Path.home().joinpath('.cache', 'modelscope')
     return default_cache_dir
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 6d685b87..f79097fe 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -10,7 +10,8 @@ from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.utils.config import Config
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ModelFile)
 from .logger import get_logger
 
 logger = get_logger(__name__)
@@ -119,8 +120,13 @@ def parse_label_mapping(model_dir):
     if label2id is None:
         config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
         config = Config.from_file(config_path)
-        if hasattr(config, 'model') and hasattr(config.model, 'label2id'):
-            label2id = config.model.label2id
+        if hasattr(config, ConfigFields.model) and hasattr(
+                config[ConfigFields.model], 'label2id'):
+            label2id = config[ConfigFields.model].label2id
+        elif hasattr(config, ConfigFields.preprocessor) and hasattr(
+                config[ConfigFields.preprocessor], 'label2id'):
+            label2id = config[ConfigFields.preprocessor].label2id
+
     if label2id is None:
         config_path = os.path.join(model_dir, 'config.json')
         config = Config.from_file(config_path)
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 85f442a7..c9bea020 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -10,6 +10,7 @@ from collections import OrderedDict
 from functools import wraps
 from importlib import import_module
 from itertools import chain
+from pathlib import Path
 from types import ModuleType
 from typing import Any
 
@@ -43,7 +44,7 @@ def import_modules_from_file(py_file: str):
     """
     dirname, basefile = os.path.split(py_file)
     if dirname == '':
-        dirname == './'
+        dirname = Path.cwd()
     module_name = osp.splitext(basefile)[0]
     sys.path.insert(0, dirname)
     validate_py_syntax(py_file)
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
new file mode 100644
index 00000000..35b374f2
--- /dev/null
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -0,0 +1,43 @@
+from typing import List
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
+                                      DialogStateTrackingPipeline)
+
+
+def text2sql_tracking_and_print_results(
+        test_case, pipelines: List[ConversationalTextToSqlPipeline]):
+    for p in pipelines:
+        last_sql, history = '', []
+        for item in test_case['utterance']:
+            case = {
+                'utterance': item,
+                'history': history,
+                'last_sql': last_sql,
+                'database_id': test_case['database_id'],
+                'local_db_path': test_case['local_db_path']
+            }
+            results = p(case)
+            print({'question': item})
+            print(results)
+            last_sql = results['text']
+            history.append(item)
+
+
+def tracking_and_print_dialog_states(
+        test_case, pipelines: List[DialogStateTrackingPipeline]):
+    import json
+    pipelines_len = len(pipelines)
+    history_states = [{}]
+    utter = {}
+    for step, item in enumerate(test_case):
+        utter.update(item)
+        result = pipelines[step % pipelines_len]({
+            'utter':
+            utter,
+            'history_states':
+            history_states
+        })
+        print(json.dumps(result))
+
+        history_states.extend([result[OutputKeys.OUTPUT], {}])
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
new file mode 100644
index 00000000..ca50d579
--- /dev/null
+++ b/modelscope/utils/regress_test_utils.py
@@ -0,0 +1,703 @@
+import contextlib
+import hashlib
+import os
+import pickle
+import random
+import shutil
+import tempfile
+from collections.abc import Mapping
+from pathlib import Path
+from types import FunctionType
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch.optim
+from torch import nn
+
+
+class RegressTool:
+    """This class is used to stop inference/training results from changing by some unaware affections by unittests.
+
+    Firstly, run a baseline test to create a result file, then changes can be observed between
+    the latest version and the baseline file.
+    """
+
+    def __init__(self,
+                 baseline: bool = None,
+                 store_func: FunctionType = None,
+                 load_func: FunctionType = None):
+        """A func to store the baseline file and a func to load the baseline file.
+        """
+        self.baseline = baseline
+        self.store_func = store_func
+        self.load_func = load_func
+        print(f'Current working dir is: {Path.cwd()}')
+
+    def store(self, local, remote):
+        if self.store_func is not None:
+            self.store_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            os.makedirs(path, exist_ok=True)
+            shutil.copy(local, os.path.join(path, remote))
+
+    def load(self, local, remote):
+        if self.load_func is not None:
+            self.load_func(local, remote)
+        else:
+            path = os.path.abspath(
+                os.path.join(Path.cwd(), 'data', 'test', 'regression'))
+            baseline = os.path.join(path, remote)
+            if not os.path.exists(baseline):
+                raise ValueError(f'base line file {baseline} not exist')
+            print(
+                f'local file found:{baseline}, md5:{hashlib.md5(open(baseline,"rb").read()).hexdigest()}'
+            )
+            if os.path.exists(local):
+                os.remove(local)
+            os.symlink(baseline, local, target_is_directory=False)
+
+    @contextlib.contextmanager
+    def monitor_module_single_forward(self,
+                                      module: nn.Module,
+                                      file_name: str,
+                                      compare_fn=None):
+        """Monitor a pytorch module in a single forward.
+
+        @param module: A torch module
+        @param file_name: The file_name to store or load file
+        @param compare_fn: A custom fn used to compare the results manually.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of submodules
+        type is in one of 'input', 'output'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+        io_json = {}
+        absolute_path = f'./{file_name}.bin'
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        hack_forward(module, file_name, io_json)
+        intercept_module(module, io_json)
+        yield
+        hack_forward(module, None, None, restore=True)
+        intercept_module(module, None, restore=True)
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(io_json, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            class NumpyEncoder(json.JSONEncoder):
+                """Special json encoder for numpy types
+                """
+
+                def default(self, obj):
+                    if isinstance(obj, np.integer):
+                        return int(obj)
+                    elif isinstance(obj, np.floating):
+                        return float(obj)
+                    elif isinstance(obj, np.ndarray):
+                        return obj.tolist()
+                    return json.JSONEncoder.default(self, obj)
+
+            print(f'baseline: {json.dumps(baseline_json, cls=NumpyEncoder)}')
+            print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
+            if not compare_io_and_print(baseline_json, io_json, compare_fn):
+                raise ValueError('Result not match!')
+
+    @contextlib.contextmanager
+    def monitor_module_train(self,
+                             trainer: Union[Dict, Any],
+                             file_name,
+                             level='config',
+                             compare_fn=None,
+                             ignore_keys=None,
+                             compare_random=True,
+                             lazy_stop_callback=None):
+        """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
+
+        This is usually useful when you try to change some dangerous code
+        which has the risk of affecting the training loop.
+
+        @param trainer: A dict or an object contains the model/optimizer/lr_scheduler
+        @param file_name: The file_name to store or load file
+        @param level: The regression level.
+            'strict' for matching every single tensor.
+                     Please make sure the parameters of head are fixed
+                     and the drop-out rate is zero.
+            'config' for matching the initial config, like cfg file, optimizer param_groups,
+                     lr_scheduler params and the random seed.
+            'metric' for compare the best metrics in the evaluation loop.
+        @param compare_fn: A custom fn used to compare the results manually.
+        @param ignore_keys: The keys to ignore of the named_parameters.
+        @param compare_random: If to compare random setttings, default True.
+        @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+
+        >>> def compare_fn(v1, v2, key, type):
+        >>>     return None
+
+        v1 is the baseline value
+        v2 is the value of current version
+        key is the key of modules/parameters
+        type is in one of 'input', 'output', 'backward', 'optimizer', 'lr_scheduler', 'cfg', 'state'
+        """
+        baseline = os.getenv('REGRESSION_BASELINE')
+        if baseline is None or self.baseline is None:
+            yield
+            return
+
+        baseline = self.baseline
+
+        io_json = {}
+        bw_json = {}
+        absolute_path = f'./{file_name}.bin'
+
+        if level == 'strict':
+            print(
+                "[Important] The level of regression is 'strict', please make sure your model's parameters are "
+                'fixed and all drop-out rates have been set to zero.')
+
+        assert hasattr(
+            trainer, 'model') or 'model' in trainer, 'model must be in trainer'
+        module = trainer['model'] if isinstance(trainer,
+                                                dict) else trainer.model
+        if not isinstance(module, nn.Module):
+            assert hasattr(module, 'model')
+            module = module.model
+
+        assert hasattr(
+            trainer, 'optimizer'
+        ) or 'optimizer' in trainer, 'optimizer must be in trainer'
+        assert hasattr(
+            trainer, 'lr_scheduler'
+        ) or 'lr_scheduler' in trainer, 'lr_scheduler must be in trainer'
+        optimizer: torch.optim.Optimizer = trainer['optimizer'] if isinstance(
+            trainer, dict) else trainer.optimizer
+        lr_scheduler: torch.optim.lr_scheduler._LRScheduler = trainer['lr_scheduler'] if isinstance(trainer, dict) \
+            else trainer.lr_scheduler
+        torch_state = numpify_tensor_nested(torch.get_rng_state())
+        np_state = np.random.get_state()
+        random_seed = random.getstate()
+        seed = trainer._seed if hasattr(
+            trainer,
+            '_seed') else trainer.seed if hasattr(trainer, 'seed') else None
+
+        if level == 'strict':
+            hack_forward(module, file_name, io_json)
+            intercept_module(module, io_json)
+        hack_backward(
+            module, optimizer, bw_json, lazy_stop_callback=lazy_stop_callback)
+        yield
+        hack_backward(module, optimizer, None, restore=True)
+        if level == 'strict':
+            hack_forward(module, None, None, restore=True)
+            intercept_module(module, None, restore=True)
+
+        optimizer_dict = optimizer.state_dict()
+        optimizer_dict.pop('state', None)
+        summary = {
+            'forward': io_json,
+            'backward': bw_json,
+            'optimizer': {
+                'type': optimizer.__class__.__name__,
+                'defaults': optimizer.defaults,
+                'state_dict': optimizer_dict
+            },
+            'lr_scheduler': {
+                'type': lr_scheduler.__class__.__name__,
+                'state_dict': lr_scheduler.state_dict()
+            },
+            'cfg': trainer.cfg.to_dict() if hasattr(trainer, 'cfg') else None,
+            'state': {
+                'torch_state': torch_state,
+                'np_state': np_state,
+                'random_seed': random_seed,
+                'seed': seed,
+            }
+        }
+
+        if baseline:
+            with open(absolute_path, 'wb') as f:
+                pickle.dump(summary, f)
+            self.store(absolute_path, f'{file_name}.bin')
+            os.remove(absolute_path)
+        else:
+            name = os.path.basename(absolute_path)
+            baseline = os.path.join(tempfile.gettempdir(), name)
+            self.load(baseline, name)
+            with open(baseline, 'rb') as f:
+                baseline_json = pickle.load(f)
+
+            if level == 'strict' and not compare_io_and_print(
+                    baseline_json['forward'], io_json, compare_fn):
+                raise RuntimeError('Forward not match!')
+            if not compare_backward_and_print(
+                    baseline_json['backward'],
+                    bw_json,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    level=level):
+                raise RuntimeError('Backward not match!')
+            cfg_opt1 = {
+                'optimizer': baseline_json['optimizer'],
+                'lr_scheduler': baseline_json['lr_scheduler'],
+                'cfg': baseline_json['cfg'],
+                'state': None if not compare_random else baseline_json['state']
+            }
+            cfg_opt2 = {
+                'optimizer': summary['optimizer'],
+                'lr_scheduler': summary['lr_scheduler'],
+                'cfg': summary['cfg'],
+                'state': None if not compare_random else summary['state']
+            }
+            if not compare_cfg_and_optimizers(cfg_opt1, cfg_opt2, compare_fn):
+                raise RuntimeError('Cfg or optimizers not match!')
+
+
+class MsRegressTool(RegressTool):
+
+    class EarlyStopError(Exception):
+        pass
+
+    @contextlib.contextmanager
+    def monitor_ms_train(self,
+                         trainer,
+                         file_name,
+                         level='config',
+                         compare_fn=None,
+                         ignore_keys=None):
+
+        def lazy_stop_callback():
+
+            from modelscope.trainers.hooks.hook import Hook, Priority
+
+            class EarlyStopHook(Hook):
+                PRIORITY = Priority.VERY_LOW
+
+                def after_iter(self, trainer):
+                    raise MsRegressTool.EarlyStopError('Test finished.')
+
+            trainer.register_hook(EarlyStopHook())
+
+        def _train_loop(trainer, *args, **kwargs):
+            with self.monitor_module_train(
+                    trainer,
+                    file_name,
+                    level,
+                    compare_fn=compare_fn,
+                    ignore_keys=ignore_keys,
+                    lazy_stop_callback=lazy_stop_callback):
+                try:
+                    return trainer.train_loop_origin(*args, **kwargs)
+                except MsRegressTool.EarlyStopError:
+                    pass
+
+        trainer.train_loop_origin, trainer.train_loop = \
+            trainer.train_loop, type(trainer.train_loop)(_train_loop, trainer)
+        yield
+
+
+def compare_module(module1: nn.Module, module2: nn.Module):
+    for p1, p2 in zip(module1.parameters(), module2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True
+
+
+def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
+    import torch
+    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return type(tensors)({
+            k: numpify_tensor_nested(t, reduction, clip_value)
+            for k, t in tensors.items()
+        })
+    if isinstance(tensors, torch.Tensor):
+        t: np.ndarray = tensors.cpu().numpy()
+        if clip_value is not None:
+            t = np.where(t > clip_value, clip_value, t)
+            t = np.where(t < -clip_value, -clip_value, t)
+        if reduction == 'sum':
+            return t.sum(dtype=np.float)
+        elif reduction == 'mean':
+            return t.mean(dtype=np.float)
+        return t
+    return tensors
+
+
+def detach_tensor_nested(tensors):
+    import torch
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(detach_tensor_nested(t) for t in tensors)
+    if isinstance(tensors, Mapping):
+        return type(tensors)(
+            {k: detach_tensor_nested(t)
+             for k, t in tensors.items()})
+    if isinstance(tensors, torch.Tensor):
+        return tensors.detach()
+    return tensors
+
+
+def hack_forward(module: nn.Module,
+                 name,
+                 io_json,
+                 restore=False,
+                 keep_tensors=False):
+
+    def _forward(self, *args, **kwargs):
+        ret = self.forward_origin(*args, **kwargs)
+        if keep_tensors:
+            args = numpify_tensor_nested(detach_tensor_nested(args))
+            kwargs = numpify_tensor_nested(detach_tensor_nested(kwargs))
+            output = numpify_tensor_nested(detach_tensor_nested(ret))
+        else:
+            args = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(args), reduction='mean'),
+            }
+            kwargs = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(kwargs), reduction='mean'),
+            }
+            output = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(ret), reduction='mean'),
+            }
+
+        io_json[name] = {
+            'input': {
+                'args': args,
+                'kwargs': kwargs,
+            },
+            'output': output,
+        }
+        return ret
+
+    if not restore and not hasattr(module, 'forward_origin'):
+        module.forward_origin, module.forward = module.forward, type(
+            module.forward)(_forward, module)
+    if restore and hasattr(module, 'forward_origin'):
+        module.forward = module.forward_origin
+        del module.forward_origin
+
+
+def hack_backward(module: nn.Module,
+                  optimizer,
+                  io_json,
+                  restore=False,
+                  lazy_stop_callback=None):
+
+    def _step(self, *args, **kwargs):
+        for name, param in module.named_parameters():
+            io_json[name] = {
+                'data': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.data), reduction='mean'),
+                },
+                'grad': {
+                    'sum':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='sum'),
+                    'mean':
+                    numpify_tensor_nested(
+                        detach_tensor_nested(param.grad), reduction='mean'),
+                }
+            }
+        ret = self.step_origin(*args, **kwargs)
+        for name, param in module.named_parameters():
+            io_json[name]['data_after'] = {
+                'sum':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='sum'),
+                'mean':
+                numpify_tensor_nested(
+                    detach_tensor_nested(param.data), reduction='mean'),
+            }
+        if lazy_stop_callback is not None:
+            lazy_stop_callback()
+        return ret
+
+    if not restore and not hasattr(optimizer, 'step_origin'):
+        optimizer.step_origin, optimizer.step = optimizer.step, type(
+            optimizer.state_dict)(_step, optimizer)
+    if restore and hasattr(optimizer, 'step_origin'):
+        optimizer.step = optimizer.step_origin
+        del optimizer.step_origin
+
+
+def intercept_module(module: nn.Module,
+                     io_json,
+                     parent_name=None,
+                     restore=False):
+    for name, module in module.named_children():
+        full_name = parent_name + '.' + name if parent_name is not None else name
+        hack_forward(module, full_name, io_json, restore)
+        intercept_module(module, io_json, full_name, restore)
+
+
+def compare_arguments_nested(print_content, arg1, arg2):
+    type1 = type(arg1)
+    type2 = type(arg2)
+    if type1.__name__ != type2.__name__:
+        if print_content is not None:
+            print(
+                f'{print_content}, type not equal:{type1.__name__} and {type2.__name__}'
+            )
+        return False
+
+    if arg1 is None:
+        return True
+    elif isinstance(arg1, (int, str, bool, np.bool, np.integer, np.str)):
+        if arg1 != arg2:
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (float, np.floating)):
+        if not np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8, equal_nan=True):
+            if print_content is not None:
+                print(f'{print_content}, arg1:{arg1}, arg2:{arg2}')
+            return False
+        return True
+    elif isinstance(arg1, (tuple, list)):
+        if len(arg1) != len(arg2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, length is not equal:{len(arg1)}, {len(arg2)}'
+                )
+            return False
+        if not all([
+                compare_arguments_nested(None, sub_arg1, sub_arg2)
+                for sub_arg1, sub_arg2 in zip(arg1, arg2)
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, Mapping):
+        keys1 = arg1.keys()
+        keys2 = arg2.keys()
+        if len(keys1) != len(keys2):
+            if print_content is not None:
+                print(
+                    f'{print_content}, key length is not equal:{len(keys1)}, {len(keys2)}'
+                )
+            return False
+        if len(set(keys1) - set(keys2)) > 0:
+            if print_content is not None:
+                print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
+            return False
+        if not all([
+                compare_arguments_nested(None, arg1[key], arg2[key])
+                for key in keys1
+        ]):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    elif isinstance(arg1, np.ndarray):
+        arg1 = np.where(np.equal(arg1, None), np.NaN,
+                        arg1).astype(dtype=np.float)
+        arg2 = np.where(np.equal(arg2, None), np.NaN,
+                        arg2).astype(dtype=np.float)
+        if not all(
+                np.isclose(arg1, arg2, rtol=1.e-3, atol=1.e-8,
+                           equal_nan=True).flatten()):
+            if print_content is not None:
+                print(f'{print_content}')
+            return False
+        return True
+    else:
+        raise ValueError(f'type not supported: {type1}')
+
+
+def compare_io_and_print(baseline_json, io_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(io_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        v1 = baseline_json[key]
+        v2 = io_json[key]
+
+        v1input = numpify_tensor_nested(v1['input'])
+        v2input = numpify_tensor_nested(v2['input'])
+        res = compare_fn(v1input, v2input, key, 'input')
+        if res is not None:
+            print(
+                f'input of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(
+                f'unmatched module {key} input args', v1input['args'],
+                v2input['args']) and match
+            match = compare_arguments_nested(
+                f'unmatched module {key} input kwargs', v1input['kwargs'],
+                v2input['kwargs']) and match
+        v1output = numpify_tensor_nested(v1['output'])
+        v2output = numpify_tensor_nested(v2['output'])
+        res = compare_fn(v1output, v2output, key, 'output')
+        if res is not None:
+            print(
+                f'output of {key} compared with user compare_fn with result:{res}\n'
+            )
+            match = match and res
+        else:
+            match = compare_arguments_nested(f'unmatched module {key} outputs',
+                                             v1output, v2output) and match
+    return match
+
+
+def compare_backward_and_print(baseline_json,
+                               bw_json,
+                               level,
+                               ignore_keys=None,
+                               compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    keys1 = set(baseline_json.keys())
+    keys2 = set(bw_json.keys())
+    added = keys1 - keys2
+    removed = keys2 - keys1
+    print(f'unmatched backward keys: {added}, {removed}')
+    shared_keys = keys1.intersection(keys2)
+    match = True
+    for key in shared_keys:
+        if ignore_keys is not None and key in ignore_keys:
+            continue
+
+        res = compare_fn(baseline_json[key], bw_json[key], key, 'backward')
+        if res is not None:
+            print(f'backward data of {key} compared with '
+                  f'user compare_fn with result:{res}\n')
+            match = match and res
+        else:
+            data1, grad1, data_after1 = baseline_json[key][
+                'data'], baseline_json[key]['grad'], baseline_json[key][
+                    'data_after']
+            data2, grad2, data_after2 = bw_json[key]['data'], bw_json[key][
+                'grad'], bw_json[key]['data_after']
+            match = compare_arguments_nested(
+                f'unmatched module {key} tensor data', data1, data2) and match
+            if level == 'strict':
+                match = compare_arguments_nested(
+                    f'unmatched module {key} grad data', grad1,
+                    grad2) and match
+                match = compare_arguments_nested(
+                    f'unmatched module {key} data after step', data_after1,
+                    data_after2) and match
+    return match
+
+
+def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
+    if compare_fn is None:
+
+        def compare_fn(*args, **kwargs):
+            return None
+
+    optimizer1, lr_scheduler1, cfg1, state1 = baseline_json[
+        'optimizer'], baseline_json['lr_scheduler'], baseline_json[
+            'cfg'], baseline_json['state']
+    optimizer2, lr_scheduler2, cfg2, state2 = cfg_json['optimizer'], cfg_json[
+        'lr_scheduler'], cfg_json['cfg'], baseline_json['state']
+
+    match = True
+    res = compare_fn(optimizer1, optimizer2, None, 'optimizer')
+    if res is not None:
+        print(f'optimizer compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if optimizer1['type'] != optimizer2['type']:
+            print(
+                f"Optimizer type not equal:{optimizer1['type']} and {optimizer2['type']}"
+            )
+        match = compare_arguments_nested('unmatched optimizer defaults',
+                                         optimizer1['defaults'],
+                                         optimizer2['defaults']) and match
+        match = compare_arguments_nested('unmatched optimizer state_dict',
+                                         optimizer1['state_dict'],
+                                         optimizer2['state_dict']) and match
+
+    res = compare_fn(lr_scheduler1, lr_scheduler2, None, 'lr_scheduler')
+    if res is not None:
+        print(
+            f'lr_scheduler compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        if lr_scheduler1['type'] != lr_scheduler2['type']:
+            print(
+                f"Optimizer type not equal:{lr_scheduler1['type']} and {lr_scheduler2['type']}"
+            )
+        match = compare_arguments_nested('unmatched lr_scheduler state_dict',
+                                         lr_scheduler1['state_dict'],
+                                         lr_scheduler2['state_dict']) and match
+
+    res = compare_fn(cfg1, cfg2, None, 'cfg')
+    if res is not None:
+        print(f'cfg compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched cfg', cfg1, cfg2) and match
+
+    res = compare_fn(state1, state2, None, 'state')
+    if res is not None:
+        print(
+            f'random state compared with user compare_fn with result:{res}\n')
+        match = match and res
+    else:
+        match = compare_arguments_nested('unmatched random state', state1,
+                                         state2) and match
+
+    return match
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 1f157f9a..45e33c3e 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -132,17 +132,6 @@ def master_only(func: Callable) -> Callable:
     return wrapper
 
 
-def create_device(cpu: bool = False) -> torch.DeviceObjType:
-    use_cuda = torch.cuda.is_available() and not cpu
-    if use_cuda:
-        local_rank = os.environ.get('LOCAL_RANK', 0)
-        device = torch.device(f'cuda:{local_rank}')
-    else:
-        device = torch.device('cpu')
-
-    return device
-
-
 def make_tmp_dir():
     """Make sure each rank has the same temporary directory on the distributed mode.
     """
diff --git a/modelscope/version.py b/modelscope/version.py
index 40ed83d9..d93912ee 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.5'
+__version__ = '0.3.7'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 8dcf6791..ebb61851 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,13 +14,14 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.5
+pai-easycv>=0.6.0
 pandas
 psutil
 regex
 scikit-image>=0.19.3
 scikit-learn>=0.20.1
 shapely
+shotdetect_scenedetect_lgss
 tensorflow-estimator>=1.15.1
 tf_slim
 timm>=0.4.9
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 6bd56aff..ada4fc50 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,11 +1,14 @@
 en_core_web_sm>=2.3.5
 fairseq>=0.10.2
+jieba>=0.42.1
 pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
+sacremoses>=0.0.41
 seqeval
 spacy>=2.3.5
+subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
 transformers>=4.12.0
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index e2b78f06..b51faeda 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -4,10 +4,13 @@ easydict
 einops
 filelock>=3.3.0
 gast>=0.2.2
+jsonplus
 numpy
 opencv-python
 oss2
 Pillow>=6.2.0
+# for pyarrow 9.0.0 event_loop core dump
+pyarrow>=6.0.0,!=9.0.0
 pyyaml
 requests
 scipy
diff --git a/tests/isolated_cases.txt b/tests/isolated_cases.txt
new file mode 100644
index 00000000..be85142a
--- /dev/null
+++ b/tests/isolated_cases.txt
@@ -0,0 +1,6 @@
+ test_text_to_speech.py
+ test_multi_modal_embedding.py
+ test_ofa_tasks.py
+ test_video_summarization.py
+ test_dialog_modeling.py
+ test_csanmt_translation.py
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
new file mode 100644
index 00000000..61b1c6a4
--- /dev/null
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.msdatasets import MsDataset
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+KEY_EXTRACTED = 'extracted'
+
+
+class DatasetUploadTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_dir = os.getcwd()
+        self.dataset_name = 'small_coco_for_test'
+        self.dataset_file_name = self.dataset_name
+        self.prepared_dataset_name = 'pets_small'
+        self.token = os.getenv('TEST_UPLOAD_MS_TOKEN')
+        error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN'
+        self.assertIsNotNone(self.token, msg=error_msg)
+        from modelscope.hub.api import HubApi
+        from modelscope.hub.api import ModelScopeConfig
+        self.api = HubApi()
+        self.api.login(self.token)
+
+        # get user info
+        self.namespace, _ = ModelScopeConfig.get_user_info()
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name)
+        self.test_meta_dir = os.path.join(self.test_work_dir, 'meta')
+        if not os.path.exists(self.test_work_dir):
+            os.makedirs(self.test_work_dir)
+
+    def tearDown(self):
+        os.chdir(self.old_dir)
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        print('The test dir successfully removed!')
+
+    @staticmethod
+    def get_raw_downloaded_file_path(extracted_path):
+        raw_downloaded_file_path = ''
+        raw_data_dir = os.path.abspath(
+            os.path.join(extracted_path, '../../..'))
+        for root, dirs, files in os.walk(raw_data_dir):
+            if KEY_EXTRACTED in dirs:
+                for file in files:
+                    curr_file_path = os.path.join(root, file)
+                    if zipfile.is_zipfile(curr_file_path):
+                        raw_downloaded_file_path = curr_file_path
+        return raw_downloaded_file_path
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload(self):
+        # Get the prepared data from hub, using default modelscope namespace
+        ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
+        config_res = ms_ds_train._hf_ds.config_kwargs
+        extracted_path = config_res.get('split_config').get('train')
+        raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path)
+
+        MsDataset.upload(
+            object_name=self.dataset_file_name + '.zip',
+            local_file_path=raw_zipfile_path,
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_clone_meta(self):
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload_meta(self):
+        # Clone dataset meta repo first.
+        MsDataset.clone_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name))
+
+        with open(os.path.join(self.test_meta_dir, ModelFile.README),
+                  'a') as f:
+            f.write('\nThis is a line for unit test.')
+
+        MsDataset.upload_meta(
+            dataset_work_dir=self.test_meta_dir,
+            dataset_id=os.path.join(self.namespace, self.dataset_name),
+            commit_message='Update for unit test.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index f9118353..9780ac4b 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -4,7 +4,7 @@ from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.utils.constant import DownloadMode
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE, DownloadMode
 from modelscope.utils.test_utils import require_tf, require_torch, test_level
 
 
@@ -31,15 +31,21 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_seg_toydata(self):
+        ms_ds_train = MsDataset.load('movie_scene_seg_toydata', split='train')
+        print(ms_ds_train._hf_ds.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_coco(self):
         ms_ds_train = MsDataset.load(
             'pets_small',
-            namespace='modelscope',
-            split='train',
+            namespace=DEFAULT_DATASET_NAMESPACE,
             download_mode=DownloadMode.FORCE_REDOWNLOAD,
-            classes=('1', '2'))
-        print(ms_ds_train._hf_ds.config_kwargs)
+            split='train')
+        print(ms_ds_train.config_kwargs)
+        assert next(iter(ms_ds_train.config_kwargs['split_config'].values()))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ms_csv_basic(self):
diff --git a/tests/pipelines/easycv_pipelines/__init__.py b/tests/pipelines/easycv_pipelines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
new file mode 100644
index 00000000..6cfdacc6
--- /dev/null
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVSegmentationPipelineTest(unittest.TestCase):
+
+    img_path = 'data/test/images/image_segmentation.jpg'
+
+    def _internal_test__(self, model_id):
+        img = np.asarray(Image.open(self.img_path))
+
+        semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
+        outputs = semantic_seg(self.img_path)
+
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertListEqual(
+            list(img.shape)[:2], list(results['seg_pred'][0].shape))
+        self.assertListEqual(results['seg_pred'][0][1, 4:10].tolist(),
+                             [161 for i in range(6)])
+        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
+                             [133 for i in range(10)])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b0(self):
+        model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b1(self):
+        model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b2(self):
+        model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b3(self):
+        model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b4(self):
+        model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_segformer_b5(self):
+        model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
+        self._internal_test__(model_id)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
index 7453f136..e955eb60 100644
--- a/tests/pipelines/test_action_recognition.py
+++ b/tests/pipelines/test_action_recognition.py
@@ -15,23 +15,6 @@ class ActionRecognitionTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/cv_TAdaConv_action-recognition'
 
-    @unittest.skip('deprecated, download model from model hub instead')
-    def test_run_with_direct_file_download(self):
-        model_path = 'https://aquila2-online-models.oss-cn-shanghai.aliyuncs.com/maas_test/pytorch_model.pt'
-        config_path = 'https://aquila2-online-models.oss-cn-shanghai.aliyuncs.com/maas_test/configuration.json'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model_file = osp.join(tmp_dir, ModelFile.TORCH_MODEL_FILE)
-            with open(model_file, 'wb') as ofile1:
-                ofile1.write(File.read(model_path))
-            config_file = osp.join(tmp_dir, ModelFile.CONFIGURATION)
-            with open(config_file, 'wb') as ofile2:
-                ofile2.write(File.read(config_path))
-            recognition_pipeline = pipeline(
-                Tasks.action_recognition, model=tmp_dir)
-            result = recognition_pipeline(
-                'data/test/videos/action_recognition_test_video.mp4')
-            print(f'recognition output: {result}.')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
         recognition_pipeline = pipeline(
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 88ebcdbd..a83f5031 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -53,14 +53,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
         },
-        'test_run_with_ark_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
-        'test_run_with_tfrecord_dataset': {
-            'checking_item': OutputKeys.TEXT,
-            'example': 'dataset_example'
-        },
         'dataset_example': {
             'Wrd': 49532,  # the number of words
             'Snt': 5000,  # the number of sentences
@@ -252,60 +244,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase):
             model_id=self.am_tf_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_tf', rec_result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_ark_dataset(self):
-        '''run with datasets, and audio format is kaldi_ark
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.ark
-                 data.scp
-                 data.text
-               dev    # devsets
-                 data.ark
-                 data.scp
-                 data.text
-               train  # trainsets
-                 data.ark
-                 data.scp
-                 data.text
-        '''
-
-        logger.info('Run ASR test with ark dataset (pytorch)...')
-        logger.info('Downloading ark testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, AISHELL1_TESTSETS_FILE),
-            AISHELL1_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_pytorch_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_ark_dataset', rec_result)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_tfrecord_dataset(self):
-        '''run with datasets, and audio format is tfrecord
-           datasets directory:
-             <dataset_path>
-               test   # testsets
-                 data.records
-                 data.idx
-                 data.text
-        '''
-
-        logger.info('Run ASR test with tfrecord dataset (tensorflow)...')
-        logger.info('Downloading tfrecord testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, TFRECORD_TESTSETS_FILE),
-            TFRECORD_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_tf_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_tfrecord_dataset', rec_result)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_body_2d_keypoints.py b/tests/pipelines/test_body_2d_keypoints.py
index eca5e961..d010adc5 100644
--- a/tests/pipelines/test_body_2d_keypoints.py
+++ b/tests/pipelines/test_body_2d_keypoints.py
@@ -9,59 +9,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_keypoints
 from modelscope.utils.test_utils import test_level
 
-lst_parent_ids_17 = [0, 0, 0, 1, 2, 0, 0, 5, 6, 7, 8, 5, 6, 11, 12, 13, 14]
-lst_left_ids_17 = [1, 3, 5, 7, 9, 11, 13, 15]
-lst_right_ids_17 = [2, 4, 6, 8, 10, 12, 14, 16]
-lst_spine_ids_17 = [0]
-
-lst_parent_ids_15 = [0, 0, 1, 2, 3, 1, 5, 6, 14, 8, 9, 14, 11, 12, 1]
-lst_left_ids_15 = [2, 3, 4, 8, 9, 10]
-lst_right_ids_15 = [5, 6, 7, 11, 12, 13]
-lst_spine_ids_15 = [0, 1, 14]
-
-
-def draw_joints(image, np_kps, score, threshold=0.2):
-    if np_kps.shape[0] == 17:
-        lst_parent_ids = lst_parent_ids_17
-        lst_left_ids = lst_left_ids_17
-        lst_right_ids = lst_right_ids_17
-
-    elif np_kps.shape[0] == 15:
-        lst_parent_ids = lst_parent_ids_15
-        lst_left_ids = lst_left_ids_15
-        lst_right_ids = lst_right_ids_15
-
-    for i in range(len(lst_parent_ids)):
-        pid = lst_parent_ids[i]
-        if i == pid:
-            continue
-
-        if (score[i] < threshold or score[1] < threshold):
-            continue
-
-        if i in lst_left_ids and pid in lst_left_ids:
-            color = (0, 255, 0)
-        elif i in lst_right_ids and pid in lst_right_ids:
-            color = (255, 0, 0)
-        else:
-            color = (0, 255, 255)
-
-        cv2.line(image, (int(np_kps[i, 0]), int(np_kps[i, 1])),
-                 (int(np_kps[pid][0]), int(np_kps[pid, 1])), color, 3)
-
-    for i in range(np_kps.shape[0]):
-        if score[i] < threshold:
-            continue
-        cv2.circle(image, (int(np_kps[i, 0]), int(np_kps[i, 1])), 5,
-                   (0, 0, 255), -1)
-
-
-def draw_box(image, box):
-    cv2.rectangle(image, (int(box[0][0]), int(box[0][1])),
-                  (int(box[1][0]), int(box[1][1])), (0, 0, 255), 2)
-
 
 class Body2DKeypointsTest(unittest.TestCase):
 
@@ -71,14 +21,7 @@ class Body2DKeypointsTest(unittest.TestCase):
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
         output = pipeline(pipeline_input)
-        poses = np.array(output[OutputKeys.POSES])
-        scores = np.array(output[OutputKeys.SCORES])
-        boxes = np.array(output[OutputKeys.BOXES])
-        assert len(poses) == len(scores) and len(poses) == len(boxes)
-        image = cv2.imread(self.test_image, -1)
-        for i in range(len(poses)):
-            draw_box(image, np.array(boxes[i]))
-            draw_joints(image, np.array(poses[i]), np.array(scores[i]))
+        image = draw_keypoints(output, self.test_image)
         cv2.imwrite('pose_keypoint.jpg', image)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
new file mode 100644
index 00000000..50426414
--- /dev/null
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -0,0 +1,49 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import pdb
+import unittest
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class Body3DKeypointsTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_canonical_body-3d-keypoints_video'
+        self.test_video = 'data/test/videos/Walking.54138969.mp4'
+
+    def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
+        output = pipeline(pipeline_input)
+        poses = np.array(output[OutputKeys.POSES])
+        print(f'result 3d points shape {poses.shape}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_file(self):
+        body_3d_keypoints = pipeline(
+            Tasks.body_3d_keypoints, model=self.model_id)
+        self.pipeline_inference(body_3d_keypoints, self.test_video)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_with_video_stream(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        cap = cv2.VideoCapture(self.test_video)
+        if not cap.isOpened():
+            raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
+                            % (self.test_video))
+        self.pipeline_inference(body_3d_keypoints, cap)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        body_3d_keypoints = pipeline(Tasks.body_3d_keypoints)
+        self.pipeline_inference(body_3d_keypoints, self.test_video)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 67a4ce7b..0504cb7c 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -9,6 +9,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
@@ -25,24 +26,6 @@ class ConversationalTextToSql(unittest.TestCase):
         ]
     }
 
-    def tracking_and_print_results(
-            self, pipelines: List[ConversationalTextToSqlPipeline]):
-        for my_pipeline in pipelines:
-            last_sql, history = '', []
-            for item in self.test_case['utterance']:
-                case = {
-                    'utterance': item,
-                    'history': history,
-                    'last_sql': last_sql,
-                    'database_id': self.test_case['database_id'],
-                    'local_db_path': self.test_case['local_db_path']
-                }
-                results = my_pipeline(case)
-                print({'question': item})
-                print(results)
-                last_sql = results['text']
-                history.append(item)
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
@@ -61,7 +44,7 @@ class ConversationalTextToSql(unittest.TestCase):
                 model=model,
                 preprocessor=preprocessor)
         ]
-        self.tracking_and_print_results(pipelines)
+        text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
@@ -77,7 +60,7 @@ class ConversationalTextToSql(unittest.TestCase):
                 model=model,
                 preprocessor=preprocessor)
         ]
-        self.tracking_and_print_results(pipelines)
+        text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
@@ -85,12 +68,12 @@ class ConversationalTextToSql(unittest.TestCase):
             pipeline(
                 task=Tasks.conversational_text_to_sql, model=self.model_id)
         ]
-        self.tracking_and_print_results(pipelines)
+        text2sql_tracking_and_print_results(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipelines = [pipeline(task=Tasks.conversational_text_to_sql)]
-        self.tracking_and_print_results(pipelines)
+        text2sql_tracking_and_print_results(self.test_case, pipelines)
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_crowd_counting.py b/tests/pipelines/test_crowd_counting.py
index 1bd5a0dd..99f5ffd2 100644
--- a/tests/pipelines/test_crowd_counting.py
+++ b/tests/pipelines/test_crowd_counting.py
@@ -2,13 +2,12 @@
 import unittest
 
 import cv2
-import numpy as np
 from PIL import Image
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.cv.heatmap import numpy_to_cv2img
+from modelscope.utils.cv.image_utils import numpy_to_cv2img
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index c852b1ff..bb6022ec 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -7,18 +7,26 @@ from modelscope.utils.test_utils import test_level
 
 
 class TranslationTest(unittest.TestCase):
-    model_id = 'damo/nlp_csanmt_translation_zh2en'
-    inputs = '声明 补充 说 ， 沃伦 的 同事 都 深感 震惊 ， 并且 希望 他 能够 投@@ 案@@ 自@@ 首 。'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id)
-        print(pipeline_ins(input=self.inputs))
+    def test_run_with_model_name_for_zh2en(self):
+        model_id = 'damo/nlp_csanmt_translation_zh2en'
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
+        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2zh(self):
+        model_id = 'damo/nlp_csanmt_translation_en2zh'
+        inputs = 'Elon Musk, co-founder and chief executive officer of Tesla Motors.'
+        pipeline_ins = pipeline(task=Tasks.translation, model=model_id)
+        print(pipeline_ins(input=inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
+        inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'
         pipeline_ins = pipeline(task=Tasks.translation)
-        print(pipeline_ins(input=self.inputs))
+        print(pipeline_ins(input=inputs))
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 2710ec0d..843aade9 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -1,15 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
-from typing import List
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SpaceForDialogStateTracking
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
@@ -79,24 +78,6 @@ class DialogStateTrackingTest(unittest.TestCase):
         'User-8': 'Thank you, goodbye',
     }]
 
-    def tracking_and_print_dialog_states(
-            self, pipelines: List[DialogStateTrackingPipeline]):
-        import json
-        pipelines_len = len(pipelines)
-        history_states = [{}]
-        utter = {}
-        for step, item in enumerate(self.test_case):
-            utter.update(item)
-            result = pipelines[step % pipelines_len]({
-                'utter':
-                utter,
-                'history_states':
-                history_states
-            })
-            print(json.dumps(result))
-
-            history_states.extend([result[OutputKeys.OUTPUT], {}])
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id, revision='update')
@@ -111,7 +92,7 @@ class DialogStateTrackingTest(unittest.TestCase):
                 model=model,
                 preprocessor=preprocessor)
         ]
-        self.tracking_and_print_dialog_states(pipelines)
+        tracking_and_print_dialog_states(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
@@ -128,7 +109,7 @@ class DialogStateTrackingTest(unittest.TestCase):
                 preprocessor=preprocessor)
         ]
 
-        self.tracking_and_print_dialog_states(pipelines)
+        tracking_and_print_dialog_states(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
@@ -138,7 +119,7 @@ class DialogStateTrackingTest(unittest.TestCase):
                 model=self.model_id,
                 model_revision='update')
         ]
-        self.tracking_and_print_dialog_states(pipelines)
+        tracking_and_print_dialog_states(self.test_case, pipelines)
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_document_segmentation.py b/tests/pipelines/test_document_segmentation.py
new file mode 100644
index 00000000..39609be8
--- /dev/null
+++ b/tests/pipelines/test_document_segmentation.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+from typing import Any, Dict
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class DocumentSegmentationTest(unittest.TestCase):
+
+    model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
+    eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
+    sentences = '近年来，随着端到端语音识别的流行，基于Transformer结构的语音识别系统逐渐成为了主流。然而，由于Transformer是一种自回归模型，需要逐个生成目标文字，计算复杂度随着目标文字数量线性增加，限制了其在工业生产中的应用。针对Transoformer模型自回归生成文字的低计算效率缺陷，学术界提出了非自回归模型来并行的输出目标文字。根据生成目标文字时，迭代轮数，非自回归模型分为：多轮迭代式与单轮迭代非自回归模型。其中实用的是基于单轮迭代的非自回归模型。对于单轮非自回归模型，现有工作往往聚焦于如何更加准确的预测目标文字个数，如CTC-enhanced采用CTC预测输出文字个数，尽管如此，考虑到现实应用中，语速、口音、静音以及噪声等因素的影响，如何准确的预测目标文字个数以及抽取目标文字对应的声学隐变量仍然是一个比较大的挑战；另外一方面，我们通过对比自回归模型与单轮非自回归模型在工业大数据上的错误类型（如下图所示，AR与vanilla NAR），发现，相比于自回归模型，非自回归模型，在预测目标文字个数方面差距较小，但是替换错误显著的增加，我们认为这是由于单轮非自回归模型中条件独立假设导致的语义信息丢失。于此同时，目前非自回归模型主要停留在学术验证阶段，还没有工业大数据上的相关实验与结论。'  # noqa *
+    sentences_1 = '移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。'  # noqa *
+    eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (1220–1263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.'  # noqa *
+
+    def run_pipeline(self, model_id: str, documents: str) -> Dict[str, Any]:
+        p = pipeline(task=Tasks.document_segmentation, model=model_id)
+
+        result = p(documents=documents)
+
+        return result
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_document(self):
+        logger.info('Run document segmentation with one document ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id, documents=self.sentences)
+        print(result[OutputKeys.TEXT])
+
+        result = self.run_pipeline(
+            model_id=self.eng_model_id, documents=self.eng_sentences)
+        print(result[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_documents(self):
+        logger.info('Run document segmentation with many documents ...')
+
+        result = self.run_pipeline(
+            model_id=self.model_id,
+            documents=[self.sentences, self.sentences_1])
+
+        documents_list = result[OutputKeys.TEXT]
+        for document in documents_list:
+            print(document)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index d4872e0a..03dd75a6 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -9,6 +9,7 @@ from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
 from modelscope.utils.test_utils import test_level
 
 
@@ -17,46 +18,21 @@ class FaceDetectionTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
 
-    def show_result(self, img_path, bboxes, kpss, scores):
-        bboxes = np.array(bboxes)
-        kpss = np.array(kpss)
-        scores = np.array(scores)
-        img = cv2.imread(img_path)
-        assert img is not None, f"Can't read img: {img_path}"
-        for i in range(len(scores)):
-            bbox = bboxes[i].astype(np.int32)
-            kps = kpss[i].reshape(-1, 2).astype(np.int32)
-            score = scores[i]
-            x1, y1, x2, y2 = bbox
-            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
-            for kp in kps:
-                cv2.circle(img, tuple(kp), 1, (0, 0, 255), 1)
-            cv2.putText(
-                img,
-                f'{score:.2f}', (x1, y2),
-                1,
-                1.0, (0, 255, 0),
-                thickness=1,
-                lineType=8)
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
         cv2.imwrite('result.png', img)
-        print(
-            f'Found {len(scores)} faces, output written to {osp.abspath("result.png")}'
-        )
+        print(f'output written to {osp.abspath("result.png")}')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_dataset(self):
         input_location = ['data/test/images/face_detection.png']
-        # alternatively:
-        # input_location = '/dir/to/images'
 
         dataset = MsDataset.load(input_location, target='image')
         face_detection = pipeline(Tasks.face_detection, model=self.model_id)
         # note that for dataset output, the inference-output is a Generator that can be iterated.
         result = face_detection(dataset)
         result = next(result)
-        self.show_result(input_location[0], result[OutputKeys.BOXES],
-                         result[OutputKeys.KEYPOINTS],
-                         result[OutputKeys.SCORES])
+        self.show_result(input_location[0], result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
@@ -64,18 +40,14 @@ class FaceDetectionTest(unittest.TestCase):
         img_path = 'data/test/images/face_detection.png'
 
         result = face_detection(img_path)
-        self.show_result(img_path, result[OutputKeys.BOXES],
-                         result[OutputKeys.KEYPOINTS],
-                         result[OutputKeys.SCORES])
+        self.show_result(img_path, result)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         face_detection = pipeline(Tasks.face_detection)
         img_path = 'data/test/images/face_detection.png'
         result = face_detection(img_path)
-        self.show_result(img_path, result[OutputKeys.BOXES],
-                         result[OutputKeys.KEYPOINTS],
-                         result[OutputKeys.SCORES])
+        self.show_result(img_path, result)
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_face_image_generation.py b/tests/pipelines/test_face_image_generation.py
index fc2c58cc..c758ea3a 100644
--- a/tests/pipelines/test_face_image_generation.py
+++ b/tests/pipelines/test_face_image_generation.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import os.path as osp
 import unittest
 
diff --git a/tests/pipelines/test_face_recognition.py b/tests/pipelines/test_face_recognition.py
index 20e05f65..015205d6 100644
--- a/tests/pipelines/test_face_recognition.py
+++ b/tests/pipelines/test_face_recognition.py
@@ -21,7 +21,6 @@ class FaceRecognitionTest(unittest.TestCase):
 
         face_recognition = pipeline(
             Tasks.face_recognition, model=self.model_id)
-        # note that for dataset output, the inference-output is a Generator that can be iterated.
         emb1 = face_recognition(img1)[OutputKeys.IMG_EMBEDDING]
         emb2 = face_recognition(img2)[OutputKeys.IMG_EMBEDDING]
         sim = np.dot(emb1[0], emb2[0])
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
new file mode 100644
index 00000000..3a87643c
--- /dev/null
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -0,0 +1,85 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqQuestionAnswering
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FaqQuestionAnsweringPipeline
+from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaqQuestionAnsweringTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+    param = {
+        'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
+        'support_set': [{
+            'text': '卖品代金券怎么用',
+            'label': '6527856'
+        }, {
+            'text': '怎么使用优惠券',
+            'label': '6527856'
+        }, {
+            'text': '这个可以一起领吗',
+            'label': '1000012000'
+        }, {
+            'text': '付款时送的优惠券哪里领',
+            'label': '1000012000'
+        }, {
+            'text': '购物等级怎么长',
+            'label': '13421097'
+        }, {
+            'text': '购物等级二心',
+            'label': '13421097'
+        }]
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(cache_path)
+        model = SbertForFaqQuestionAnswering(cache_path)
+        model.load_checkpoint(cache_path)
+        pipeline_ins = FaqQuestionAnsweringPipeline(
+            model, preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = FaqQuestionAnsweringPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=self.model_id)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        print(pipeline_ins(self.param, max_seq_length=20))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_sentence_embedding(self):
+        pipeline_ins = pipeline(task=Tasks.faq_question_answering)
+        sentence_vec = pipeline_ins.get_sentence_embedding(
+            ['今天星期六', '明天星期几明天星期几'])
+        print(np.shape(sentence_vec))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 2f57b2d8..1b709e27 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -9,6 +9,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -37,6 +38,7 @@ class FillMaskTest(unittest.TestCase):
         'Everything in [MASK] you call reality is really [MASK] a reflection of your '
         '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
     }
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
@@ -98,9 +100,11 @@ class FillMaskTest(unittest.TestCase):
                 second_sequence=None)
             pipeline_ins = pipeline(
                 task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
-            print(
-                f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-                f'{pipeline_ins(self.test_inputs[language])}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_sbert_{language}'):
+                print(
+                    f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+                    f'{pipeline_ins(self.test_inputs[language])}\n')
 
         # veco
         model = Model.from_pretrained(self.model_id_veco)
@@ -111,8 +115,11 @@ class FillMaskTest(unittest.TestCase):
         for language in ['zh', 'en']:
             ori_text = self.ori_texts[language]
             test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
-            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-                  f'{pipeline_ins(test_input)}\n')
+            with self.regress_tool.monitor_module_single_forward(
+                    pipeline_ins.model, f'fill_mask_veco_{language}'):
+                print(
+                    f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                    f'{pipeline_ins(test_input)}\n')
 
         # zh bert
         model = Model.from_pretrained(self.model_id_bert)
@@ -123,8 +130,10 @@ class FillMaskTest(unittest.TestCase):
         language = 'zh'
         ori_text = self.ori_texts[language]
         test_input = self.test_inputs[language]
-        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-              f'{pipeline_ins(test_input)}\n')
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'fill_mask_bert_zh'):
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
diff --git a/tests/pipelines/test_hicossl_video_embedding.py b/tests/pipelines/test_hicossl_video_embedding.py
new file mode 100644
index 00000000..5615cef2
--- /dev/null
+++ b/tests/pipelines/test_hicossl_video_embedding.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# !/usr/bin/env python
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HICOSSLVideoEmbeddingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_s3dg_video-embedding'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        videossl_pipeline = pipeline(
+            Tasks.video_embedding, model=self.model_id)
+        result = videossl_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+
+        print(f'video embedding output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image2image_generation.py b/tests/pipelines/test_image2image_generation.py
index 81aae81e..116cef76 100644
--- a/tests/pipelines/test_image2image_generation.py
+++ b/tests/pipelines/test_image2image_generation.py
@@ -3,6 +3,7 @@ import unittest
 
 from torchvision.utils import save_image
 
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -10,7 +11,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageGenerationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide two generation modes, i.e., Similar Image Generation and Interpolation.
             You can pass the following parameters for different mode.
@@ -27,13 +28,13 @@ class Image2ImageGenerationTest(unittest.TestCase):
         result2 = img2img_gen_pipeline(('data/test/images/img2img_input.jpg',
                                         'data/test/images/img2img_style.jpg'))
         save_image(
-            result1['output_img'].clamp(-1, 1),
+            result1[OutputKeys.OUTPUT_IMG].clamp(-1, 1),
             'result1.jpg',
             range=(-1, 1),
             normalize=True,
             nrow=4)
         save_image(
-            result2['output_img'].clamp(-1, 1),
+            result2[OutputKeys.OUTPUT_IMG].clamp(-1, 1),
             'result2.jpg',
             range=(-1, 1),
             normalize=True,
diff --git a/tests/pipelines/test_image2image_translation.py b/tests/pipelines/test_image2image_translation.py
index fd2f8063..a1cdb957 100644
--- a/tests/pipelines/test_image2image_translation.py
+++ b/tests/pipelines/test_image2image_translation.py
@@ -8,7 +8,7 @@ from modelscope.utils.test_utils import test_level
 
 class Image2ImageTranslationTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub(self):
         r"""We provide three translation modes, i.e., uncropping, colorization and combination.
             You can pass the following parameters for different mode.
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 1bebf3df..83b7fee2 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -18,19 +18,6 @@ class ImageMattingTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-matting'
 
-    @unittest.skip('deprecated, download model from model hub instead')
-    def test_run_with_direct_file_download(self):
-        model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
-                     '.com/data/test/maas/image_matting/matting_person.pb'
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
-            with open(model_file, 'wb') as ofile:
-                ofile.write(File.read(model_path))
-            img_matting = pipeline(Tasks.portrait_matting, model=tmp_dir)
-
-            result = img_matting('data/test/images/image_matting.png')
-            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
-
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_dataset(self):
         input_location = ['data/test/images/image_matting.png']
diff --git a/tests/pipelines/test_image_panoptic_segmentation.py b/tests/pipelines/test_image_panoptic_segmentation.py
new file mode 100644
index 00000000..3f07adf5
--- /dev/null
+++ b/tests/pipelines/test_image_panoptic_segmentation.py
@@ -0,0 +1,40 @@
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.test_utils import test_level
+
+
+class ImagePanopticSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        result = pan_segmentor(input_location)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation return success')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_panoptic_segmentation_from_PIL(self):
+        input_location = 'data/test/images/image_panoptic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_panoptic-segmentation_cocopan'
+        pan_segmentor = pipeline(Tasks.image_segmentation, model=model_id)
+        PIL_array = PIL.Image.open(input_location)
+        result = pan_segmentor(PIL_array)
+
+        draw_img = panoptic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print test_image_panoptic_segmentation from PIL return success')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_reid_person.py b/tests/pipelines/test_image_reid_person.py
new file mode 100644
index 00000000..c3e8d487
--- /dev/null
+++ b/tests/pipelines/test_image_reid_person.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageReidPersonTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.input_location = 'data/test/images/image_reid_person.jpg'
+        self.model_id = 'damo/cv_passvitb_image-reid-person_market'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_reid_person(self):
+        image_reid_person = pipeline(
+            Tasks.image_reid_person, model=self.model_id)
+        result = image_reid_person(self.input_location)
+        assert result and OutputKeys.IMG_EMBEDDING in result
+        print(
+            f'The shape of img embedding is: {result[OutputKeys.IMG_EMBEDDING].shape}'
+        )
+        print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_image_reid_person_with_image(self):
+        image_reid_person = pipeline(
+            Tasks.image_reid_person, model=self.model_id)
+        img = Image.open(self.input_location)
+        result = image_reid_person(img)
+        assert result and OutputKeys.IMG_EMBEDDING in result
+        print(
+            f'The shape of img embedding is: {result[OutputKeys.IMG_EMBEDDING].shape}'
+        )
+        print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_image_reid_person_with_default_model(self):
+        image_reid_person = pipeline(Tasks.image_reid_person)
+        result = image_reid_person(self.input_location)
+        assert result and OutputKeys.IMG_EMBEDDING in result
+        print(
+            f'The shape of img embedding is: {result[OutputKeys.IMG_EMBEDDING].shape}'
+        )
+        print(f'The img embedding is: {result[OutputKeys.IMG_EMBEDDING]}')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_semantic_segmentation.py b/tests/pipelines/test_image_semantic_segmentation.py
new file mode 100644
index 00000000..6738976c
--- /dev/null
+++ b/tests/pipelines/test_image_semantic_segmentation.py
@@ -0,0 +1,54 @@
+import unittest
+
+import cv2
+import PIL
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import semantic_seg_masks_to_image
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+class ImageSemanticSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_panmerge(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        model_id = 'damo/cv_swinL_semantic-segmentation_cocopanmerge'
+        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_panmerge_from_PIL DONE')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_semantic_segmentation_vitadapter(self):
+        input_location = 'data/test/images/image_semantic_segmentation.jpg'
+        model_id = 'damo/cv_vitadapter_semantic-segmentation_cocostuff164k'
+        segmenter = pipeline(Tasks.image_segmentation, model=model_id)
+        result = segmenter(input_location)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter DONE')
+
+        PIL_array = PIL.Image.open(input_location)
+        result = segmenter(PIL_array)
+
+        draw_img = semantic_seg_masks_to_image(result[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('test_image_semantic_segmentation_vitadapter_from_PIL DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index 964e47ac..4e5bb69b 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -15,7 +15,7 @@ class ImageStyleTransferTest(unittest.TestCase):
     def setUp(self) -> None:
         self.model_id = 'damo/cv_aams_style-transfer_damo'
 
-    @unittest.skip('deprecated, download model from model hub instead')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         snapshot_path = snapshot_download(self.model_id)
         print('snapshot_path: {}'.format(snapshot_path))
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index e7967edc..4a732950 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -1,7 +1,6 @@
 import os.path
 import unittest
 
-from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -41,3 +40,7 @@ class KWSFarfieldTest(unittest.TestCase):
         result = kws(data)
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_movie_scene_segmentation.py b/tests/pipelines/test_movie_scene_segmentation.py
new file mode 100644
index 00000000..5993c634
--- /dev/null
+++ b/tests/pipelines/test_movie_scene_segmentation.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MovieSceneSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_movie_scene_segmentation(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation, model=model_id)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_movie_scene_segmentation_with_default_task(self):
+        input_location = 'data/test/videos/movie_scene_segmentation_test_video.mp4'
+        movie_scene_segmentation_pipeline = pipeline(
+            Tasks.movie_scene_segmentation)
+        result = movie_scene_segmentation_pipeline(input_location)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 4b8a813a..642ac11d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -54,6 +54,27 @@ class MplugTasksTest(unittest.TestCase):
         result = pipeline_vqa(input)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_model(self):
+        model = Model.from_pretrained(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'question': question}
+        result = pipeline_retrieval(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_image_text_retrieval_with_name(self):
+        model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
+        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
+        image = Image.open('data/test/images/image-text-retrieval.jpg')
+        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'question': question}
+        result = pipeline_retrieval(input)
+        print(result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 6152f279..f94e31fa 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -31,11 +31,10 @@ class MultiModalEmbeddingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
+        model = Model.from_pretrained(
+            self.model_id, revision=self.model_version)
         pipeline_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding,
-            model=model,
-            model_revision=self.model_version)
+            task=Tasks.multi_modal_embedding, model=model)
         text_embedding = pipeline_multi_modal_embedding(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
diff --git a/tests/pipelines/test_multi_modal_similarity.py b/tests/pipelines/test_multi_modal_similarity.py
new file mode 100644
index 00000000..d1d6a7a8
--- /dev/null
+++ b/tests/pipelines/test_multi_modal_similarity.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiModalSimilarityTest(unittest.TestCase):
+    model_id = 'damo/multi-modal_team-vit-large-patch14_multi-modal-similarity'
+    test_input = {
+        'img': 'data/test/images/generative_multimodal.jpg',
+        'text': '起居室照片'
+    }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        multi_modal_similarity_pipeline = pipeline(
+            Tasks.multi_modal_similarity, model=self.model_id)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        multi_modal_similarity_pipeline = pipeline(
+            task=Tasks.multi_modal_similarity, model=model)
+        output = multi_modal_similarity_pipeline(self.test_input)
+        print(output)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 5ba93f49..ad0fa228 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -3,7 +3,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition
+from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
+                                   TransformerCRFForNamedEntityRecognition)
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
 from modelscope.preprocessors import NERPreprocessor
@@ -12,12 +13,13 @@ from modelscope.utils.test_utils import test_level
 
 
 class NamedEntityRecognitionTest(unittest.TestCase):
-    model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+    tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
+    lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id)
+    def test_run_tcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.tcrf_model_id)
         tokenizer = NERPreprocessor(cache_path)
         model = TransformerCRFForNamedEntityRecognition(
             cache_path, tokenizer=tokenizer)
@@ -32,9 +34,36 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_by_direct_model_download(self):
+        cache_path = snapshot_download(self.lcrf_model_id)
+        tokenizer = NERPreprocessor(cache_path)
+        model = LSTMCRFForNamedEntityRecognition(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = NamedEntityRecognitionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
+    def test_run_tcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.tcrf_model_id)
+        tokenizer = NERPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.lcrf_model_id)
         tokenizer = NERPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition,
@@ -43,9 +72,15 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_model_name(self):
+    def test_run_tcrf_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_lcrf_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.named_entity_recognition, model=self.model_id)
+            task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 1e259a2e..1d3fba12 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -15,6 +16,7 @@ class NLITest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_nli_chinese-base'
     sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
     sentence2 = '四川商务职业学院商务管理在哪个校区？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
@@ -26,7 +28,6 @@ class NLITest(unittest.TestCase):
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
-        print()
         print(
             f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
             f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
@@ -42,7 +43,9 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_nli'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 8ee5f2ef..9044e41a 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -4,14 +4,13 @@ import unittest
 from os import path as osp
 
 import cv2
-import numpy as np
 from PIL import Image
 
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
-from modelscope.preprocessors.image import load_image
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import created_boxed_image
 from modelscope.utils.test_utils import test_level
 
 
@@ -22,11 +21,9 @@ class OfaTasksTest(unittest.TestCase):
         os.makedirs(self.output_dir, exist_ok=True)
 
     def save_img(self, image_in, box, image_out):
-        image = load_image(image_in)
-        img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
-        cv2.rectangle(img, (int(box[0]), int(box[1])),
-                      (int(box[2]), int(box[3])), (0, 255, 0), 3)
-        cv2.imwrite(osp.join(self.output_dir, image_out), img)
+        cv2.imwrite(
+            osp.join(self.output_dir, image_out),
+            created_boxed_image(image_in, box))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_model(self):
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index 8b5384ee..bdbf8b61 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -24,19 +24,6 @@ class ImageCartoonTest(unittest.TestCase):
             cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
             print(f'Output written to {osp.abspath("result.png")}')
 
-    @unittest.skip('deprecated, download model from model hub instead')
-    def test_run_by_direct_model_download(self):
-        model_dir = './assets'
-        if not os.path.exists(model_dir):
-            os.system(
-                'wget https://invi-label.oss-cn-shanghai.aliyuncs.com/label/model/cartoon/assets.zip'
-            )
-            os.system('unzip assets.zip')
-
-        img_cartoon = pipeline(
-            Tasks.image_portrait_stylization, model=model_dir)
-        self.pipeline_inference(img_cartoon, self.test_image)
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
         img_cartoon = pipeline(
diff --git a/tests/pipelines/test_realtime_object_detection.py b/tests/pipelines/test_realtime_object_detection.py
new file mode 100644
index 00000000..03ddacf4
--- /dev/null
+++ b/tests/pipelines/test_realtime_object_detection.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import realtime_object_detection_bbox_vis
+from modelscope.utils.test_utils import test_level
+
+
+class RealtimeObjectDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_cspnet_image-object-detection_yolox'
+        self.model_nano_id = 'damo/cv_cspnet_image-object-detection_yolox_nano_coco'
+        self.test_image = 'data/test/images/keypoints_detect/000000438862.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rt_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_nano(self):
+        realtime_object_detection = pipeline(
+            Tasks.image_object_detection, model=self.model_nano_id)
+
+        image = cv2.imread(self.test_image)
+        result = realtime_object_detection(image)
+        if result:
+            bboxes = result[OutputKeys.BOXES].astype(int)
+            image = realtime_object_detection_bbox_vis(image, bboxes)
+            cv2.imwrite('rtnano_obj_out.jpg', image)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
new file mode 100644
index 00000000..20502a19
--- /dev/null
+++ b/tests/pipelines/test_relation_extraction.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import InformationExtractionModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import InformationExtractionPipeline
+from modelscope.preprocessors import RelationExtractionPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class RelationExtractionTest(unittest.TestCase):
+    model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
+    sentence = '高捷，祖籍江苏，本科毕业于东南大学'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(cache_path)
+        model = InformationExtractionModel.from_pretrained(cache_path)
+        pipeline1 = InformationExtractionPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.information_extraction, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = RelationExtractionPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.information_extraction, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.information_extraction)
+        print(pipeline_ins(input=self.sentence))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index d39f6783..6990bf75 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
 from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -15,6 +16,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
@@ -47,7 +49,9 @@ class SentenceSimilarityTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity, model=self.model_id)
-        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_sen_sim'):
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index f3bc6981..35c96282 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -30,7 +30,6 @@ class SentimentClassificationTaskModelTest(unittest.TestCase):
             preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
-        print()
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1: {pipeline2(input=self.sentence1)}')
 
diff --git a/tests/pipelines/test_skin_retouching.py b/tests/pipelines/test_skin_retouching.py
index a10af416..c6dbee2c 100644
--- a/tests/pipelines/test_skin_retouching.py
+++ b/tests/pipelines/test_skin_retouching.py
@@ -23,10 +23,9 @@ class SkinRetouchingTest(unittest.TestCase):
         cv2.imwrite('result_skinretouching.png', result[OutputKeys.OUTPUT_IMG])
         print(f'Output written to {osp.abspath("result_skinretouching.png")}')
 
-    @unittest.skip('deprecated, download model from model hub instead')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir = snapshot_download(self.model_id)
-
         skin_retouching = pipeline(Tasks.skin_retouching, model=model_dir)
         self.pipeline_inference(skin_retouching, self.test_image)
 
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
index f5d4714c..fc228cd8 100644
--- a/tests/pipelines/test_video_single_object_tracking.py
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -1,11 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-from modelscope.models.cv.video_single_object_tracking.utils.utils import \
-    show_tracking_result
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_tracking_result
 from modelscope.utils.test_utils import test_level
 
 
@@ -22,8 +21,8 @@ class SingleObjectTracking(unittest.TestCase):
         init_bbox = [414, 343, 514, 449]  # [x1, y1, x2, y2]
         result = video_single_object_tracking((video_path, init_bbox))
         print('result is : ', result[OutputKeys.BOXES])
-        show_tracking_result(video_path, result[OutputKeys.BOXES],
-                             './tracking_result.avi')
+        show_video_tracking_result(video_path, result[OutputKeys.BOXES],
+                                   './tracking_result.avi')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
new file mode 100644
index 00000000..12a0ee07
--- /dev/null
+++ b/tests/pipelines/test_video_summarization.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_summarization_result
+from modelscope.utils.test_utils import test_level
+
+
+class VideoSummarizationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        model_id = 'damo/cv_googlenet_pgl-video-summarization'
+        video_path = 'data/test/videos/video_category_test_video.mp4'
+        summarization_pipeline = pipeline(
+            Tasks.video_summarization, model=model_id)
+        result = summarization_pipeline(video_path)
+
+        print(f'video summarization output: \n{result}.')
+        show_video_summarization_result(video_path, result,
+                                        './summarization_result.avi')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_path = 'data/test/videos/video_category_test_video.mp4'
+        summarization_pipeline = pipeline(Tasks.video_summarization)
+        result = summarization_pipeline(video_path)
+
+        print(f'video summarization output:\n {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index c332d987..87006f96 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -9,6 +9,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import WordSegmentationPipeline
 from modelscope.preprocessors import TokenClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -16,6 +17,7 @@ class WordSegmentationTest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
     sentence = '今天天气不错，适合出去游玩'
     sentence_eng = 'I am a program.'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
@@ -27,7 +29,6 @@ class WordSegmentationTest(unittest.TestCase):
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
-        print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -42,8 +43,12 @@ class WordSegmentationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=self.model_id)
-        print(pipeline_ins(input=self.sentence))
-        print(pipeline_ins(input=self.sentence_eng))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_zh'):
+            print(pipeline_ins(input=self.sentence))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_ws_en'):
+            print(pipeline_ins(input=self.sentence_eng))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index 7620a0ed..f0f2a481 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -8,6 +8,7 @@ from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -16,6 +17,7 @@ class ZeroShotClassificationTest(unittest.TestCase):
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
     template = '这篇文章的标题是{}'
+    regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
@@ -33,7 +35,6 @@ class ZeroShotClassificationTest(unittest.TestCase):
             f'sentence: {self.sentence}\n'
             f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
         )
-        print()
         print(
             f'sentence: {self.sentence}\n'
             f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'
@@ -53,7 +54,11 @@ class ZeroShotClassificationTest(unittest.TestCase):
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.zero_shot_classification, model=self.model_id)
-        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+        with self.regress_tool.monitor_module_single_forward(
+                pipeline_ins.model, 'sbert_zero_shot'):
+            print(
+                pipeline_ins(
+                    input=self.sentence, candidate_labels=self.labels))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/run.py b/tests/run.py
index 27af7fe5..79509745 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -2,11 +2,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
+import datetime
+import multiprocessing
 import os
+import subprocess
 import sys
+import tempfile
 import unittest
 from fnmatch import fnmatch
+from multiprocessing.managers import BaseManager
+from pathlib import Path
+from turtle import shape
+from unittest import TestResult, TextTestResult
 
+import pandas
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
 #         A segmentation fault may be raise by pytorch cpp library
 #         if 'import tensorflow' in front of 'import torch'.
@@ -19,6 +28,227 @@ from modelscope.utils.test_utils import set_test_level, test_level
 logger = get_logger()
 
 
+def test_cases_result_to_df(result_list):
+    table_header = [
+        'Name', 'Result', 'Info', 'Start time', 'Stop time',
+        'Time cost(seconds)'
+    ]
+    df = pandas.DataFrame(
+        result_list, columns=table_header).sort_values(
+            by=['Start time'], ascending=True)
+    return df
+
+
+def statistics_test_result(df):
+    total_cases = df.shape[0]
+    # yapf: disable
+    success_cases = df.loc[df['Result'] == 'Success'].shape[0]
+    error_cases = df.loc[df['Result'] == 'Error'].shape[0]
+    failures_cases = df.loc[df['Result'] == 'Failures'].shape[0]
+    expected_failure_cases = df.loc[df['Result'] == 'ExpectedFailures'].shape[0]
+    unexpected_success_cases = df.loc[df['Result'] == 'UnexpectedSuccesses'].shape[0]
+    skipped_cases = df.loc[df['Result'] == 'Skipped'].shape[0]
+    # yapf: enable
+
+    if failures_cases > 0 or \
+       error_cases > 0 or \
+       unexpected_success_cases > 0:
+        result = 'FAILED'
+    else:
+        result = 'SUCCESS'
+    result_msg = '%s (Runs=%s,success=%s,failures=%s,errors=%s,\
+    skipped=%s,expected failures=%s,unexpected successes=%s)' % (
+        result, total_cases, success_cases, failures_cases, error_cases,
+        skipped_cases, expected_failure_cases, unexpected_success_cases)
+
+    print(result_msg)
+    if result == 'FAILED':
+        sys.exit(1)
+
+
+def gather_test_suites_in_files(test_dir, case_file_list, list_tests):
+    test_suite = unittest.TestSuite()
+    for case in case_file_list:
+        test_case = unittest.defaultTestLoader.discover(
+            start_dir=test_dir, pattern=case)
+        test_suite.addTest(test_case)
+        if hasattr(test_case, '__iter__'):
+            for subcase in test_case:
+                if list_tests:
+                    print(subcase)
+        else:
+            if list_tests:
+                print(test_case)
+    return test_suite
+
+
+def gather_test_suites_files(test_dir, pattern):
+    case_file_list = []
+    for dirpath, dirnames, filenames in os.walk(test_dir):
+        for file in filenames:
+            if fnmatch(file, pattern):
+                case_file_list.append(file)
+    return case_file_list
+
+
+def collect_test_results(case_results):
+    result_list = [
+    ]  # each item is Case, Result, Start time, Stop time, Time cost
+    for case_result in case_results.successes:
+        result_list.append(
+            (case_result.test_full_name, 'Success', '', case_result.start_time,
+             case_result.stop_time, case_result.time_cost))
+    for case_result in case_results.errors:
+        result_list.append(
+            (case_result[0].test_full_name, 'Error', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.skipped:
+        result_list.append(
+            (case_result[0].test_full_name, 'Skipped', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.expectedFailures:
+        result_list.append(
+            (case_result[0].test_full_name, 'ExpectedFailures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.failures:
+        result_list.append(
+            (case_result[0].test_full_name, 'Failures', case_result[1],
+             case_result[0].start_time, case_result[0].stop_time,
+             case_result[0].time_cost))
+    for case_result in case_results.unexpectedSuccesses:
+        result_list.append((case_result.test_full_name, 'UnexpectedSuccesses',
+                            '', case_result.start_time, case_result.stop_time,
+                            case_result.time_cost))
+    return result_list
+
+
+class TestSuiteRunner:
+
+    def run(self, msg_queue, test_dir, test_suite_file):
+        test_suite = unittest.TestSuite()
+        test_case = unittest.defaultTestLoader.discover(
+            start_dir=test_dir, pattern=test_suite_file)
+        test_suite.addTest(test_case)
+        runner = TimeCostTextTestRunner()
+        test_suite_result = runner.run(test_suite)
+        msg_queue.put(collect_test_results(test_suite_result))
+
+
+def run_command_with_popen(cmd):
+    with subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=1,
+            encoding='utf8') as sub_process:
+        for line in iter(sub_process.stdout.readline, ''):
+            sys.stdout.write(line)
+
+
+def run_in_subprocess(args):
+    # only case args.isolated_cases run in subporcess, all other run in a subprocess
+    test_suite_files = gather_test_suites_files(
+        os.path.abspath(args.test_dir), args.pattern)
+
+    if args.subprocess:  # run all case in subprocess
+        isolated_cases = test_suite_files
+    else:
+        isolated_cases = []
+        with open(args.isolated_cases, 'r') as f:
+            for line in f:
+                if line.strip() in test_suite_files:
+                    isolated_cases.append(line.strip())
+
+    if not args.list_tests:
+        with tempfile.TemporaryDirectory() as temp_result_dir:
+            for test_suite_file in isolated_cases:  # run case in subprocess
+                cmd = [
+                    'python', 'tests/run.py', '--pattern', test_suite_file,
+                    '--result_dir', temp_result_dir
+                ]
+                run_command_with_popen(cmd)
+            result_dfs = []
+            # run remain cases in a process.
+            remain_suite_files = [
+                item for item in test_suite_files if item not in isolated_cases
+            ]
+            test_suite = gather_test_suites_in_files(args.test_dir,
+                                                     remain_suite_files,
+                                                     args.list_tests)
+            if test_suite.countTestCases() > 0:
+                runner = TimeCostTextTestRunner()
+                result = runner.run(test_suite)
+                result = collect_test_results(result)
+                df = test_cases_result_to_df(result)
+                result_dfs.append(df)
+
+            # collect test results
+            result_path = Path(temp_result_dir)
+            for result in result_path.iterdir():
+                if Path.is_file(result):
+                    df = pandas.read_pickle(result)
+                    result_dfs.append(df)
+
+            result_pd = pandas.concat(
+                result_dfs)  # merge result of every test suite.
+            print_table_result(result_pd)
+            print_abnormal_case_info(result_pd)
+            statistics_test_result(result_pd)
+
+
+def get_object_full_name(obj):
+    klass = obj.__class__
+    module = klass.__module__
+    if module == 'builtins':
+        return klass.__qualname__
+    return module + '.' + klass.__qualname__
+
+
+class TimeCostTextTestResult(TextTestResult):
+    """Record test case time used!"""
+
+    def __init__(self, stream, descriptions, verbosity):
+        self.successes = []
+        return super(TimeCostTextTestResult,
+                     self).__init__(stream, descriptions, verbosity)
+
+    def startTest(self, test):
+        test.start_time = datetime.datetime.now()
+        test.test_full_name = get_object_full_name(
+            test) + '.' + test._testMethodName
+        self.stream.writeln('Test case:  %s start at: %s' %
+                            (test.test_full_name, test.start_time))
+
+        return super(TimeCostTextTestResult, self).startTest(test)
+
+    def stopTest(self, test):
+        TextTestResult.stopTest(self, test)
+        test.stop_time = datetime.datetime.now()
+        test.time_cost = (test.stop_time - test.start_time).total_seconds()
+        self.stream.writeln(
+            'Test case: %s stop at: %s, cost time: %s(seconds)' %
+            (test.test_full_name, test.stop_time, test.time_cost))
+        super(TimeCostTextTestResult, self).stopTest(test)
+
+    def addSuccess(self, test):
+        self.successes.append(test)
+        super(TextTestResult, self).addSuccess(test)
+
+
+class TimeCostTextTestRunner(unittest.runner.TextTestRunner):
+    resultclass = TimeCostTextTestResult
+
+    def run(self, test):
+        return super(TimeCostTextTestRunner, self).run(test)
+
+    def _makeResult(self):
+        result = super(TimeCostTextTestRunner, self)._makeResult()
+        return result
+
+
 def gather_test_cases(test_dir, pattern, list_tests):
     case_list = []
     for dirpath, dirnames, filenames in os.walk(test_dir):
@@ -42,16 +272,40 @@ def gather_test_cases(test_dir, pattern, list_tests):
     return test_suite
 
 
+def print_abnormal_case_info(df):
+    df = df.loc[(df['Result'] == 'Error') | (df['Result'] == 'Failures')]
+    for _, row in df.iterrows():
+        print('Case %s run result: %s, msg:\n%s' %
+              (row['Name'], row['Result'], row['Info']))
+
+
+def print_table_result(df):
+    df = df.loc[df['Result'] != 'Skipped']
+    df = df.drop('Info', axis=1)
+    formatters = {
+        'Name': '{{:<{}s}}'.format(df['Name'].str.len().max()).format,
+        'Result': '{{:<{}s}}'.format(df['Result'].str.len().max()).format,
+    }
+    with pandas.option_context('display.max_rows', None, 'display.max_columns',
+                               None, 'display.width', None):
+        print(df.to_string(justify='left', formatters=formatters, index=False))
+
+
 def main(args):
-    runner = unittest.TextTestRunner()
+    runner = TimeCostTextTestRunner()
     test_suite = gather_test_cases(
         os.path.abspath(args.test_dir), args.pattern, args.list_tests)
     if not args.list_tests:
         result = runner.run(test_suite)
-        if len(result.failures) > 0:
-            sys.exit(len(result.failures))
-        if len(result.errors) > 0:
-            sys.exit(len(result.errors))
+        result = collect_test_results(result)
+        df = test_cases_result_to_df(result)
+        if args.result_dir is not None:
+            file_name = str(int(datetime.datetime.now().timestamp() * 1000))
+            df.to_pickle(os.path.join(args.result_dir, file_name))
+        else:
+            print_table_result(df)
+            print_abnormal_case_info(df)
+            statistics_test_result(df)
 
 
 if __name__ == '__main__':
@@ -66,11 +320,30 @@ if __name__ == '__main__':
         '--level', default=0, type=int, help='2 -- all, 1 -- p1, 0 -- p0')
     parser.add_argument(
         '--disable_profile', action='store_true', help='disable profiling')
+    parser.add_argument(
+        '--isolated_cases',
+        default=None,
+        help='specified isolated cases config file')
+    parser.add_argument(
+        '--subprocess',
+        action='store_true',
+        help='run all test suite in subprocess')
+    parser.add_argument(
+        '--result_dir',
+        default=None,
+        help='Save result to directory, internal use only')
     args = parser.parse_args()
     set_test_level(args.level)
+    os.environ['REGRESSION_BASELINE'] = '1'
     logger.info(f'TEST LEVEL: {test_level()}')
     if not args.disable_profile:
         from utils import profiler
         logger.info('enable profile ...')
         profiler.enable()
-    main(args)
+    if args.isolated_cases is not None or args.subprocess:
+        run_in_subprocess(args)
+    elif args.isolated_cases is not None and args.subprocess:
+        print('isolated_cases and subporcess conflict')
+        sys.exit(1)
+    else:
+        main(args)
diff --git a/tests/trainers/audio/__init__.py b/tests/trainers/audio/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
new file mode 100644
index 00000000..176c811f
--- /dev/null
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -0,0 +1,56 @@
+import os
+import shutil
+import tempfile
+import unittest
+from functools import partial
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.test_utils import test_level
+
+SEGMENT_LENGTH_TEST = 640
+
+
+class TestANSTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
+
+        hf_ds = MsDataset.load(
+            'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
+        mapped_ds = hf_ds.map(
+            partial(to_segment, segment_length=SEGMENT_LENGTH_TEST),
+            remove_columns=['duration'],
+            batched=True,
+            batch_size=2)
+        self.dataset = MsDataset.from_hf_dataset(mapped_ds)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            model_revision='beta',
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            max_epochs=2,
+            train_iters_per_epoch=2,
+            val_iters_per_epoch=1,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            Trainers.speech_frcrn_ans_cirm_16k, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i + 1}.pth', results_files)
diff --git a/tests/trainers/easycv/__init__.py b/tests/trainers/easycv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/easycv/test_easycv_trainer.py b/tests/trainers/easycv/test_easycv_trainer.py
new file mode 100644
index 00000000..6d1d7ec4
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer.py
@@ -0,0 +1,244 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+import requests
+import torch
+
+from modelscope.metainfo import Models, Pipelines, Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import LogKeys, ModeKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+from modelscope.utils.torch_utils import is_master
+
+
+def _download_data(url, save_dir):
+    r = requests.get(url, verify=True)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    zip_name = os.path.split(url)[-1]
+    save_path = os.path.join(save_dir, zip_name)
+    with open(save_path, 'wb') as f:
+        f.write(r.content)
+
+    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
+    shutil.unpack_archive(save_path, unpack_dir)
+
+
+def train_func(work_dir, dist=False, log_config=3, imgs_per_gpu=4):
+    import easycv
+    config_path = os.path.join(
+        os.path.dirname(easycv.__file__),
+        'configs/detection/yolox/yolox_s_8xb16_300e_coco.py')
+
+    data_dir = os.path.join(work_dir, 'small_coco_test')
+    url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco.zip'
+    if is_master():
+        _download_data(url, data_dir)
+
+    import time
+    time.sleep(1)
+    cfg = Config.from_file(config_path)
+
+    cfg.work_dir = work_dir
+    cfg.total_epochs = 2
+    cfg.checkpoint_config.interval = 1
+    cfg.eval_config.interval = 1
+    cfg.log_config = dict(
+        interval=log_config,
+        hooks=[
+            dict(type='TextLoggerHook'),
+            dict(type='TensorboardLoggerHook')
+        ])
+    cfg.data.train.data_source.ann_file = os.path.join(
+        data_dir, 'small_coco/small_coco/instances_train2017_20.json')
+    cfg.data.train.data_source.img_prefix = os.path.join(
+        data_dir, 'small_coco/small_coco/train2017')
+    cfg.data.val.data_source.ann_file = os.path.join(
+        data_dir, 'small_coco/small_coco/instances_val2017_20.json')
+    cfg.data.val.data_source.img_prefix = os.path.join(
+        data_dir, 'small_coco/small_coco/val2017')
+    cfg.data.imgs_per_gpu = imgs_per_gpu
+    cfg.data.workers_per_gpu = 2
+    cfg.data.val.imgs_per_gpu = 2
+
+    ms_cfg_file = os.path.join(work_dir, 'ms_yolox_s_8xb16_300e_coco.json')
+    from easycv.utils.ms_utils import to_ms_config
+
+    if is_master():
+        to_ms_config(
+            cfg,
+            dump=True,
+            task=Tasks.image_object_detection,
+            ms_model_name=Models.yolox,
+            pipeline_name=Pipelines.easycv_detection,
+            save_path=ms_cfg_file)
+
+    trainer_name = Trainers.easycv
+    kwargs = dict(
+        task=Tasks.image_object_detection,
+        cfg_file=ms_cfg_file,
+        launcher='pytorch' if dist else None)
+
+    trainer = build_trainer(trainer_name, kwargs)
+    trainer.train()
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSingleGpu(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipIf(
+        True, 'The test cases are all run in the master process, '
+        'cause registry conflicts, and it should run in the subprocess.')
+    def test_single_gpu(self):
+        # TODO: run in subprocess
+        train_func(self.tmp_dir)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00013
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 10
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 3,
+                LogKeys.LR: 0.00157
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 10
+            }, json.loads(lines[3]))
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class EasyCVTrainerTestMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_multi_gpus(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            log_config=2,
+            imgs_per_gpu=5)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0002
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 5
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 2,
+                LogKeys.LR: 0.0018
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 5
+            }, json.loads(lines[3]))
+
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+        for i in [0, 2]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+            self.assertIn(LogKeys.MEMORY, lines[i])
+            self.assertIn('total_loss', lines[i])
+        for i in [1, 3]:
+            self.assertIn(
+                'CocoDetectionEvaluator_DetectionBoxes_Precision/mAP',
+                lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.50IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP@.75IOU', lines[i])
+            self.assertIn('DetectionBoxes_Precision/mAP (small)', lines[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/test_segformer.py b/tests/trainers/easycv/test_segformer.py
new file mode 100644
index 00000000..0da47ef6
--- /dev/null
+++ b/tests/trainers/easycv/test_segformer.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import requests
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+from modelscope.utils.torch_utils import is_master
+
+
+def _download_data(url, save_dir):
+    r = requests.get(url, verify=True)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    zip_name = os.path.split(url)[-1]
+    save_path = os.path.join(save_dir, zip_name)
+    with open(save_path, 'wb') as f:
+        f.write(r.content)
+
+    unpack_dir = os.path.join(save_dir, os.path.splitext(zip_name)[0])
+    shutil.unpack_archive(save_path, unpack_dir)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestSegformer(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+        from modelscope.trainers.easycv.trainer import EasyCVEpochBasedTrainer
+
+        url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/datasets/small_coco_stuff164k.zip'
+        data_dir = os.path.join(self.tmp_dir, 'data')
+        if is_master():
+            _download_data(url, data_dir)
+
+        # adapt to ditributed mode
+        from easycv.utils.test_util import pseudo_dist_init
+        pseudo_dist_init()
+
+        root_path = os.path.join(data_dir, 'small_coco_stuff164k')
+        cfg_options = {
+            'train.max_epochs':
+            2,
+            'dataset.train.data_source.img_root':
+            os.path.join(root_path, 'train2017'),
+            'dataset.train.data_source.label_root':
+            os.path.join(root_path, 'annotations/train2017'),
+            'dataset.train.data_source.split':
+            os.path.join(root_path, 'train.txt'),
+            'dataset.val.data_source.img_root':
+            os.path.join(root_path, 'val2017'),
+            'dataset.val.data_source.label_root':
+            os.path.join(root_path, 'annotations/val2017'),
+            'dataset.val.data_source.split':
+            os.path.join(root_path, 'val.txt'),
+        }
+
+        trainer_name = Trainers.easycv
+        kwargs = dict(
+            task=Tasks.image_segmentation,
+            model='EasyCV/EasyCV-Segformer-b0',
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_single_gpu_segformer(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/hooks/logger/test_tensorboard_hook.py b/tests/trainers/hooks/logger/test_tensorboard_hook.py
index 54c31056..67b1aa63 100644
--- a/tests/trainers/hooks/logger/test_tensorboard_hook.py
+++ b/tests/trainers/hooks/logger/test_tensorboard_hook.py
@@ -11,6 +11,7 @@ import torch
 from torch import nn
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -19,7 +20,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_checkpoint_hook.py b/tests/trainers/hooks/test_checkpoint_hook.py
index 1c81d057..c694ece6 100644
--- a/tests/trainers/hooks/test_checkpoint_hook.py
+++ b/tests/trainers/hooks/test_checkpoint_hook.py
@@ -11,11 +11,14 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile
 from modelscope.utils.registry import default_group
 from modelscope.utils.test_utils import create_dummy_test_dataset
 
+SRC_DIR = os.path.dirname(__file__)
+
 
 def create_dummy_metric():
     _global_iter = 0
@@ -39,12 +42,13 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
         self.linear = nn.Linear(5, 4)
         self.bn = nn.BatchNorm1d(4)
+        self.model_dir = SRC_DIR
 
     def forward(self, feat, labels):
         x = self.linear(feat)
@@ -123,6 +127,14 @@ class CheckpointHookTest(unittest.TestCase):
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 class BestCkptSaverHookTest(unittest.TestCase):
 
@@ -198,6 +210,14 @@ class BestCkptSaverHookTest(unittest.TestCase):
         self.assertIn(f'best_{LogKeys.EPOCH}1_{MetricKeys.ACCURACY}0.1.pth',
                       results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(SRC_DIR)
+        self.assertIn(copy_src_files[0], output_files)
+        self.assertIn(copy_src_files[-1], output_files)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py
index 1338bb2c..2c71e790 100644
--- a/tests/trainers/hooks/test_evaluation_hook.py
+++ b/tests/trainers/hooks/test_evaluation_hook.py
@@ -11,6 +11,7 @@ from torch import nn
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.registry import default_group
@@ -34,7 +35,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index 86d53ecc..7a1ff220 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -13,6 +13,7 @@ from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.registry import default_group
@@ -40,7 +41,7 @@ def create_dummy_metric():
             return {MetricKeys.ACCURACY: self._fake_acc_by_epoch[_global_iter]}
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py
index 25457c1c..84c783b5 100644
--- a/tests/trainers/hooks/test_optimizer_hook.py
+++ b/tests/trainers/hooks/test_optimizer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py
index 614f7688..9fb79c77 100644
--- a/tests/trainers/hooks/test_timer_hook.py
+++ b/tests/trainers/hooks/test_timer_hook.py
@@ -12,6 +12,7 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -83,8 +84,8 @@ class IterTimerHookTest(unittest.TestCase):
             trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
-        trainer.data_loader = train_dataloader
         trainer.train_dataloader = train_dataloader
+        trainer.data_loader = train_dataloader
         trainer.invoke_hook(TrainerStages.before_run)
         for i in range(trainer._epoch, trainer._max_epochs):
             trainer.invoke_hook(TrainerStages.before_train_epoch)
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
new file mode 100644
index 00000000..351600c6
--- /dev/null
+++ b/tests/trainers/test_finetune_mplug.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.multi_modal import MPlugForAllTasks
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import EpochBasedTrainer, build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneMPlug(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        from modelscope.utils.constant import DownloadMode
+        datadict = MsDataset.load(
+            'coco_captions_small_slice',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        self.train_dataset = MsDataset(datadict['train'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+        self.test_dataset = MsDataset(datadict['test'].to_hf_dataset().map(
+            lambda _: {
+                'question': 'what the picture describes?'
+            }).rename_column('image:FILE',
+                             'image').rename_column('answer:Value', 'answer'))
+
+        self.max_epochs = 3
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_caption(self):
+        kwargs = dict(
+            model='damo/mplug_image-captioning_coco_base_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_caption_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_image-captioning_coco_base_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_vqa(self):
+        kwargs = dict(
+            model='damo/mplug_visual-question-answering_coco_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_vqa_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_visual-question-answering_coco_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_retrieval(self):
+        kwargs = dict(
+            model='damo/mplug_image-text-retrieval_flickr30k_large_en',
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_retrieval_with_model_and_args(self):
+        cache_path = snapshot_download(
+            'damo/mplug_image-text-retrieval_flickr30k_large_en')
+        model = MPlugForAllTasks.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 12c7da77..24f1a2fd 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -4,11 +4,24 @@ import shutil
 import tempfile
 import unittest
 
-from modelscope.metainfo import Trainers
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
+from modelscope.trainers.hooks import Hook
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
+    calculate_fisher
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.data_utils import to_device
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
+    epoch_num = 1
+
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -40,15 +53,36 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(10):
+        for i in range(self.epoch_num):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+    def pipeline_sentence_similarity(self, model_dir):
+        model = Model.from_pretrained(model_dir)
+        pipeline_ins = pipeline(task=Tasks.sentence_similarity, model=model)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
     @unittest.skip
     def test_finetune_afqmc(self):
+        """This unittest is used to reproduce the clue:afqmc dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
 
         def cfg_modify_fn(cfg):
-            cfg.task = 'sentence-similarity'
-            cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'}
+            cfg.task = Tasks.sentence_similarity
+            cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer}
             cfg.train.optimizer.lr = 2e-5
             cfg['dataset'] = {
                 'train': {
@@ -58,7 +92,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'label': 'label',
                 }
             }
-            cfg.train.max_epochs = 10
+            cfg.train.max_epochs = self.epoch_num
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,
@@ -90,13 +124,20 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         dc.local_files_only = True
         dataset = load_dataset('clue', 'afqmc', download_config=dc)
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        self.pipeline_sentence_similarity(output_dir)
+
     @unittest.skip
     def test_finetune_tnews(self):
+        """This unittest is used to reproduce the clue:tnews dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
 
         def cfg_modify_fn(cfg):
             # TODO no proper task for tnews
@@ -148,13 +189,21 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         dataset = load_dataset('clue', 'tnews', download_config=dc)
 
         self.finetune(
-            model_id='damo/nlp_structbert_backbone_tiny_std',
+            model_id='damo/nlp_structbert_backbone_base_std',
             train_dataset=dataset['train'],
             eval_dataset=dataset['validation'],
             cfg_modify_fn=cfg_modify_fn)
 
     @unittest.skip
     def test_veco_xnli(self):
+        """This unittest is used to reproduce the xnli dataset + veco model training results.
+
+        Here we follow the training scenario listed in the Alicemind open source project:
+        https://github.com/alibaba/AliceMind/tree/main/VECO
+        by training the english language subset.
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         from datasets import load_dataset
         langs = ['en']
         langs_eval = ['en']
@@ -240,6 +289,112 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             name=Trainers.nlp_veco_trainer,
             cfg_modify_fn=cfg_modify_fn)
 
+    @unittest.skip
+    def test_finetune_cluewsc(self):
+        """This unittest is used to reproduce the clue:wsc dataset + structbert model training results.
+
+        A runnable sample of child-tuning is also showed here.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
+        child_tuning_type = 'ChildTuning-F'
+        mode = {}
+        if child_tuning_type is not None:
+            mode = {'mode': child_tuning_type, 'reserve_p': 0.2}
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'text',
+                    'second_sequence': 'text2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.dataloader.batch_size_per_gpu = 16
+            cfg.train.max_epochs = 30
+            cfg.train.optimizer = {
+                'type':
+                'AdamW' if child_tuning_type is None else 'ChildTuningAdamW',
+                'lr': 1e-5,
+                'options': {},
+                **mode,
+            }
+            cfg.train.lr_scheduler = {
+                'type':
+                'LinearLR',
+                'start_factor':
+                1.0,
+                'end_factor':
+                0.0,
+                'total_iters':
+                int(
+                    len(dataset['train'])
+                    / cfg.train.dataloader.batch_size_per_gpu)
+                * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 30
+            }]
+            return cfg
+
+        def add_sentence2(features):
+            return {
+                'text2':
+                features['target']['span2_text'] + '指代'
+                + features['target']['span1_text']
+            }
+
+        dataset = MsDataset.load('clue', subset_name='cluewsc2020')
+        dataset = {
+            k: v.to_hf_dataset().map(add_sentence2)
+            for k, v in dataset.items()
+        }
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: NlpEpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+
+        class CalculateFisherHook(Hook):
+
+            @staticmethod
+            def forward_step(model, inputs):
+                inputs = to_device(inputs, trainer.device)
+                trainer.train_step(model, inputs)
+                return trainer.train_outputs['loss']
+
+            def before_run(self, trainer: NlpEpochBasedTrainer):
+                v = calculate_fisher(trainer.model, trainer.train_dataloader,
+                                     self.forward_step, 0.2)
+                trainer.optimizer.set_gradient_mask(v)
+
+        if child_tuning_type == 'ChildTuning-D':
+            trainer.register_hook(CalculateFisherHook())
+        trainer.train()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_finetune_text_generation.py
similarity index 56%
rename from tests/trainers/test_text_generation_trainer.py
rename to tests/trainers/test_finetune_text_generation.py
index a60bc903..8cdfdf01 100644
--- a/tests/trainers/test_text_generation_trainer.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -6,14 +6,14 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
-from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
+from modelscope.models.nlp import GPT3ForTextGeneration, PalmForTextGeneration
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
-class TestTextGenerationTrainer(unittest.TestCase):
+class TestFinetuneTextGeneration(unittest.TestCase):
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -21,40 +21,41 @@ class TestTextGenerationTrainer(unittest.TestCase):
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
 
-        self.model_id = 'damo/nlp_palm2.0_text-generation_english-base'
-
-        # todo: Replace below scripts with MsDataset.load when the formal dataset service is ready
         from datasets import Dataset
-        dataset_dict = {
+
+        src_dataset_dict = {
             'src_txt': [
                 'This is test sentence1-1', 'This is test sentence2-1',
                 'This is test sentence3-1'
-            ],
+            ]
+        }
+        src_tgt_dataset_dict = {
+            'src_txt':
+            src_dataset_dict['src_txt'],
             'tgt_txt': [
                 'This is test sentence1-2', 'This is test sentence2-2',
                 'This is test sentence3-2'
             ]
         }
-        dataset = Dataset.from_dict(dataset_dict)
 
-        class MsDatasetDummy(MsDataset):
+        self.src_dataset = MsDataset(Dataset.from_dict(src_dataset_dict))
+        self.src_tgt_dataset = MsDataset(
+            Dataset.from_dict(src_tgt_dataset_dict))
 
-            def __len__(self):
-                return len(self._hf_ds)
-
-        self.dataset = MsDatasetDummy(dataset)
+        self.max_epochs = 3
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_trainer(self):
+    def test_trainer_with_palm(self):
 
         kwargs = dict(
-            model=self.model_id,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
+            model='damo/nlp_palm2.0_text-generation_english-base',
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(
@@ -62,30 +63,67 @@ class TestTextGenerationTrainer(unittest.TestCase):
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(3):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_trainer_with_model_and_args(self):
-        tmp_dir = tempfile.TemporaryDirectory().name
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
+    def test_trainer_with_palm_with_model_and_args(self):
 
-        cache_path = snapshot_download(self.model_id)
+        cache_path = snapshot_download(
+            'damo/nlp_palm2.0_text-generation_english-base')
         model = PalmForTextGeneration.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
-            max_epochs=2,
+            train_dataset=self.src_tgt_dataset,
+            eval_dataset=self.src_tgt_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_gpt3(self):
+
+        kwargs = dict(
+            model='damo/nlp_gpt3_text-generation_chinese-base',
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_gpt3_with_model_and_args(self):
+
+        cache_path = snapshot_download(
+            'damo/nlp_gpt3_text-generation_chinese-base')
+        model = GPT3ForTextGeneration.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.src_dataset,
+            eval_dataset=self.src_dataset,
+            max_epochs=self.max_epochs,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(2):
+        for i in range(self.max_epochs):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skip
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index 520d1a3c..c34410be 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -47,6 +47,11 @@ class TestFinetuneTokenClassification(unittest.TestCase):
 
     @unittest.skip
     def test_word_segmentation(self):
+        """This unittest is used to reproduce the icwb2:pku dataset + structbert model training results.
+
+        User can train a custom dataset by modifying this piece of code and comment the @unittest.skip.
+        """
+
         os.system(
             f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip'
         )
@@ -114,7 +119,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
             return cfg
 
         self.finetune(
-            'damo/nlp_structbert_backbone_tiny_std',
+            'damo/nlp_structbert_backbone_base_std',
             train_dataset,
             dev_dataset,
             cfg_modify_fn=cfg_modify_fn)
diff --git a/tests/trainers/test_image_color_enhance_trainer.py b/tests/trainers/test_image_color_enhance_trainer.py
index f1dcbe51..34d84cd2 100644
--- a/tests/trainers/test_image_color_enhance_trainer.py
+++ b/tests/trainers/test_image_color_enhance_trainer.py
@@ -17,6 +17,41 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root):
+        super(PairedImageDataset, self).__init__()
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        return self
+
+
 class TestImageColorEnhanceTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -27,47 +62,6 @@ class TestImageColorEnhanceTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_csrnet_image-color-enhance-models'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root):
-                super(PairedImageDataset, self).__init__()
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                return torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(lq, (256, 256), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(gt, (256, 256), interpolation=cv2.INTER_CUBIC)
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/image_color_enhance/')
 
diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py
index c8557ff5..03f7eea3 100644
--- a/tests/trainers/test_image_instance_segmentation_trainer.py
+++ b/tests/trainers/test_image_instance_segmentation_trainer.py
@@ -15,7 +15,7 @@ from modelscope.msdatasets.task_datasets import \
     ImageInstanceSegmentationCocoDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.test_utils import test_level
 
 
@@ -41,34 +41,26 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         if train_data_cfg is None:
             # use default toy data
             train_data_cfg = ConfigDict(
-                name='pets_small',
-                split='train',
-                classes=('Cat', 'Dog'),
-                test_mode=False)
+                name='pets_small', split='train', test_mode=False)
         if val_data_cfg is None:
             val_data_cfg = ConfigDict(
-                name='pets_small',
-                split='validation',
-                classes=('Cat', 'Dog'),
-                test_mode=True)
+                name='pets_small', split='validation', test_mode=True)
 
         self.train_dataset = MsDataset.load(
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
-            classes=train_data_cfg.classes,
-            test_mode=train_data_cfg.test_mode)
-        assert self.train_dataset.config_kwargs[
-            'classes'] == train_data_cfg.classes
+            test_mode=train_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.train_dataset.config_kwargs['classes']
         assert next(
             iter(self.train_dataset.config_kwargs['split_config'].values()))
 
         self.eval_dataset = MsDataset.load(
             dataset_name=val_data_cfg.name,
             split=val_data_cfg.split,
-            classes=val_data_cfg.classes,
-            test_mode=val_data_cfg.test_mode)
-        assert self.eval_dataset.config_kwargs[
-            'classes'] == val_data_cfg.classes
+            test_mode=val_data_cfg.test_mode,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        assert self.eval_dataset.config_kwargs['classes']
         assert next(
             iter(self.eval_dataset.config_kwargs['split_config'].values()))
 
diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py
index dc450ff0..049adf7e 100644
--- a/tests/trainers/test_image_portrait_enhancement_trainer.py
+++ b/tests/trainers/test_image_portrait_enhancement_trainer.py
@@ -19,6 +19,47 @@ from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
 
+class PairedImageDataset(data.Dataset):
+
+    def __init__(self, root, size=512):
+        super(PairedImageDataset, self).__init__()
+        self.size = size
+        gt_dir = osp.join(root, 'gt')
+        lq_dir = osp.join(root, 'lq')
+        self.gt_filelist = os.listdir(gt_dir)
+        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
+        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
+        self.lq_filelist = os.listdir(lq_dir)
+        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
+        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
+
+    def _img_to_tensor(self, img):
+        img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
+            torch.float32) / 255.
+        return (img - 0.5) / 0.5
+
+    def __getitem__(self, index):
+        lq = cv2.imread(self.lq_filelist[index])
+        gt = cv2.imread(self.gt_filelist[index])
+        lq = cv2.resize(
+            lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        gt = cv2.resize(
+            gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+
+        return \
+            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
+
+    def __len__(self):
+        return len(self.gt_filelist)
+
+    def to_torch_dataset(self,
+                         columns: Union[str, List[str]] = None,
+                         preprocessors: Union[Callable, List[Callable]] = None,
+                         **format_kwargs):
+        # self.preprocessor = preprocessors
+        return self
+
+
 class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -29,53 +70,6 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
 
-        class PairedImageDataset(data.Dataset):
-
-            def __init__(self, root, size=512):
-                super(PairedImageDataset, self).__init__()
-                self.size = size
-                gt_dir = osp.join(root, 'gt')
-                lq_dir = osp.join(root, 'lq')
-                self.gt_filelist = os.listdir(gt_dir)
-                self.gt_filelist = sorted(
-                    self.gt_filelist, key=lambda x: int(x[:-4]))
-                self.gt_filelist = [
-                    osp.join(gt_dir, f) for f in self.gt_filelist
-                ]
-                self.lq_filelist = os.listdir(lq_dir)
-                self.lq_filelist = sorted(
-                    self.lq_filelist, key=lambda x: int(x[:-4]))
-                self.lq_filelist = [
-                    osp.join(lq_dir, f) for f in self.lq_filelist
-                ]
-
-            def _img_to_tensor(self, img):
-                img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(
-                    2, 0, 1).type(torch.float32) / 255.
-                return (img - 0.5) / 0.5
-
-            def __getitem__(self, index):
-                lq = cv2.imread(self.lq_filelist[index])
-                gt = cv2.imread(self.gt_filelist[index])
-                lq = cv2.resize(
-                    lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-                gt = cv2.resize(
-                    gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-
-                return \
-                    {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-            def __len__(self):
-                return len(self.gt_filelist)
-
-            def to_torch_dataset(self,
-                                 columns: Union[str, List[str]] = None,
-                                 preprocessors: Union[Callable,
-                                                      List[Callable]] = None,
-                                 **format_kwargs):
-                # self.preprocessor = preprocessors
-                return self
-
         self.dataset = PairedImageDataset(
             './data/test/images/face_enhancement/')
 
diff --git a/tests/trainers/test_movie_scene_segmentation_trainer.py b/tests/trainers/test_movie_scene_segmentation_trainer.py
new file mode 100644
index 00000000..f25dc92a
--- /dev/null
+++ b/tests/trainers/test_movie_scene_segmentation_trainer.py
@@ -0,0 +1,109 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.movie_scene_segmentation import \
+    MovieSceneSegmentationModel
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageInstanceSegmentationTrainer(unittest.TestCase):
+
+    model_id = 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+        cache_path = snapshot_download(self.model_id)
+        config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(config_path)
+
+        max_epochs = cfg.train.max_epochs
+
+        train_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='train',
+            cfg=cfg.preprocessor,
+            test_mode=False)
+
+        test_data_cfg = ConfigDict(
+            name='movie_scene_seg_toydata',
+            split='test',
+            cfg=cfg.preprocessor,
+            test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            namespace=train_data_cfg.namespace,
+            cfg=train_data_cfg.cfg,
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            namespace=test_data_cfg.namespace,
+            cfg=test_data_cfg.cfg,
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+        self.max_epochs = max_epochs
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=self.tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        cache_path = snapshot_download(self.model_id)
+        model = MovieSceneSegmentationModel.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir=tmp_dir)
+
+        trainer = build_trainer(
+            name=Trainers.movie_scene_segmentation, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 0259f804..17fa97f9 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -14,7 +14,9 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
+from modelscope.trainers.base import DummyTrainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
@@ -35,7 +37,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
@@ -263,7 +265,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 1,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[2]))
         self.assertDictContainsSubset(
             {
@@ -283,7 +285,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 2,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[5]))
         self.assertDictContainsSubset(
             {
@@ -303,7 +305,7 @@ class TrainerTest(unittest.TestCase):
             {
                 LogKeys.MODE: ModeKeys.EVAL,
                 LogKeys.EPOCH: 3,
-                LogKeys.ITER: 20
+                LogKeys.ITER: 10
             }, json.loads(lines[8]))
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 9781816d..3777772d 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -15,6 +15,7 @@ from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
+from modelscope.models.base import Model
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import (DistributedTestCase,
@@ -37,7 +38,7 @@ dummy_dataset_big = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 40)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 213b6b4f..2cf1c152 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -6,16 +6,20 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
+from modelscope.models.base import Model
 from modelscope.models.nlp.sequence_classification import \
     SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 
 class TestTrainerWithNlp(unittest.TestCase):
+    sentence1 = '今天气温比昨天高么？'
+    sentence2 = '今天湿度比昨天高么？'
 
     def setUp(self):
         print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
@@ -30,7 +34,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         kwargs = dict(
@@ -47,6 +51,27 @@ class TestTrainerWithNlp(unittest.TestCase):
         for i in range(10):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+        output_files = os.listdir(
+            os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR))
+        self.assertIn(ModelFile.CONFIGURATION, output_files)
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, output_files)
+        copy_src_files = os.listdir(trainer.model_dir)
+
+        print(f'copy_src_files are {copy_src_files}')
+        print(f'output_files are {output_files}')
+        for item in copy_src_files:
+            if not item.startswith('.'):
+                self.assertIn(item, output_files)
+
+        def pipeline_sentence_similarity(model_dir):
+            model = Model.from_pretrained(model_dir)
+            pipeline_ins = pipeline(
+                task=Tasks.sentence_similarity, model=model)
+            print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        pipeline_sentence_similarity(output_dir)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_backbone_head(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
diff --git a/tests/trainers/test_video_summarization_trainer.py b/tests/trainers/test_video_summarization_trainer.py
new file mode 100644
index 00000000..1cea1eea
--- /dev/null
+++ b/tests/trainers/test_video_summarization_trainer.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models.cv.video_summarization import PGLVideoSummarization
+from modelscope.msdatasets.task_datasets import VideoSummarizationDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class VideoSummarizationTrainerTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_googlenet_pgl-video-summarization'
+        self.cache_path = snapshot_download(self.model_id)
+        self.config = Config.from_file(
+            os.path.join(self.cache_path, ModelFile.CONFIGURATION))
+        self.dataset_train = VideoSummarizationDataset('train',
+                                                       self.config.dataset,
+                                                       self.cache_path)
+        self.dataset_val = VideoSummarizationDataset('test',
+                                                     self.config.dataset,
+                                                     self.cache_path)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        model = PGLVideoSummarization.from_pretrained(self.cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
+            max_epochs=2,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(2):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py
index 87e5320e..23561734 100644
--- a/tests/trainers/utils/test_inference.py
+++ b/tests/trainers/utils/test_inference.py
@@ -11,6 +11,7 @@ from torch.utils.data import DataLoader
 from modelscope.metrics.builder import MetricKeys
 from modelscope.metrics.sequence_classification_metric import \
     SequenceClassificationMetric
+from modelscope.models.base import Model
 from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
@@ -20,7 +21,7 @@ dummy_dataset = create_dummy_test_dataset(
     torch.rand((5, )), torch.randint(0, 4, (1, )), 20)
 
 
-class DummyModel(nn.Module):
+class DummyModel(nn.Module, Model):
 
     def __init__(self):
         super().__init__()
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index c144c4fe..de99a7b8 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -5,13 +5,13 @@ import shutil
 import tempfile
 import time
 import unittest
-
-import gast
+from pathlib import Path
 
 from modelscope.utils.ast_utils import AstScaning, FilesAstScaning, load_index
 
-MODELSCOPE_PATH = '/'.join(
-    os.path.dirname(__file__).split('/')[:-2]) + '/modelscope'
+p = Path(__file__)
+
+MODELSCOPE_PATH = p.resolve().parents[2].joinpath('modelscope')
 
 
 class AstScaningTest(unittest.TestCase):
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index d934a86c..8b89fa68 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -4,6 +4,8 @@ import copy
 import tempfile
 import unittest
 
+import json
+
 from modelscope.utils.config import Config, check_config
 
 obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -43,7 +45,8 @@ class ConfigTest(unittest.TestCase):
             self.assertEqual(pretty_text, cfg.dump())
             cfg.dump(ofile.name)
             with open(ofile.name, 'r') as infile:
-                self.assertEqual(json_str, infile.read())
+                self.assertDictEqual(
+                    json.loads(json_str), json.loads(infile.read()))
 
         with tempfile.NamedTemporaryFile(suffix='.yaml') as ofile:
             cfg.dump(ofile.name)
diff --git a/tests/utils/test_device.py b/tests/utils/test_device.py
new file mode 100644
index 00000000..4def9915
--- /dev/null
+++ b/tests/utils/test_device.py
@@ -0,0 +1,102 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import time
+import unittest
+
+import torch
+
+from modelscope.utils.constant import Frameworks
+from modelscope.utils.device import (create_device, device_placement,
+                                     verify_device)
+
+# import tensorflow must be imported after torch is imported when using tf1.15
+import tensorflow as tf  # isort:skip
+
+
+class DeviceTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def tearDown(self):
+        super().tearDown()
+
+    def test_verify(self):
+        device_name, device_id = verify_device('cpu')
+        self.assertEqual(device_name, 'cpu')
+        self.assertTrue(device_id is None)
+        device_name, device_id = verify_device('CPU')
+        self.assertEqual(device_name, 'cpu')
+
+        device_name, device_id = verify_device('gpu')
+        self.assertEqual(device_name, 'gpu')
+        self.assertTrue(device_id == 0)
+
+        device_name, device_id = verify_device('cuda')
+        self.assertEqual(device_name, 'gpu')
+        self.assertTrue(device_id == 0)
+
+        device_name, device_id = verify_device('cuda:0')
+        self.assertEqual(device_name, 'gpu')
+        self.assertTrue(device_id == 0)
+
+        device_name, device_id = verify_device('gpu:1')
+        self.assertEqual(device_name, 'gpu')
+        self.assertTrue(device_id == 1)
+
+        with self.assertRaises(AssertionError):
+            verify_device('xgu')
+
+    def test_create_device_torch(self):
+        if torch.cuda.is_available():
+            target_device_type = 'cuda'
+            target_device_index = 0
+        else:
+            target_device_type = 'cpu'
+            target_device_index = None
+        device = create_device('gpu')
+        self.assertTrue(isinstance(device, torch.device))
+        self.assertTrue(device.type == target_device_type)
+        self.assertTrue(device.index == target_device_index)
+
+        device = create_device('gpu:0')
+        self.assertTrue(isinstance(device, torch.device))
+        self.assertTrue(device.type == target_device_type)
+        self.assertTrue(device.index == target_device_index)
+
+        device = create_device('cuda')
+        self.assertTrue(device.type == target_device_type)
+        self.assertTrue(isinstance(device, torch.device))
+        self.assertTrue(device.index == target_device_index)
+
+        device = create_device('cuda:0')
+        self.assertTrue(isinstance(device, torch.device))
+        self.assertTrue(device.type == target_device_type)
+        self.assertTrue(device.index == target_device_index)
+
+    def test_device_placement_cpu(self):
+        with device_placement(Frameworks.torch, 'cpu'):
+            pass
+
+    @unittest.skip('skip this test to avoid debug logging.')
+    def test_device_placement_tf_gpu(self):
+        tf.debugging.set_log_device_placement(True)
+        with device_placement(Frameworks.tf, 'gpu:0'):
+            a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
+            c = tf.matmul(a, b)
+            s = tf.Session()
+            s.run(c)
+        tf.debugging.set_log_device_placement(False)
+
+    def test_device_placement_torch_gpu(self):
+        with device_placement(Frameworks.torch, 'gpu:0'):
+            if torch.cuda.is_available():
+                self.assertEqual(torch.cuda.current_device(), 0)
+
+
+if __name__ == '__main__':
+    unittest.main()