diff --git a/data/test/audios/1ch_nihaomiya.wav b/data/test/audios/1ch_nihaomiya.wav
new file mode 100644
index 00000000..4618d412
--- /dev/null
+++ b/data/test/audios/1ch_nihaomiya.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f7f5a0a4efca1e83463cb44460c66b56fb7cd673eb6da37924637bc05ef758d
+size 1440044
diff --git a/modelscope/metrics/image_instance_segmentation_metric.py b/modelscope/metrics/image_instance_segmentation_metric.py
index 7deafbce..86a19d13 100644
--- a/modelscope/metrics/image_instance_segmentation_metric.py
+++ b/modelscope/metrics/image_instance_segmentation_metric.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 import tempfile
 from collections import OrderedDict
diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
index 56bdbd1c..65725b6f 100644
--- a/modelscope/metrics/movie_scene_segmentation_metric.py
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -1,3 +1,5 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
index 3e7609e1..2007688d 100644
--- a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
@@ -1,5 +1,5 @@
-# Modified from: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
-
+# The implementation is adopted from Swin Transformer, made publicly available under the MIT License at
+# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
index 30e70f82..ff83271e 100644
--- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
 import os
 from collections import OrderedDict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
index cca1432f..1b096fb3 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .transforms import build_preprocess_transform
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
index c2c11286..f0dde759 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
@@ -51,9 +52,9 @@ class LoadImageFromFile:
     """Load an image from file.
 
     Required keys are "img_prefix" and "img_info" (a dict that must contain the
-    key "filename"). Added or updated keys are "filename", "img", "img_shape",
-    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
-    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+    key "filename", "ann_file", and "classes"). Added or updated keys are
+    "filename", "ori_filename", "img", "img_shape", "ori_shape" (same as `img_shape`),
+    "img_fields", "ann_file" (path to annotation file) and "classes".
 
     Args:
         to_float32 (bool): Whether to convert the loaded image to a float32
@@ -73,7 +74,7 @@ class LoadImageFromFile:
         """Call functions to load image and get image meta information.
 
         Args:
-            results (dict): Result dict from :obj:`ImageInstanceSegmentationDataset`.
+            results (dict): Result dict from :obj:`ImageInstanceSegmentationCocoDataset`.
 
         Returns:
             dict: The dict contains loaded image and meta information.
diff --git a/modelscope/models/cv/image_instance_segmentation/model.py b/modelscope/models/cv/image_instance_segmentation/model.py
index 2be59623..a56a1608 100644
--- a/modelscope/models/cv/image_instance_segmentation/model.py
+++ b/modelscope/models/cv/image_instance_segmentation/model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 531e2efd..6058cd73 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/visualization/image.py
 import itertools
 
 import cv2
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index e9576963..676b5ac1 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -1,3 +1,6 @@
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
+
 import os
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
index 3682726f..e5a929aa 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .save_op import get_pred_boundary, pred2scene, scene2video
 from .shot_encoder import resnet50
 from .trn import TransformerCRN
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/head.py b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
index 20a87e66..d6468c53 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/head.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -1,9 +1,5 @@
-# ------------------------------------------------------------------------------------
-# BaSSL
-# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# Github: https://github.com/kakaobrain/bassl
-# ------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index d7c8c0ed..cf26d21a 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -1,7 +1,5 @@
-# ----------------------------------------------------------------------------------
-# The codes below partially refer to the SceneSeg LGSS.
-# Github: https://github.com/AnyiRao/SceneSeg
-# ----------------------------------------------------------------------------------
+# The implementation here is modified based on SceneSeg,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/AnyiRao/SceneSeg
 import os
 import os.path as osp
 import subprocess
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
index 7ad1907f..11d20b13 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -1,6 +1,4 @@
-"""
-Modified from original implementation in torchvision
-"""
+# The implementation is adopted from torchvision
 
 from typing import Any, Callable, List, Optional, Type, Union
 
diff --git a/modelscope/models/cv/object_detection/mmdet_model.py b/modelscope/models/cv/object_detection/mmdet_model.py
index 7bf81349..485d440a 100644
--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
index 2e47ce76..3a1fdd0b 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .backbones import ViT
 from .dense_heads import AnchorNHead, RPNNHead
 from .necks import FPNF
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
index 3b34dad6..c0697d48 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/backbones/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .vit import ViT
 
 __all__ = ['ViT']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
index 0fba8c00..0d34e996 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .anchor_head import AnchorNHead
 from .rpn_head import RPNNHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
index b4114652..d4ea5282 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmdet.models.builder import HEADS
 from mmdet.models.dense_heads import AnchorHead
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
index f53368ce..8e934a5c 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import copy
 
 import torch
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
index 5b0b6210..d164987e 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fpn import FPNF
 
 __all__ = ['FPNF']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
index 52529b28..5f8648ce 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.runner import BaseModule, auto_fp16
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
index a6be3775..658280df 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .bbox_heads import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                          Shared4Conv1FCBBoxNHead)
 from .mask_heads import FCNMaskNHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
index 0d4d5b6b..61d93503 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .convfc_bbox_head import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                                Shared4Conv1FCBBoxNHead)
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
index d2e04b80..726329a1 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 from mmdet.models.builder import HEADS
 from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
index 8f816850..043e62a0 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fcn_mask_head import FCNMaskNHead
 
 __all__ = ['FCNMaskNHead']
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
index e5aedc98..335f6b8f 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-# Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from warnings import warn
 
 import numpy as np
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
index 971a0232..34f240c6 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/__init__.py
@@ -1,3 +1,5 @@
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .checkpoint import load_checkpoint
 from .convModule_norm import ConvModule_Norm
 
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
index 593af1cc..7833f592 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
@@ -1,5 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import io
 import os
 import os.path as osp
diff --git a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
index d81c24e1..a15780f7 100644
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
@@ -1,5 +1,5 @@
-# Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
-
+# Implementation in this file is modified based on ViTAE-Transformer
+# Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmcv.cnn import ConvModule
 
 
diff --git a/modelscope/models/cv/salient_detection/models/__init__.py b/modelscope/models/cv/salient_detection/models/__init__.py
index 0850c33d..8ea7a5d3 100644
--- a/modelscope/models/cv/salient_detection/models/__init__.py
+++ b/modelscope/models/cv/salient_detection/models/__init__.py
@@ -1 +1,3 @@
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 from .u2net import U2NET
diff --git a/modelscope/models/cv/salient_detection/models/u2net.py b/modelscope/models/cv/salient_detection/models/u2net.py
index 0a0a4511..05dbf7ad 100644
--- a/modelscope/models/cv/salient_detection/models/u2net.py
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -1,4 +1,5 @@
-# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
+# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
+# source code avaiable via https://github.com/xuebinqin/U-2-Net
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
index 539d1f24..6e617f58 100644
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 
 import cv2
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index cb8d3826..ab9b0357 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -37,9 +37,7 @@ class OfaForAllTasks(TorchModel):
 
     def __init__(self, model_dir, *args, **kwargs):
         super().__init__(model_dir=model_dir, *args, **kwargs)
-        sd = torch.load(osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
-        sd = sd if 'meta' not in sd else sd['state_dict']
-        model = OFAModel.from_pretrained(model_dir, state_dict=sd)
+        model = OFAModel.from_pretrained(model_dir)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
         self.model = model.module if hasattr(model, 'module') else model
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index ca84db4f..361b8ae0 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -44,44 +44,40 @@ def format_list(para) -> List:
     return para
 
 
-class MsIterableDataset(torch.utils.data.IterableDataset):
+class MsMapDataset(torch.utils.data.Dataset):
 
     def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
-                 columns):
-        super(MsIterableDataset).__init__()
+                 columns, to_tensor):
+        super(MsDataset).__init__()
         self.dataset = dataset
         self.preprocessor_list = preprocessor_list
+        self.to_tensor = to_tensor
         self.retained_columns = retained_columns
         self.columns = columns
 
     def __len__(self):
         return len(self.dataset)
 
-    def __iter__(self):
-        worker_info = torch.utils.data.get_worker_info()
-        if worker_info is None:  # single-process data loading
-            iter_start = 0
-            iter_end = len(self.dataset)
-        else:  # in a worker process
-            per_worker = math.ceil(
-                len(self.dataset) / float(worker_info.num_workers))
-            worker_id = worker_info.id
-            iter_start = worker_id * per_worker
-            iter_end = min(iter_start + per_worker, len(self.dataset))
-
-        for idx in range(iter_start, iter_end):
-            item_dict = self.dataset[idx]
-            res = {
-                k: torch.tensor(item_dict[k])
-                for k in self.columns if k in self.retained_columns
-            }
-            for preprocessor in self.preprocessor_list:
-                res.update({
-                    k: v  # k: torch.tensor(v)
-                    for k, v in preprocessor(item_dict).items()
-                    if k in self.retained_columns
-                })
-            yield res
+    def type_converter(self, x):
+        if self.to_tensor:
+            return torch.tensor(x)
+        else:
+            return x
+
+    def __getitem__(self, index):
+        item_dict = self.dataset[index]
+        res = {
+            k: self.type_converter(item_dict[k])
+            for k in self.columns
+            if (not self.to_tensor) or k in self.retained_columns
+        }
+        for preprocessor in self.preprocessor_list:
+            res.update({
+                k: self.type_converter(v)
+                for k, v in preprocessor(item_dict).items()
+                if (not self.to_tensor) or k in self.retained_columns
+            })
+        return res
 
 
 class MsDataset:
@@ -341,6 +337,7 @@ class MsDataset:
         self,
         preprocessors: Union[Callable, List[Callable]],
         columns: Union[str, List[str]] = None,
+        to_tensor: bool = True,
     ):
         preprocessor_list = preprocessors if isinstance(
             preprocessors, list) else [preprocessors]
@@ -350,29 +347,29 @@ class MsDataset:
         columns = [
             key for key in self._hf_ds.features.keys() if key in columns
         ]
-        sample = next(iter(self._hf_ds))
+        retained_columns = []
+        if to_tensor:
+            sample = next(iter(self._hf_ds))
 
-        sample_res = {k: np.array(sample[k]) for k in columns}
-        for processor in preprocessor_list:
-            sample_res.update(
-                {k: np.array(v)
-                 for k, v in processor(sample).items()})
+            sample_res = {k: np.array(sample[k]) for k in columns}
+            for processor in preprocessor_list:
+                sample_res.update(
+                    {k: np.array(v)
+                     for k, v in processor(sample).items()})
 
-        def is_numpy_number(value):
-            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
-                value.dtype, np.floating) or np.issubdtype(
-                    value.dtype, np.bool)
+            def is_numpy_number(value):
+                return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
+                    value.dtype, np.floating)
 
-        retained_columns = []
-        for k in sample_res.keys():
-            if not is_numpy_number(sample_res[k]):
-                logger.warning(
-                    f'Data of column {k} is non-numeric, will be removed')
-                # continue
-            retained_columns.append(k)
+            for k in sample_res.keys():
+                if not is_numpy_number(sample_res[k]):
+                    logger.warning(
+                        f'Data of column {k} is non-numeric, will be removed')
+                    continue
+                retained_columns.append(k)
 
-        return MsIterableDataset(self._hf_ds, preprocessor_list,
-                                 retained_columns, columns)
+        return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns,
+                            columns, to_tensor)
 
     def to_torch_dataset(
         self,
@@ -380,6 +377,7 @@ class MsDataset:
         preprocessors: Union[Callable, List[Callable]] = None,
         task_name: str = None,
         task_data_config: ConfigDict = None,
+        to_tensor: bool = True,
         **format_kwargs,
     ):
         """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -387,13 +385,14 @@ class MsDataset:
 
         Args:
             preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
-                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
+                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                 will be used as a field of torch.utils.data.Dataset.
-            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
-                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
-                the output fields of processors will also be added.
+            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
+                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
+                If the `preprocessors` is not None, the output fields of processors will also be added.
             task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
             task_data_config (ConfigDict, default None): config dict for model object.
+            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
             format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
 
         Returns:
@@ -410,7 +409,7 @@ class MsDataset:
             return build_task_dataset(task_data_config, task_name)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
-                preprocessors, columns=columns)
+                preprocessors, columns=columns, to_tensor=to_tensor)
         else:
             self._hf_ds.reset_format()
             self._hf_ds.set_format(
diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index 10cf7bfb..1c7bc249 100644
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from MMDetection, publicly available at
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 
 import numpy as np
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
index e56039ac..b1bc40f8 100644
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
diff --git a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
index 925d6281..68cbf918 100644
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -1,6 +1,5 @@
-# ---------------------------------------------------------------------------------------------------
-# The implementation is built upon BaSSL, publicly available at https://github.com/kakaobrain/bassl
-# ---------------------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 import copy
 import os
 import os.path as osp
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
index 62f58fee..e2f618fa 100644
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -4,6 +4,9 @@ import io
 import wave
 from typing import Any, Dict
 
+import numpy
+import soundfile as sf
+
 from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
@@ -37,7 +40,6 @@ class KWSFarfieldPipeline(Pipeline):
         self.model.eval()
         frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
         self._nframe = self.model.size_in // frame_size
-        self.frame_count = 0
 
     def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         if isinstance(inputs, bytes):
@@ -54,35 +56,36 @@ class KWSFarfieldPipeline(Pipeline):
         input_file = inputs['input_file']
         if isinstance(input_file, str):
             input_file = File.read(input_file)
-        if isinstance(input_file, bytes):
-            input_file = io.BytesIO(input_file)
-        self.frame_count = 0
+        frames, samplerate = sf.read(io.BytesIO(input_file), dtype='int16')
+        if len(frames.shape) == 1:
+            frames = numpy.stack((frames, frames, numpy.zeros_like(frames)), 1)
+
         kws_list = []
-        with wave.open(input_file, 'rb') as fin:
-            if 'output_file' in inputs:
-                with wave.open(inputs['output_file'], 'wb') as fout:
-                    fout.setframerate(self.SAMPLE_RATE)
-                    fout.setnchannels(self.OUTPUT_CHANNELS)
-                    fout.setsampwidth(self.SAMPLE_WIDTH)
-                    self._process(fin, kws_list, fout)
-            else:
-                self._process(fin, kws_list)
+        if 'output_file' in inputs:
+            with wave.open(inputs['output_file'], 'wb') as fout:
+                fout.setframerate(self.SAMPLE_RATE)
+                fout.setnchannels(self.OUTPUT_CHANNELS)
+                fout.setsampwidth(self.SAMPLE_WIDTH)
+                self._process(frames, kws_list, fout)
+        else:
+            self._process(frames, kws_list)
         return {OutputKeys.KWS_LIST: kws_list}
 
     def _process(self,
-                 fin: wave.Wave_read,
+                 frames: numpy.ndarray,
                  kws_list,
                  fout: wave.Wave_write = None):
-        data = fin.readframes(self._nframe)
-        while len(data) >= self.model.size_in:
-            self.frame_count += self._nframe
+        for start_index in range(0, frames.shape[0], self._nframe):
+            end_index = start_index + self._nframe
+            if end_index > frames.shape[0]:
+                end_index = frames.shape[0]
+            data = frames[start_index:end_index, :].tobytes()
             result = self.model.forward_decode(data)
             if fout:
                 fout.writeframes(result['pcm'])
             if 'kws' in result:
-                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
+                result['kws']['offset'] += start_index / self.SAMPLE_RATE
                 kws_list.append(result['kws'])
-            data = fin.readframes(self._nframe)
 
     def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py
index 72335d5b..74d1862e 100644
--- a/modelscope/pipelines/cv/action_detection_pipeline.py
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
index d6495f0a..8aea1146 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -10,6 +10,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.device import create_device
 
 
 class EasyCVPipeline(object):
@@ -53,16 +54,19 @@ class EasyCVPipeline(object):
         ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'
 
         self.cfg = Config.from_file(self.config_file)
-        self.predict_op = self._build_predict_op()
+        if 'device' in kwargs:
+            kwargs['device'] = create_device(kwargs['device'])
+        self.predict_op = self._build_predict_op(**kwargs)
 
-    def _build_predict_op(self):
+    def _build_predict_op(self, **kwargs):
         """Build EasyCV predictor."""
         from easycv.predictors.builder import build_predictor
 
         easycv_config = self._to_easycv_config()
         pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
             'model_path': self.model_path,
-            'config_file': easycv_config
+            'config_file': easycv_config,
+            **kwargs
         })
         return pipeline_op
 
@@ -91,5 +95,4 @@ class EasyCVPipeline(object):
         return easycv_config
 
     def __call__(self, inputs) -> Any:
-        # TODO: support image url
         return self.predict_op(inputs)
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index eb4d6c15..7c32e0fc 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -4,7 +4,6 @@ from typing import Any
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from .base import EasyCVPipeline
 
@@ -34,8 +33,11 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
         return self.predict_op.show_result(img, points, scale, save_path)
 
     def __call__(self, inputs) -> Any:
-        output = self.predict_op(inputs)[0][0]
-        points = output['point']
-        poses = output['pose']
+        outputs = self.predict_op(inputs)
 
-        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
+        results = [{
+            OutputKeys.KEYPOINTS: output['point'],
+            OutputKeys.POSES: output['pose']
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
index db66f5d2..bad0c652 100644
--- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -28,7 +28,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
             *args,
             **kwargs)
 
-    def _build_predict_op(self):
+    def _build_predict_op(self, **kwargs):
         """Build EasyCV predictor."""
         from easycv.predictors.builder import build_predictor
         detection_predictor_type = self.cfg['DETECTION']['type']
@@ -46,6 +46,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
         easycv_config = self._to_easycv_config()
         pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
             'model_path': self.model_path,
-            'config_file': easycv_config
+            'config_file': easycv_config,
+            **kwargs
         })
         return pipeline_op
diff --git a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
index ce0bf907..5a0f0d7e 100644
--- a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/pipelines/cv/image_style_transfer_pipeline.py b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
index 827a0d44..64e67115 100644
--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -61,7 +61,13 @@ class ImageStyleTransferPipeline(Pipeline):
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, {}, {}
 
-    def preprocess(self, content: Input, style: Input) -> Dict[str, Any]:
+    def preprocess(self,
+                   content: Input,
+                   style: Input = None) -> Dict[str, Any]:
+        if type(content) is dict:  # for demo service
+            style = content['style']
+            content = content['content']
+
         content = LoadImage.convert_to_ndarray(content)
         if len(content.shape) == 2:
             content = cv2.cvtColor(content, cv2.COLOR_GRAY2BGR)
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
index 0ef0261d..b5acf17a 100644
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import torch
diff --git a/modelscope/preprocessors/movie_scene_segmentation/__init__.py b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
index 73da792d..b28ccabc 100644
--- a/modelscope/preprocessors/movie_scene_segmentation/__init__.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/preprocessors/movie_scene_segmentation/transforms.py b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
index b4e57420..5b84003c 100644
--- a/modelscope/preprocessors/movie_scene_segmentation/transforms.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -1,9 +1,5 @@
-# ------------------------------------------------------------------------------------
-# The codes below partially refer to the BaSSL
-# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# Github: https://github.com/kakaobrain/bassl
-# ------------------------------------------------------------------------------------
+# The implementation here is modified based on BaSSL,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 import numbers
 import os.path as osp
 import random
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 6cacb235..6d06bbb9 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -186,7 +186,8 @@ class MPlugPreprocessor(Preprocessor):
         image = image.convert('RGB')
         image = self.patch_resize_transform(image)
         question = '' if self.cfg.task == Tasks.image_captioning \
-            else data[1 if isinstance(data, tuple) else 'question']
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
         question = self.tokenizer(
             question.lower(),
             padding='max_length',
diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
index 2e2415dc..a777bde1 100644
--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer
diff --git a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
index ee4dd849..7645f9f3 100644
--- a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index d188ae6f..793092c8 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -37,8 +37,8 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.torch_utils import (get_dist_info, init_dist,
-                                          set_random_seed)
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -155,8 +155,17 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_preprocessor is not None:
             self.eval_preprocessor.mode = ModeKeys.EVAL
 
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
         device_name = kwargs.get('device', 'gpu')
-        verify_device(device_name)
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
         self.device = create_device(device_name)
 
         self.train_dataset = self.to_task_dataset(
@@ -219,11 +228,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         self.use_fp16 = kwargs.get('use_fp16', False)
 
-        if kwargs.get('launcher', None) is not None:
-            init_dist(kwargs['launcher'])
-
-        self._dist = get_dist_info()[1] > 1
-
         # model placement
         if self.device.type == 'cuda':
             self.model.to(self.device)
@@ -532,8 +536,14 @@ class EpochBasedTrainer(BaseTrainer):
         model.train()
         self._mode = ModeKeys.TRAIN
         # call model forward but not __call__ to skip postprocess
-        if isinstance(inputs,
-                      Mapping) and not func_receive_dict_inputs(model.forward):
+
+        if is_parallel(model):
+            receive_dict_inputs = func_receive_dict_inputs(
+                model.module.forward)
+        else:
+            receive_dict_inputs = func_receive_dict_inputs(model.forward)
+
+        if isinstance(inputs, Mapping) and not receive_dict_inputs:
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index c6a291d9..1f8f8ed0 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -9,9 +9,9 @@ from collections.abc import Mapping
 
 import torch
 from torch import distributed as dist
-from torch.nn.parallel import DistributedDataParallel
 from tqdm import tqdm
 
+from modelscope.trainers.parallel.utils import is_parallel
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
@@ -138,7 +138,10 @@ def multi_gpu_test(model,
         data_len = data_loader_iters_per_gpu * world_size
         desc = 'Total test iterations with multi gpus'
 
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    if is_parallel(model):
+        receive_dict_inputs = func_receive_dict_inputs(model.module.forward)
+    else:
+        receive_dict_inputs = func_receive_dict_inputs(model.forward)
 
     count = 0
     with tqdm(total=data_len, desc=desc) as pbar:
@@ -146,10 +149,7 @@ def multi_gpu_test(model,
             data = to_device(data, device)
             data_list.append(data)
             with torch.no_grad():
-                forward_func = model.module.forward if \
-                    isinstance(model, DistributedDataParallel) else model.forward
-                if isinstance(data, Mapping
-                              ) and not func_receive_dict_inputs(forward_func):
+                if isinstance(data, Mapping) and not receive_dict_inputs:
                     result = model.forward(**data)
                 else:
                     result = model.forward(data)
diff --git a/modelscope/utils/demo_utils.py b/modelscope/utils/demo_utils.py
index 41ac0bca..363ae950 100644
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -123,7 +123,7 @@ INPUT_EXAMPLES = {
         'urlPaths': {
             'outUrls': [{
                 'outputKey': OutputKeys.OUTPUT_PCM,
-                'fileType': 'wav'
+                'fileType': 'pcm'
             }]
         }
     },
@@ -134,7 +134,7 @@ INPUT_EXAMPLES = {
         'urlPaths': {
             'outUrls': [{
                 'outputKey': OutputKeys.OUTPUT_PCM,
-                'fileType': 'wav'
+                'fileType': 'pcm'
             }]
         }
     },
@@ -147,7 +147,13 @@ INPUT_EXAMPLES = {
             'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
             'a blue turtle-like pokemon with round head'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }]
+        }
     },
     TasksIODescriptions.visual_question_answering: {
         'task':
@@ -156,7 +162,16 @@ INPUT_EXAMPLES = {
             'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
             'what is grown on the plant?'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{
+                'outputKey': 'text'
+            }]
+        }
     },
     TasksIODescriptions.visual_entailment: {
         'task':
@@ -165,7 +180,14 @@ INPUT_EXAMPLES = {
             'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
             'there are two birds.', 'test'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
     },
     TasksIODescriptions.generative_multi_modal_embedding: {
         'task':
@@ -174,7 +196,14 @@ INPUT_EXAMPLES = {
             'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
             'dogs playing in the grass'
         ],
-        'urlPaths': {}
+        'urlPaths': {
+            'inUrls': [{
+                'name': 'image'
+            }, {
+                'name': 'text'
+            }],
+            'outUrls': [{}]
+        }
     },
 }
 
@@ -192,7 +221,13 @@ class DemoCompatibilityCheck(object):
         print('testing demo: ', self.task, self.model_id)
         test_pipline = pipeline(self.task, self.model_id)
         req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
-        output = test_pipline(preprocess(req))
+        inputs = preprocess(req)
+        params = req.get('parameters', {})
+        # modelscope inference
+        if params != {}:
+            output = test_pipline(inputs, **params)
+        else:
+            output = test_pipline(inputs)
         json.dumps(output, cls=NumpyEncoder)
         result = postprocess(req, output)
         print(result)
@@ -215,11 +250,21 @@ class NumpyEncoder(json.JSONEncoder):
 
 
 def preprocess(req):
+    in_urls = req.get('urlPaths').get('inUrls')
     if len(req['inputs']) == 1:
         inputs = req['inputs'][0]
     else:
         inputs = tuple(req['inputs'])
-    return inputs
+    if in_urls is None or len(in_urls) == 0:
+        return inputs
+
+    inputs_dict = {}
+    for i, in_url in enumerate(in_urls):
+        input_name = in_url.get('name')
+        if input_name is None or input_name == '':
+            return inputs
+        inputs_dict[input_name] = req['inputs'][i]
+    return inputs_dict
 
 
 def postprocess(req, resp):
@@ -242,4 +287,3 @@ def postprocess(req, resp):
             out_mem_file = io.BytesIO()
             out_mem_file.write(new_resp.get(output_key))
             return type(out_mem_file)
-        # TODO(lingcai.wl): support more file type
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index 6d4132f6..74d9bb7b 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -115,6 +115,10 @@ def get_dist_info() -> Tuple[int, int]:
     return rank, world_size
 
 
+def get_local_rank():
+    return int(os.environ.get('LOCAL_RANK', 0))
+
+
 def is_master():
     rank, _ = get_dist_info()
     return rank == 0
diff --git a/modelscope/version.py b/modelscope/version.py
index 908c0bb7..9a8e054a 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.3'
+__version__ = '0.4.4'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index ebb61851..8c06242a 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,7 +14,7 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
-pai-easycv>=0.6.0
+pai-easycv>=0.6.3.4
 pandas
 psutil
 regex
diff --git a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
index 6cfdacc6..80ab36a6 100644
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
+from distutils.version import LooseVersion
 
+import easycv
 import numpy as np
 from PIL import Image
 
-from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -14,7 +15,7 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
 
     img_path = 'data/test/images/image_segmentation.jpg'
 
-    def _internal_test__(self, model_id):
+    def _internal_test_(self, model_id):
         img = np.asarray(Image.open(self.img_path))
 
         semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
@@ -24,41 +25,61 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):
 
         results = outputs[0]
         self.assertListEqual(
-            list(img.shape)[:2], list(results['seg_pred'][0].shape))
-        self.assertListEqual(results['seg_pred'][0][1, 4:10].tolist(),
-                             [161 for i in range(6)])
-        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
-                             [133 for i in range(10)])
+            list(img.shape)[:2], list(results['seg_pred'].shape))
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
+        # TODO: support in the future
+        img = np.asarray(Image.open(self.img_path))
+        num_samples = num_samples
+        batch_size = batch_size
+        semantic_seg = pipeline(
+            task=Tasks.image_segmentation,
+            model=model_id,
+            batch_size=batch_size)
+        outputs = semantic_seg([self.img_path] * num_samples)
+
+        self.assertEqual(semantic_seg.predict_op.batch_size, batch_size)
+        self.assertEqual(len(outputs), num_samples)
+
+        for output in outputs:
+            self.assertListEqual(
+                list(img.shape)[:2], list(output['seg_pred'].shape))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b0(self):
         model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b1(self):
         model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b2(self):
         model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b3(self):
         model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b4(self):
         model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_segformer_b5(self):
         model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
-        self._internal_test__(model_id)
+        self._internal_test_(model_id)
+        self._internal_test_batch_(model_id)
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
index a5e347e8..667ecddc 100644
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         face_2d_keypoints_align = pipeline(
             task=Tasks.face_2d_keypoints, model=model_id)
-        output = face_2d_keypoints_align(img_path)
+        output = face_2d_keypoints_align(img_path)[0]
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.POSES]
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index fea7afd7..f8c167de 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -8,6 +8,7 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+TEST_SPEECH_FILE_MONO = 'data/test/audios/1ch_nihaomiya.wav'
 TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
                   'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
                   '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
@@ -26,6 +27,16 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_mono(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        inputs = {
+            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
+        }
+        result = kws(inputs)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_url(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 273d3105..a3ace62d 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -44,8 +44,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_visual-question-answering_coco_large_en')
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -54,8 +54,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_visual-question-answering_coco_large_en'
         pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        question = 'What is the woman doing?'
-        input = {'image': image, 'question': question}
+        text = 'What is the woman doing?'
+        input = {'image': image, 'text': text}
         result = pipeline_vqa(input)
         print(result)
 
@@ -65,8 +65,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/mplug_image-text-retrieval_flickr30k_large_en')
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)
 
@@ -75,8 +75,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
         pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
         image = Image.open('data/test/images/image-text-retrieval.jpg')
-        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
-        input = {'image': image, 'question': question}
+        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
+        input = {'image': image, 'text': text}
         result = pipeline_retrieval(input)
         print(result)
 
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 9a72d1ff..e6638dfa 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -147,8 +147,10 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
-                      osp.join('large_en_model_' + image_name + '.png'))
+        self.save_img(
+            image,
+            result[OutputKeys.BOXES][0],  # just one box
+            osp.join('large_en_model_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_grounding_with_name(self):
@@ -161,7 +163,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-2]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_en_name_' + image_name + '.png'))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -174,7 +176,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
         image_name = image.split('/')[-1]
-        self.save_img(image, result[OutputKeys.BOXES],
+        self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_zh_name_' + image_name))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index fc983023..4c571b7f 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -9,6 +9,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_image_super_resolution.py
   - test_easycv_trainer.py
   - test_segformer.py
+  - test_segmentation_pipeline.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 1f622287..0176704a 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -53,7 +53,18 @@ class DummyModel(nn.Module, Model):
         return dict(logits=x, loss=loss)
 
 
-def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
+class DummyModelForwardInputs(DummyModel):
+
+    def forward(self, inputs):
+        feat, labels = inputs['feat'], inputs['labels']
+        return super().forward(feat, labels)
+
+
+def train_func(work_dir,
+               dist=False,
+               iterable_dataset=False,
+               forward_inputs=False,
+               **kwargs):
     json_cfg = {
         'task': Tasks.image_classification,
         'train': {
@@ -81,7 +92,10 @@ def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     with open(config_path, 'w') as f:
         json.dump(json_cfg, f)
 
-    model = DummyModel()
+    if forward_inputs:
+        model = DummyModelForwardInputs()
+    else:
+        model = DummyModel()
     optimmizer = SGD(model.parameters(), lr=0.01)
     lr_scheduler = StepLR(optimmizer, 2)
     trainer_name = Trainers.default
@@ -273,6 +287,22 @@ class TrainerTestMultiGpus(DistributedTestCase):
         for i in [1, 3, 5]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_forward_inputs(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            forward_inputs=True)
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
+
     # TODO: support iters_per_epoch for dist mode
     @unittest.skipIf(True, 'need to adapt to DistributedSampler')
     def test_multi_gpus_with_iters_per_epoch(self):