Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts: # modelscope/models/multi_modal/ofa_for_all_tasks.py # modelscope/msdatasets/ms_dataset.py # modelscope/trainers/utils/inference.py
3 years ago · 537827e5a1
--- a/data/test/audios/1ch_nihaomiya.wav
+++ b/data/test/audios/1ch_nihaomiya.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:4f7f5a0a4efca1e83463cb44460c66b56fb7cd673eb6da37924637bc05ef758d
 size 1440044
--- a/modelscope/metrics/image_instance_segmentation_metric.py
+++ b/modelscope/metrics/image_instance_segmentation_metric.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from MMDetection, publicly available at
 # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp
 import tempfile
 from collections import OrderedDict
--- a/modelscope/metrics/movie_scene_segmentation_metric.py
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -1,3 +1,5 @@
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 from typing import Dict

 import numpy as np
--- a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
@@ -1,5 +1,5 @@
 # Modified from: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py

 # The implementation is adopted from Swin Transformer, made publicly available under the MIT License at
 # https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
 import numpy as np
 import torch
 import torch.nn as nn
--- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
+++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from MMDetection, publicly available at
 # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
 import os
 from collections import OrderedDict

--- a/modelscope/models/cv/image_instance_segmentation/datasets/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .transforms import build_preprocess_transform
--- a/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/transforms.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import numpy as np
@@ -51,9 +52,9 @@ class LoadImageFromFile:
    """Load an image from file.

    Required keys are "img_prefix" and "img_info" (a dict that must contain the
    key "filename"). Added or updated keys are "filename", "img", "img_shape",
    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
    key "filename", "ann_file", and "classes"). Added or updated keys are
    "filename", "ori_filename", "img", "img_shape", "ori_shape" (same as `img_shape`),
    "img_fields", "ann_file" (path to annotation file) and "classes".

    Args:
        to_float32 (bool): Whether to convert the loaded image to a float32
@@ -73,7 +74,7 @@ class LoadImageFromFile:
        """Call functions to load image and get image meta information.

        Args:
            results (dict): Result dict from :obj:`ImageInstanceSegmentationDataset`.
            results (dict): Result dict from :obj:`ImageInstanceSegmentationCocoDataset`.

        Returns:
            dict: The dict contains loaded image and meta information.
--- a/modelscope/models/cv/image_instance_segmentation/model.py
+++ b/modelscope/models/cv/image_instance_segmentation/model.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict

--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from MMDetection, publicly available at
 # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/visualization/image.py
 import itertools

 import cv2
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -1,3 +1,6 @@
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl

 import os
 import os.path as osp
 from typing import Any, Dict
--- a/modelscope/models/cv/movie_scene_segmentation/utils/init.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/init.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .save_op import get_pred_boundary, pred2scene, scene2video
 from .shot_encoder import resnet50
 from .trn import TransformerCRN
--- a/modelscope/models/cv/movie_scene_segmentation/utils/head.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -1,9 +1,5 @@
 # ------------------------------------------------------------------------------------
 # BaSSL
 # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # Github: https://github.com/kakaobrain/bassl
 # ------------------------------------------------------------------------------------
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl

 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -1,7 +1,5 @@
 # ----------------------------------------------------------------------------------
 # The codes below partially refer to the SceneSeg LGSS.
 # Github: https://github.com/AnyiRao/SceneSeg
 # ----------------------------------------------------------------------------------
 # The implementation here is modified based on SceneSeg,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/AnyiRao/SceneSeg
 import os
 import os.path as osp
 import subprocess
--- a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -1,6 +1,4 @@
 """
 Modified from original implementation in torchvision
 """
 # The implementation is adopted from torchvision

 from typing import Any, Callable, List, Optional, Type, Union

--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import numpy as np
--- a/modelscope/models/cv/object_detection/mmdet_ms/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .backbones import ViT
 from .dense_heads import AnchorNHead, RPNNHead
 from .necks import FPNF
--- a/modelscope/models/cv/object_detection/mmdet_ms/backbones/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/backbones/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .vit import ViT

 __all__ = ['ViT']
--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .anchor_head import AnchorNHead
 from .rpn_head import RPNNHead

--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/anchor_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmdet.models.builder import HEADS
 from mmdet.models.dense_heads import AnchorHead

--- a/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/dense_heads/rpn_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import copy

 import torch
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fpn import FPNF

 __all__ = ['FPNF']
--- a/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/necks/fpn.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.runner import BaseModule, auto_fp16
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .bbox_heads import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                         Shared4Conv1FCBBoxNHead)
 from .mask_heads import FCNMaskNHead
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .convfc_bbox_head import (ConvFCBBoxNHead, Shared2FCBBoxNHead,
                               Shared4Conv1FCBBoxNHead)

--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import torch.nn as nn
 from mmdet.models.builder import HEADS
 from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .fcn_mask_head import FCNMaskNHead

 __all__ = ['FCNMaskNHead']
--- a/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/roi_heads/mask_heads/fcn_mask_head.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Implementation in this file is modifed from source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from warnings import warn

 import numpy as np
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/init.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/init.py
@@ -1,3 +1,5 @@
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from .checkpoint import load_checkpoint
 from .convModule_norm import ConvModule_Norm

--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/checkpoint.py
@@ -1,5 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 # Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet
 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 import io
 import os
 import os.path as osp
--- a/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
+++ b/modelscope/models/cv/object_detection/mmdet_ms/utils/convModule_norm.py
@@ -1,5 +1,5 @@
 # Implementation adopted from ViTAE-Transformer, source code avaiable via https://github.com/ViTAE-Transformer/ViTDet

 # Implementation in this file is modified based on ViTAE-Transformer
 # Originally Apache 2.0 License and publicly avaialbe at https://github.com/ViTAE-Transformer/ViTDet
 from mmcv.cnn import ConvModule


--- a/modelscope/models/cv/salient_detection/models/init.py
+++ b/modelscope/models/cv/salient_detection/models/init.py
@@ -1 +1,3 @@
 # The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
 # source code avaiable via https://github.com/xuebinqin/U-2-Net
 from .u2net import U2NET
--- a/modelscope/models/cv/salient_detection/models/u2net.py
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -1,4 +1,5 @@
 # Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
 # The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
 # source code avaiable via https://github.com/xuebinqin/U-2-Net
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import cv2
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -37,9 +37,7 @@ class OfaForAllTasks(TorchModel):

    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir=model_dir, *args, **kwargs)
        sd = torch.load(osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
        sd = sd if 'meta' not in sd else sd['state_dict']
        model = OFAModel.from_pretrained(model_dir, state_dict=sd)
        model = OFAModel.from_pretrained(model_dir)
        self.cfg = Config.from_file(
            osp.join(model_dir, ModelFile.CONFIGURATION))
        self.model = model.module if hasattr(model, 'module') else model
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -44,44 +44,40 @@ def format_list(para) -> List:
    return para


 class MsIterableDataset(torch.utils.data.IterableDataset):
 class MsMapDataset(torch.utils.data.Dataset):

    def __init__(self, dataset: Iterable, preprocessor_list, retained_columns,
                 columns):
        super(MsIterableDataset).__init__()
                 columns, to_tensor):
        super(MsDataset).__init__()
        self.dataset = dataset
        self.preprocessor_list = preprocessor_list
        self.to_tensor = to_tensor
        self.retained_columns = retained_columns
        self.columns = columns

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:  # single-process data loading
            iter_start = 0
            iter_end = len(self.dataset)
        else:  # in a worker process
            per_worker = math.ceil(
                len(self.dataset) / float(worker_info.num_workers))
            worker_id = worker_info.id
            iter_start = worker_id * per_worker
            iter_end = min(iter_start + per_worker, len(self.dataset))

        for idx in range(iter_start, iter_end):
            item_dict = self.dataset[idx]
            res = {
                k: torch.tensor(item_dict[k])
                for k in self.columns if k in self.retained_columns
            }
            for preprocessor in self.preprocessor_list:
                res.update({
                    k: v  # k: torch.tensor(v)
                    for k, v in preprocessor(item_dict).items()
                    if k in self.retained_columns
                })
            yield res
    def type_converter(self, x):
        if self.to_tensor:
            return torch.tensor(x)
        else:
            return x

    def __getitem__(self, index):
        item_dict = self.dataset[index]
        res = {
            k: self.type_converter(item_dict[k])
            for k in self.columns
            if (not self.to_tensor) or k in self.retained_columns
        }
        for preprocessor in self.preprocessor_list:
            res.update({
                k: self.type_converter(v)
                for k, v in preprocessor(item_dict).items()
                if (not self.to_tensor) or k in self.retained_columns
            })
        return res


 class MsDataset:
@@ -341,6 +337,7 @@ class MsDataset:
        self,
        preprocessors: Union[Callable, List[Callable]],
        columns: Union[str, List[str]] = None,
        to_tensor: bool = True,
    ):
        preprocessor_list = preprocessors if isinstance(
            preprocessors, list) else [preprocessors]
@@ -350,29 +347,29 @@ class MsDataset:
        columns = [
            key for key in self._hf_ds.features.keys() if key in columns
        ]
        sample = next(iter(self._hf_ds))
        retained_columns = []
        if to_tensor:
            sample = next(iter(self._hf_ds))

        sample_res = {k: np.array(sample[k]) for k in columns}
        for processor in preprocessor_list:
            sample_res.update(
                {k: np.array(v)
                 for k, v in processor(sample).items()})
            sample_res = {k: np.array(sample[k]) for k in columns}
            for processor in preprocessor_list:
                sample_res.update(
                    {k: np.array(v)
                     for k, v in processor(sample).items()})

        def is_numpy_number(value):
            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
                value.dtype, np.floating) or np.issubdtype(
                    value.dtype, np.bool)
            def is_numpy_number(value):
                return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
                    value.dtype, np.floating)

        retained_columns = []
        for k in sample_res.keys():
            if not is_numpy_number(sample_res[k]):
                logger.warning(
                    f'Data of column {k} is non-numeric, will be removed')
                # continue
            retained_columns.append(k)
            for k in sample_res.keys():
                if not is_numpy_number(sample_res[k]):
                    logger.warning(
                        f'Data of column {k} is non-numeric, will be removed')
                    continue
                retained_columns.append(k)

        return MsIterableDataset(self._hf_ds, preprocessor_list,
                                 retained_columns, columns)
        return MsMapDataset(self._hf_ds, preprocessor_list, retained_columns,
                            columns, to_tensor)

    def to_torch_dataset(
        self,
@@ -380,6 +377,7 @@ class MsDataset:
        preprocessors: Union[Callable, List[Callable]] = None,
        task_name: str = None,
        task_data_config: ConfigDict = None,
        to_tensor: bool = True,
        **format_kwargs,
    ):
        """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -387,13 +385,14 @@ class MsDataset:

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
                the output fields of processors will also be added.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
                If the `preprocessors` is not None, the output fields of processors will also be added.
            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
            task_data_config (ConfigDict, default None): config dict for model object.
            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
@@ -410,7 +409,7 @@ class MsDataset:
            return build_task_dataset(task_data_config, task_name)
        if preprocessors is not None:
            return self.to_torch_dataset_with_processors(
                preprocessors, columns=columns)
                preprocessors, columns=columns, to_tensor=to_tensor)
        else:
            self._hf_ds.reset_format()
            self._hf_ds.set_format(
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from MMDetection, publicly available at
 # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py
 import os.path as osp

 import numpy as np
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/init.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .movie_scene_segmentation_dataset import MovieSceneSegmentationDataset
--- a/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/movie_scene_segmentation/movie_scene_segmentation_dataset.py
@@ -1,6 +1,5 @@
 # ---------------------------------------------------------------------------------------------------
 # The implementation is built upon BaSSL, publicly available at https://github.com/kakaobrain/bassl
 # ---------------------------------------------------------------------------------------------------
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly available at https://github.com/kakaobrain/bassl
 import copy
 import os
 import os.path as osp
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -4,6 +4,9 @@ import io
 import wave
 from typing import Any, Dict

 import numpy
 import soundfile as sf

 from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
@@ -37,7 +40,6 @@ class KWSFarfieldPipeline(Pipeline):
        self.model.eval()
        frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
        self._nframe = self.model.size_in // frame_size
        self.frame_count = 0

    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
        if isinstance(inputs, bytes):
@@ -54,35 +56,36 @@ class KWSFarfieldPipeline(Pipeline):
        input_file = inputs['input_file']
        if isinstance(input_file, str):
            input_file = File.read(input_file)
        if isinstance(input_file, bytes):
            input_file = io.BytesIO(input_file)
        self.frame_count = 0
        frames, samplerate = sf.read(io.BytesIO(input_file), dtype='int16')
        if len(frames.shape) == 1:
            frames = numpy.stack((frames, frames, numpy.zeros_like(frames)), 1)

        kws_list = []
        with wave.open(input_file, 'rb') as fin:
            if 'output_file' in inputs:
                with wave.open(inputs['output_file'], 'wb') as fout:
                    fout.setframerate(self.SAMPLE_RATE)
                    fout.setnchannels(self.OUTPUT_CHANNELS)
                    fout.setsampwidth(self.SAMPLE_WIDTH)
                    self._process(fin, kws_list, fout)
            else:
                self._process(fin, kws_list)
        if 'output_file' in inputs:
            with wave.open(inputs['output_file'], 'wb') as fout:
                fout.setframerate(self.SAMPLE_RATE)
                fout.setnchannels(self.OUTPUT_CHANNELS)
                fout.setsampwidth(self.SAMPLE_WIDTH)
                self._process(frames, kws_list, fout)
        else:
            self._process(frames, kws_list)
        return {OutputKeys.KWS_LIST: kws_list}

    def _process(self,
                 fin: wave.Wave_read,
                 frames: numpy.ndarray,
                 kws_list,
                 fout: wave.Wave_write = None):
        data = fin.readframes(self._nframe)
        while len(data) >= self.model.size_in:
            self.frame_count += self._nframe
        for start_index in range(0, frames.shape[0], self._nframe):
            end_index = start_index + self._nframe
            if end_index > frames.shape[0]:
                end_index = frames.shape[0]
            data = frames[start_index:end_index, :].tobytes()
            result = self.model.forward_decode(data)
            if fout:
                fout.writeframes(result['pcm'])
            if 'kws' in result:
                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
                result['kws']['offset'] += start_index / self.SAMPLE_RATE
                kws_list.append(result['kws'])
            data = fin.readframes(self._nframe)

    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/action_detection_pipeline.py
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import math
 import os.path as osp
 from typing import Any, Dict
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -10,6 +10,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import create_device


 class EasyCVPipeline(object):
@@ -53,16 +54,19 @@ class EasyCVPipeline(object):
        ), f'Not find "{ModelFile.CONFIGURATION}" in model directory!'

        self.cfg = Config.from_file(self.config_file)
        self.predict_op = self._build_predict_op()
        if 'device' in kwargs:
            kwargs['device'] = create_device(kwargs['device'])
        self.predict_op = self._build_predict_op(**kwargs)

    def _build_predict_op(self):
    def _build_predict_op(self, **kwargs):
        """Build EasyCV predictor."""
        from easycv.predictors.builder import build_predictor

        easycv_config = self._to_easycv_config()
        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
            'model_path': self.model_path,
            'config_file': easycv_config
            'config_file': easycv_config,
            **kwargs
        })
        return pipeline_op

@@ -91,5 +95,4 @@ class EasyCVPipeline(object):
        return easycv_config

    def __call__(self, inputs) -> Any:
        # TODO: support image url
        return self.predict_op(inputs)
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -4,7 +4,6 @@ from typing import Any
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from .base import EasyCVPipeline

@@ -34,8 +33,11 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
        return self.predict_op.show_result(img, points, scale, save_path)

    def __call__(self, inputs) -> Any:
        output = self.predict_op(inputs)[0][0]
        points = output['point']
        poses = output['pose']
        outputs = self.predict_op(inputs)

        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
        results = [{
            OutputKeys.KEYPOINTS: output['point'],
            OutputKeys.POSES: output['pose']
        } for output in outputs]

        return results
--- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -28,7 +28,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
            *args,
            **kwargs)

    def _build_predict_op(self):
    def _build_predict_op(self, **kwargs):
        """Build EasyCV predictor."""
        from easycv.predictors.builder import build_predictor
        detection_predictor_type = self.cfg['DETECTION']['type']
@@ -46,6 +46,7 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
        easycv_config = self._to_easycv_config()
        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
            'model_path': self.model_path,
            'config_file': easycv_config
            'config_file': easycv_config,
            **kwargs
        })
        return pipeline_op
--- a/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_instance_segmentation_pipeline.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union

--- a/modelscope/pipelines/cv/image_style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/image_style_transfer_pipeline.py
@@ -61,7 +61,13 @@ class ImageStyleTransferPipeline(Pipeline):
    def _sanitize_parameters(self, **pipeline_parameters):
        return pipeline_parameters, {}, {}

    def preprocess(self, content: Input, style: Input) -> Dict[str, Any]:
    def preprocess(self,
                   content: Input,
                   style: Input = None) -> Dict[str, Any]:
        if type(content) is dict:  # for demo service
            style = content['style']
            content = content['content']

        content = LoadImage.convert_to_ndarray(content)
        if len(content.shape) == 2:
            content = cv2.cvtColor(content, cv2.COLOR_GRAY2BGR)
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict

 import torch
--- a/modelscope/preprocessors/movie_scene_segmentation/init.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/init.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule
--- a/modelscope/preprocessors/movie_scene_segmentation/transforms.py
+++ b/modelscope/preprocessors/movie_scene_segmentation/transforms.py
@@ -1,9 +1,5 @@
 # ------------------------------------------------------------------------------------
 # The codes below partially refer to the BaSSL
 # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # Github: https://github.com/kakaobrain/bassl
 # ------------------------------------------------------------------------------------
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 import numbers
 import os.path as osp
 import random
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -186,7 +186,8 @@ class MPlugPreprocessor(Preprocessor):
        image = image.convert('RGB')
        image = self.patch_resize_transform(image)
        question = '' if self.cfg.task == Tasks.image_captioning \
            else data[1 if isinstance(data, tuple) else 'question']
            else data[1 if isinstance(data, tuple)
                      else ('text' if 'text' in data else 'question')]
        question = self.tokenizer(
            question.lower(),
            padding='max_length',
--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer
--- a/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
+++ b/modelscope/trainers/cv/movie_scene_segmentation_trainer.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.trainer import EpochBasedTrainer
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -37,8 +37,8 @@ from modelscope.utils.device import create_device, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.torch_utils import (get_dist_info, init_dist,
                                          set_random_seed)
 from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
                                          init_dist, set_random_seed)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -155,8 +155,17 @@ class EpochBasedTrainer(BaseTrainer):
        if self.eval_preprocessor is not None:
            self.eval_preprocessor.mode = ModeKeys.EVAL

        if kwargs.get('launcher', None) is not None:
            init_dist(kwargs['launcher'])

        _, world_size = get_dist_info()
        self._dist = world_size > 1

        device_name = kwargs.get('device', 'gpu')
        verify_device(device_name)
        if self._dist:
            local_rank = get_local_rank()
            device_name = f'cuda:{local_rank}'

        self.device = create_device(device_name)

        self.train_dataset = self.to_task_dataset(
@@ -219,11 +228,6 @@ class EpochBasedTrainer(BaseTrainer):

        self.use_fp16 = kwargs.get('use_fp16', False)

        if kwargs.get('launcher', None) is not None:
            init_dist(kwargs['launcher'])

        self._dist = get_dist_info()[1] > 1

        # model placement
        if self.device.type == 'cuda':
            self.model.to(self.device)
@@ -532,8 +536,14 @@ class EpochBasedTrainer(BaseTrainer):
        model.train()
        self._mode = ModeKeys.TRAIN
        # call model forward but not __call__ to skip postprocess
        if isinstance(inputs,
                      Mapping) and not func_receive_dict_inputs(model.forward):

        if is_parallel(model):
            receive_dict_inputs = func_receive_dict_inputs(
                model.module.forward)
        else:
            receive_dict_inputs = func_receive_dict_inputs(model.forward)

        if isinstance(inputs, Mapping) and not receive_dict_inputs:
            train_outputs = model.forward(**inputs)
        else:
            train_outputs = model.forward(inputs)
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -9,9 +9,9 @@ from collections.abc import Mapping

 import torch
 from torch import distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 from tqdm import tqdm

 from modelscope.trainers.parallel.utils import is_parallel
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
@@ -138,7 +138,10 @@ def multi_gpu_test(model,
        data_len = data_loader_iters_per_gpu * world_size
        desc = 'Total test iterations with multi gpus'

    time.sleep(2)  # This line can prevent deadlock problem in some cases.
    if is_parallel(model):
        receive_dict_inputs = func_receive_dict_inputs(model.module.forward)
    else:
        receive_dict_inputs = func_receive_dict_inputs(model.forward)

    count = 0
    with tqdm(total=data_len, desc=desc) as pbar:
@@ -146,10 +149,7 @@ def multi_gpu_test(model,
            data = to_device(data, device)
            data_list.append(data)
            with torch.no_grad():
                forward_func = model.module.forward if \
                    isinstance(model, DistributedDataParallel) else model.forward
                if isinstance(data, Mapping
                              ) and not func_receive_dict_inputs(forward_func):
                if isinstance(data, Mapping) and not receive_dict_inputs:
                    result = model.forward(**data)
                else:
                    result = model.forward(data)
--- a/modelscope/utils/demo_utils.py
+++ b/modelscope/utils/demo_utils.py
@@ -123,7 +123,7 @@ INPUT_EXAMPLES = {
        'urlPaths': {
            'outUrls': [{
                'outputKey': OutputKeys.OUTPUT_PCM,
                'fileType': 'wav'
                'fileType': 'pcm'
            }]
        }
    },
@@ -134,7 +134,7 @@ INPUT_EXAMPLES = {
        'urlPaths': {
            'outUrls': [{
                'outputKey': OutputKeys.OUTPUT_PCM,
                'fileType': 'wav'
                'fileType': 'pcm'
            }]
        }
    },
@@ -147,7 +147,13 @@ INPUT_EXAMPLES = {
            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-grounding/visual_grounding.png',
            'a blue turtle-like pokemon with round head'
        ],
        'urlPaths': {}
        'urlPaths': {
            'inUrls': [{
                'name': 'image'
            }, {
                'name': 'text'
            }]
        }
    },
    TasksIODescriptions.visual_question_answering: {
        'task':
@@ -156,7 +162,16 @@ INPUT_EXAMPLES = {
            'http://225252-file.oss-cn-hangzhou-zmf.aliyuncs.com/maas_demo/visual_question_answering.png',
            'what is grown on the plant?'
        ],
        'urlPaths': {}
        'urlPaths': {
            'inUrls': [{
                'name': 'image'
            }, {
                'name': 'text'
            }],
            'outUrls': [{
                'outputKey': 'text'
            }]
        }
    },
    TasksIODescriptions.visual_entailment: {
        'task':
@@ -165,7 +180,14 @@ INPUT_EXAMPLES = {
            'http://xingchen-data.oss-cn-zhangjiakou.aliyuncs.com/maas/visual-entailment/visual_entailment.jpg',
            'there are two birds.', 'test'
        ],
        'urlPaths': {}
        'urlPaths': {
            'inUrls': [{
                'name': 'image'
            }, {
                'name': 'text'
            }],
            'outUrls': [{}]
        }
    },
    TasksIODescriptions.generative_multi_modal_embedding: {
        'task':
@@ -174,7 +196,14 @@ INPUT_EXAMPLES = {
            'http://clip-multimodal.oss-cn-beijing.aliyuncs.com/lingchen/demo/dogs.jpg',
            'dogs playing in the grass'
        ],
        'urlPaths': {}
        'urlPaths': {
            'inUrls': [{
                'name': 'image'
            }, {
                'name': 'text'
            }],
            'outUrls': [{}]
        }
    },
 }

@@ -192,7 +221,13 @@ class DemoCompatibilityCheck(object):
        print('testing demo: ', self.task, self.model_id)
        test_pipline = pipeline(self.task, self.model_id)
        req = INPUT_EXAMPLES[TASKS_INPUT_TEMPLATES[self.task]]
        output = test_pipline(preprocess(req))
        inputs = preprocess(req)
        params = req.get('parameters', {})
        # modelscope inference
        if params != {}:
            output = test_pipline(inputs, **params)
        else:
            output = test_pipline(inputs)
        json.dumps(output, cls=NumpyEncoder)
        result = postprocess(req, output)
        print(result)
@@ -215,11 +250,21 @@ class NumpyEncoder(json.JSONEncoder):


 def preprocess(req):
    in_urls = req.get('urlPaths').get('inUrls')
    if len(req['inputs']) == 1:
        inputs = req['inputs'][0]
    else:
        inputs = tuple(req['inputs'])
    return inputs
    if in_urls is None or len(in_urls) == 0:
        return inputs

    inputs_dict = {}
    for i, in_url in enumerate(in_urls):
        input_name = in_url.get('name')
        if input_name is None or input_name == '':
            return inputs
        inputs_dict[input_name] = req['inputs'][i]
    return inputs_dict


 def postprocess(req, resp):
@@ -242,4 +287,3 @@ def postprocess(req, resp):
            out_mem_file = io.BytesIO()
            out_mem_file.write(new_resp.get(output_key))
            return type(out_mem_file)
        # TODO(lingcai.wl): support more file type
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -115,6 +115,10 @@ def get_dist_info() -> Tuple[int, int]:
    return rank, world_size


 def get_local_rank():
    return int(os.environ.get('LOCAL_RANK', 0))


 def is_master():
    rank, _ = get_dist_info()
    return rank == 0
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
 __version__ = '0.4.3'
 __version__ = '0.4.4'
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -14,7 +14,7 @@ mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
 onnxruntime>=1.10
 pai-easycv>=0.6.0
 pai-easycv>=0.6.3.4
 pandas
 psutil
 regex
--- a/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
+++ b/tests/pipelines/easycv_pipelines/test_segmentation_pipeline.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 from distutils.version import LooseVersion

 import easycv
 import numpy as np
 from PIL import Image

 from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -14,7 +15,7 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):

    img_path = 'data/test/images/image_segmentation.jpg'

    def _internal_test__(self, model_id):
    def _internal_test_(self, model_id):
        img = np.asarray(Image.open(self.img_path))

        semantic_seg = pipeline(task=Tasks.image_segmentation, model=model_id)
@@ -24,41 +25,61 @@ class EasyCVSegmentationPipelineTest(unittest.TestCase):

        results = outputs[0]
        self.assertListEqual(
            list(img.shape)[:2], list(results['seg_pred'][0].shape))
        self.assertListEqual(results['seg_pred'][0][1, 4:10].tolist(),
                             [161 for i in range(6)])
        self.assertListEqual(results['seg_pred'][0][-1, -10:].tolist(),
                             [133 for i in range(10)])
            list(img.shape)[:2], list(results['seg_pred'].shape))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def _internal_test_batch_(self, model_id, num_samples=2, batch_size=2):
        # TODO: support in the future
        img = np.asarray(Image.open(self.img_path))
        num_samples = num_samples
        batch_size = batch_size
        semantic_seg = pipeline(
            task=Tasks.image_segmentation,
            model=model_id,
            batch_size=batch_size)
        outputs = semantic_seg([self.img_path] * num_samples)

        self.assertEqual(semantic_seg.predict_op.batch_size, batch_size)
        self.assertEqual(len(outputs), num_samples)

        for output in outputs:
            self.assertListEqual(
                list(img.shape)[:2], list(output['seg_pred'].shape))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b0(self):
        model_id = 'damo/cv_segformer-b0_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b1(self):
        model_id = 'damo/cv_segformer-b1_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b2(self):
        model_id = 'damo/cv_segformer-b2_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b3(self):
        model_id = 'damo/cv_segformer-b3_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b4(self):
        model_id = 'damo/cv_segformer-b4_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_segformer_b5(self):
        model_id = 'damo/cv_segformer-b5_image_semantic-segmentation_coco-stuff164k'
        self._internal_test__(model_id)
        self._internal_test_(model_id)
        self._internal_test_batch_(model_id)


 if __name__ == '__main__':
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):

        face_2d_keypoints_align = pipeline(
            task=Tasks.face_2d_keypoints, model=model_id)
        output = face_2d_keypoints_align(img_path)
        output = face_2d_keypoints_align(img_path)[0]

        output_keypoints = output[OutputKeys.KEYPOINTS]
        output_pose = output[OutputKeys.POSES]
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -8,6 +8,7 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level

 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
 TEST_SPEECH_FILE_MONO = 'data/test/audios/1ch_nihaomiya.wav'
 TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
                  'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
                  '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
@@ -26,6 +27,16 @@ class KWSFarfieldTest(unittest.TestCase):
        self.assertEqual(len(result['kws_list']), 5)
        print(result['kws_list'][-1])

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_mono(self):
        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
        inputs = {
            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
        }
        result = kws(inputs)
        self.assertEqual(len(result['kws_list']), 5)
        print(result['kws_list'][-1])

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_url(self):
        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -44,8 +44,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
            'damo/mplug_visual-question-answering_coco_large_en')
        pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
        image = Image.open('data/test/images/image_mplug_vqa.jpg')
        question = 'What is the woman doing?'
        input = {'image': image, 'question': question}
        text = 'What is the woman doing?'
        input = {'image': image, 'text': text}
        result = pipeline_vqa(input)
        print(result)

@@ -54,8 +54,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
        model = 'damo/mplug_visual-question-answering_coco_large_en'
        pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
        image = Image.open('data/test/images/image_mplug_vqa.jpg')
        question = 'What is the woman doing?'
        input = {'image': image, 'question': question}
        text = 'What is the woman doing?'
        input = {'image': image, 'text': text}
        result = pipeline_vqa(input)
        print(result)

@@ -65,8 +65,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
            'damo/mplug_image-text-retrieval_flickr30k_large_en')
        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
        image = Image.open('data/test/images/image-text-retrieval.jpg')
        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
        input = {'image': image, 'question': question}
        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
        input = {'image': image, 'text': text}
        result = pipeline_retrieval(input)
        print(result)

@@ -75,8 +75,8 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
        model = 'damo/mplug_image-text-retrieval_flickr30k_large_en'
        pipeline_retrieval = pipeline(Tasks.image_text_retrieval, model=model)
        image = Image.open('data/test/images/image-text-retrieval.jpg')
        question = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
        input = {'image': image, 'question': question}
        text = 'Two young guys with shaggy hair look at their hands while hanging out in the yard.'
        input = {'image': image, 'text': text}
        result = pipeline_retrieval(input)
        print(result)

--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -147,8 +147,10 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
        result = ofa_pipe(input)
        print(result)
        image_name = image.split('/')[-2]
        self.save_img(image, result[OutputKeys.BOXES],
                      osp.join('large_en_model_' + image_name + '.png'))
        self.save_img(
            image,
            result[OutputKeys.BOXES][0],  # just one box
            osp.join('large_en_model_' + image_name + '.png'))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_visual_grounding_with_name(self):
@@ -161,7 +163,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
        result = ofa_pipe(input)
        print(result)
        image_name = image.split('/')[-2]
        self.save_img(image, result[OutputKeys.BOXES],
        self.save_img(image, result[OutputKeys.BOXES][0],
                      osp.join('large_en_name_' + image_name + '.png'))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -174,7 +176,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
        result = ofa_pipe(input)
        print(result)
        image_name = image.split('/')[-1]
        self.save_img(image, result[OutputKeys.BOXES],
        self.save_img(image, result[OutputKeys.BOXES][0],
                      osp.join('large_zh_name_' + image_name))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -9,6 +9,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
  - test_image_super_resolution.py
  - test_easycv_trainer.py
  - test_segformer.py
  - test_segmentation_pipeline.py

 envs:
  default: # default env, case not in other env will in default, pytorch.
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -53,7 +53,18 @@ class DummyModel(nn.Module, Model):
        return dict(logits=x, loss=loss)


 def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
 class DummyModelForwardInputs(DummyModel):

    def forward(self, inputs):
        feat, labels = inputs['feat'], inputs['labels']
        return super().forward(feat, labels)


 def train_func(work_dir,
               dist=False,
               iterable_dataset=False,
               forward_inputs=False,
               **kwargs):
    json_cfg = {
        'task': Tasks.image_classification,
        'train': {
@@ -81,7 +92,10 @@ def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
    with open(config_path, 'w') as f:
        json.dump(json_cfg, f)

    model = DummyModel()
    if forward_inputs:
        model = DummyModelForwardInputs()
    else:
        model = DummyModel()
    optimmizer = SGD(model.parameters(), lr=0.01)
    lr_scheduler = StepLR(optimmizer, 2)
    trainer_name = Trainers.default
@@ -273,6 +287,22 @@ class TrainerTestMultiGpus(DistributedTestCase):
        for i in [1, 3, 5]:
            self.assertIn(MetricKeys.ACCURACY, lines[i])

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_multi_gpus_forward_inputs(self):
        self.start(
            train_func,
            num_gpus=2,
            work_dir=self.tmp_dir,
            dist=True,
            forward_inputs=True)

        results_files = os.listdir(self.tmp_dir)
        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
        self.assertEqual(len(json_files), 1)
        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)

    # TODO: support iters_per_epoch for dist mode
    @unittest.skipIf(True, 'need to adapt to DistributedSampler')
    def test_multi_gpus_with_iters_per_epoch(self):