Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts: # modelscope/metrics/__init__.py
3 years ago · 8257e28a4f
--- a/data/test/audios/noise_2ch.wav
+++ b/data/test/audios/noise_2ch.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d
 size 2327764
--- a/data/test/audios/wake_word_with_label_xyxy.wav
+++ b/data/test/audios/wake_word_with_label_xyxy.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150
 size 68524
--- a/data/test/images/auto_demo.jpg
+++ b/data/test/images/auto_demo.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32
 size 53219
--- a/data/test/images/card_detection.jpg
+++ b/data/test/images/card_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c
 size 254845
--- a/data/test/images/face_detection2.jpeg
+++ b/data/test/images/face_detection2.jpeg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae
 size 48909
--- a/data/test/images/image_body_reshaping.jpg
+++ b/data/test/images/image_body_reshaping.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d
 size 1127557
--- a/data/test/images/image_inpainting/image_inpainting.png
+++ b/data/test/images/image_inpainting/image_inpainting.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5
 size 2747938
--- a/data/test/images/image_inpainting/image_inpainting_mask.png
+++ b/data/test/images/image_inpainting/image_inpainting_mask.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15
 size 27616
--- a/data/test/images/keypoints_detect/body_keypoints_detection.jpg
+++ b/data/test/images/keypoints_detect/body_keypoints_detection.jpg
@@ -1,3 +0,0 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d
 size 1702339
--- a/data/test/images/keypoints_detect/img_test_wholebody.jpg
+++ b/data/test/images/keypoints_detect/img_test_wholebody.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd
 size 167407
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
 size 60801
 oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
 size 61239
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
 size 60801
 oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
 size 61115
--- a/data/test/videos/referring_video_object_segmentation_test_video.mp4
+++ b/data/test/videos/referring_video_object_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb
 size 2924691
--- a/data/test/videos/test_realtime_vod.mp4
+++ b/data/test/videos/test_realtime_vod.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87
 size 4068783
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash

 # install special package
 RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
 RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl

 RUN if [ "$USE_GPU" = "True" ] ; then \
        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -24,20 +24,17 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       DownloadMode)
 from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                     datahub_raise_on_error, handle_http_response, is_ok,
                     raise_on_error)
 from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
                          model_id_to_group_owner_name)
                     datahub_raise_on_error, handle_http_post_error,
                     handle_http_response, is_ok, raise_on_error)
 from .utils.utils import get_endpoint, model_id_to_group_owner_name

 logger = get_logger()


 class HubApi:

    def __init__(self, endpoint=None, dataset_endpoint=None):
    def __init__(self, endpoint=None):
        self.endpoint = endpoint if endpoint is not None else get_endpoint()
        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
        )

    def login(
        self,
@@ -105,17 +102,15 @@ class HubApi:

        path = f'{self.endpoint}/api/v1/models'
        owner_or_group, name = model_id_to_group_owner_name(model_id)
        r = requests.post(
            path,
            json={
                'Path': owner_or_group,
                'Name': name,
                'ChineseName': chinese_name,
                'Visibility': visibility,  # server check
                'License': license
            },
            cookies=cookies)
        r.raise_for_status()
        body = {
            'Path': owner_or_group,
            'Name': name,
            'ChineseName': chinese_name,
            'Visibility': visibility,  # server check
            'License': license
        }
        r = requests.post(path, json=body, cookies=cookies)
        handle_http_post_error(r, path, body)
        raise_on_error(r.json())
        model_repo_url = f'{get_endpoint()}/{model_id}'
        return model_repo_url
@@ -290,7 +285,7 @@ class HubApi:
        return files

    def list_datasets(self):
        path = f'{self.dataset_endpoint}/api/v1/datasets'
        path = f'{self.endpoint}/api/v1/datasets'
        headers = None
        params = {}
        r = requests.get(path, params=params, headers=headers)
@@ -317,13 +312,13 @@ class HubApi:
                cache_dir):
            shutil.rmtree(cache_dir)
        os.makedirs(cache_dir, exist_ok=True)
        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
        r = requests.get(datahub_url)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
        dataset_id = resp['Data']['Id']
        dataset_type = resp['Data']['Type']
        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
        r = requests.get(datahub_url)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
@@ -341,7 +336,7 @@ class HubApi:
            file_path = file_info['Path']
            extension = os.path.splitext(file_path)[-1]
            if extension in dataset_meta_format:
                datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                              f'Revision={revision}&FilePath={file_path}'
                r = requests.get(datahub_url)
                r.raise_for_status()
@@ -365,7 +360,7 @@ class HubApi:
            namespace: str,
            revision: Optional[str] = DEFAULT_DATASET_REVISION):
        if file_name.endswith('.csv'):
            file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
            file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                        f'Revision={revision}&FilePath={file_name}'
        return file_name

@@ -374,7 +369,7 @@ class HubApi:
            dataset_name: str,
            namespace: str,
            revision: Optional[str] = DEFAULT_DATASET_REVISION):
        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                      f'ststoken?Revision={revision}'
        return self.datahub_remote_call(datahub_url)

@@ -385,7 +380,7 @@ class HubApi:
            namespace: str,
            revision: Optional[str] = DEFAULT_DATASET_REVISION):

        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                      f'ststoken?Revision={revision}'

        cookies = requests.utils.dict_from_cookiejar(cookies)
@@ -394,6 +389,19 @@ class HubApi:
        raise_on_error(resp)
        return resp['Data']

    def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
                                 is_recursive, is_filter_dir, revision,
                                 cookies):
        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
            f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
        cookies = requests.utils.dict_from_cookiejar(cookies)

        resp = requests.get(url=url, cookies=cookies)
        resp = resp.json()
        raise_on_error(resp)
        resp = resp['Data']
        return resp

    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
        r = requests.post(url)
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -4,6 +4,10 @@ from http import HTTPStatus

 from requests.exceptions import HTTPError

 from modelscope.utils.logger import get_logger

 logger = get_logger()


 class NotExistError(Exception):
    pass
@@ -45,15 +49,24 @@ def is_ok(rsp):
    return rsp['Code'] == HTTPStatus.OK and rsp['Success']


 def handle_http_post_error(response, url, request_body):
    try:
        response.raise_for_status()
    except HTTPError as error:
        logger.error('Request %s with body: %s exception' %
                     (url, request_body))
        raise error


 def handle_http_response(response, logger, cookies, model_id):
    try:
        response.raise_for_status()
    except HTTPError:
    except HTTPError as error:
        if cookies is None:  # code in [403] and
            logger.error(
                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                private. Please login first.')
        raise
        raise error


 def raise_on_error(rsp):
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import re
 import subprocess
 from typing import List
 from xmlrpc.client import Boolean
@@ -138,8 +139,8 @@ class GitCommandWrapper(metaclass=Singleton):
                repo_base_dir, repo_name, user_name)
            response = self._run_git_command(*config_user_name_args.split(' '))
            logger.debug(response.stdout.decode('utf8'))
            config_user_email_args = '-C %s/%s config user.name %s' % (
                repo_base_dir, repo_name, user_name)
            config_user_email_args = '-C %s/%s config user.email %s' % (
                repo_base_dir, repo_name, user_email)
            response = self._run_git_command(
                *config_user_email_args.split(' '))
            logger.debug(response.stdout.decode('utf8'))
@@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton):
        cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision]
        return self._run_git_command(*cmds)

    def get_remote_branches(self, repo_dir: str):
        cmds = ['-C', '%s' % repo_dir, 'branch', '-r']
        rsp = self._run_git_command(*cmds)
        info = [
            line.strip()
            for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
        ][1:]
        return ['/'.join(line.split('/')[1:]) for line in info]

    def pull(self, repo_dir: str):
        cmds = ['-C', repo_dir, 'pull']
        return self._run_git_command(*cmds)
--- a/modelscope/hub/upload.py
+++ b/modelscope/hub/upload.py
@@ -0,0 +1,117 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import datetime
 import os
 import shutil
 import tempfile
 import uuid
 from typing import Dict, Optional
 from uuid import uuid4

 from filelock import FileLock

 from modelscope import __version__
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.errors import InvalidParameter, NotLoginException
 from modelscope.hub.git import GitCommandWrapper
 from modelscope.hub.repository import Repository
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 def upload_folder(model_id: str,
                  model_dir: str,
                  visibility: int = 0,
                  license: str = None,
                  chinese_name: Optional[str] = None,
                  commit_message: Optional[str] = None,
                  revision: Optional[str] = DEFAULT_MODEL_REVISION):
    """
    Upload model from a given directory to given repository. A valid model directory
    must contain a configuration.json file.

    This function upload the files in given directory to given repository. If the
    given repository is not exists in remote, it will automatically create it with
    given visibility, license and chinese_name parameters. If the revision is also
    not exists in remote repository, it will create a new branch for it.

    This function must be called before calling HubApi's login with a valid token
    which can be obtained from ModelScope's website.

    Args:
        model_id (`str`):
            The model id to be uploaded, caller must have write permission for it.
        model_dir(`str`):
            The Absolute Path of the finetune result.
        visibility(`int`, defaults to `0`):
            Visibility of the new created model(1-private, 5-public). If the model is
            not exists in ModelScope, this function will create a new model with this
            visibility and this parameter is required. You can ignore this parameter
            if you make sure the model's existence.
        license(`str`, defaults to `None`):
            License of the new created model(see License). If the model is not exists
            in ModelScope, this function will create a new model with this license
            and this parameter is required. You can ignore this parameter if you
            make sure the model's existence.
        chinese_name(`str`, *optional*, defaults to `None`):
            chinese name of the new created model.
        commit_message(`str`, *optional*, defaults to `None`):
            commit message of the push request.
        revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
            which branch to push. If the branch is not exists, It will create a new
            branch and push to it.
    """
    if model_id is None:
        raise InvalidParameter('model_id cannot be empty!')
    if model_dir is None:
        raise InvalidParameter('model_dir cannot be empty!')
    if not os.path.exists(model_dir) or os.path.isfile(model_dir):
        raise InvalidParameter('model_dir must be a valid directory.')
    cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
    if not os.path.exists(cfg_file):
        raise ValueError(f'{model_dir} must contain a configuration.json.')
    cookies = ModelScopeConfig.get_cookies()
    if cookies is None:
        raise NotLoginException('Must login before upload!')
    files_to_save = os.listdir(model_dir)
    api = HubApi()
    try:
        api.get_model(model_id=model_id)
    except Exception:
        if visibility is None or license is None:
            raise InvalidParameter(
                'visibility and license cannot be empty if want to create new repo'
            )
        logger.info('Create new model %s' % model_id)
        api.create_model(
            model_id=model_id,
            visibility=visibility,
            license=license,
            chinese_name=chinese_name)
    tmp_dir = tempfile.mkdtemp()
    git_wrapper = GitCommandWrapper()
    try:
        repo = Repository(model_dir=tmp_dir, clone_from=model_id)
        branches = git_wrapper.get_remote_branches(tmp_dir)
        if revision not in branches:
            logger.info('Create new branch %s' % revision)
            git_wrapper.new_branch(tmp_dir, revision)
        git_wrapper.checkout(tmp_dir, revision)
        for f in files_to_save:
            if f[0] != '.':
                src = os.path.join(model_dir, f)
                if os.path.isdir(src):
                    shutil.copytree(src, os.path.join(tmp_dir, f))
                else:
                    shutil.copy(src, tmp_dir)
        if not commit_message:
            date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
            commit_message = '[automsg] push model %s to hub at %s' % (
                model_id, date)
        repo.push(commit_message=commit_message, branch=revision)
    except Exception:
        raise
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,8 +4,7 @@ import hashlib
 import os
 from typing import Optional

 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
                                      DEFAULT_MODELSCOPE_DOMAIN,
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                      DEFAULT_MODELSCOPE_GROUP,
                                      MODEL_ID_SEPARATOR,
                                      MODELSCOPE_URL_SCHEME)
@@ -44,11 +43,6 @@ def get_endpoint():
    return MODELSCOPE_URL_SCHEME + modelscope_domain


 def get_dataset_hub_endpoint():
    return os.environ.get('HUB_DATASET_ENDPOINT',
                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)


 def compute_hash(file_path):
    BUFFER_SIZE = 1024 * 64  # 64k buffer size
    sha256_hash = hashlib.sha256()
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -14,6 +14,7 @@ class Models(object):
    # vision models
    detection = 'detection'
    realtime_object_detection = 'realtime-object-detection'
    realtime_video_object_detection = 'realtime-video-object-detection'
    scrfd = 'scrfd'
    classification_model = 'ClassificationModel'
    nafnet = 'nafnet'
@@ -27,11 +28,13 @@ class Models(object):
    face_2d_keypoints = 'face-2d-keypoints'
    panoptic_segmentation = 'swinL-panoptic-segmentation'
    image_reid_person = 'passvitb'
    image_inpainting = 'FFTInpainting'
    video_summarization = 'pgl-video-summarization'
    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    text_driven_segmentation = 'text-driven-segmentation'
    resnet50_bert = 'resnet50-bert'
    referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
    fer = 'fer'
    retinaface = 'retinaface'
    shop_segmentation = 'shop-segmentation'
@@ -39,14 +42,18 @@ class Models(object):
    mtcnn = 'mtcnn'
    ulfd = 'ulfd'
    video_inpainting = 'video-inpainting'
    human_wholebody_keypoint = 'human-wholebody-keypoint'
    hand_static = 'hand-static'
    face_human_hand_detection = 'face-human-hand-detection'
    face_emotion = 'face-emotion'
    product_segmentation = 'product-segmentation'
    image_body_reshaping = 'image-body-reshaping'

    # EasyCV models
    yolox = 'YOLOX'
    segformer = 'Segformer'
    hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
    image_object_detection_auto = 'image-object-detection-auto'

    # nlp models
    bert = 'bert'
@@ -66,6 +73,7 @@ class Models(object):
    gcnncrf = 'gcnn-crf'
    bart = 'bart'
    gpt3 = 'gpt3'
    gpt_neo = 'gpt-neo'
    plug = 'plug'
    bert_for_ds = 'bert-for-document-segmentation'
    ponet = 'ponet'
@@ -96,6 +104,7 @@ class TaskModels(object):
    information_extraction = 'information-extraction'
    fill_mask = 'fill-mask'
    feature_extraction = 'feature-extraction'
    text_generation = 'text-generation'


 class Heads(object):
@@ -111,6 +120,8 @@ class Heads(object):
    token_classification = 'token-classification'
    # extraction
    information_extraction = 'information-extraction'
    # text gen
    text_generation = 'text-generation'


 class Pipelines(object):
@@ -144,6 +155,7 @@ class Pipelines(object):
    salient_detection = 'u2net-salient-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    card_detection = 'resnet-card-detection-scrfd34gkps'
    ulfd_face_detection = 'manual-face-detection-ulfd'
    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
    retina_face_detection = 'resnet50-face-detection-retinaface'
@@ -160,6 +172,7 @@ class Pipelines(object):
    face_image_generation = 'gan-face-image-generation'
    product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
    realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
    face_recognition = 'ir101-face-recognition-cfglint'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    image2image_translation = 'image-to-image-translation'
@@ -168,6 +181,7 @@ class Pipelines(object):
    ocr_recognition = 'convnextTiny-ocr-recognition'
    image_portrait_enhancement = 'gpen-image-portrait-enhancement'
    image_to_image_generation = 'image-to-image-generation'
    image_object_detection_auto = 'yolox_image-object-detection-auto'
    skin_retouching = 'unet-skin-retouching'
    tinynas_classification = 'tinynas-classification'
    tinynas_detection = 'tinynas-detection'
@@ -178,15 +192,19 @@ class Pipelines(object):
    video_summarization = 'googlenet_pgl_video_summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_reid_person = 'passvitb-image-reid-person'
    image_inpainting = 'fft-inpainting'
    text_driven_segmentation = 'text-driven-segmentation'
    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
    shop_segmentation = 'shop-segmentation'
    video_inpainting = 'video-inpainting'
    human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image'
    pst_action_recognition = 'patchshift-action-recognition'
    hand_static = 'hand-static'
    face_human_hand_detection = 'face-human-hand-detection'
    face_emotion = 'face-emotion'
    product_segmentation = 'product-segmentation'
    image_body_reshaping = 'flow-based-body-reshaping'
    referring_video_object_segmentation = 'referring-video-object-segmentation'

    # nlp tasks
    automatic_post_editing = 'automatic-post-editing'
@@ -211,6 +229,7 @@ class Pipelines(object):
    zero_shot_classification = 'zero-shot-classification'
    text_error_correction = 'text-error-correction'
    plug_generation = 'plug-generation'
    gpt3_generation = 'gpt3-generation'
    faq_question_answering = 'faq-question-answering'
    conversational_text_to_sql = 'conversational-text-to-sql'
    table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -219,6 +238,9 @@ class Pipelines(object):
    relation_extraction = 'relation-extraction'
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
    translation_en_to_de = 'translation_en_to_de'  # keep it underscore
    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore

    # audio tasks
    sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -263,6 +285,9 @@ class Trainers(object):
    image_portrait_enhancement = 'image-portrait-enhancement'
    video_summarization = 'video-summarization'
    movie_scene_segmentation = 'movie-scene-segmentation'
    face_detection_scrfd = 'face-detection-scrfd'
    card_detection_scrfd = 'card-detection-scrfd'
    image_inpainting = 'image-inpainting'

    # nlp trainers
    bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -274,6 +299,7 @@ class Trainers(object):

    # audio trainers
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'


 class Preprocessors(object):
@@ -302,6 +328,8 @@ class Preprocessors(object):
    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
    text_gen_tokenizer = 'text-gen-tokenizer'
    text2text_gen_preprocessor = 'text2text-gen-preprocessor'
    text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer'
    text2text_translate_preprocessor = 'text2text-translate-preprocessor'
    token_cls_tokenizer = 'token-cls-tokenizer'
    ner_tokenizer = 'ner-tokenizer'
    nli_tokenizer = 'nli-tokenizer'
@@ -324,6 +352,7 @@ class Preprocessors(object):
    re_tokenizer = 're-tokenizer'
    document_segmentation = 'document-segmentation'
    feature_extraction = 'feature-extraction'
    sentence_piece = 'sentence-piece'

    # audio preprocessor
    linear_aec_fbank = 'linear-aec-fbank'
@@ -365,6 +394,8 @@ class Metrics(object):
    video_summarization_metric = 'video-summarization-metric'
    # metric for movie-scene-segmentation task
    movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
    # metric for inpainting task
    image_inpainting_metric = 'image-inpainting-metric'


 class Optimizers(object):
@@ -406,6 +437,9 @@ class Hooks(object):
    IterTimerHook = 'IterTimerHook'
    EvaluationHook = 'EvaluationHook'

    # Compression
    SparsityHook = 'SparsityHook'


 class LR_Schedulers(object):
    """learning rate scheduler is defined here
@@ -421,6 +455,8 @@ class Datasets(object):
    """
    ClsDataset = 'ClsDataset'
    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
    HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
    HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
    SegDataset = 'SegDataset'
    DetDataset = 'DetDataset'
    DetImagesMixDataset = 'DetImagesMixDataset'
--- a/modelscope/metrics/init.py
+++ b/modelscope/metrics/init.py
@@ -19,6 +19,7 @@ if TYPE_CHECKING:
    from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
    from .accuracy_metric import AccuracyMetric
    from .bleu_metric import BleuMetric
    from .image_inpainting_metric import ImageInpaintingMetric

 else:
    _import_structure = {
@@ -36,6 +37,7 @@ else:
        'token_classification_metric': ['TokenClassificationMetric'],
        'video_summarization_metric': ['VideoSummarizationMetric'],
        'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
        'image_inpainting_metric': ['ImageInpaintingMetric'],
        'accuracy_metric': ['AccuracyMetric'],
        'bleu_metric': ['BleuMetric'],
    }
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric):
        total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
        return {
            'total_loss': total_loss.item(),
            'avg_sisnr': avg_sisnr.item(),
            # model use opposite number of sisnr as a calculation shortcut.
            # revert it in evaluation result
            'avg_sisnr': -avg_sisnr.item(),
            MetricKeys.AVERAGE_LOSS: avg_loss.item()
        }
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -18,6 +18,7 @@ class MetricKeys(object):
    SSIM = 'ssim'
    AVERAGE_LOSS = 'avg_loss'
    FScore = 'fscore'
    FID = 'fid'
    BLEU_1 = 'bleu-1'
    BLEU_4 = 'bleu-4'
    ROUGE_1 = 'rouge-1'
@@ -39,6 +40,7 @@ task_default_metrics = {
    Tasks.image_captioning: [Metrics.text_gen_metric],
    Tasks.visual_question_answering: [Metrics.text_gen_metric],
    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
    Tasks.image_inpainting: [Metrics.image_inpainting_metric],
 }


--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -1,12 +1,16 @@
 # ------------------------------------------------------------------------
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # ------------------------------------------------------------------------
 # modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py
 # ------------------------------------------------------------------------
 from typing import Dict

 import cv2
 import numpy as np
 from skimage.metrics import peak_signal_noise_ratio, structural_similarity
 import torch

 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                           torch_nested_numpify)
 from .base import Metric
 from .builder import METRICS, MetricKeys

@@ -20,26 +24,249 @@ class ImageDenoiseMetric(Metric):
    label_name = 'target'

    def __init__(self):
        super(ImageDenoiseMetric, self).__init__()
        self.preds = []
        self.labels = []

    def add(self, outputs: Dict, inputs: Dict):
        ground_truths = outputs[ImageDenoiseMetric.label_name]
        eval_results = outputs[ImageDenoiseMetric.pred_name]
        self.preds.append(
            torch_nested_numpify(torch_nested_detach(eval_results)))
        self.labels.append(
            torch_nested_numpify(torch_nested_detach(ground_truths)))
        self.preds.append(eval_results)
        self.labels.append(ground_truths)

    def evaluate(self):
        psnr_list, ssim_list = [], []
        for (pred, label) in zip(self.preds, self.labels):
            psnr_list.append(
                peak_signal_noise_ratio(label[0], pred[0], data_range=255))
            ssim_list.append(
                structural_similarity(
                    label[0], pred[0], multichannel=True, data_range=255))
            psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0))
            ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0))
        return {
            MetricKeys.PSNR: np.mean(psnr_list),
            MetricKeys.SSIM: np.mean(ssim_list)
        }


 def reorder_image(img, input_order='HWC'):
    """Reorder images to 'HWC' order.
    If the input_order is (h, w), return (h, w, 1);
    If the input_order is (c, h, w), return (h, w, c);
    If the input_order is (h, w, c), return as it is.
    Args:
        img (ndarray): Input image.
        input_order (str): Whether the input order is 'HWC' or 'CHW'.
            If the input image shape is (h, w), input_order will not have
            effects. Default: 'HWC'.
    Returns:
        ndarray: reordered image.
    """

    if input_order not in ['HWC', 'CHW']:
        raise ValueError(
            f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
        )
    if len(img.shape) == 2:
        img = img[..., None]
    if input_order == 'CHW':
        img = img.transpose(1, 2, 0)
    return img


 def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
    """Calculate PSNR (Peak Signal-to-Noise Ratio).
    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
    Args:
        img1 (ndarray/tensor): Images with range [0, 255]/[0, 1].
        img2 (ndarray/tensor): Images with range [0, 255]/[0, 1].
        crop_border (int): Cropped pixels in each edge of an image. These
            pixels are not involved in the PSNR calculation.
        input_order (str): Whether the input order is 'HWC' or 'CHW'.
            Default: 'HWC'.
        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
    Returns:
        float: psnr result.
    """

    assert img1.shape == img2.shape, (
        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
    if input_order not in ['HWC', 'CHW']:
        raise ValueError(
            f'Wrong input_order {input_order}. Supported input_orders are '
            '"HWC" and "CHW"')
    if type(img1) == torch.Tensor:
        if len(img1.shape) == 4:
            img1 = img1.squeeze(0)
        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
    if type(img2) == torch.Tensor:
        if len(img2.shape) == 4:
            img2 = img2.squeeze(0)
        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)

    img1 = reorder_image(img1, input_order=input_order)
    img2 = reorder_image(img2, input_order=input_order)
    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)

    if crop_border != 0:
        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]

    def _psnr(img1, img2):

        mse = np.mean((img1 - img2)**2)
        if mse == 0:
            return float('inf')
        max_value = 1. if img1.max() <= 1 else 255.
        return 20. * np.log10(max_value / np.sqrt(mse))

    return _psnr(img1, img2)


 def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
    """Calculate SSIM (structural similarity).
    Ref:
    Image quality assessment: From error visibility to structural similarity
    The results are the same as that of the official released MATLAB code in
    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
    For three-channel images, SSIM is calculated for each channel and then
    averaged.
    Args:
        img1 (ndarray): Images with range [0, 255].
        img2 (ndarray): Images with range [0, 255].
        crop_border (int): Cropped pixels in each edge of an image. These
            pixels are not involved in the SSIM calculation.
        input_order (str): Whether the input order is 'HWC' or 'CHW'.
            Default: 'HWC'.
        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
    Returns:
        float: ssim result.
    """

    assert img1.shape == img2.shape, (
        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
    if input_order not in ['HWC', 'CHW']:
        raise ValueError(
            f'Wrong input_order {input_order}. Supported input_orders are '
            '"HWC" and "CHW"')

    if type(img1) == torch.Tensor:
        if len(img1.shape) == 4:
            img1 = img1.squeeze(0)
        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
    if type(img2) == torch.Tensor:
        if len(img2.shape) == 4:
            img2 = img2.squeeze(0)
        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)

    img1 = reorder_image(img1, input_order=input_order)
    img2 = reorder_image(img2, input_order=input_order)

    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)

    if crop_border != 0:
        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]

    def _cal_ssim(img1, img2):
        ssims = []

        max_value = 1 if img1.max() <= 1 else 255
        with torch.no_grad():
            final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim(
                img1, img2, max_value)
            ssims.append(final_ssim)

        return np.array(ssims).mean()

    return _cal_ssim(img1, img2)


 def _ssim(img, img2, max_value):
    """Calculate SSIM (structural similarity) for one channel images.
    It is called by func:`calculate_ssim`.
    Args:
        img (ndarray): Images with range [0, 255] with order 'HWC'.
        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
    Returns:
        float: SSIM result.
    """

    c1 = (0.01 * max_value)**2
    c2 = (0.03 * max_value)**2

    img = img.astype(np.float64)
    img2 = img2.astype(np.float64)
    kernel = cv2.getGaussianKernel(11, 1.5)
    window = np.outer(kernel, kernel.transpose())

    mu1 = cv2.filter2D(img, -1, window)[5:-5,
                                        5:-5]  # valid mode for window size 11
    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
    mu1_sq = mu1**2
    mu2_sq = mu2**2
    mu1_mu2 = mu1 * mu2
    sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
    sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2

    tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
    tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
    ssim_map = tmp1 / tmp2
    return ssim_map.mean()


 def _3d_gaussian_calculator(img, conv3d):
    out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
    return out


 def _generate_3d_gaussian_kernel():
    kernel = cv2.getGaussianKernel(11, 1.5)
    window = np.outer(kernel, kernel.transpose())
    kernel_3 = cv2.getGaussianKernel(11, 1.5)
    kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0))
    conv3d = torch.nn.Conv3d(
        1,
        1, (11, 11, 11),
        stride=1,
        padding=(5, 5, 5),
        bias=False,
        padding_mode='replicate')
    conv3d.weight.requires_grad = False
    conv3d.weight[0, 0, :, :, :] = kernel
    return conv3d


 def _ssim_3d(img1, img2, max_value):
    assert len(img1.shape) == 3 and len(img2.shape) == 3
    """Calculate SSIM (structural similarity) for one channel images.
    It is called by func:`calculate_ssim`.
    Args:
        img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
        img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
    Returns:
        float: ssim result.
    """
    C1 = (0.01 * max_value)**2
    C2 = (0.03 * max_value)**2
    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)

    kernel = _generate_3d_gaussian_kernel().cuda()

    img1 = torch.tensor(img1).float().cuda()
    img2 = torch.tensor(img2).float().cuda()

    mu1 = _3d_gaussian_calculator(img1, kernel)
    mu2 = _3d_gaussian_calculator(img2, kernel)

    mu1_sq = mu1**2
    mu2_sq = mu2**2
    mu1_mu2 = mu1 * mu2
    sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq
    sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq
    sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2

    tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2)
    tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
    ssim_map = tmp1 / tmp2
    return float(ssim_map.mean())
--- a/modelscope/metrics/image_inpainting_metric.py
+++ b/modelscope/metrics/image_inpainting_metric.py
@@ -0,0 +1,210 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 from typing import Dict

 import numpy as np
 import torch
 import torch.nn.functional as F
 from scipy import linalg

 from modelscope.metainfo import Metrics
 from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3
 from modelscope.utils.registry import default_group
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                           torch_nested_numpify)
 from .base import Metric
 from .builder import METRICS, MetricKeys


 def fid_calculate_activation_statistics(act):
    mu = np.mean(act, axis=0)
    sigma = np.cov(act, rowvar=False)
    return mu, sigma


 def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6):
    mu1, sigma1 = fid_calculate_activation_statistics(activations_pred)
    mu2, sigma2 = fid_calculate_activation_statistics(activations_target)

    diff = mu1 - mu2

    # Product might be almost singular
    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
    if not np.isfinite(covmean).all():
        offset = np.eye(sigma1.shape[0]) * eps
        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))

    # Numerical error might give slight imaginary component
    if np.iscomplexobj(covmean):
        # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2):
            m = np.max(np.abs(covmean.imag))
            raise ValueError('Imaginary component {}'.format(m))
        covmean = covmean.real

    tr_covmean = np.trace(covmean)

    return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2)
            - 2 * tr_covmean)


 class FIDScore(torch.nn.Module):

    def __init__(self, dims=2048, eps=1e-6):
        super().__init__()
        if getattr(FIDScore, '_MODEL', None) is None:
            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
            FIDScore._MODEL = InceptionV3([block_idx]).eval()
        self.model = FIDScore._MODEL
        self.eps = eps
        self.reset()

    def forward(self, pred_batch, target_batch, mask=None):
        activations_pred = self._get_activations(pred_batch)
        activations_target = self._get_activations(target_batch)

        self.activations_pred.append(activations_pred.detach().cpu())
        self.activations_target.append(activations_target.detach().cpu())

    def get_value(self):
        activations_pred, activations_target = (self.activations_pred,
                                                self.activations_target)
        activations_pred = torch.cat(activations_pred).cpu().numpy()
        activations_target = torch.cat(activations_target).cpu().numpy()

        total_distance = calculate_frechet_distance(
            activations_pred, activations_target, eps=self.eps)

        self.reset()
        return total_distance

    def reset(self):
        self.activations_pred = []
        self.activations_target = []

    def _get_activations(self, batch):
        activations = self.model(batch)[0]
        if activations.shape[2] != 1 or activations.shape[3] != 1:
            assert False, \
                'We should not have got here, because Inception always scales inputs to 299x299'
        activations = activations.squeeze(-1).squeeze(-1)
        return activations


 class SSIM(torch.nn.Module):
    """SSIM. Modified from:
    https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
    """

    def __init__(self, window_size=11, size_average=True):
        super().__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.channel = 1
        self.register_buffer('window',
                             self._create_window(window_size, self.channel))

    def forward(self, img1, img2):
        assert len(img1.shape) == 4

        channel = img1.size()[1]

        if channel == self.channel and self.window.data.type(
        ) == img1.data.type():
            window = self.window
        else:
            window = self._create_window(self.window_size, channel)

            window = window.type_as(img1)

            self.window = window
            self.channel = channel

        return self._ssim(img1, img2, window, self.window_size, channel,
                          self.size_average)

    def _gaussian(self, window_size, sigma):
        gauss = torch.Tensor([
            np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2))
            for x in range(window_size)
        ])
        return gauss / gauss.sum()

    def _create_window(self, window_size, channel):
        _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1)
        _2D_window = _1D_window.mm(
            _1D_window.t()).float().unsqueeze(0).unsqueeze(0)
        return _2D_window.expand(channel, 1, window_size,
                                 window_size).contiguous()

    def _ssim(self,
              img1,
              img2,
              window,
              window_size,
              channel,
              size_average=True):
        mu1 = F.conv2d(
            img1, window, padding=(window_size // 2), groups=channel)
        mu2 = F.conv2d(
            img2, window, padding=(window_size // 2), groups=channel)

        mu1_sq = mu1.pow(2)
        mu2_sq = mu2.pow(2)
        mu1_mu2 = mu1 * mu2

        sigma1_sq = F.conv2d(
            img1 * img1, window, padding=(window_size // 2),
            groups=channel) - mu1_sq
        sigma2_sq = F.conv2d(
            img2 * img2, window, padding=(window_size // 2),
            groups=channel) - mu2_sq
        sigma12 = F.conv2d(
            img1 * img2, window, padding=(window_size // 2),
            groups=channel) - mu1_mu2

        C1 = 0.01**2
        C2 = 0.03**2

        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
                   ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

        if size_average:
            return ssim_map.mean()

        return ssim_map.mean(1).mean(1).mean(1)

    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                              missing_keys, unexpected_keys, error_msgs):
        return


@METRICS.register_module(
    group_key=default_group, module_name=Metrics.image_inpainting_metric)
 class ImageInpaintingMetric(Metric):
    """The metric computation class for image inpainting classes.
    """

    def __init__(self):
        self.preds = []
        self.targets = []
        self.SSIM = SSIM(window_size=11, size_average=False).eval()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.FID = FIDScore().to(device)

    def add(self, outputs: Dict, inputs: Dict):
        pred = outputs['inpainted']
        target = inputs['image']
        self.preds.append(torch_nested_detach(pred))
        self.targets.append(torch_nested_detach(target))

    def evaluate(self):
        ssim_list = []
        for (pred, target) in zip(self.preds, self.targets):
            ssim_list.append(self.SSIM(pred, target))
            self.FID(pred, target)
        ssim_list = torch_nested_numpify(ssim_list)
        fid = self.FID.get_value()
        return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}
--- a/modelscope/metrics/video_summarization_metric.py
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -1,3 +1,6 @@
 # Part of the implementation is borrowed and modified from PGL-SUM,
 # publicly available at https://github.com/e-apostolidis/PGL-SUM

 from typing import Dict

 import numpy as np
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Any, Dict

--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,15 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Dict

 import torch
 from typing import Dict, Optional

 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.audio.audio_utils import update_conf
 from modelscope.utils.constant import Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2


@@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel):

    MODEL_TXT = 'model.txt'
    SC_CONFIG = 'sound_connect.conf'
    SC_CONF_ITEM_KWS_MODEL = '${kws_model}'

    def __init__(self, model_dir: str, *args, **kwargs):
    def __init__(self,
                 model_dir: str,
                 training: Optional[bool] = False,
                 *args,
                 **kwargs):
        """initialize the dfsmn model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)
        sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
        model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
        model_bin_file = os.path.join(model_dir,
                                      ModelFile.TORCH_MODEL_BIN_FILE)
        self._model = None
        if os.path.exists(model_bin_file):
            kwargs.pop('device')
            self._model = FSMNSeleNetV2(*args, **kwargs)
            checkpoint = torch.load(model_bin_file)
            self._model.load_state_dict(checkpoint, strict=False)

        self._sc = None
        if os.path.exists(model_txt_file):
            with open(sc_config_file) as f:
                lines = f.readlines()
            with open(sc_config_file, 'w') as f:
                for line in lines:
                    if self.SC_CONF_ITEM_KWS_MODEL in line:
                        line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
                                            model_txt_file)
                    f.write(line)
            import py_sound_connect
            self._sc = py_sound_connect.SoundConnect(sc_config_file)
            self.size_in = self._sc.bytesPerBlockIn()
            self.size_out = self._sc.bytesPerBlockOut()

        if self._model is None and self._sc is None:
            raise Exception(
                f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
            )
        if training:
            self.model = FSMNSeleNetV2(*args, **kwargs)
        else:
            sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
            model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
            self._sc = None
            if os.path.exists(model_txt_file):
                conf_dict = dict(mode=56542, kws_model=model_txt_file)
                update_conf(sc_config_file, sc_config_file, conf_dict)
                import py_sound_connect
                self._sc = py_sound_connect.SoundConnect(sc_config_file)
                self.size_in = self._sc.bytesPerBlockIn()
                self.size_out = self._sc.bytesPerBlockOut()
            else:
                raise Exception(
                    f'Invalid model directory! Failed to load model file: {model_txt_file}.'
                )

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        ...
        return self.model.forward(input)

    def forward_decode(self, data: bytes):
        result = {'pcm': self._sc.process(data, self.size_out)}
--- a/modelscope/models/audio/kws/generic_key_word_spotting.py
+++ b/modelscope/models/audio/kws/generic_key_word_spotting.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Any, Dict

--- a/modelscope/models/audio/tts/models/datasets/init.py
+++ b/modelscope/models/audio/tts/models/datasets/init.py
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -4,14 +4,16 @@
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
               body_3d_keypoints, cartoon, cmdssl_video_embedding,
               crowd_counting, face_2d_keypoints, face_detection,
               face_generation, image_classification, image_color_enhance,
               image_colorization, image_denoise, image_instance_segmentation,
               face_generation, human_wholebody_keypoint, image_classification,
               image_color_enhance, image_colorization, image_denoise,
               image_inpainting, image_instance_segmentation,
               image_panoptic_segmentation, image_portrait_enhancement,
               image_reid_person, image_semantic_segmentation,
               image_to_image_generation, image_to_image_translation,
               movie_scene_segmentation, object_detection,
               product_retrieval_embedding, realtime_object_detection,
               salient_detection, shop_segmentation, super_resolution,
               referring_video_object_segmentation, salient_detection,
               shop_segmentation, super_resolution,
               video_single_object_tracking, video_summarization, virual_tryon)

 # yapf: enable
--- a/modelscope/models/cv/crowd_counting/cc_model.py
+++ b/modelscope/models/cv/crowd_counting/cc_model.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Any, Dict, Optional, Union

--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -1,10 +1,10 @@
 # ------------------------------------------------------------------------------
 # Copyright (c) Microsoft
 # Licensed under the MIT License.
 # Written by Bin Xiao (Bin.Xiao@microsoft.com)
 # Modified by Ke Sun (sunk@mail.ustc.edu.cn)
 # https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
 # ------------------------------------------------------------------------------
 """
 Copyright (c) Microsoft
 Licensed under the MIT License.
 Written by Bin Xiao (Bin.Xiao@microsoft.com)
 Modified by Ke Sun (sunk@mail.ustc.edu.cn)
 https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
 """

 import functools
 import logging
--- a/modelscope/models/cv/face_detection/init.py
+++ b/modelscope/models/cv/face_detection/init.py
@@ -8,12 +8,14 @@ if TYPE_CHECKING:
    from .mtcnn import MtcnnFaceDetector
    from .retinaface import RetinaFaceDetection
    from .ulfd_slim import UlfdFaceDetector
    from .scrfd import ScrfdDetect
 else:
    _import_structure = {
        'ulfd_slim': ['UlfdFaceDetector'],
        'retinaface': ['RetinaFaceDetection'],
        'mtcnn': ['MtcnnFaceDetector'],
        'mogface': ['MogFaceDetector']
        'mogface': ['MogFaceDetector'],
        'scrfd': ['ScrfdDetect']
    }

    import sys
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -1,189 +0,0 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
 from numpy import random


@PIPELINES.register_module()
 class RandomSquareCrop(object):
    """Random crop the image & bboxes, the cropped patches have minimum IoU
    requirement with original image & bboxes, the IoU threshold is randomly
    selected from min_ious.

    Args:
        min_ious (tuple): minimum IoU threshold for all intersections with
        bounding boxes
        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
        where a >= min_crop_size).

    Note:
        The keys for bboxes, labels and masks should be paired. That is, \
        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
    """

    def __init__(self,
                 crop_ratio_range=None,
                 crop_choice=None,
                 bbox_clip_border=True):

        self.crop_ratio_range = crop_ratio_range
        self.crop_choice = crop_choice
        self.bbox_clip_border = bbox_clip_border

        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
        if self.crop_ratio_range is not None:
            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range

        self.bbox2label = {
            'gt_bboxes': 'gt_labels',
            'gt_bboxes_ignore': 'gt_labels_ignore'
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }

    def __call__(self, results):
        """Call function to crop images and bounding boxes with minimum IoU
        constraint.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Result dict with images and bounding boxes cropped, \
                'img_shape' key is updated.
        """

        if 'img_fields' in results:
            assert results['img_fields'] == ['img'], \
                'Only single img_fields is allowed'
        img = results['img']
        assert 'bbox_fields' in results
        assert 'gt_bboxes' in results
        boxes = results['gt_bboxes']
        h, w, c = img.shape
        scale_retry = 0
        if self.crop_ratio_range is not None:
            max_scale = self.crop_ratio_max
        else:
            max_scale = np.amax(self.crop_choice)
        while True:
            scale_retry += 1

            if scale_retry == 1 or max_scale > 1.0:
                if self.crop_ratio_range is not None:
                    scale = np.random.uniform(self.crop_ratio_min,
                                              self.crop_ratio_max)
                elif self.crop_choice is not None:
                    scale = np.random.choice(self.crop_choice)
            else:
                scale = scale * 1.2

            for i in range(250):
                short_side = min(w, h)
                cw = int(scale * short_side)
                ch = cw

                # TODO +1
                if w == cw:
                    left = 0
                elif w > cw:
                    left = random.randint(0, w - cw)
                else:
                    left = random.randint(w - cw, 0)
                if h == ch:
                    top = 0
                elif h > ch:
                    top = random.randint(0, h - ch)
                else:
                    top = random.randint(h - ch, 0)

                patch = np.array(
                    (int(left), int(top), int(left + cw), int(top + ch)),
                    dtype=np.int)

                # center of boxes should inside the crop img
                # only adjust boxes and instance masks when the gt is not empty
                # adjust boxes
                def is_center_of_bboxes_in_patch(boxes, patch):
                    # TODO >=
                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
                    mask = \
                        ((center[:, 0] > patch[0])
                         * (center[:, 1] > patch[1])
                         * (center[:, 0] < patch[2])
                         * (center[:, 1] < patch[3]))
                    return mask

                mask = is_center_of_bboxes_in_patch(boxes, patch)
                if not mask.any():
                    continue
                for key in results.get('bbox_fields', []):
                    boxes = results[key].copy()
                    mask = is_center_of_bboxes_in_patch(boxes, patch)
                    boxes = boxes[mask]
                    if self.bbox_clip_border:
                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
                    boxes -= np.tile(patch[:2], 2)

                    results[key] = boxes
                    # labels
                    label_key = self.bbox2label.get(key)
                    if label_key in results:
                        results[label_key] = results[label_key][mask]

                    # keypoints field
                    if key == 'gt_bboxes':
                        for kps_key in results.get('keypoints_fields', []):
                            keypointss = results[kps_key].copy()
                            keypointss = keypointss[mask, :, :]
                            if self.bbox_clip_border:
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               max=patch[2:])
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               min=patch[:2])
                            keypointss[:, :, 0] -= patch[0]
                            keypointss[:, :, 1] -= patch[1]
                            results[kps_key] = keypointss

                    # mask fields
                    mask_key = self.bbox2mask.get(key)
                    if mask_key in results:
                        results[mask_key] = results[mask_key][mask.nonzero()
                                                              [0]].crop(patch)

                # adjust the img no matter whether the gt is empty before crop
                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
                patch_from = patch.copy()
                patch_from[0] = max(0, patch_from[0])
                patch_from[1] = max(0, patch_from[1])
                patch_from[2] = min(img.shape[1], patch_from[2])
                patch_from[3] = min(img.shape[0], patch_from[3])
                patch_to = patch.copy()
                patch_to[0] = max(0, patch_to[0] * -1)
                patch_to[1] = max(0, patch_to[1] * -1)
                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
                rimg[patch_to[1]:patch_to[3],
                     patch_to[0]:patch_to[2], :] = img[
                         patch_from[1]:patch_from[3],
                         patch_from[0]:patch_from[2], :]
                img = rimg
                results['img'] = img
                results['img_shape'] = img.shape

                return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(min_ious={self.min_iou}, '
        repr_str += f'crop_size={self.crop_size})'
        return repr_str
--- a/modelscope/models/cv/face_detection/mogface/models/detectors.py
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -1,3 +1,5 @@
 # The implementation is based on MogFace, available at
 # https://github.com/damo-cv/MogFace
 import os

 import cv2
--- a/modelscope/models/cv/face_detection/scrfd/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .scrfd_detect import ScrfdDetect
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch


 def bbox2result(bboxes, labels, num_classes, kps=None):
 def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5):
    """Convert detection results to a list of numpy arrays.

    Args:
@@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None):
    Returns:
        list(ndarray): bbox results of each class
    """
    bbox_len = 5 if kps is None else 5 + 10  # if has kps, add 10 kps into bbox
    bbox_len = 5 if kps is None else 5 + num_kps * 2  # if has kps, add num_kps*2 into bbox
    if bboxes.shape[0] == 0:
        return [
            np.zeros((0, bbox_len), dtype=np.float32)
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
@@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes,

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2)
        multi_scores (Tensor): shape (n, #class), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
@@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes,
    num_classes = multi_scores.size(1) - 1
    # exclude background category
    kps = None
    if multi_kps is not None:
        num_kps = int((multi_kps.shape[1] / num_classes) / 2)
    if multi_bboxes.shape[1] > 4:
        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
        if multi_kps is not None:
            kps = multi_kps.view(multi_scores.size(0), -1, 10)
            kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2)
    else:
        bboxes = multi_bboxes[:, None].expand(
            multi_scores.size(0), num_classes, 4)
        if multi_kps is not None:
            kps = multi_kps[:, None].expand(
                multi_scores.size(0), num_classes, 10)
                multi_scores.size(0), num_classes, num_kps * 2)

    scores = multi_scores[:, :-1]
    if score_factors is not None:
@@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes,

    bboxes = bboxes.reshape(-1, 4)
    if kps is not None:
        kps = kps.reshape(-1, 10)
        kps = kps.reshape(-1, num_kps * 2)
    scores = scores.reshape(-1)
    labels = labels.reshape(-1)

--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/init.py
@@ -2,6 +2,12 @@
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
 """
 from .auto_augment import RotateV2
 from .formating import DefaultFormatBundleV2
 from .loading import LoadAnnotationsV2
 from .transforms import RandomSquareCrop

 __all__ = ['RandomSquareCrop']
 __all__ = [
    'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2',
    'DefaultFormatBundleV2'
 ]
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
@@ -0,0 +1,271 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
 """
 import copy

 import cv2
 import mmcv
 import numpy as np
 from mmdet.datasets.builder import PIPELINES

 _MAX_LEVEL = 10


 def level_to_value(level, max_value):
    """Map from level to values based on max_value."""
    return (level / _MAX_LEVEL) * max_value


 def random_negative(value, random_negative_prob):
    """Randomly negate value based on random_negative_prob."""
    return -value if np.random.rand() < random_negative_prob else value


 def bbox2fields():
    """The key correspondence from bboxes to labels, masks and
    segmentations."""
    bbox2label = {
        'gt_bboxes': 'gt_labels',
        'gt_bboxes_ignore': 'gt_labels_ignore'
    }
    bbox2mask = {
        'gt_bboxes': 'gt_masks',
        'gt_bboxes_ignore': 'gt_masks_ignore'
    }
    bbox2seg = {
        'gt_bboxes': 'gt_semantic_seg',
    }
    return bbox2label, bbox2mask, bbox2seg


@PIPELINES.register_module()
 class RotateV2(object):
    """Apply Rotate Transformation to image (and its corresponding bbox, mask,
    segmentation).

    Args:
        level (int | float): The level should be in range (0,_MAX_LEVEL].
        scale (int | float): Isotropic scale factor. Same in
            ``mmcv.imrotate``.
        center (int | float | tuple[float]): Center point (w, h) of the
            rotation in the source image. If None, the center of the
            image will be used. Same in ``mmcv.imrotate``.
        img_fill_val (int | float | tuple): The fill value for image border.
            If float, the same value will be used for all the three
            channels of image. If tuple, the should be 3 elements (e.g.
            equals the number of channels for image).
        seg_ignore_label (int): The fill value used for segmentation map.
            Note this value must equals ``ignore_label`` in ``semantic_head``
            of the corresponding config. Default 255.
        prob (float): The probability for perform transformation and
            should be in range 0 to 1.
        max_rotate_angle (int | float): The maximum angles for rotate
            transformation.
        random_negative_prob (float): The probability that turns the
             offset negative.
    """

    def __init__(self,
                 level,
                 scale=1,
                 center=None,
                 img_fill_val=128,
                 seg_ignore_label=255,
                 prob=0.5,
                 max_rotate_angle=30,
                 random_negative_prob=0.5):
        assert isinstance(level, (int, float)), \
            f'The level must be type int or float. got {type(level)}.'
        assert 0 <= level <= _MAX_LEVEL, \
            f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
        assert isinstance(scale, (int, float)), \
            f'The scale must be type int or float. got type {type(scale)}.'
        if isinstance(center, (int, float)):
            center = (center, center)
        elif isinstance(center, tuple):
            assert len(center) == 2, 'center with type tuple must have '\
                f'2 elements. got {len(center)} elements.'
        else:
            assert center is None, 'center must be None or type int, '\
                f'float or tuple, got type {type(center)}.'
        if isinstance(img_fill_val, (float, int)):
            img_fill_val = tuple([float(img_fill_val)] * 3)
        elif isinstance(img_fill_val, tuple):
            assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
                f'have 3 elements. got {len(img_fill_val)}.'
            img_fill_val = tuple([float(val) for val in img_fill_val])
        else:
            raise ValueError(
                'img_fill_val must be float or tuple with 3 elements.')
        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
            'all elements of img_fill_val should between range [0,255]. '\
            f'got {img_fill_val}.'
        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
            f'got {prob}.'
        assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
            f'should be type int or float. got type {type(max_rotate_angle)}.'
        self.level = level
        self.scale = scale
        # Rotation angle in degrees. Positive values mean
        # clockwise rotation.
        self.angle = level_to_value(level, max_rotate_angle)
        self.center = center
        self.img_fill_val = img_fill_val
        self.seg_ignore_label = seg_ignore_label
        self.prob = prob
        self.max_rotate_angle = max_rotate_angle
        self.random_negative_prob = random_negative_prob

    def _rotate_img(self, results, angle, center=None, scale=1.0):
        """Rotate the image.

        Args:
            results (dict): Result dict from loading pipeline.
            angle (float): Rotation angle in degrees, positive values
                mean clockwise rotation. Same in ``mmcv.imrotate``.
            center (tuple[float], optional): Center point (w, h) of the
                rotation. Same in ``mmcv.imrotate``.
            scale (int | float): Isotropic scale factor. Same in
                ``mmcv.imrotate``.
        """
        for key in results.get('img_fields', ['img']):
            img = results[key].copy()
            img_rotated = mmcv.imrotate(
                img, angle, center, scale, border_value=self.img_fill_val)
            results[key] = img_rotated.astype(img.dtype)
            results['img_shape'] = results[key].shape

    def _rotate_bboxes(self, results, rotate_matrix):
        """Rotate the bboxes."""
        h, w, c = results['img_shape']
        for key in results.get('bbox_fields', []):
            min_x, min_y, max_x, max_y = np.split(
                results[key], results[key].shape[-1], axis=-1)
            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
                                    [min_x, max_y],
                                    [max_x, max_y]])  # [4, 2, nb_bbox, 1]
            # pad 1 to convert from format [x, y] to homogeneous
            # coordinates format [x, y, 1]
            coordinates = np.concatenate(
                (coordinates,
                 np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
                axis=1)  # [4, 3, nb_bbox, 1]
            coordinates = coordinates.transpose(
                (2, 0, 1, 3))  # [nb_bbox, 4, 3, 1]
            rotated_coords = np.matmul(rotate_matrix,
                                       coordinates)  # [nb_bbox, 4, 2, 1]
            rotated_coords = rotated_coords[..., 0]  # [nb_bbox, 4, 2]
            min_x, min_y = np.min(
                rotated_coords[:, :, 0], axis=1), np.min(
                    rotated_coords[:, :, 1], axis=1)
            max_x, max_y = np.max(
                rotated_coords[:, :, 0], axis=1), np.max(
                    rotated_coords[:, :, 1], axis=1)
            results[key] = np.stack([min_x, min_y, max_x, max_y],
                                    axis=-1).astype(results[key].dtype)

    def _rotate_keypoints90(self, results, angle):
        """Rotate the keypoints, only valid when angle in [-90,90,-180,180]"""
        if angle not in [-90, 90, 180, -180
                         ] or self.scale != 1 or self.center is not None:
            return
        for key in results.get('keypoints_fields', []):
            k = results[key]
            if angle == 90:
                w, h, c = results['img'].shape
                new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1)
            elif angle == -90:
                w, h, c = results['img'].shape
                new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1)
            else:
                h, w, c = results['img'].shape
                new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]],
                               axis=-1)
            # a kps is invalid if thrid value is -1
            kps_invalid = new[..., -1][:, -1] == -1
            new[kps_invalid] = np.zeros(new.shape[1:]) - 1
            results[key] = new

    def _rotate_masks(self,
                      results,
                      angle,
                      center=None,
                      scale=1.0,
                      fill_val=0):
        """Rotate the masks."""
        h, w, c = results['img_shape']
        for key in results.get('mask_fields', []):
            masks = results[key]
            results[key] = masks.rotate((h, w), angle, center, scale, fill_val)

    def _rotate_seg(self,
                    results,
                    angle,
                    center=None,
                    scale=1.0,
                    fill_val=255):
        """Rotate the segmentation map."""
        for key in results.get('seg_fields', []):
            seg = results[key].copy()
            results[key] = mmcv.imrotate(
                seg, angle, center, scale,
                border_value=fill_val).astype(seg.dtype)

    def _filter_invalid(self, results, min_bbox_size=0):
        """Filter bboxes and corresponding masks too small after rotate
        augmentation."""
        bbox2label, bbox2mask, _ = bbox2fields()
        for key in results.get('bbox_fields', []):
            bbox_w = results[key][:, 2] - results[key][:, 0]
            bbox_h = results[key][:, 3] - results[key][:, 1]
            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
            valid_inds = np.nonzero(valid_inds)[0]
            results[key] = results[key][valid_inds]
            # label fields. e.g. gt_labels and gt_labels_ignore
            label_key = bbox2label.get(key)
            if label_key in results:
                results[label_key] = results[label_key][valid_inds]
            # mask fields, e.g. gt_masks and gt_masks_ignore
            mask_key = bbox2mask.get(key)
            if mask_key in results:
                results[mask_key] = results[mask_key][valid_inds]

    def __call__(self, results):
        """Call function to rotate images, bounding boxes, masks and semantic
        segmentation maps.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Rotated results.
        """
        if np.random.rand() > self.prob:
            return results
        h, w = results['img'].shape[:2]
        center = self.center
        if center is None:
            center = ((w - 1) * 0.5, (h - 1) * 0.5)
        angle = random_negative(self.angle, self.random_negative_prob)
        self._rotate_img(results, angle, center, self.scale)
        rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
        self._rotate_bboxes(results, rotate_matrix)
        self._rotate_keypoints90(results, angle)
        self._rotate_masks(results, angle, center, self.scale, fill_val=0)
        self._rotate_seg(
            results, angle, center, self.scale, fill_val=self.seg_ignore_label)
        self._filter_invalid(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(level={self.level}, '
        repr_str += f'scale={self.scale}, '
        repr_str += f'center={self.center}, '
        repr_str += f'img_fill_val={self.img_fill_val}, '
        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
        repr_str += f'prob={self.prob}, '
        repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
        repr_str += f'random_negative_prob={self.random_negative_prob})'
        return repr_str
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
@@ -0,0 +1,113 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py
 """
 import numpy as np
 import torch
 from mmcv.parallel import DataContainer as DC
 from mmdet.datasets.builder import PIPELINES


 def to_tensor(data):
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.

    Args:
        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
            be converted.
    """

    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not mmcv.is_str(data):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@PIPELINES.register_module()
 class DefaultFormatBundleV2(object):
    """Default formatting bundle.

    It simplifies the pipeline of formatting common fields, including "img",
    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
    These fields are formatted as follows.

    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
    - proposals: (1)to tensor, (2)to DataContainer
    - gt_bboxes: (1)to tensor, (2)to DataContainer
    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
    - gt_labels: (1)to tensor, (2)to DataContainer
    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
                       (3)to DataContainer (stack=True)
    """

    def __call__(self, results):
        """Call function to transform and format common fields in results.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            dict: The result dict contains the data that is formatted with \
                default bundle.
        """

        if 'img' in results:
            img = results['img']
            # add default meta keys
            results = self._add_default_meta_keys(results)
            if len(img.shape) < 3:
                img = np.expand_dims(img, -1)
            img = np.ascontiguousarray(img.transpose(2, 0, 1))
            results['img'] = DC(to_tensor(img), stack=True)
        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss',
                'gt_labels'
        ]:
            if key not in results:
                continue
            results[key] = DC(to_tensor(results[key]))
        if 'gt_masks' in results:
            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
        if 'gt_semantic_seg' in results:
            results['gt_semantic_seg'] = DC(
                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
        return results

    def _add_default_meta_keys(self, results):
        """Add default meta keys.

        We set default meta keys including `pad_shape`, `scale_factor` and
        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
        `Pad` are implemented during the whole pipeline.

        Args:
            results (dict): Result dict contains the data to convert.

        Returns:
            results (dict): Updated result dict contains the data to convert.
        """
        img = results['img']
        results.setdefault('pad_shape', img.shape)
        results.setdefault('scale_factor', 1.0)
        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
        results.setdefault(
            'img_norm_cfg',
            dict(
                mean=np.zeros(num_channels, dtype=np.float32),
                std=np.ones(num_channels, dtype=np.float32),
                to_rgb=False))
        return results

    def __repr__(self):
        return self.__class__.__name__
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
@@ -0,0 +1,225 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py
 """
 import os.path as osp

 import numpy as np
 import pycocotools.mask as maskUtils
 from mmdet.core import BitmapMasks, PolygonMasks
 from mmdet.datasets.builder import PIPELINES


@PIPELINES.register_module()
 class LoadAnnotationsV2(object):
    """Load mutiple types of annotations.

    Args:
        with_bbox (bool): Whether to parse and load the bbox annotation.
             Default: True.
        with_label (bool): Whether to parse and load the label annotation.
            Default: True.
        with_keypoints (bool): Whether to parse and load the keypoints annotation.
            Default: False.
        with_mask (bool): Whether to parse and load the mask annotation.
             Default: False.
        with_seg (bool): Whether to parse and load the semantic segmentation
            annotation. Default: False.
        poly2mask (bool): Whether to convert the instance masks from polygons
            to bitmaps. Default: True.
        file_client_args (dict): Arguments to instantiate a FileClient.
            See :class:`mmcv.fileio.FileClient` for details.
            Defaults to ``dict(backend='disk')``.
    """

    def __init__(self,
                 with_bbox=True,
                 with_label=True,
                 with_keypoints=False,
                 with_mask=False,
                 with_seg=False,
                 poly2mask=True,
                 file_client_args=dict(backend='disk')):
        self.with_bbox = with_bbox
        self.with_label = with_label
        self.with_keypoints = with_keypoints
        self.with_mask = with_mask
        self.with_seg = with_seg
        self.poly2mask = poly2mask
        self.file_client_args = file_client_args.copy()
        self.file_client = None

    def _load_bboxes(self, results):
        """Private function to load bounding box annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.

        Returns:
            dict: The dict contains loaded bounding box annotations.
        """

        ann_info = results['ann_info']
        results['gt_bboxes'] = ann_info['bboxes'].copy()

        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
        if gt_bboxes_ignore is not None:
            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
            results['bbox_fields'].append('gt_bboxes_ignore')
        results['bbox_fields'].append('gt_bboxes')
        return results

    def _load_keypoints(self, results):
        """Private function to load bounding box annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.

        Returns:
            dict: The dict contains loaded bounding box annotations.
        """

        ann_info = results['ann_info']
        results['gt_keypointss'] = ann_info['keypointss'].copy()

        results['keypoints_fields'] = ['gt_keypointss']
        return results

    def _load_labels(self, results):
        """Private function to load label annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.

        Returns:
            dict: The dict contains loaded label annotations.
        """

        results['gt_labels'] = results['ann_info']['labels'].copy()
        return results

    def _poly2mask(self, mask_ann, img_h, img_w):
        """Private function to convert masks represented with polygon to
        bitmaps.

        Args:
            mask_ann (list | dict): Polygon mask annotation input.
            img_h (int): The height of output mask.
            img_w (int): The width of output mask.

        Returns:
            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
        """

        if isinstance(mask_ann, list):
            # polygon -- a single object might consist of multiple parts
            # we merge all parts into one mask rle code
            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
            rle = maskUtils.merge(rles)
        elif isinstance(mask_ann['counts'], list):
            # uncompressed RLE
            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
        else:
            # rle
            rle = mask_ann
        mask = maskUtils.decode(rle)
        return mask

    def process_polygons(self, polygons):
        """Convert polygons to list of ndarray and filter invalid polygons.

        Args:
            polygons (list[list]): Polygons of one instance.

        Returns:
            list[numpy.ndarray]: Processed polygons.
        """

        polygons = [np.array(p) for p in polygons]
        valid_polygons = []
        for polygon in polygons:
            if len(polygon) % 2 == 0 and len(polygon) >= 6:
                valid_polygons.append(polygon)
        return valid_polygons

    def _load_masks(self, results):
        """Private function to load mask annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.

        Returns:
            dict: The dict contains loaded mask annotations.
                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
        """

        h, w = results['img_info']['height'], results['img_info']['width']
        gt_masks = results['ann_info']['masks']
        if self.poly2mask:
            gt_masks = BitmapMasks(
                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
        else:
            gt_masks = PolygonMasks(
                [self.process_polygons(polygons) for polygons in gt_masks], h,
                w)
        results['gt_masks'] = gt_masks
        results['mask_fields'].append('gt_masks')
        return results

    def _load_semantic_seg(self, results):
        """Private function to load semantic segmentation annotations.

        Args:
            results (dict): Result dict from :obj:`dataset`.

        Returns:
            dict: The dict contains loaded semantic segmentation annotations.
        """
        import mmcv
        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)

        filename = osp.join(results['seg_prefix'],
                            results['ann_info']['seg_map'])
        img_bytes = self.file_client.get(filename)
        results['gt_semantic_seg'] = mmcv.imfrombytes(
            img_bytes, flag='unchanged').squeeze()
        results['seg_fields'].append('gt_semantic_seg')
        return results

    def __call__(self, results):
        """Call function to load multiple types annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet.CustomDataset`.

        Returns:
            dict: The dict contains loaded bounding box, label, mask and
                semantic segmentation annotations.
        """

        if self.with_bbox:
            results = self._load_bboxes(results)
            if results is None:
                return None
        if self.with_label:
            results = self._load_labels(results)
        if self.with_keypoints:
            results = self._load_keypoints(results)
        if self.with_mask:
            results = self._load_masks(results)
        if self.with_seg:
            results = self._load_semantic_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(with_bbox={self.with_bbox}, '
        repr_str += f'with_label={self.with_label}, '
        repr_str += f'with_keypoints={self.with_keypoints}, '
        repr_str += f'with_mask={self.with_mask}, '
        repr_str += f'with_seg={self.with_seg})'
        repr_str += f'poly2mask={self.poly2mask})'
        repr_str += f'poly2mask={self.file_client_args})'
        return repr_str
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
@@ -0,0 +1,737 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import mmcv
 import numpy as np
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
 from mmdet.datasets.builder import PIPELINES
 from numpy import random


@PIPELINES.register_module()
 class ResizeV2(object):
    """Resize images & bbox & mask &kps.

    This transform resizes the input image to some scale. Bboxes and masks are
    then resized with the same scale factor. If the input dict contains the key
    "scale", then the scale in the input dict is used, otherwise the specified
    scale in the init method is used. If the input dict contains the key
    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
    scale_factor), the actual scale will be computed by image shape and
    scale_factor.

    `img_scale` can either be a tuple (single-scale) or a list of tuple
    (multi-scale). There are 3 multiscale modes:

    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
      range and multiply it with the image scale.
    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
      sample a scale from the multiscale range.
    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
      sample a scale from multiple scales.

    Args:
        img_scale (tuple or list[tuple]): Images scales for resizing.
        multiscale_mode (str): Either "range" or "value".
        ratio_range (tuple[float]): (min_ratio, max_ratio)
        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
            image.
        bbox_clip_border (bool, optional): Whether clip the objects outside
            the border of the image. Defaults to True.
        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
            These two backends generates slightly different results. Defaults
            to 'cv2'.
        override (bool, optional): Whether to override `scale` and
            `scale_factor` so as to call resize twice. Default False. If True,
            after the first resizing, the existed `scale` and `scale_factor`
            will be ignored so the second resizing can be allowed.
            This option is a work-around for multiple times of resize in DETR.
            Defaults to False.
    """

    def __init__(self,
                 img_scale=None,
                 multiscale_mode='range',
                 ratio_range=None,
                 keep_ratio=True,
                 bbox_clip_border=True,
                 backend='cv2',
                 override=False):
        if img_scale is None:
            self.img_scale = None
        else:
            if isinstance(img_scale, list):
                self.img_scale = img_scale
            else:
                self.img_scale = [img_scale]
            assert mmcv.is_list_of(self.img_scale, tuple)

        if ratio_range is not None:
            # mode 1: given a scale and a range of image ratio
            assert len(self.img_scale) == 1
        else:
            # mode 2: given multiple scales or a range of scales
            assert multiscale_mode in ['value', 'range']

        self.backend = backend
        self.multiscale_mode = multiscale_mode
        self.ratio_range = ratio_range
        self.keep_ratio = keep_ratio
        # TODO: refactor the override option in Resize
        self.override = override
        self.bbox_clip_border = bbox_clip_border

    @staticmethod
    def random_select(img_scales):
        """Randomly select an img_scale from given candidates.

        Args:
            img_scales (list[tuple]): Images scales for selection.

        Returns:
            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
                where ``img_scale`` is the selected image scale and \
                ``scale_idx`` is the selected index in the given candidates.
        """

        assert mmcv.is_list_of(img_scales, tuple)
        scale_idx = np.random.randint(len(img_scales))
        img_scale = img_scales[scale_idx]
        return img_scale, scale_idx

    @staticmethod
    def random_sample(img_scales):
        """Randomly sample an img_scale when ``multiscale_mode=='range'``.

        Args:
            img_scales (list[tuple]): Images scale range for sampling.
                There must be two tuples in img_scales, which specify the lower
                and uper bound of image scales.

        Returns:
            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
                ``img_scale`` is sampled scale and None is just a placeholder \
                to be consistent with :func:`random_select`.
        """

        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
        img_scale_long = [max(s) for s in img_scales]
        img_scale_short = [min(s) for s in img_scales]
        long_edge = np.random.randint(
            min(img_scale_long),
            max(img_scale_long) + 1)
        short_edge = np.random.randint(
            min(img_scale_short),
            max(img_scale_short) + 1)
        img_scale = (long_edge, short_edge)
        return img_scale, None

    @staticmethod
    def random_sample_ratio(img_scale, ratio_range):
        """Randomly sample an img_scale when ``ratio_range`` is specified.

        A ratio will be randomly sampled from the range specified by
        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
        generate sampled scale.

        Args:
            img_scale (tuple): Images scale base to multiply with ratio.
            ratio_range (tuple[float]): The minimum and maximum ratio to scale
                the ``img_scale``.

        Returns:
            (tuple, None): Returns a tuple ``(scale, None)``, where \
                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
                None is just a placeholder to be consistent with \
                :func:`random_select`.
        """

        assert isinstance(img_scale, tuple) and len(img_scale) == 2
        min_ratio, max_ratio = ratio_range
        assert min_ratio <= max_ratio
        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
        return scale, None

    def _random_scale(self, results):
        """Randomly sample an img_scale according to ``ratio_range`` and
        ``multiscale_mode``.

        If ``ratio_range`` is specified, a ratio will be sampled and be
        multiplied with ``img_scale``.
        If multiple scales are specified by ``img_scale``, a scale will be
        sampled according to ``multiscale_mode``.
        Otherwise, single scale will be used.

        Args:
            results (dict): Result dict from :obj:`dataset`.

        Returns:
            dict: Two new keys 'scale` and 'scale_idx` are added into \
                ``results``, which would be used by subsequent pipelines.
        """

        if self.ratio_range is not None:
            scale, scale_idx = self.random_sample_ratio(
                self.img_scale[0], self.ratio_range)
        elif len(self.img_scale) == 1:
            scale, scale_idx = self.img_scale[0], 0
        elif self.multiscale_mode == 'range':
            scale, scale_idx = self.random_sample(self.img_scale)
        elif self.multiscale_mode == 'value':
            scale, scale_idx = self.random_select(self.img_scale)
        else:
            raise NotImplementedError

        results['scale'] = scale
        results['scale_idx'] = scale_idx

    def _resize_img(self, results):
        """Resize images with ``results['scale']``."""
        for key in results.get('img_fields', ['img']):
            if self.keep_ratio:
                img, scale_factor = mmcv.imrescale(
                    results[key],
                    results['scale'],
                    return_scale=True,
                    backend=self.backend)
                # the w_scale and h_scale has minor difference
                # a real fix should be done in the mmcv.imrescale in the future
                new_h, new_w = img.shape[:2]
                h, w = results[key].shape[:2]
                w_scale = new_w / w
                h_scale = new_h / h
            else:
                img, w_scale, h_scale = mmcv.imresize(
                    results[key],
                    results['scale'],
                    return_scale=True,
                    backend=self.backend)
            results[key] = img

            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
                                    dtype=np.float32)
            results['img_shape'] = img.shape
            # in case that there is no padding
            results['pad_shape'] = img.shape
            results['scale_factor'] = scale_factor
            results['keep_ratio'] = self.keep_ratio

    def _resize_bboxes(self, results):
        """Resize bounding boxes with ``results['scale_factor']``."""
        for key in results.get('bbox_fields', []):
            bboxes = results[key] * results['scale_factor']
            if self.bbox_clip_border:
                img_shape = results['img_shape']
                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            results[key] = bboxes

    def _resize_keypoints(self, results):
        """Resize keypoints with ``results['scale_factor']``."""
        for key in results.get('keypoints_fields', []):
            keypointss = results[key].copy()
            factors = results['scale_factor']
            assert factors[0] == factors[2]
            assert factors[1] == factors[3]
            keypointss[:, :, 0] *= factors[0]
            keypointss[:, :, 1] *= factors[1]
            if self.bbox_clip_border:
                img_shape = results['img_shape']
                keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0,
                                              img_shape[1])
                keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0,
                                              img_shape[0])
            results[key] = keypointss

    def _resize_masks(self, results):
        """Resize masks with ``results['scale']``"""
        for key in results.get('mask_fields', []):
            if results[key] is None:
                continue
            if self.keep_ratio:
                results[key] = results[key].rescale(results['scale'])
            else:
                results[key] = results[key].resize(results['img_shape'][:2])

    def _resize_seg(self, results):
        """Resize semantic segmentation map with ``results['scale']``."""
        for key in results.get('seg_fields', []):
            if self.keep_ratio:
                gt_seg = mmcv.imrescale(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            else:
                gt_seg = mmcv.imresize(
                    results[key],
                    results['scale'],
                    interpolation='nearest',
                    backend=self.backend)
            results['gt_semantic_seg'] = gt_seg

    def __call__(self, results):
        """Call function to resize images, bounding boxes, masks, semantic
        segmentation map.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
                'keep_ratio' keys are added into result dict.
        """

        if 'scale' not in results:
            if 'scale_factor' in results:
                img_shape = results['img'].shape[:2]
                scale_factor = results['scale_factor']
                assert isinstance(scale_factor, float)
                results['scale'] = tuple(
                    [int(x * scale_factor) for x in img_shape][::-1])
            else:
                self._random_scale(results)
        else:
            if not self.override:
                assert 'scale_factor' not in results, (
                    'scale and scale_factor cannot be both set.')
            else:
                results.pop('scale')
                if 'scale_factor' in results:
                    results.pop('scale_factor')
                self._random_scale(results)

        self._resize_img(results)
        self._resize_bboxes(results)
        self._resize_keypoints(results)
        self._resize_masks(results)
        self._resize_seg(results)
        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(img_scale={self.img_scale}, '
        repr_str += f'multiscale_mode={self.multiscale_mode}, '
        repr_str += f'ratio_range={self.ratio_range}, '
        repr_str += f'keep_ratio={self.keep_ratio})'
        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
        return repr_str


@PIPELINES.register_module()
 class RandomFlipV2(object):
    """Flip the image & bbox & mask & kps.

    If the input dict contains the key "flip", then the flag will be used,
    otherwise it will be randomly decided by a ratio specified in the init
    method.

    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
    float/string or tuple of float/string. There are 3 flip modes:

    - ``flip_ratio`` is float, ``direction`` is string: the image will be
        ``direction``ly flipped with probability of ``flip_ratio`` .
        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
        then image will be horizontally flipped with probability of 0.5.
    - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
        be ``direction[i]``ly flipped with probability of
        ``flip_ratio/len(direction)``.
        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
        then image will be horizontally flipped with probability of 0.25,
        vertically with probability of 0.25.
    - ``flip_ratio`` is list of float, ``direction`` is list of string:
        given ``len(flip_ratio) == len(direction)``, the image wil
        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
        'vertical']``, then image will be horizontally flipped with probability
         of 0.3, vertically with probability of 0.5

    Args:
        flip_ratio (float | list[float], optional): The flipping probability.
            Default: None.
        direction(str | list[str], optional): The flipping direction. Options
            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
            If input is a list, the length must equal ``flip_ratio``. Each
            element in ``flip_ratio`` indicates the flip probability of
            corresponding direction.
    """

    def __init__(self, flip_ratio=None, direction='horizontal'):
        if isinstance(flip_ratio, list):
            assert mmcv.is_list_of(flip_ratio, float)
            assert 0 <= sum(flip_ratio) <= 1
        elif isinstance(flip_ratio, float):
            assert 0 <= flip_ratio <= 1
        elif flip_ratio is None:
            pass
        else:
            raise ValueError('flip_ratios must be None, float, '
                             'or list of float')
        self.flip_ratio = flip_ratio

        valid_directions = ['horizontal', 'vertical', 'diagonal']
        if isinstance(direction, str):
            assert direction in valid_directions
        elif isinstance(direction, list):
            assert mmcv.is_list_of(direction, str)
            assert set(direction).issubset(set(valid_directions))
        else:
            raise ValueError('direction must be either str or list of str')
        self.direction = direction

        if isinstance(flip_ratio, list):
            assert len(self.flip_ratio) == len(self.direction)
        self.count = 0

    def bbox_flip(self, bboxes, img_shape, direction):
        """Flip bboxes horizontally.

        Args:
            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
            img_shape (tuple[int]): Image shape (height, width)
            direction (str): Flip direction. Options are 'horizontal',
                'vertical'.

        Returns:
            numpy.ndarray: Flipped bounding boxes.
        """

        assert bboxes.shape[-1] % 4 == 0
        flipped = bboxes.copy()
        if direction == 'horizontal':
            w = img_shape[1]
            flipped[..., 0::4] = w - bboxes[..., 2::4]
            flipped[..., 2::4] = w - bboxes[..., 0::4]
        elif direction == 'vertical':
            h = img_shape[0]
            flipped[..., 1::4] = h - bboxes[..., 3::4]
            flipped[..., 3::4] = h - bboxes[..., 1::4]
        elif direction == 'diagonal':
            w = img_shape[1]
            h = img_shape[0]
            flipped[..., 0::4] = w - bboxes[..., 2::4]
            flipped[..., 1::4] = h - bboxes[..., 3::4]
            flipped[..., 2::4] = w - bboxes[..., 0::4]
            flipped[..., 3::4] = h - bboxes[..., 1::4]
        else:
            raise ValueError(f"Invalid flipping direction '{direction}'")
        return flipped

    def keypoints_flip(self, keypointss, img_shape, direction):
        """Flip keypoints horizontally."""

        assert direction == 'horizontal'
        assert keypointss.shape[-1] == 3
        num_kps = keypointss.shape[1]
        assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}'
        assert keypointss.ndim == 3
        flipped = keypointss.copy()
        if num_kps == 5:
            flip_order = [1, 0, 2, 4, 3]
        elif num_kps == 4:
            flip_order = [3, 2, 1, 0]
        for idx, a in enumerate(flip_order):
            flipped[:, idx, :] = keypointss[:, a, :]
        w = img_shape[1]
        flipped[..., 0] = w - flipped[..., 0]
        return flipped

    def __call__(self, results):
        """Call function to flip bounding boxes, masks, semantic segmentation
        maps.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Flipped results, 'flip', 'flip_direction' keys are added \
                into result dict.
        """
        if 'flip' not in results:
            if isinstance(self.direction, list):
                # None means non-flip
                direction_list = self.direction + [None]
            else:
                # None means non-flip
                direction_list = [self.direction, None]

            if isinstance(self.flip_ratio, list):
                non_flip_ratio = 1 - sum(self.flip_ratio)
                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
            else:
                non_flip_ratio = 1 - self.flip_ratio
                # exclude non-flip
                single_ratio = self.flip_ratio / (len(direction_list) - 1)
                flip_ratio_list = [single_ratio] * (len(direction_list)
                                                    - 1) + [non_flip_ratio]

            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)

            results['flip'] = cur_dir is not None
        if 'flip_direction' not in results:
            results['flip_direction'] = cur_dir
        if results['flip']:
            # flip image
            for key in results.get('img_fields', ['img']):
                results[key] = mmcv.imflip(
                    results[key], direction=results['flip_direction'])
            # flip bboxes
            for key in results.get('bbox_fields', []):
                results[key] = self.bbox_flip(results[key],
                                              results['img_shape'],
                                              results['flip_direction'])
            # flip kps
            for key in results.get('keypoints_fields', []):
                results[key] = self.keypoints_flip(results[key],
                                                   results['img_shape'],
                                                   results['flip_direction'])
            # flip masks
            for key in results.get('mask_fields', []):
                results[key] = results[key].flip(results['flip_direction'])

            # flip segs
            for key in results.get('seg_fields', []):
                results[key] = mmcv.imflip(
                    results[key], direction=results['flip_direction'])
        return results

    def __repr__(self):
        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'


@PIPELINES.register_module()
 class RandomSquareCrop(object):
    """Random crop the image & bboxes, the cropped patches have minimum IoU
    requirement with original image & bboxes, the IoU threshold is randomly
    selected from min_ious.

    Args:
        min_ious (tuple): minimum IoU threshold for all intersections with
        bounding boxes
        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
        where a >= min_crop_size).

    Note:
        The keys for bboxes, labels and masks should be paired. That is, \
        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
    """

    def __init__(self,
                 crop_ratio_range=None,
                 crop_choice=None,
                 bbox_clip_border=True,
                 big_face_ratio=0,
                 big_face_crop_choice=None):

        self.crop_ratio_range = crop_ratio_range
        self.crop_choice = crop_choice
        self.big_face_crop_choice = big_face_crop_choice
        self.bbox_clip_border = bbox_clip_border

        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
        if self.crop_ratio_range is not None:
            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range

        self.bbox2label = {
            'gt_bboxes': 'gt_labels',
            'gt_bboxes_ignore': 'gt_labels_ignore'
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }
        assert big_face_ratio >= 0 and big_face_ratio <= 1.0
        self.big_face_ratio = big_face_ratio

    def __call__(self, results):
        """Call function to crop images and bounding boxes with minimum IoU
        constraint.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Result dict with images and bounding boxes cropped, \
                'img_shape' key is updated.
        """

        if 'img_fields' in results:
            assert results['img_fields'] == ['img'], \
                'Only single img_fields is allowed'
        img = results['img']
        assert 'bbox_fields' in results
        assert 'gt_bboxes' in results
        # try augment big face images
        find_bigface = False
        if np.random.random() < self.big_face_ratio:
            min_size = 100  # h and w
            expand_ratio = 0.3  # expand ratio of croped face alongwith both w and h
            bbox = results['gt_bboxes'].copy()
            lmks = results['gt_keypointss'].copy()
            label = results['gt_labels'].copy()
            # filter small faces
            size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * (
                (bbox[:, 3] - bbox[:, 1]) > min_size)
            bbox = bbox[size_mask]
            lmks = lmks[size_mask]
            label = label[size_mask]
            # randomly choose a face that has no overlap with others
            if len(bbox) > 0:
                overlaps = bbox_overlaps(bbox, bbox)
                overlaps -= np.eye(overlaps.shape[0])
                iou_mask = np.sum(overlaps, axis=1) == 0
                bbox = bbox[iou_mask]
                lmks = lmks[iou_mask]
                label = label[iou_mask]
                if len(bbox) > 0:
                    choice = np.random.randint(len(bbox))
                    bbox = bbox[choice]
                    lmks = lmks[choice]
                    label = [label[choice]]
                    w = bbox[2] - bbox[0]
                    h = bbox[3] - bbox[1]
                    x1 = bbox[0] - w * expand_ratio
                    x2 = bbox[2] + w * expand_ratio
                    y1 = bbox[1] - h * expand_ratio
                    y2 = bbox[3] + h * expand_ratio
                    x1, x2 = np.clip([x1, x2], 0, img.shape[1])
                    y1, y2 = np.clip([y1, y2], 0, img.shape[0])
                    bbox -= np.tile([x1, y1], 2)
                    lmks -= (x1, y1, 0)

                    find_bigface = True
                    img = img[int(y1):int(y2), int(x1):int(x2), :]
                    results['gt_bboxes'] = np.expand_dims(bbox, axis=0)
                    results['gt_keypointss'] = np.expand_dims(lmks, axis=0)
                    results['gt_labels'] = np.array(label)
                    results['img'] = img

        boxes = results['gt_bboxes']
        h, w, c = img.shape

        if self.crop_ratio_range is not None:
            max_scale = self.crop_ratio_max
        else:
            max_scale = np.amax(self.crop_choice)
        scale_retry = 0
        while True:
            scale_retry += 1
            if scale_retry == 1 or max_scale > 1.0:
                if self.crop_ratio_range is not None:
                    scale = np.random.uniform(self.crop_ratio_min,
                                              self.crop_ratio_max)
                elif self.crop_choice is not None:
                    scale = np.random.choice(self.crop_choice)
            else:
                scale = scale * 1.2

            if find_bigface:
                # select a scale from big_face_crop_choice if in big_face mode
                scale = np.random.choice(self.big_face_crop_choice)

            for i in range(250):
                long_side = max(w, h)
                cw = int(scale * long_side)
                ch = cw

                # TODO +1
                if w == cw:
                    left = 0
                elif w > cw:
                    left = random.randint(0, w - cw)
                else:
                    left = random.randint(w - cw, 0)
                if h == ch:
                    top = 0
                elif h > ch:
                    top = random.randint(0, h - ch)
                else:
                    top = random.randint(h - ch, 0)

                patch = np.array(
                    (int(left), int(top), int(left + cw), int(top + ch)),
                    dtype=np.int32)

                # center of boxes should inside the crop img
                # only adjust boxes and instance masks when the gt is not empty
                # adjust boxes
                def is_center_of_bboxes_in_patch(boxes, patch):
                    # TODO >=
                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
                    mask = \
                        ((center[:, 0] > patch[0])
                         * (center[:, 1] > patch[1])
                         * (center[:, 0] < patch[2])
                         * (center[:, 1] < patch[3]))
                    return mask

                mask = is_center_of_bboxes_in_patch(boxes, patch)
                if not mask.any():
                    continue
                for key in results.get('bbox_fields', []):
                    boxes = results[key].copy()
                    mask = is_center_of_bboxes_in_patch(boxes, patch)
                    boxes = boxes[mask]
                    if self.bbox_clip_border:
                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
                    boxes -= np.tile(patch[:2], 2)

                    results[key] = boxes
                    # labels
                    label_key = self.bbox2label.get(key)
                    if label_key in results:
                        results[label_key] = results[label_key][mask]

                    # keypoints field
                    if key == 'gt_bboxes':
                        for kps_key in results.get('keypoints_fields', []):
                            keypointss = results[kps_key].copy()
                            keypointss = keypointss[mask, :, :]
                            if self.bbox_clip_border:
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               max=patch[2:])
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               min=patch[:2])
                            keypointss[:, :, 0] -= patch[0]
                            keypointss[:, :, 1] -= patch[1]
                            results[kps_key] = keypointss

                    # mask fields
                    mask_key = self.bbox2mask.get(key)
                    if mask_key in results:
                        results[mask_key] = results[mask_key][mask.nonzero()
                                                              [0]].crop(patch)

                # adjust the img no matter whether the gt is empty before crop
                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
                patch_from = patch.copy()
                patch_from[0] = max(0, patch_from[0])
                patch_from[1] = max(0, patch_from[1])
                patch_from[2] = min(img.shape[1], patch_from[2])
                patch_from[3] = min(img.shape[0], patch_from[3])
                patch_to = patch.copy()
                patch_to[0] = max(0, patch_to[0] * -1)
                patch_to[1] = max(0, patch_to[1] * -1)
                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
                rimg[patch_to[1]:patch_to[3],
                     patch_to[0]:patch_to[2], :] = img[
                         patch_from[1]:patch_from[3],
                         patch_from[0]:patch_from[2], :]
                img = rimg
                results['img'] = img
                results['img_shape'] = img.shape

                return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(min_ious={self.min_iou}, '
        repr_str += f'crop_size={self.crop_size})'
        return repr_str
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
@@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset):
    CLASSES = ('FG', )

    def __init__(self, min_size=None, **kwargs):
        self.NK = 5
        self.NK = kwargs.pop('num_kps', 5)
        self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
        self.min_size = min_size
        self.gt_path = kwargs.get('gt_path')
@@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset):
        if len(values) > 4:
            if len(values) > 5:
                kps = np.array(
                    values[4:19], dtype=np.float32).reshape((self.NK, 3))
                    values[4:4 + self.NK * 3], dtype=np.float32).reshape(
                        (self.NK, 3))
                for li in range(kps.shape[0]):
                    if (kps[li, :] == -1).all():
                        kps[li][2] = 0.0  # weight = 0, ignore
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead):
                 scale_mode=1,
                 dw_conv=False,
                 use_kps=False,
                 num_kps=5,
                 loss_kps=dict(
                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
                 **kwargs):
@@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead):
        self.scale_mode = scale_mode
        self.use_dfl = True
        self.dw_conv = dw_conv
        self.NK = 5
        self.NK = num_kps
        self.extra_flops = 0.0
        if loss_dfl is None or not loss_dfl:
            self.use_dfl = False
@@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead):
                batch_size, -1, self.cls_out_channels).sigmoid()
            bbox_pred = bbox_pred.permute(0, 2, 3,
                                          1).reshape(batch_size, -1, 4)
            kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)

            kps_pred = kps_pred.permute(0, 2, 3,
                                        1).reshape(batch_size, -1, self.NK * 2)
        return cls_score, bbox_pred, kps_pred

    def forward_train(self,
@@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead):
                if self.use_dfl:
                    kps_pred = self.integral(kps_pred) * stride[0]
                else:
                    kps_pred = kps_pred.reshape((-1, 10)) * stride[0]
                    kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0]

            nms_pre = cfg.get('nms_pre', -1)
            if nms_pre > 0 and scores.shape[0] > nms_pre:
@@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead):
            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
            if mlvl_kps is not None:
                scale_factor2 = torch.tensor(
                    [scale_factor[0], scale_factor[1]] * 5)
                    [scale_factor[0], scale_factor[1]] * self.NK)
                mlvl_kps /= scale_factor2.to(mlvl_kps.device)

        mlvl_scores = torch.cat(mlvl_scores)
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/init.py
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
@@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector):
                                              gt_bboxes_ignore)
        return losses

    def simple_test(self, img, img_metas, rescale=False):
    def simple_test(self,
                    img,
                    img_metas,
                    rescale=False,
                    repeat_head=1,
                    output_kps_var=0,
                    output_results=1):
        """Test function without test time augmentation.

        Args:
@@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector):
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.
            repeat_head (int): repeat inference times in head
            output_kps_var (int): whether output kps var to calculate quality
            output_results (int): 0: nothing  1: bbox  2: both bbox and kps

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
@@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector):
                corresponds to each class.
        """
        x = self.extract_feat(img)
        outs = self.bbox_head(x)
        if torch.onnx.is_in_onnx_export():
            print('single_stage.py in-onnx-export')
            print(outs.__class__)
            cls_score, bbox_pred, kps_pred = outs
            for c in cls_score:
                print(c.shape)
            for c in bbox_pred:
                print(c.shape)
            if self.bbox_head.use_kps:
                for c in kps_pred:
        assert repeat_head >= 1
        kps_out0 = []
        kps_out1 = []
        kps_out2 = []
        for i in range(repeat_head):
            outs = self.bbox_head(x)
            kps_out0 += [outs[2][0].detach().cpu().numpy()]
            kps_out1 += [outs[2][1].detach().cpu().numpy()]
            kps_out2 += [outs[2][2].detach().cpu().numpy()]
        if output_kps_var:
            var0 = np.var(np.vstack(kps_out0), axis=0).mean()
            var1 = np.var(np.vstack(kps_out1), axis=0).mean()
            var2 = np.var(np.vstack(kps_out2), axis=0).mean()
            var = np.mean([var0, var1, var2])
        else:
            var = None

        if output_results > 0:
            if torch.onnx.is_in_onnx_export():
                print('single_stage.py in-onnx-export')
                print(outs.__class__)
                cls_score, bbox_pred, kps_pred = outs
                for c in cls_score:
                    print(c.shape)
                for c in bbox_pred:
                    print(c.shape)
                return (cls_score, bbox_pred, kps_pred)
            else:
                return (cls_score, bbox_pred)
        bbox_list = self.bbox_head.get_bboxes(
            *outs, img_metas, rescale=rescale)
                if self.bbox_head.use_kps:
                    for c in kps_pred:
                        print(c.shape)
                    return (cls_score, bbox_pred, kps_pred)
                else:
                    return (cls_score, bbox_pred)
            bbox_list = self.bbox_head.get_bboxes(
                *outs, img_metas, rescale=rescale)

        # return kps if use_kps
        if len(bbox_list[0]) == 2:
            bbox_results = [
                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
                for det_bboxes, det_labels in bbox_list
            ]
        elif len(bbox_list[0]) == 3:
            bbox_results = [
                bbox2result(
                    det_bboxes,
                    det_labels,
                    self.bbox_head.num_classes,
                    kps=det_kps)
                for det_bboxes, det_labels, det_kps in bbox_list
            ]
        return bbox_results
            # return kps if use_kps
            if len(bbox_list[0]) == 2:
                bbox_results = [
                    bbox2result(det_bboxes, det_labels,
                                self.bbox_head.num_classes)
                    for det_bboxes, det_labels in bbox_list
                ]
            elif len(bbox_list[0]) == 3:
                if output_results == 2:
                    bbox_results = [
                        bbox2result(
                            det_bboxes,
                            det_labels,
                            self.bbox_head.num_classes,
                            kps=det_kps,
                            num_kps=self.bbox_head.NK)
                        for det_bboxes, det_labels, det_kps in bbox_list
                    ]
                elif output_results == 1:
                    bbox_results = [
                        bbox2result(det_bboxes, det_labels,
                                    self.bbox_head.num_classes)
                        for det_bboxes, det_labels, _ in bbox_list
                    ]
        else:
            bbox_results = None
        if var is not None:
            return bbox_results, var
        else:
            return bbox_results

    def feature_test(self, img):
        x = self.extract_feat(img)
--- a/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
+++ b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
@@ -0,0 +1,71 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from copy import deepcopy
 from typing import Any, Dict

 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['ScrfdDetect']


@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd)
 class ScrfdDetect(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the face detection model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)
        from mmcv import Config
        from mmcv.parallel import MMDataParallel
        from mmcv.runner import load_checkpoint
        from mmdet.models import build_detector
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
        cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py'))
        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
        cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
        detector = build_detector(cfg.model)
        logger.info(f'loading model from {ckpt_path}')
        device = torch.device(
            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
        load_checkpoint(detector, ckpt_path, map_location=device)
        detector = MMDataParallel(detector, device_ids=[0])
        detector.eval()
        self.detector = detector
        logger.info('load model done')

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        result = self.detector(
            return_loss=False,
            rescale=True,
            img=[input['img'][0].unsqueeze(0)],
            img_metas=[[dict(input['img_metas'][0].data)]],
            output_results=2)
        assert result is not None
        result = result[0][0]
        bboxes = result[:, :4].tolist()
        kpss = result[:, 5:].tolist()
        scores = result[:, 4].tolist()
        return {
            OutputKeys.SCORES: scores,
            OutputKeys.BOXES: bboxes,
            OutputKeys.KEYPOINTS: kpss
        }

    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        return input
--- a/modelscope/models/cv/hand_2d_keypoints/init.py
+++ b/modelscope/models/cv/hand_2d_keypoints/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .hand_2d_keypoints import Hand2dKeyPoints

 else:
    _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
+++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.pose import TopDown

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks


@MODELS.register_module(
    group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
 class Hand2dKeyPoints(EasyCVBaseModel, TopDown):

    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/human_wholebody_keypoint/init.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/init.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .human_wholebody_keypoint import HumanWholeBodyKeypoint

 else:
    _import_structure = {
        'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
@@ -0,0 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.pose.top_down import TopDown

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks


@MODELS.register_module(
    group_key=Tasks.human_wholebody_keypoint,
    module_name=Models.human_wholebody_keypoint)
 class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):

    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_body_reshaping/init.py
+++ b/modelscope/models/cv/image_body_reshaping/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .image_body_reshaping import ImageBodyReshaping

 else:
    _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
+++ b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
@@ -0,0 +1,128 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict

 import cv2
 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from .model import FlowGenerator
 from .person_info import PersonInfo
 from .pose_estimator.body import Body
 from .slim_utils import image_warp_grid1, resize_on_long_side

 logger = get_logger()

 __all__ = ['ImageBodyReshaping']


@MODELS.register_module(
    Tasks.image_body_reshaping, module_name=Models.image_body_reshaping)
 class ImageBodyReshaping(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the image body reshaping model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)

        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')

        self.degree = 1.0
        self.reshape_model = FlowGenerator(n_channels=16).to(self.device)
        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        checkpoints = torch.load(model_path, map_location=torch.device('cpu'))
        self.reshape_model.load_state_dict(
            checkpoints['state_dict'], strict=True)
        self.reshape_model.eval()
        logger.info('load body reshaping model done')

        pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth')
        self.pose_esti = Body(pose_model_ckpt, self.device)
        logger.info('load pose model done')

    def pred_joints(self, img):
        if img is None:
            return None
        small_src, resize_scale = resize_on_long_side(img, 300)
        body_joints = self.pose_esti(small_src)

        if body_joints.shape[0] >= 1:
            body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale

        return body_joints

    def pred_flow(self, img):

        body_joints = self.pred_joints(img)
        small_size = 1200

        if img.shape[0] > small_size or img.shape[1] > small_size:
            _img, _scale = resize_on_long_side(img, small_size)
            body_joints[:, :, :2] = body_joints[:, :, :2] * _scale
        else:
            _img = img

        # We only reshape one person
        if body_joints.shape[0] < 1 or body_joints.shape[0] > 1:
            return None

        person = PersonInfo(body_joints[0])

        with torch.no_grad():
            person_pred = person.pred_flow(_img, self.reshape_model,
                                           self.device)

        flow = np.dstack((person_pred['rDx'], person_pred['rDy']))

        scale = img.shape[0] * 1.0 / flow.shape[0]

        flow = cv2.resize(flow, (img.shape[1], img.shape[0]))
        flow *= scale

        return flow

    def warp(self, src_img, flow):

        X_flow = flow[..., 0]
        Y_flow = flow[..., 1]

        X_flow = np.ascontiguousarray(X_flow)
        Y_flow = np.ascontiguousarray(Y_flow)

        pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0)
        return pred

    def inference(self, img):
        img = img.cpu().numpy()
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        flow = self.pred_flow(img)

        if flow is None:
            return img

        assert flow.shape[:2] == img.shape[:2]

        mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8)
        mag -= 3
        mag[mag <= 0] = 0

        x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
        flow = np.dstack((x, y))

        flow *= self.degree
        pred = self.warp(img, flow)
        out_img = np.clip(pred, 0, 255)
        logger.info('model inference done')

        return out_img.astype(np.uint8)
--- a/modelscope/models/cv/image_body_reshaping/model.py
+++ b/modelscope/models/cv/image_body_reshaping/model.py
@@ -0,0 +1,189 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class ConvLayer(nn.Module):

    def __init__(self, in_ch, out_ch):
        super(ConvLayer, self).__init__()

        self.conv = nn.Sequential(
            nn.ReflectionPad2d(1),
            nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0),
            nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True))

    def forward(self, x):
        x = self.conv(x)
        return x


 class SASA(nn.Module):

    def __init__(self, in_dim):
        super(SASA, self).__init__()
        self.chanel_in = in_dim

        self.query_conv = nn.Conv2d(
            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.key_conv = nn.Conv2d(
            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(
            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
        self.mag_conv = nn.Conv2d(
            in_channels=5, out_channels=in_dim // 32, kernel_size=1)

        self.gamma = nn.Parameter(torch.zeros(1))

        self.softmax = nn.Softmax(dim=-1)  #
        self.sigmoid = nn.Sigmoid()

    def structure_encoder(self, paf_mag, target_height, target_width):
        torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True)
        torso_mask = torch.clamp(torso_mask, 0, 1)

        arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True)
        arms_mask = torch.clamp(arms_mask, 0, 1)

        legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True)
        legs_mask = torch.clamp(legs_mask, 0, 1)

        fg_mask = paf_mag[:, 12, :, :].unsqueeze(1)
        bg_mask = 1 - fg_mask
        Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask),
                      dim=1)
        Y = F.interpolate(Y, size=(target_height, target_width), mode='area')
        return Y

    def forward(self, X, PAF_mag):
        """extract self-attention features.
        Args:
            X : input feature maps( B x C x H x W)
            PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity

        Returns:
            out : self attention value + input feature
            Y: B X N X N (N is Width*Height)
        """

        m_batchsize, C, height, width = X.size()

        Y = self.structure_encoder(PAF_mag, height, width)

        connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1,
                                                      width * height)
        affinity = torch.bmm(
            connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec)
        affinity_centered = affinity - torch.mean(affinity)
        affinity_sigmoid = self.sigmoid(affinity_centered)

        proj_query = self.query_conv(X).view(m_batchsize, -1,
                                             width * height).permute(0, 2, 1)
        proj_key = self.key_conv(X).view(m_batchsize, -1, width * height)
        selfatten_map = torch.bmm(proj_query, proj_key)
        selfatten_centered = selfatten_map - torch.mean(
            selfatten_map)  # centering
        selfatten_sigmoid = self.sigmoid(selfatten_centered)

        SASA_map = selfatten_sigmoid * affinity_sigmoid

        proj_value = self.value_conv(X).view(m_batchsize, -1, width * height)

        out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1))
        out = out.view(m_batchsize, C, height, width)

        out = self.gamma * out + X
        return out, Y


 class FlowGenerator(nn.Module):

    def __init__(self, n_channels, deep_supervision=False):
        super(FlowGenerator, self).__init__()
        self.deep_supervision = deep_supervision

        self.Encoder = nn.Sequential(
            ConvLayer(n_channels, 64),
            ConvLayer(64, 64),
            nn.MaxPool2d(2),
            ConvLayer(64, 128),
            ConvLayer(128, 128),
            nn.MaxPool2d(2),
            ConvLayer(128, 256),
            ConvLayer(256, 256),
            nn.MaxPool2d(2),
            ConvLayer(256, 512),
            ConvLayer(512, 512),
            nn.MaxPool2d(2),
            ConvLayer(512, 1024),
            ConvLayer(1024, 1024),
            ConvLayer(1024, 1024),
            ConvLayer(1024, 1024),
            ConvLayer(1024, 1024),
        )

        self.SASA = SASA(in_dim=1024)

        self.Decoder = nn.Sequential(
            ConvLayer(1024, 1024),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
            ConvLayer(1024, 512),
            ConvLayer(512, 512),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
            ConvLayer(512, 256),
            ConvLayer(256, 256),
            ConvLayer(256, 128),
            ConvLayer(128, 64),
            ConvLayer(64, 32),
            nn.Conv2d(32, 2, kernel_size=1, padding=0),
            nn.Tanh(),
            nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
        )

        dilation_ksize = 17
        self.dilation = torch.nn.MaxPool2d(
            kernel_size=dilation_ksize,
            stride=1,
            padding=int((dilation_ksize - 1) / 2))

    def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2):
        n, c, h, w = x.size()
        yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
        xv = xv.float() / (w - 1) * 2.0 - 1
        yv = yv.float() / (h - 1) * 2.0 - 1
        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
        grid = grid.to(flow.device)
        grid_x = grid + 2 * flow * coff
        warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
        return warp_x

    def forward(self, img, skeleton_map, coef=0.2):
        """extract self-attention features.
        Args:
            img : input numpy image
            skeleton_map : skeleton map of input image
            coef: warp degree

        Returns:
            warp_x : warped image
            flow: predicted flow
        """

        img_concat = torch.cat((img, skeleton_map), dim=1)
        X = self.Encoder(img_concat)

        _, _, height, width = X.size()

        # directly get PAF magnitude from skeleton maps via dilation
        PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5)

        out, Y = self.SASA(X, PAF_mag)
        flow = self.Decoder(out)

        flow = flow.permute(0, 2, 3, 1)  # [n, 2, h, w] ==> [n, h, w, 2]

        warp_x = self.warp(img, flow, coff=coef)
        warp_x = torch.clamp(warp_x, min=-1.0, max=1.0)

        return warp_x, flow
--- a/modelscope/models/cv/image_body_reshaping/person_info.py
+++ b/modelscope/models/cv/image_body_reshaping/person_info.py
@@ -0,0 +1,339 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import copy

 import cv2
 import numpy as np
 import torch

 from .slim_utils import (enlarge_box_tblr, gen_skeleton_map,
                         get_map_fusion_map_cuda, get_mask_bbox,
                         resize_on_long_side)


 class PersonInfo(object):

    def __init__(self, joints):
        self.joints = joints
        self.flow = None
        self.pad_boder = False
        self.height_expand = 0
        self.width_expand = 0
        self.coeff = 0.2
        self.network_input_W = 256
        self.network_input_H = 256
        self.divider = 20
        self.flow_scales = ['upper_2']

    def update_attribute(self, pad_boder, height_expand, width_expand):
        self.pad_boder = pad_boder
        self.height_expand = height_expand
        self.width_expand = width_expand
        if pad_boder:
            self.joints[:, 0] += width_expand
            self.joints[:, 1] += height_expand

    def pred_flow(self, img, flow_net, device):
        with torch.no_grad():
            if img is None:
                print('image is none')
                self.flow = None

            if len(img.shape) == 2:
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

            if self.pad_boder:
                height_expand = self.height_expand
                width_expand = self.width_expand
                pad_img = cv2.copyMakeBorder(
                    img,
                    height_expand,
                    height_expand,
                    width_expand,
                    width_expand,
                    cv2.BORDER_CONSTANT,
                    value=(127, 127, 127))

            else:
                height_expand = 0
                width_expand = 0
                pad_img = img.copy()

            canvas = np.zeros(
                shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32)

            self.human_joint_box = self.__joint_to_body_box()

            self.human_box = enlarge_box_tblr(
                self.human_joint_box, pad_img, ratio=0.25)
            human_box_height = self.human_box[1] - self.human_box[0]
            human_box_width = self.human_box[3] - self.human_box[2]

            self.leg_joint_box = self.__joint_to_leg_box()
            self.leg_box = enlarge_box_tblr(
                self.leg_joint_box, pad_img, ratio=0.25)

            self.arm_joint_box = self.__joint_to_arm_box()
            self.arm_box = enlarge_box_tblr(
                self.arm_joint_box, pad_img, ratio=0.1)

            x_flows = []
            y_flows = []
            multi_bbox = []

            for scale in self.flow_scales:  # better for metric
                scale_value = float(scale.split('_')[-1])

                arm_box = copy.deepcopy(self.arm_box)

                if arm_box[0] is None:
                    arm_box = self.human_box

                arm_box_height = arm_box[1] - arm_box[0]
                arm_box_width = arm_box[3] - arm_box[2]

                roi_bbox = None

                if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1:
                    roi_bbox = self.human_box
                else:
                    arm_box = enlarge_box_tblr(
                        arm_box, pad_img, ratio=scale_value)
                    if scale == 'upper_0.2':
                        arm_box[0] = min(arm_box[0], int(self.joints[0][1]))
                    if scale.startswith('upper'):
                        roi_bbox = [
                            max(self.human_box[0], arm_box[0]),
                            min(self.human_box[1], arm_box[1]),
                            max(self.human_box[2], arm_box[2]),
                            min(self.human_box[3], arm_box[3])
                        ]
                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
                                3] - roi_bbox[2] < 1:
                            continue

                    elif scale.startswith('lower'):
                        roi_bbox = [
                            max(self.human_box[0], self.leg_box[0]),
                            min(self.human_box[1], self.leg_box[1]),
                            max(self.human_box[2], self.leg_box[2]),
                            min(self.human_box[3], self.leg_box[3])
                        ]

                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
                                3] - roi_bbox[2] < 1:
                            continue

                skel_map, roi_bbox = gen_skeleton_map(
                    self.joints, 'depth', input_roi_box=roi_bbox)

                if roi_bbox is None:
                    continue

                if skel_map.dtype != np.float32:
                    skel_map = skel_map.astype(np.float32)

                skel_map -= 1.0  # [0,2] ->[-1,1]

                multi_bbox.append(roi_bbox)

                roi_bbox_height = roi_bbox[1] - roi_bbox[0]
                roi_bbox_width = roi_bbox[3] - roi_bbox[2]

                assert skel_map.shape[0] == roi_bbox_height
                assert skel_map.shape[1] == roi_bbox_width
                roi_height_pad = roi_bbox_height // self.divider
                roi_width_pad = roi_bbox_width // self.divider
                paded_roi_h = roi_bbox_height + 2 * roi_height_pad
                paded_roi_w = roi_bbox_width + 2 * roi_width_pad

                roi_height_pad_joint = skel_map.shape[0] // self.divider
                roi_width_pad_joint = skel_map.shape[1] // self.divider
                skel_map = np.pad(
                    skel_map,
                    ((roi_height_pad_joint, roi_height_pad_joint),
                     (roi_width_pad_joint, roi_width_pad_joint), (0, 0)),
                    'constant',
                    constant_values=-1)

                skel_map_resized = cv2.resize(
                    skel_map, (self.network_input_W, self.network_input_H))

                skel_map_resized[skel_map_resized < 0] = -1.0
                skel_map_resized[skel_map_resized > -0.5] = 1.0
                skel_map_transformed = torch.from_numpy(
                    skel_map_resized.transpose((2, 0, 1)))

                roi_npy = pad_img[roi_bbox[0]:roi_bbox[1],
                                  roi_bbox[2]:roi_bbox[3], :].copy()
                if roi_npy.dtype != np.float32:
                    roi_npy = roi_npy.astype(np.float32)

                roi_npy = np.pad(roi_npy,
                                 ((roi_height_pad, roi_height_pad),
                                  (roi_width_pad, roi_width_pad), (0, 0)),
                                 'edge')

                roi_npy = roi_npy[:, :, ::-1]

                roi_npy = cv2.resize(
                    roi_npy, (self.network_input_W, self.network_input_H))

                roi_npy *= 1.0 / 255
                roi_npy -= 0.5
                roi_npy *= 2

                rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1)))

                rgb_tensor = rgb_tensor.unsqueeze(0).to(device)
                skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device)
                warped_img_val, flow_field_val = flow_net(
                    rgb_tensor, skel_map_tensor
                )  # inference, connectivity_mask [1,12,16,16]
                flow_field_val = flow_field_val.detach().squeeze().cpu().numpy(
                )

                flow_field_val = cv2.resize(
                    flow_field_val, (paded_roi_w, paded_roi_h),
                    interpolation=cv2.INTER_LINEAR)
                flow_field_val[..., 0] = flow_field_val[
                    ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff
                flow_field_val[..., 1] = flow_field_val[
                    ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff

                # remove pad areas
                flow_field_val = flow_field_val[
                    roi_height_pad:flow_field_val.shape[0] - roi_height_pad,
                    roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :]

                diffuse_width = max(roi_bbox_width // 3, 1)
                diffuse_height = max(roi_bbox_height // 3, 1)
                assert roi_bbox_width == flow_field_val.shape[1]
                assert roi_bbox_height == flow_field_val.shape[0]

                origin_flow = np.zeros(
                    (pad_img.shape[0] + 2 * diffuse_height,
                     pad_img.shape[1] + 2 * diffuse_width, 2),
                    dtype=np.float32)

                flow_field_val = np.pad(flow_field_val,
                                        ((diffuse_height, diffuse_height),
                                         (diffuse_width, diffuse_width),
                                         (0, 0)), 'linear_ramp')

                origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height,
                            roi_bbox[2]:roi_bbox[3]
                            + 2 * diffuse_width] = flow_field_val

                origin_flow = origin_flow[diffuse_height:-diffuse_height,
                                          diffuse_width:-diffuse_width, :]

                x_flows.append(origin_flow[..., 0])
                y_flows.append(origin_flow[..., 1])

            if len(x_flows) == 0:
                return {
                    'rDx': np.zeros(canvas.shape[:2], dtype=np.float32),
                    'rDy': np.zeros(canvas.shape[:2], dtype=np.float32),
                    'multi_bbox': multi_bbox,
                    'x_fusion_map':
                    np.ones(canvas.shape[:2], dtype=np.float32),
                    'y_fusion_map':
                    np.ones(canvas.shape[:2], dtype=np.float32)
                }
            else:
                origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow(
                    x_flows, y_flows, device=device)

            return {
                'rDx': origin_rDx,
                'rDy': origin_rDy,
                'multi_bbox': multi_bbox,
                'x_fusion_map': x_fusion_map,
                'y_fusion_map': y_fusion_map
            }

    @staticmethod
    def blend_multiscale_flow(x_flows, y_flows, device=None):
        scale_num = len(x_flows)
        if scale_num == 1:
            return x_flows[0], y_flows[0], np.ones_like(
                x_flows[0]), np.ones_like(x_flows[0])

        origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]),
                              dtype=np.float32)
        origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]),
                              dtype=np.float32)

        x_fusion_map, x_acc_map = get_map_fusion_map_cuda(
            x_flows, 1, device=device)
        y_fusion_map, y_acc_map = get_map_fusion_map_cuda(
            y_flows, 1, device=device)

        x_flow_map = 1.0 / x_fusion_map
        y_flow_map = 1.0 / y_fusion_map

        all_acc_map = x_acc_map + y_acc_map
        all_acc_map = all_acc_map.astype(np.uint8)
        roi_box = get_mask_bbox(all_acc_map, threshold=1)

        if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[
                3] - roi_box[2] <= 0:
            roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]]

        roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1],
                                    roi_box[2]:roi_box[3]]
        roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1],
                                    roi_box[2]:roi_box[3]]

        roi_width = roi_x_flow_map.shape[1]
        roi_height = roi_x_flow_map.shape[0]

        roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320)
        roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320)

        roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55))
        roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55))

        roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height))
        roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height))

        x_flow_map[roi_box[0]:roi_box[1],
                   roi_box[2]:roi_box[3]] = roi_x_flow_map
        y_flow_map[roi_box[0]:roi_box[1],
                   roi_box[2]:roi_box[3]] = roi_y_flow_map

        for i in range(scale_num):
            origin_rDx += x_flows[i]
            origin_rDy += y_flows[i]

        origin_rDx *= x_flow_map
        origin_rDy *= y_flow_map

        return origin_rDx, origin_rDy, x_flow_map, y_flow_map

    def __joint_to_body_box(self):
        joint_left = int(np.min(self.joints, axis=0)[0])
        joint_right = int(np.max(self.joints, axis=0)[0])
        joint_top = int(np.min(self.joints, axis=0)[1])
        joint_bottom = int(np.max(self.joints, axis=0)[1])
        return [joint_top, joint_bottom, joint_left, joint_right]

    def __joint_to_leg_box(self):
        leg_joints = self.joints[8:, :]
        if np.max(leg_joints, axis=0)[2] < 0.05:
            return [0, 0, 0, 0]
        joint_left = int(np.min(leg_joints, axis=0)[0])
        joint_right = int(np.max(leg_joints, axis=0)[0])
        joint_top = int(np.min(leg_joints, axis=0)[1])
        joint_bottom = int(np.max(leg_joints, axis=0)[1])
        return [joint_top, joint_bottom, joint_left, joint_right]

    def __joint_to_arm_box(self):
        arm_joints = self.joints[2:8, :]
        if np.max(arm_joints, axis=0)[2] < 0.05:
            return [0, 0, 0, 0]
        joint_left = int(np.min(arm_joints, axis=0)[0])
        joint_right = int(np.max(arm_joints, axis=0)[0])
        joint_top = int(np.min(arm_joints, axis=0)[1])
        joint_bottom = int(np.max(arm_joints, axis=0)[1])
        return [joint_top, joint_bottom, joint_left, joint_right]
--- a/modelscope/models/cv/image_body_reshaping/pose_estimator/init.py
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/init.py
--- a/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
@@ -0,0 +1,272 @@
 # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.

 import math

 import cv2
 import numpy as np
 import torch
 from scipy.ndimage.filters import gaussian_filter

 from .model import BodyposeModel
 from .util import pad_rightdown_corner, transfer


 class Body(object):

    def __init__(self, model_path, device):
        self.model = BodyposeModel().to(device)
        model_dict = transfer(self.model, torch.load(model_path))
        self.model.load_state_dict(model_dict)
        self.model.eval()

    def __call__(self, oriImg):
        scale_search = [0.5]
        boxsize = 368
        stride = 8
        padValue = 128
        thre1 = 0.1
        thre2 = 0.05
        bodyparts = 18
        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))

        for m in range(len(multiplier)):
            scale = multiplier[m]
            imageToTest = cv2.resize(
                oriImg, (0, 0),
                fx=scale,
                fy=scale,
                interpolation=cv2.INTER_CUBIC)
            imageToTest_padded, pad = pad_rightdown_corner(
                imageToTest, stride, padValue)
            im = np.transpose(
                np.float32(imageToTest_padded[:, :, :, np.newaxis]),
                (3, 2, 0, 1)) / 256 - 0.5
            im = np.ascontiguousarray(im)

            data = torch.from_numpy(im).float()
            if torch.cuda.is_available():
                data = data.cuda()
            with torch.no_grad():
                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()

            # extract outputs, resize, and remove padding
            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
                                   (1, 2, 0))  # output 1 is heatmaps
            heatmap = cv2.resize(
                heatmap, (0, 0),
                fx=stride,
                fy=stride,
                interpolation=cv2.INTER_CUBIC)
            heatmap = heatmap[:imageToTest_padded.shape[0]
                              - pad[2], :imageToTest_padded.shape[1]
                              - pad[3], :]
            heatmap = cv2.resize(
                heatmap, (oriImg.shape[1], oriImg.shape[0]),
                interpolation=cv2.INTER_CUBIC)

            paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
                               (1, 2, 0))  # output 0 is PAFs
            paf = cv2.resize(
                paf, (0, 0),
                fx=stride,
                fy=stride,
                interpolation=cv2.INTER_CUBIC)
            paf = paf[:imageToTest_padded.shape[0]
                      - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
            paf = cv2.resize(
                paf, (oriImg.shape[1], oriImg.shape[0]),
                interpolation=cv2.INTER_CUBIC)

            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
            paf_avg += +paf / len(multiplier)

        all_peaks = []
        peak_counter = 0

        for part in range(bodyparts):
            map_ori = heatmap_avg[:, :, part]
            one_heatmap = gaussian_filter(map_ori, sigma=3)

            map_left = np.zeros(one_heatmap.shape)
            map_left[1:, :] = one_heatmap[:-1, :]
            map_right = np.zeros(one_heatmap.shape)
            map_right[:-1, :] = one_heatmap[1:, :]
            map_up = np.zeros(one_heatmap.shape)
            map_up[:, 1:] = one_heatmap[:, :-1]
            map_down = np.zeros(one_heatmap.shape)
            map_down[:, :-1] = one_heatmap[:, 1:]

            peaks_binary = np.logical_and.reduce(
                (one_heatmap >= map_left, one_heatmap >= map_right,
                 one_heatmap >= map_up, one_heatmap >= map_down,
                 one_heatmap > thre1))
            peaks = list(
                zip(np.nonzero(peaks_binary)[1],
                    np.nonzero(peaks_binary)[0]))  # note reverse
            peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
            peak_id = range(peak_counter, peak_counter + len(peaks))
            peaks_with_score_and_id = [
                peaks_with_score[i] + (peak_id[i], )
                for i in range(len(peak_id))
            ]

            all_peaks.append(peaks_with_score_and_id)
            peak_counter += len(peaks)

        # find connection in the specified sequence, center 29 is in the position 15
        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
                   [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
                   [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
        # the middle joints heatmap correpondence
        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
                  [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
                  [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
                  [45, 46]]

        connection_all = []
        special_k = []
        mid_num = 10

        for k in range(len(mapIdx)):
            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
            candA = all_peaks[limbSeq[k][0] - 1]
            candB = all_peaks[limbSeq[k][1] - 1]
            nA = len(candA)
            nB = len(candB)
            if (nA != 0 and nB != 0):
                connection_candidate = []
                for i in range(nA):
                    for j in range(nB):
                        vec = np.subtract(candB[j][:2], candA[i][:2])
                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
                        norm = max(0.001, norm)
                        vec = np.divide(vec, norm)

                        startend = list(
                            zip(
                                np.linspace(
                                    candA[i][0], candB[j][0], num=mid_num),
                                np.linspace(
                                    candA[i][1], candB[j][1], num=mid_num)))

                        vec_x = np.array([
                            score_mid[int(round(startend[item][1])),
                                      int(round(startend[item][0])), 0]
                            for item in range(len(startend))
                        ])
                        vec_y = np.array([
                            score_mid[int(round(startend[item][1])),
                                      int(round(startend[item][0])), 1]
                            for item in range(len(startend))
                        ])

                        score_midpts = np.multiply(
                            vec_x, vec[0]) + np.multiply(vec_y, vec[1])
                        temp1 = sum(score_midpts) / len(score_midpts)
                        temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0)
                        score_with_dist_prior = temp1 + temp2
                        criterion1 = len(np.nonzero(
                            score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
                        criterion2 = score_with_dist_prior > 0
                        if criterion1 and criterion2:
                            connection_candidate.append([
                                i, j, score_with_dist_prior,
                                score_with_dist_prior + candA[i][2]
                                + candB[j][2]
                            ])

                connection_candidate = sorted(
                    connection_candidate, key=lambda x: x[2], reverse=True)
                connection = np.zeros((0, 5))
                for c in range(len(connection_candidate)):
                    i, j, s = connection_candidate[c][0:3]
                    if (i not in connection[:, 3]
                            and j not in connection[:, 4]):
                        connection = np.vstack(
                            [connection, [candA[i][3], candB[j][3], s, i, j]])
                        if (len(connection) >= min(nA, nB)):
                            break

                connection_all.append(connection)
            else:
                special_k.append(k)
                connection_all.append([])

        # last number in each row is the total parts number of that person
        # the second last number in each row is the score of the overall configuration
        subset = -1 * np.ones((0, 20))
        candidate = np.array(
            [item for sublist in all_peaks for item in sublist])

        for k in range(len(mapIdx)):
            if k not in special_k:
                partAs = connection_all[k][:, 0]
                partBs = connection_all[k][:, 1]
                indexA, indexB = np.array(limbSeq[k]) - 1

                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
                    found = 0
                    subset_idx = [-1, -1]
                    for j in range(len(subset)):  # 1:size(subset,1):
                        if subset[j][indexA] == partAs[i] or subset[j][
                                indexB] == partBs[i]:
                            subset_idx[found] = j
                            found += 1

                    if found == 1:
                        j = subset_idx[0]
                        if subset[j][indexB] != partBs[i]:
                            subset[j][indexB] = partBs[i]
                            subset[j][-1] += 1
                            subset[j][-2] += candidate[
                                partBs[i].astype(int),
                                2] + connection_all[k][i][2]
                    elif found == 2:  # if found 2 and disjoint, merge them
                        j1, j2 = subset_idx
                        tmp1 = (subset[j1] >= 0).astype(int)
                        tmp2 = (subset[j2] >= 0).astype(int)
                        membership = (tmp1 + tmp2)[:-2]
                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
                            subset[j1][:-2] += (subset[j2][:-2] + 1)
                            subset[j1][-2:] += subset[j2][-2:]
                            subset[j1][-2] += connection_all[k][i][2]
                            subset = np.delete(subset, j2, 0)
                        else:  # as like found == 1
                            subset[j1][indexB] = partBs[i]
                            subset[j1][-1] += 1
                            subset[j1][-2] += candidate[
                                partBs[i].astype(int),
                                2] + connection_all[k][i][2]

                    # if find no partA in the subset, create a new subset
                    elif not found and k < 17:
                        row = -1 * np.ones(20)
                        row[indexA] = partAs[i]
                        row[indexB] = partBs[i]
                        row[-1] = 2
                        row[-2] = sum(
                            candidate[connection_all[k][i, :2].astype(int),
                                      2]) + connection_all[k][i][2]
                        subset = np.vstack([subset, row])
        # delete some rows of subset which has few parts occur
        deleteIdx = []
        for i in range(len(subset)):
            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
                deleteIdx.append(i)
        subset = np.delete(subset, deleteIdx, axis=0)

        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
        # candidate: x, y, score, id
        count = subset.shape[0]
        joints = np.zeros(shape=(count, bodyparts, 3))

        for i in range(count):
            for j in range(bodyparts):
                joints[i, j, :3] = candidate[int(subset[i, j]), :3]
                confidence = 1.0 if subset[i, j] >= 0 else 0.0
                joints[i, j, 2] *= confidence
        return joints
--- a/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
@@ -0,0 +1,141 @@
 # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.

 from collections import OrderedDict

 import torch
 import torch.nn as nn


 def make_layers(block, no_relu_layers):
    layers = []
    for layer_name, v in block.items():
        if 'pool' in layer_name:
            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
            layers.append((layer_name, layer))
        else:
            conv2d = nn.Conv2d(
                in_channels=v[0],
                out_channels=v[1],
                kernel_size=v[2],
                stride=v[3],
                padding=v[4])
            layers.append((layer_name, conv2d))
            if layer_name not in no_relu_layers:
                layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))

    return nn.Sequential(OrderedDict(layers))


 class BodyposeModel(nn.Module):

    def __init__(self):
        super(BodyposeModel, self).__init__()

        # these layers have no relu layer
        no_relu_layers = [
            'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
            'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
            'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
            'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
        ]
        blocks = {}
        block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
                              ('conv1_2', [64, 64, 3, 1, 1]),
                              ('pool1_stage1', [2, 2, 0]),
                              ('conv2_1', [64, 128, 3, 1, 1]),
                              ('conv2_2', [128, 128, 3, 1, 1]),
                              ('pool2_stage1', [2, 2, 0]),
                              ('conv3_1', [128, 256, 3, 1, 1]),
                              ('conv3_2', [256, 256, 3, 1, 1]),
                              ('conv3_3', [256, 256, 3, 1, 1]),
                              ('conv3_4', [256, 256, 3, 1, 1]),
                              ('pool3_stage1', [2, 2, 0]),
                              ('conv4_1', [256, 512, 3, 1, 1]),
                              ('conv4_2', [512, 512, 3, 1, 1]),
                              ('conv4_3_CPM', [512, 256, 3, 1, 1]),
                              ('conv4_4_CPM', [256, 128, 3, 1, 1])])

        # Stage 1
        block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
                                ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
                                ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
                                ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
                                ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])

        block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
                                ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
                                ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
                                ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
                                ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
        blocks['block1_1'] = block1_1
        blocks['block1_2'] = block1_2

        self.model0 = make_layers(block0, no_relu_layers)

        # Stages 2 - 6
        for i in range(2, 7):
            blocks['block%d_1' % i] = OrderedDict([
                ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
                ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
                ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
                ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
                ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
                ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
                ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
            ])

            blocks['block%d_2' % i] = OrderedDict([
                ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
                ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
                ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
                ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
                ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
                ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
                ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
            ])

        for k in blocks.keys():
            blocks[k] = make_layers(blocks[k], no_relu_layers)

        self.model1_1 = blocks['block1_1']
        self.model2_1 = blocks['block2_1']
        self.model3_1 = blocks['block3_1']
        self.model4_1 = blocks['block4_1']
        self.model5_1 = blocks['block5_1']
        self.model6_1 = blocks['block6_1']

        self.model1_2 = blocks['block1_2']
        self.model2_2 = blocks['block2_2']
        self.model3_2 = blocks['block3_2']
        self.model4_2 = blocks['block4_2']
        self.model5_2 = blocks['block5_2']
        self.model6_2 = blocks['block6_2']

    def forward(self, x):

        out1 = self.model0(x)

        out1_1 = self.model1_1(out1)
        out1_2 = self.model1_2(out1)
        out2 = torch.cat([out1_1, out1_2, out1], 1)

        out2_1 = self.model2_1(out2)
        out2_2 = self.model2_2(out2)
        out3 = torch.cat([out2_1, out2_2, out1], 1)

        out3_1 = self.model3_1(out3)
        out3_2 = self.model3_2(out3)
        out4 = torch.cat([out3_1, out3_2, out1], 1)

        out4_1 = self.model4_1(out4)
        out4_2 = self.model4_2(out4)
        out5 = torch.cat([out4_1, out4_2, out1], 1)

        out5_1 = self.model5_1(out5)
        out5_2 = self.model5_2(out5)
        out6 = torch.cat([out5_1, out5_2, out1], 1)

        out6_1 = self.model6_1(out6)
        out6_2 = self.model6_2(out6)

        return out6_1, out6_2
--- a/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
@@ -0,0 +1,33 @@
 # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
 import numpy as np


 def pad_rightdown_corner(img, stride, padValue):
    h = img.shape[0]
    w = img.shape[1]

    pad = 4 * [None]
    pad[0] = 0  # up
    pad[1] = 0  # left
    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right

    img_padded = img
    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
    img_padded = np.concatenate((pad_up, img_padded), axis=0)
    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
    img_padded = np.concatenate((pad_left, img_padded), axis=1)
    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
    img_padded = np.concatenate((img_padded, pad_down), axis=0)
    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
    img_padded = np.concatenate((img_padded, pad_right), axis=1)

    return img_padded, pad


 def transfer(model, model_weights):
    transfered_model_weights = {}
    for weights_name in model.state_dict().keys():
        transfered_model_weights[weights_name] = model_weights['.'.join(
            weights_name.split('.')[1:])]
    return transfered_model_weights
--- a/modelscope/models/cv/image_body_reshaping/slim_utils.py
+++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py
@@ -0,0 +1,507 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import math
 import os
 import random

 import cv2
 import numba
 import numpy as np
 import torch


 def resize_on_long_side(img, long_side=800):
    src_height = img.shape[0]
    src_width = img.shape[1]

    if src_height > src_width:
        scale = long_side * 1.0 / src_height
        _img = cv2.resize(
            img, (int(src_width * scale), long_side),
            interpolation=cv2.INTER_LINEAR)
    else:
        scale = long_side * 1.0 / src_width
        _img = cv2.resize(
            img, (long_side, int(src_height * scale)),
            interpolation=cv2.INTER_LINEAR)

    return _img, scale


 def point_in_box(pt, box):
    pt_x = pt[0]
    pt_y = pt[1]

    if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[
            1] and pt_y <= box[1] + box[3]:
        return True
    else:
        return False


 def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True):
    if roi_bbox is None or None in roi_bbox:
        return [None, None, None, None]

    top = roi_bbox[0]
    bottom = roi_bbox[1]
    left = roi_bbox[2]
    right = roi_bbox[3]

    roi_width = roi_bbox[3] - roi_bbox[2]
    roi_height = roi_bbox[1] - roi_bbox[0]
    right = left + roi_width
    bottom = top + roi_height

    long_side = roi_width if roi_width > roi_height else roi_height

    if use_long_side:
        new_left = left - int(long_side * ratio)
    else:
        new_left = left - int(roi_width * ratio)
    new_left = 1 if new_left < 0 else new_left

    if use_long_side:
        new_top = top - int(long_side * ratio)
    else:
        new_top = top - int(roi_height * ratio)
    new_top = 1 if new_top < 0 else new_top

    if use_long_side:
        new_right = right + int(long_side * ratio)
    else:
        new_right = right + int(roi_width * ratio)
    new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right

    if use_long_side:
        new_bottom = bottom + int(long_side * ratio)
    else:
        new_bottom = bottom + int(roi_height * ratio)
    new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom

    bbox = [new_top, new_bottom, new_left, new_right]
    return bbox


 def gen_PAF(image, joints):

    assert joints.shape[0] == 18
    assert joints.shape[1] == 3

    org_h = image.shape[0]
    org_w = image.shape[1]
    small_image, resize_scale = resize_on_long_side(image, 120)

    joints[:, :2] = joints[:, :2] * resize_scale

    joint_left = int(np.min(joints, axis=0)[0])
    joint_right = int(np.max(joints, axis=0)[0])
    joint_top = int(np.min(joints, axis=0)[1])
    joint_bottom = int(np.max(joints, axis=0)[1])

    limb_width = min(
        abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6

    if limb_width % 2 == 0:
        limb_width += 1
    kernel_size = limb_width

    part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12),
                   (12, 13), (8, 9), (9, 10)]

    map_list = []
    mask_list = []
    PAF_all = np.zeros(
        shape=(small_image.shape[0], small_image.shape[1], 2),
        dtype=np.float32)
    for c, pair in enumerate(part_orders):
        idx_a_name = pair[0]
        idx_b_name = pair[1]

        jointa = joints[idx_a_name]
        jointb = joints[idx_b_name]

        confidence_threshold = 0.05
        if jointa[2] > confidence_threshold and jointb[
                2] > confidence_threshold:
            canvas = np.zeros(
                shape=(small_image.shape[0], small_image.shape[1]),
                dtype=np.uint8)

            canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])),
                              (int(jointb[0]), int(jointb[1])),
                              (255, 255, 255), 5)

            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
                                               (kernel_size, kernel_size))

            canvas = cv2.dilate(canvas, kernel, 1)
            canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0)
            canvas = canvas.astype(np.float32) / 255
            PAF = np.zeros(
                shape=(small_image.shape[0], small_image.shape[1], 2),
                dtype=np.float32)
            PAF[..., 0] = jointb[0] - jointa[0]
            PAF[..., 1] = jointb[1] - jointa[1]
            mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1])
            PAF /= (np.dstack((mag, mag)) + 1e-5)

            single_PAF = PAF * np.dstack((canvas, canvas))
            map_list.append(
                cv2.GaussianBlur(single_PAF,
                                 (kernel_size * 3, kernel_size * 3), 0))

            mask_list.append(
                cv2.GaussianBlur(canvas.copy(),
                                 (kernel_size * 3, kernel_size * 3), 0))
            PAF_all = PAF_all * (1.0 - np.dstack(
                (canvas, canvas))) + single_PAF

    PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0)
    PAF_all = cv2.resize(
        PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR)
    map_list.append(PAF_all)
    return PAF_all, map_list, mask_list


 def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None):
    if type(joints) == list:
        joints = np.array(joints)
    assert stack_mode == 'column' or stack_mode == 'depth'

    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]

    def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0):
        jointa = joints[a]
        jointb = joints[b]

        temp1 = int((jointa[0] - x_offset) * scale)
        temp2 = int((jointa[1] - y_offset) * scale)
        temp3 = int((jointb[0] - x_offset) * scale)
        temp4 = int((jointb[1] - y_offset) * scale)

        cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width)

    roi_box = input_roi_box

    roi_box_width = roi_box[3] - roi_box[2]
    roi_box_height = roi_box[1] - roi_box[0]
    short_side_length = min(roi_box_width, roi_box_height)
    line_width = short_side_length // 30

    line_width = max(line_width, 2)

    map_cube = np.zeros(
        shape=(roi_box_height, roi_box_width, len(part_orders) + 1),
        dtype=np.float32)

    use_line_width = min(5, line_width)
    fx = use_line_width * 1.0 / line_width  # fx 最大值为1

    if fx < 0.99:
        map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx)

    for c, pair in enumerate(part_orders):
        tmp = map_cube[..., c].copy()
        link(
            tmp,
            pair[0],
            pair[1], (2.0, 2.0, 2.0),
            use_line_width,
            scale=fx,
            x_offset=roi_box[2],
            y_offset=roi_box[0])
        map_cube[..., c] = tmp

        tmp = map_cube[..., -1].copy()
        link(
            tmp,
            pair[0],
            pair[1], (2.0, 2.0, 2.0),
            use_line_width,
            scale=fx,
            x_offset=roi_box[2],
            y_offset=roi_box[0])
        map_cube[..., -1] = tmp

    map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height))

    if stack_mode == 'depth':
        return map_cube, roi_box
    elif stack_mode == 'column':
        joint_maps = []
        for c in range(len(part_orders) + 1):
            joint_maps.append(map_cube[..., c])
        joint_map = np.column_stack(joint_maps)

        return joint_map, roi_box


 def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    tl = line_thickness or round(
        0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(
            img,
            label, (c1[0], c1[1] - 2),
            0,
            tl / 3, [225, 255, 255],
            thickness=tf,
            lineType=cv2.LINE_AA)


 def draw_line(im, points, color, stroke_size=2, closed=False):
    points = points.astype(np.int32)
    for i in range(len(points) - 1):
        cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
                 stroke_size)
    if closed:
        cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)


 def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
    left = bbox[0]
    top = bbox[1]

    right = bbox[2]
    bottom = bbox[3]

    roi_width = right - left
    roi_height = bottom - top

    new_left = left - int(roi_width * enlarge_ratio)
    new_left = 0 if new_left < 0 else new_left

    new_top = top - int(roi_height * enlarge_ratio)
    new_top = 0 if new_top < 0 else new_top

    new_right = right + int(roi_width * enlarge_ratio)
    new_right = img_width if new_right > img_width else new_right

    new_bottom = bottom + int(roi_height * enlarge_ratio)
    new_bottom = img_height if new_bottom > img_height else new_bottom

    bbox = [new_left, new_top, new_right, new_bottom]

    bbox = [int(x) for x in bbox]

    return bbox


 def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')):
    map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list]
    map_concat = torch.stack(tuple(map_list_cuda), dim=-1)

    map_concat = torch.abs(map_concat)

    map_concat[map_concat < threshold] = 0
    map_concat[map_concat > 1e-5] = 1.0

    sum_map = torch.sum(map_concat, dim=2)
    a = torch.ones_like(sum_map)
    acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map))

    fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map)

    fusion_map = fusion_map.float()
    acc_map = acc_map.float()

    fusion_map = fusion_map.cpu().numpy().astype(np.float32)
    acc_map = acc_map.cpu().numpy().astype(np.float32)

    return fusion_map, acc_map


 def gen_border_shade(height, width, height_band, width_band):
    height_ratio = height_band * 1.0 / height
    width_ratio = width_band * 1.0 / width

    _height_band = int(256 * height_ratio)
    _width_band = int(256 * width_ratio)

    canvas = np.zeros((256, 256), dtype=np.float32)

    canvas[_height_band // 2:-_height_band // 2,
           _width_band // 2:-_width_band // 2] = 1.0

    canvas = cv2.blur(canvas, (_height_band, _width_band))

    canvas = cv2.resize(canvas, (width, height))

    return canvas


 def get_mask_bbox(mask, threshold=127):
    ret, mask = cv2.threshold(mask, threshold, 1, 0)

    if cv2.countNonZero(mask) == 0:
        return [None, None, None, None]

    col_acc = np.sum(mask, 0)
    row_acc = np.sum(mask, 1)

    col_acc = col_acc.tolist()
    row_acc = row_acc.tolist()

    for x in range(len(col_acc)):
        if col_acc[x] > 0:
            left = x
            break

    for x in range(1, len(col_acc)):
        if col_acc[-x] > 0:
            right = len(col_acc) - x
            break

    for x in range(len(row_acc)):
        if row_acc[x] > 0:
            top = x
            break

    for x in range(1, len(row_acc)):
        if row_acc[-x] > 0:
            bottom = len(row_acc[::-1]) - x
            break
    return [top, bottom, left, right]


 def visualize_flow(flow):
    h, w = flow.shape[:2]
    hsv = np.zeros((h, w, 3), np.uint8)
    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])

    hsv[..., 0] = ang * 180 / np.pi / 2
    hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
    hsv[..., 2] = 255
    bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    bgr = bgr * 1.0 / 255
    return bgr.astype(np.float32)


 def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1):

    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]

    abandon_idxs = [0, 1, 14, 15, 16, 17]
    # draw joints
    for i, joint in enumerate(joints):
        if i in abandon_idxs:
            continue
        if joint[-1] > confidence_threshold:

            cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2)
            if show_text:
                cv2.putText(image,
                            str(i) + '[{:.2f}]'.format(joint[-1]),
                            (int(joint[0]), int(joint[1])),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    # draw link
    for pair in part_orders:
        if joints[pair[0]][-1] > confidence_threshold and joints[
                pair[1]][-1] > confidence_threshold:
            cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])),
                     (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color,
                     2)
    return image


 def get_heatmap_cv(img, magn, max_flow_mag):
    min_flow_mag = .5
    cv_magn = np.clip(
        255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7),
        a_min=0,
        a_max=255).astype(np.uint8)
    if img.dtype != np.uint8:
        img = (255 * img).astype(np.uint8)

    heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET)
    heatmap_img = heatmap_img[..., ::-1]

    h, w = magn.shape
    img_alpha = np.ones((h, w), dtype=np.double)[:, :, None]
    heatmap_alpha = np.clip(
        magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7
    heatmap_alpha[heatmap_alpha < .2]**.5
    pm_hm = heatmap_img * heatmap_alpha
    pm_img = img * img_alpha
    cv_out = pm_hm + pm_img * (1 - heatmap_alpha)
    cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8)

    return cv_out


 def save_heatmap_cv(img, flow, supression=2):

    flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
    flow_magn -= supression
    flow_magn[flow_magn <= 0] = 0
    cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
    return cv_out


@numba.jit(nopython=True, parallel=False)
 def bilinear_interp(x, y, v11, v12, v21, v22):
    temp1 = (v11 * (1 - y) + v12 * y) * (1 - x)
    temp2 = (v21 * (1 - y) + v22 * y) * x
    result = temp1 + temp2
    return result


@numba.jit(nopython=True, parallel=False)
 def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand,
                     height_expand):
    srcW = oriImg.shape[1]
    srcH = oriImg.shape[0]

    newImg = oriImg.copy()

    for i in range(srcH):
        for j in range(srcW):
            _i = i
            _j = j

            deltaX = rDx[_i, _j]
            deltaY = rDy[_i, _j]

            nx = _j + deltaX * transRatio
            ny = _i + deltaY * transRatio

            if nx >= srcW - width_expand - 1:
                if nx > srcW - 1:
                    nx = srcW - 1

            if ny >= srcH - height_expand - 1:
                if ny > srcH - 1:
                    ny = srcH - 1

            if nx < width_expand:
                if nx < 0:
                    nx = 0

            if ny < height_expand:
                if ny < 0:
                    ny = 0

            nxi = int(math.floor(nx))
            nyi = int(math.floor(ny))
            nxi1 = int(math.ceil(nx))
            nyi1 = int(math.ceil(ny))

            for ll in range(3):
                newImg[_i, _j,
                       ll] = bilinear_interp(ny - nyi, nx - nxi,
                                             oriImg[nyi, nxi,
                                                    ll], oriImg[nyi, nxi1, ll],
                                             oriImg[nyi1, nxi,
                                                    ll], oriImg[nyi1, nxi1,
                                                                ll])
    return newImg
--- a/modelscope/models/cv/image_color_enhance/csrnet.py
+++ b/modelscope/models/cv/image_color_enhance/csrnet.py
@@ -1,3 +1,6 @@
 # The implementation is adopted from Jingwen He,
 # made publicly available at https://github.com/hejingwenhejingwen/CSRNet

 import functools
 import math

--- a/modelscope/models/cv/image_color_enhance/image_color_enhance.py
+++ b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from copy import deepcopy
 from typing import Dict, Union
--- a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
+++ b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
@@ -1,3 +1,8 @@
 # ------------------------------------------------------------------------
 # Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py
 # Copyright (c) 2022 megvii-model. All Rights Reserved.
 # ------------------------------------------------------------------------

 import numpy as np
 import torch
 import torch.nn as nn
--- a/modelscope/models/cv/image_denoise/nafnet/arch_util.py
+++ b/modelscope/models/cv/image_denoise/nafnet/arch_util.py
@@ -1,3 +1,8 @@
 # ------------------------------------------------------------------------
 # Modified from BasicSR (https://github.com/xinntao/BasicSR)
 # Copyright 2018-2020 BasicSR Authors
 # ------------------------------------------------------------------------

 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -1,8 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from copy import deepcopy
 from typing import Any, Dict, Union

 import numpy as np
 import torch.cuda
 from torch.nn.parallel import DataParallel, DistributedDataParallel

@@ -77,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel):
    def _evaluate_postprocess(self, input: Tensor,
                              target: Tensor) -> Dict[str, list]:
        preds = self.model(input)
        preds = list(torch.split(preds, 1, 0))
        targets = list(torch.split(target, 1, 0))

        preds = [(pred.data * 255.).squeeze(0).permute(
            1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds]
        targets = [(target.data * 255.).squeeze(0).permute(
            1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets]
        preds = list(torch.split(preds.clamp(0, 1), 1, 0))
        targets = list(torch.split(target.clamp(0, 1), 1, 0))

        return {'pred': preds, 'target': targets}

--- a/modelscope/msdatasets/image_denoise_data/init.py
+++ b/modelscope/msdatasets/image_denoise_data/init.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .image_denoise_dataset import PairedImageDataset
    from .model import FFTInpainting

 else:
    _import_structure = {
        'image_denoise_dataset': ['PairedImageDataset'],
        'model': ['FFTInpainting'],
    }

    import sys
--- a/modelscope/models/cv/image_inpainting/base.py
+++ b/modelscope/models/cv/image_inpainting/base.py
@@ -0,0 +1,75 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 from typing import Dict, Tuple

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from modelscope.utils.logger import get_logger
 from .modules.adversarial import NonSaturatingWithR1
 from .modules.ffc import FFCResNetGenerator
 from .modules.perceptual import ResNetPL
 from .modules.pix2pixhd import NLayerDiscriminator

 LOGGER = get_logger()


 class BaseInpaintingTrainingModule(nn.Module):

    def __init__(self,
                 model_dir='',
                 use_ddp=True,
                 predict_only=False,
                 visualize_each_iters=100,
                 average_generator=False,
                 generator_avg_beta=0.999,
                 average_generator_start_step=30000,
                 average_generator_period=10,
                 store_discr_outputs_for_vis=False,
                 **kwargs):
        super().__init__()
        LOGGER.info(
            f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}'
        )

        self.generator = FFCResNetGenerator()
        self.use_ddp = use_ddp

        if not predict_only:
            self.discriminator = NLayerDiscriminator()
            self.adversarial_loss = NonSaturatingWithR1(
                weight=10,
                gp_coef=0.001,
                mask_as_fake_target=True,
                allow_scale_mask=True)

            self.average_generator = average_generator
            self.generator_avg_beta = generator_avg_beta
            self.average_generator_start_step = average_generator_start_step
            self.average_generator_period = average_generator_period
            self.generator_average = None
            self.last_generator_averaging_step = -1
            self.store_discr_outputs_for_vis = store_discr_outputs_for_vis

            self.loss_l1 = nn.L1Loss(reduction='none')

            self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir)

        self.visualize_each_iters = visualize_each_iters
        LOGGER.info('BaseInpaintingTrainingModule init done')

    def forward(self, batch: Dict[str,
                                  torch.Tensor]) -> Dict[str, torch.Tensor]:
        """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
        raise NotImplementedError()

    def generator_loss(self,
                       batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        raise NotImplementedError()

    def discriminator_loss(
            self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        raise NotImplementedError()
--- a/modelscope/models/cv/image_inpainting/default.py
+++ b/modelscope/models/cv/image_inpainting/default.py
@@ -0,0 +1,210 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 import bisect

 import torch
 import torch.nn.functional as F

 from modelscope.utils.logger import get_logger
 from .base import BaseInpaintingTrainingModule
 from .modules.feature_matching import feature_matching_loss, masked_l1_loss

 LOGGER = get_logger()


 def set_requires_grad(module, value):
    for param in module.parameters():
        param.requires_grad = value


 def add_prefix_to_keys(dct, prefix):
    return {prefix + k: v for k, v in dct.items()}


 class LinearRamp:

    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
        self.start_value = start_value
        self.end_value = end_value
        self.start_iter = start_iter
        self.end_iter = end_iter

    def __call__(self, i):
        if i < self.start_iter:
            return self.start_value
        if i >= self.end_iter:
            return self.end_value
        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
        return self.start_value * (1 - part) + self.end_value * part


 class LadderRamp:

    def __init__(self, start_iters, values):
        self.start_iters = start_iters
        self.values = values
        assert len(values) == len(start_iters) + 1, (len(values),
                                                     len(start_iters))

    def __call__(self, i):
        segment_i = bisect.bisect_right(self.start_iters, i)
        return self.values[segment_i]


 def get_ramp(kind='ladder', **kwargs):
    if kind == 'linear':
        return LinearRamp(**kwargs)
    if kind == 'ladder':
        return LadderRamp(**kwargs)
    raise ValueError(f'Unexpected ramp kind: {kind}')


 class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):

    def __init__(self,
                 model_dir='',
                 predict_only=False,
                 concat_mask=True,
                 rescale_scheduler_kwargs=None,
                 image_to_discriminator='predicted_image',
                 add_noise_kwargs=None,
                 noise_fill_hole=False,
                 const_area_crop_kwargs=None,
                 distance_weighter_kwargs=None,
                 distance_weighted_mask_for_discr=False,
                 fake_fakes_proba=0,
                 fake_fakes_generator_kwargs=None,
                 **kwargs):
        super().__init__(model_dir=model_dir, predict_only=predict_only)
        self.concat_mask = concat_mask
        self.rescale_size_getter = get_ramp(
            **rescale_scheduler_kwargs
        ) if rescale_scheduler_kwargs is not None else None
        self.image_to_discriminator = image_to_discriminator
        self.add_noise_kwargs = add_noise_kwargs
        self.noise_fill_hole = noise_fill_hole
        self.const_area_crop_kwargs = const_area_crop_kwargs
        self.refine_mask_for_losses = None
        self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr

        self.feature_matching_weight = 100
        self.losses_l1_weight_known = 10
        self.losses_l1_weight_missing = 0
        self.fake_fakes_proba = fake_fakes_proba

    def forward(self, batch):
        img = batch['image']
        mask = batch['mask']

        masked_img = img * (1 - mask)

        if self.concat_mask:
            masked_img = torch.cat([masked_img, mask], dim=1)

        batch['predicted_image'] = self.generator(masked_img)
        batch['inpainted'] = mask * batch['predicted_image'] + (
            1 - mask) * batch['image']

        batch['mask_for_losses'] = mask

        return batch

    def generator_loss(self, batch):
        img = batch['image']
        predicted_img = batch[self.image_to_discriminator]
        original_mask = batch['mask']
        supervised_mask = batch['mask_for_losses']

        # L1
        l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
                                  self.losses_l1_weight_known,
                                  self.losses_l1_weight_missing)

        total_loss = l1_value
        metrics = dict(gen_l1=l1_value)

        # discriminator
        # adversarial_loss calls backward by itself
        mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
        self.adversarial_loss.pre_generator_step(
            real_batch=img,
            fake_batch=predicted_img,
            generator=self.generator,
            discriminator=self.discriminator)
        discr_real_pred, discr_real_features = self.discriminator(img)
        discr_fake_pred, discr_fake_features = self.discriminator(
            predicted_img)
        adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(
            real_batch=img,
            fake_batch=predicted_img,
            discr_real_pred=discr_real_pred,
            discr_fake_pred=discr_fake_pred,
            mask=mask_for_discr)
        total_loss = total_loss + adv_gen_loss
        metrics['gen_adv'] = adv_gen_loss
        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))

        # feature matching
        if self.feature_matching_weight > 0:
            need_mask_in_fm = False
            mask_for_fm = supervised_mask if need_mask_in_fm else None
            fm_value = feature_matching_loss(
                discr_fake_features, discr_real_features,
                mask=mask_for_fm) * self.feature_matching_weight
            total_loss = total_loss + fm_value
            metrics['gen_fm'] = fm_value

        if self.loss_resnet_pl is not None:
            resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
            total_loss = total_loss + resnet_pl_value
            metrics['gen_resnet_pl'] = resnet_pl_value

        return total_loss, metrics

    def discriminator_loss(self, batch):
        total_loss = 0
        metrics = {}

        predicted_img = batch[self.image_to_discriminator].detach()
        self.adversarial_loss.pre_discriminator_step(
            real_batch=batch['image'],
            fake_batch=predicted_img,
            generator=self.generator,
            discriminator=self.discriminator)
        discr_real_pred, discr_real_features = self.discriminator(
            batch['image'])
        discr_fake_pred, discr_fake_features = self.discriminator(
            predicted_img)
        adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(
            real_batch=batch['image'],
            fake_batch=predicted_img,
            discr_real_pred=discr_real_pred,
            discr_fake_pred=discr_fake_pred,
            mask=batch['mask'])

        total_loss = (total_loss + adv_discr_loss) * 0.1
        metrics['discr_adv'] = adv_discr_loss
        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))

        return total_loss, metrics

    def _do_step(self, batch, optimizer_idx=None):
        if optimizer_idx == 0:  # step for generator
            set_requires_grad(self.generator, True)
            set_requires_grad(self.discriminator, False)
        elif optimizer_idx == 1:  # step for discriminator
            set_requires_grad(self.generator, False)
            set_requires_grad(self.discriminator, True)

        batch = self(batch)
        total_loss = 0
        if optimizer_idx is None or optimizer_idx == 0:  # step for generator
            total_loss, metrics = self.generator_loss(batch)

        elif optimizer_idx is None or optimizer_idx == 1:  # step for discriminator
            total_loss, metrics = self.discriminator_loss(batch)

        result = dict(loss=total_loss)
        return result
--- a/modelscope/models/cv/image_inpainting/model.py
+++ b/modelscope/models/cv/image_inpainting/model.py
@@ -0,0 +1,36 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union

 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 LOGGER = get_logger()


@MODELS.register_module(
    Tasks.image_inpainting, module_name=Models.image_inpainting)
 class FFTInpainting(TorchModel):

    def __init__(self, model_dir: str, **kwargs):
        super().__init__(model_dir, **kwargs)

        from .default import DefaultInpaintingTrainingModule
        pretrained = kwargs.get('pretrained', True)
        predict_only = kwargs.get('predict_only', False)
        net = DefaultInpaintingTrainingModule(
            model_dir=model_dir, predict_only=predict_only)
        if pretrained:
            path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
            LOGGER.info(f'loading pretrained model from {path}')
            state = torch.load(path, map_location='cpu')
            net.load_state_dict(state, strict=False)
        self.model = net

    def forward(self, inputs):
        return self.model(inputs)
--- a/modelscope/models/cv/image_inpainting/modules/init.py
+++ b/modelscope/models/cv/image_inpainting/modules/init.py
--- a/modelscope/models/cv/image_inpainting/modules/ade20k/init.py
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .base import ModelBuilder
--- a/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
@@ -0,0 +1,380 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """

 import os

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.modules import BatchNorm2d

 from . import resnet

 NUM_CLASS = 150


 # Model Builder
 class ModelBuilder:
    # custom weights initialization
    @staticmethod
    def weights_init(m):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            nn.init.kaiming_normal_(m.weight.data)
        elif classname.find('BatchNorm') != -1:
            m.weight.data.fill_(1.)
            m.bias.data.fill_(1e-4)

    @staticmethod
    def build_encoder(arch='resnet50dilated',
                      fc_dim=512,
                      weights='',
                      model_dir=''):
        pretrained = True if len(weights) == 0 else False
        arch = arch.lower()
        if arch == 'resnet50dilated':
            orig_resnet = resnet.__dict__['resnet50'](
                pretrained=pretrained, model_dir=model_dir)
            net_encoder = ResnetDilated(orig_resnet, dilate_scale=8)
        elif arch == 'resnet50':
            orig_resnet = resnet.__dict__['resnet50'](
                pretrained=pretrained, model_dir=model_dir)
            net_encoder = Resnet(orig_resnet)
        else:
            raise Exception('Architecture undefined!')

        # encoders are usually pretrained
        # net_encoder.apply(ModelBuilder.weights_init)
        if len(weights) > 0:
            print('Loading weights for net_encoder')
            net_encoder.load_state_dict(
                torch.load(weights, map_location=lambda storage, loc: storage),
                strict=False)
        return net_encoder

    @staticmethod
    def build_decoder(arch='ppm_deepsup',
                      fc_dim=512,
                      num_class=NUM_CLASS,
                      weights='',
                      use_softmax=False,
                      drop_last_conv=False):
        arch = arch.lower()
        if arch == 'ppm_deepsup':
            net_decoder = PPMDeepsup(
                num_class=num_class,
                fc_dim=fc_dim,
                use_softmax=use_softmax,
                drop_last_conv=drop_last_conv)
        elif arch == 'c1_deepsup':
            net_decoder = C1DeepSup(
                num_class=num_class,
                fc_dim=fc_dim,
                use_softmax=use_softmax,
                drop_last_conv=drop_last_conv)
        else:
            raise Exception('Architecture undefined!')

        net_decoder.apply(ModelBuilder.weights_init)
        if len(weights) > 0:
            print('Loading weights for net_decoder')
            net_decoder.load_state_dict(
                torch.load(weights, map_location=lambda storage, loc: storage),
                strict=False)
        return net_decoder

    @staticmethod
    def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim,
                    drop_last_conv, *arts, **kwargs):
        path = os.path.join(
            weights_path, 'ade20k',
            f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth')
        return ModelBuilder.build_decoder(
            arch=arch_decoder,
            fc_dim=fc_dim,
            weights=path,
            use_softmax=True,
            drop_last_conv=drop_last_conv)

    @staticmethod
    def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim,
                    segmentation, *arts, **kwargs):
        if segmentation:
            path = os.path.join(
                weights_path, 'ade20k',
                f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth')
        else:
            path = ''
        return ModelBuilder.build_encoder(
            arch=arch_encoder,
            fc_dim=fc_dim,
            weights=path,
            model_dir=weights_path)


 def conv3x3_bn_relu(in_planes, out_planes, stride=1):
    return nn.Sequential(
        nn.Conv2d(
            in_planes,
            out_planes,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False),
        BatchNorm2d(out_planes),
        nn.ReLU(inplace=True),
    )


 # pyramid pooling, deep supervision
 class PPMDeepsup(nn.Module):

    def __init__(self,
                 num_class=NUM_CLASS,
                 fc_dim=4096,
                 use_softmax=False,
                 pool_scales=(1, 2, 3, 6),
                 drop_last_conv=False):
        super().__init__()
        self.use_softmax = use_softmax
        self.drop_last_conv = drop_last_conv

        self.ppm = []
        for scale in pool_scales:
            self.ppm.append(
                nn.Sequential(
                    nn.AdaptiveAvgPool2d(scale),
                    nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
                    BatchNorm2d(512), nn.ReLU(inplace=True)))
        self.ppm = nn.ModuleList(self.ppm)
        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)

        self.conv_last = nn.Sequential(
            nn.Conv2d(
                fc_dim + len(pool_scales) * 512,
                512,
                kernel_size=3,
                padding=1,
                bias=False), BatchNorm2d(512), nn.ReLU(inplace=True),
            nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1))
        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
        self.dropout_deepsup = nn.Dropout2d(0.1)

    def forward(self, conv_out, segSize=None):
        conv5 = conv_out[-1]

        input_size = conv5.size()
        ppm_out = [conv5]
        for pool_scale in self.ppm:
            ppm_out.append(
                nn.functional.interpolate(
                    pool_scale(conv5), (input_size[2], input_size[3]),
                    mode='bilinear',
                    align_corners=False))
        ppm_out = torch.cat(ppm_out, 1)

        if self.drop_last_conv:
            return ppm_out
        else:
            x = self.conv_last(ppm_out)

            if self.use_softmax:  # is True during inference
                x = nn.functional.interpolate(
                    x, size=segSize, mode='bilinear', align_corners=False)
                x = nn.functional.softmax(x, dim=1)
                return x

            # deep sup
            conv4 = conv_out[-2]
            _ = self.cbr_deepsup(conv4)
            _ = self.dropout_deepsup(_)
            _ = self.conv_last_deepsup(_)

            x = nn.functional.log_softmax(x, dim=1)
            _ = nn.functional.log_softmax(_, dim=1)

            return (x, _)


 class Resnet(nn.Module):

    def __init__(self, orig_resnet):
        super(Resnet, self).__init__()

        # take pretrained resnet, except AvgPool and FC
        self.conv1 = orig_resnet.conv1
        self.bn1 = orig_resnet.bn1
        self.relu1 = orig_resnet.relu1
        self.conv2 = orig_resnet.conv2
        self.bn2 = orig_resnet.bn2
        self.relu2 = orig_resnet.relu2
        self.conv3 = orig_resnet.conv3
        self.bn3 = orig_resnet.bn3
        self.relu3 = orig_resnet.relu3
        self.maxpool = orig_resnet.maxpool
        self.layer1 = orig_resnet.layer1
        self.layer2 = orig_resnet.layer2
        self.layer3 = orig_resnet.layer3
        self.layer4 = orig_resnet.layer4

    def forward(self, x, return_feature_maps=False):
        conv_out = []

        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.relu3(self.bn3(self.conv3(x)))
        x = self.maxpool(x)

        x = self.layer1(x)
        conv_out.append(x)
        x = self.layer2(x)
        conv_out.append(x)
        x = self.layer3(x)
        conv_out.append(x)
        x = self.layer4(x)
        conv_out.append(x)

        if return_feature_maps:
            return conv_out
        return [x]


 # Resnet Dilated
 class ResnetDilated(nn.Module):

    def __init__(self, orig_resnet, dilate_scale=8):
        super().__init__()
        from functools import partial

        if dilate_scale == 8:
            orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
        elif dilate_scale == 16:
            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))

        # take pretrained resnet, except AvgPool and FC
        self.conv1 = orig_resnet.conv1
        self.bn1 = orig_resnet.bn1
        self.relu1 = orig_resnet.relu1
        self.conv2 = orig_resnet.conv2
        self.bn2 = orig_resnet.bn2
        self.relu2 = orig_resnet.relu2
        self.conv3 = orig_resnet.conv3
        self.bn3 = orig_resnet.bn3
        self.relu3 = orig_resnet.relu3
        self.maxpool = orig_resnet.maxpool
        self.layer1 = orig_resnet.layer1
        self.layer2 = orig_resnet.layer2
        self.layer3 = orig_resnet.layer3
        self.layer4 = orig_resnet.layer4

    def _nostride_dilate(self, m, dilate):
        classname = m.__class__.__name__
        if classname.find('Conv') != -1:
            # the convolution with stride
            if m.stride == (2, 2):
                m.stride = (1, 1)
                if m.kernel_size == (3, 3):
                    m.dilation = (dilate // 2, dilate // 2)
                    m.padding = (dilate // 2, dilate // 2)
            # other convoluions
            else:
                if m.kernel_size == (3, 3):
                    m.dilation = (dilate, dilate)
                    m.padding = (dilate, dilate)

    def forward(self, x, return_feature_maps=False):
        conv_out = []

        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.relu3(self.bn3(self.conv3(x)))
        x = self.maxpool(x)

        x = self.layer1(x)
        conv_out.append(x)
        x = self.layer2(x)
        conv_out.append(x)
        x = self.layer3(x)
        conv_out.append(x)
        x = self.layer4(x)
        conv_out.append(x)

        if return_feature_maps:
            return conv_out
        return [x]


 # last conv, deep supervision
 class C1DeepSup(nn.Module):

    def __init__(self,
                 num_class=150,
                 fc_dim=2048,
                 use_softmax=False,
                 drop_last_conv=False):
        super(C1DeepSup, self).__init__()
        self.use_softmax = use_softmax
        self.drop_last_conv = drop_last_conv

        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)

        # last conv
        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)

    def forward(self, conv_out, segSize=None):
        conv5 = conv_out[-1]

        x = self.cbr(conv5)

        if self.drop_last_conv:
            return x
        else:
            x = self.conv_last(x)

            if self.use_softmax:  # is True during inference
                x = nn.functional.interpolate(
                    x, size=segSize, mode='bilinear', align_corners=False)
                x = nn.functional.softmax(x, dim=1)
                return x

            # deep sup
            conv4 = conv_out[-2]
            _ = self.cbr_deepsup(conv4)
            _ = self.conv_last_deepsup(_)

            x = nn.functional.log_softmax(x, dim=1)
            _ = nn.functional.log_softmax(_, dim=1)

            return (x, _)


 # last conv
 class C1(nn.Module):

    def __init__(self, num_class=150, fc_dim=2048, use_softmax=False):
        super(C1, self).__init__()
        self.use_softmax = use_softmax

        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)

        # last conv
        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)

    def forward(self, conv_out, segSize=None):
        conv5 = conv_out[-1]
        x = self.cbr(conv5)
        x = self.conv_last(x)

        if self.use_softmax:  # is True during inference
            x = nn.functional.interpolate(
                x, size=segSize, mode='bilinear', align_corners=False)
            x = nn.functional.softmax(x, dim=1)
        else:
            x = nn.functional.log_softmax(x, dim=1)

        return x
--- a/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
@@ -0,0 +1,183 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 import math
 import os

 import torch
 import torch.nn as nn
 from torch.nn import BatchNorm2d

 __all__ = ['ResNet', 'resnet50']


 def conv3x3(in_planes, out_planes, stride=1):
    '3x3 convolution with padding'
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=1,
        bias=False)


 class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes,
            planes,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False)
        self.bn2 = BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


 class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 128
        super(ResNet, self).__init__()
        self.conv1 = conv3x3(3, 64, stride=2)
        self.bn1 = BatchNorm2d(64)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(64, 64)
        self.bn2 = BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = conv3x3(64, 128)
        self.bn3 = BatchNorm2d(128)
        self.relu3 = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.inplanes,
                    planes * block.expansion,
                    kernel_size=1,
                    stride=stride,
                    bias=False),
                BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.relu3(self.bn3(self.conv3(x)))
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


 def resnet50(pretrained=False, model_dir='', **kwargs):
    """Constructs a ResNet-50 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth')
        model.load_state_dict(
            torch.load(cached_file, map_location='cpu'), strict=False)
    return model
--- a/modelscope/models/cv/image_inpainting/modules/adversarial.py
+++ b/modelscope/models/cv/image_inpainting/modules/adversarial.py
@@ -0,0 +1,167 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 from typing import Dict, Optional, Tuple

 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class BaseAdversarialLoss:

    def pre_generator_step(self, real_batch: torch.Tensor,
                           fake_batch: torch.Tensor, generator: nn.Module,
                           discriminator: nn.Module):
        """
        Prepare for generator step
        :param real_batch: Tensor, a batch of real samples
        :param fake_batch: Tensor, a batch of samples produced by generator
        :param generator:
        :param discriminator:
        :return: None
        """

    def pre_discriminator_step(self, real_batch: torch.Tensor,
                               fake_batch: torch.Tensor, generator: nn.Module,
                               discriminator: nn.Module):
        """
        Prepare for discriminator step
        :param real_batch: Tensor, a batch of real samples
        :param fake_batch: Tensor, a batch of samples produced by generator
        :param generator:
        :param discriminator:
        :return: None
        """

    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
                       mask: Optional[torch.Tensor] = None) \
            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        """
        Calculate generator loss
        :param real_batch: Tensor, a batch of real samples
        :param fake_batch: Tensor, a batch of samples produced by generator
        :param discr_real_pred: Tensor, discriminator output for real_batch
        :param discr_fake_pred: Tensor, discriminator output for fake_batch
        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
        :return: total generator loss along with some values that might be interesting to log
        """
        raise NotImplementedError

    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
                           mask: Optional[torch.Tensor] = None) \
            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        """
        Calculate discriminator loss and call .backward() on it
        :param real_batch: Tensor, a batch of real samples
        :param fake_batch: Tensor, a batch of samples produced by generator
        :param discr_real_pred: Tensor, discriminator output for real_batch
        :param discr_fake_pred: Tensor, discriminator output for fake_batch
        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
        :return: total discriminator loss along with some values that might be interesting to log
        """
        raise NotImplementedError

    def interpolate_mask(self, mask, shape):
        assert mask is not None
        assert self.allow_scale_mask or shape == mask.shape[-2:]
        if shape != mask.shape[-2:] and self.allow_scale_mask:
            if self.mask_scale_mode == 'maxpool':
                mask = F.adaptive_max_pool2d(mask, shape)
            else:
                mask = F.interpolate(
                    mask, size=shape, mode=self.mask_scale_mode)
        return mask


 def make_r1_gp(discr_real_pred, real_batch):
    if torch.is_grad_enabled():
        grad_real = torch.autograd.grad(
            outputs=discr_real_pred.sum(),
            inputs=real_batch,
            create_graph=True)[0]
        grad_penalty = (grad_real.view(grad_real.shape[0],
                                       -1).norm(2, dim=1)**2).mean()
    else:
        grad_penalty = 0
    real_batch.requires_grad = False

    return grad_penalty


 class NonSaturatingWithR1(BaseAdversarialLoss):

    def __init__(self,
                 gp_coef=5,
                 weight=1,
                 mask_as_fake_target=False,
                 allow_scale_mask=False,
                 mask_scale_mode='nearest',
                 extra_mask_weight_for_gen=0,
                 use_unmasked_for_gen=True,
                 use_unmasked_for_discr=True):
        self.gp_coef = gp_coef
        self.weight = weight
        # use for discr => use for gen;
        # otherwise we teach only the discr to pay attention to very small difference
        assert use_unmasked_for_gen or (not use_unmasked_for_discr)
        # mask as target => use unmasked for discr:
        # if we don't care about unmasked regions at all
        # then it doesn't matter if the value of mask_as_fake_target is true or false
        assert use_unmasked_for_discr or (not mask_as_fake_target)
        self.use_unmasked_for_gen = use_unmasked_for_gen
        self.use_unmasked_for_discr = use_unmasked_for_discr
        self.mask_as_fake_target = mask_as_fake_target
        self.allow_scale_mask = allow_scale_mask
        self.mask_scale_mode = mask_scale_mode
        self.extra_mask_weight_for_gen = extra_mask_weight_for_gen

    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
                       mask=None) \
            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        fake_loss = F.softplus(-discr_fake_pred)
        if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
                not self.use_unmasked_for_gen:  # == if masked region should be treated differently
            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
            if not self.use_unmasked_for_gen:
                fake_loss = fake_loss * mask
            else:
                pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
                fake_loss = fake_loss * pixel_weights

        return fake_loss.mean() * self.weight, dict()

    def pre_discriminator_step(self, real_batch: torch.Tensor,
                               fake_batch: torch.Tensor, generator: nn.Module,
                               discriminator: nn.Module):
        real_batch.requires_grad = True

    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
                           mask=None) \
            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:

        real_loss = F.softplus(-discr_real_pred)
        grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
        fake_loss = F.softplus(discr_fake_pred)

        if not self.use_unmasked_for_discr or self.mask_as_fake_target:
            # == if masked region should be treated differently
            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
            # use_unmasked_for_discr=False only makes sense for fakes;
            # for reals there is no difference beetween two regions
            fake_loss = fake_loss * mask
            if self.mask_as_fake_target:
                fake_loss = fake_loss + (1
                                         - mask) * F.softplus(-discr_fake_pred)

        sum_discr_loss = real_loss + grad_penalty + fake_loss
        metrics = dict(
            discr_real_out=discr_real_pred.mean(),
            discr_fake_out=discr_fake_pred.mean(),
            discr_real_gp=grad_penalty)
        return sum_discr_loss.mean(), metrics
--- a/modelscope/models/cv/image_inpainting/modules/feature_matching.py
+++ b/modelscope/models/cv/image_inpainting/modules/feature_matching.py
@@ -0,0 +1,45 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 from typing import List

 import torch
 import torch.nn.functional as F


 def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
    per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
    return (pixel_weights * per_pixel_l2).mean()


 def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
    per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
    return (pixel_weights * per_pixel_l1).mean()


 def feature_matching_loss(fake_features: List[torch.Tensor],
                          target_features: List[torch.Tensor],
                          mask=None):
    if mask is None:
        res = torch.stack([
            F.mse_loss(fake_feat, target_feat)
            for fake_feat, target_feat in zip(fake_features, target_features)
        ]).mean()
    else:
        res = 0
        norm = 0
        for fake_feat, target_feat in zip(fake_features, target_features):
            cur_mask = F.interpolate(
                mask,
                size=fake_feat.shape[-2:],
                mode='bilinear',
                align_corners=False)
            error_weights = 1 - cur_mask
            cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
            res = res + cur_val
            norm += 1
        res = res / norm
    return res
--- a/modelscope/models/cv/image_inpainting/modules/ffc.py
+++ b/modelscope/models/cv/image_inpainting/modules/ffc.py
@@ -0,0 +1,588 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from kornia.geometry.transform import rotate


 def get_activation(kind='tanh'):
    if kind == 'tanh':
        return nn.Tanh()
    if kind == 'sigmoid':
        return nn.Sigmoid()
    if kind is False:
        return nn.Identity()
    raise ValueError(f'Unknown activation kind {kind}')


 class SELayer(nn.Module):

    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        res = x * y.expand_as(x)
        return res


 class FourierUnit(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 groups=1,
                 spatial_scale_factor=None,
                 spatial_scale_mode='bilinear',
                 spectral_pos_encoding=False,
                 use_se=False,
                 se_kwargs=None,
                 ffc3d=False,
                 fft_norm='ortho'):
        # bn_layer not used
        super(FourierUnit, self).__init__()
        self.groups = groups

        self.conv_layer = torch.nn.Conv2d(
            in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
            out_channels=out_channels * 2,
            kernel_size=1,
            stride=1,
            padding=0,
            groups=self.groups,
            bias=False)
        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
        self.relu = torch.nn.ReLU(inplace=True)

        # squeeze and excitation block
        self.use_se = use_se
        if use_se:
            if se_kwargs is None:
                se_kwargs = {}
            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)

        self.spatial_scale_factor = spatial_scale_factor
        self.spatial_scale_mode = spatial_scale_mode
        self.spectral_pos_encoding = spectral_pos_encoding
        self.ffc3d = ffc3d
        self.fft_norm = fft_norm

    def forward(self, x):
        batch = x.shape[0]

        if self.spatial_scale_factor is not None:
            orig_size = x.shape[-2:]
            x = F.interpolate(
                x,
                scale_factor=self.spatial_scale_factor,
                mode=self.spatial_scale_mode,
                align_corners=False)

        # (batch, c, h, w/2+1, 2)
        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
        ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
        ffted = ffted.permute(0, 1, 4, 2,
                              3).contiguous()  # (batch, c, 2, h, w/2+1)
        ffted = ffted.view((
            batch,
            -1,
        ) + ffted.size()[3:])

        if self.spectral_pos_encoding:
            height, width = ffted.shape[-2:]
            coords_vert = torch.linspace(0, 1,
                                         height)[None, None, :, None].expand(
                                             batch, 1, height, width).to(ffted)
            coords_hor = torch.linspace(0, 1,
                                        width)[None, None, None, :].expand(
                                            batch, 1, height, width).to(ffted)
            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)

        if self.use_se:
            ffted = self.se(ffted)

        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
        ffted = self.relu(self.bn(ffted))

        ffted = ffted.view((
            batch,
            -1,
            2,
        ) + ffted.size()[2:]).permute(
            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
        ffted = torch.complex(ffted[..., 0], ffted[..., 1])

        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
        output = torch.fft.irfftn(
            ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)

        if self.spatial_scale_factor is not None:
            output = F.interpolate(
                output,
                size=orig_size,
                mode=self.spatial_scale_mode,
                align_corners=False)

        return output


 class SpectralTransform(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride=1,
                 groups=1,
                 enable_lfu=True,
                 **fu_kwargs):
        # bn_layer not used
        super(SpectralTransform, self).__init__()
        self.enable_lfu = enable_lfu
        if stride == 2:
            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
        else:
            self.downsample = nn.Identity()

        self.stride = stride
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels,
                out_channels // 2,
                kernel_size=1,
                groups=groups,
                bias=False), nn.BatchNorm2d(out_channels // 2),
            nn.ReLU(inplace=True))
        self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups,
                              **fu_kwargs)
        if self.enable_lfu:
            self.lfu = FourierUnit(out_channels // 2, out_channels // 2,
                                   groups)
        self.conv2 = torch.nn.Conv2d(
            out_channels // 2,
            out_channels,
            kernel_size=1,
            groups=groups,
            bias=False)

    def forward(self, x):

        x = self.downsample(x)
        x = self.conv1(x)
        output = self.fu(x)

        if self.enable_lfu:
            n, c, h, w = x.shape
            split_no = 2
            split_s = h // split_no
            xs = torch.cat(
                torch.split(x[:, :c // 4], split_s, dim=-2),
                dim=1).contiguous()
            xs = torch.cat(
                torch.split(xs, split_s, dim=-1), dim=1).contiguous()
            xs = self.lfu(xs)
            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
        else:
            xs = 0

        output = self.conv2(x + output + xs)

        return output


 class LearnableSpatialTransformWrapper(nn.Module):

    def __init__(self,
                 impl,
                 pad_coef=0.5,
                 angle_init_range=80,
                 train_angle=True):
        super().__init__()
        self.impl = impl
        self.angle = torch.rand(1) * angle_init_range
        if train_angle:
            self.angle = nn.Parameter(self.angle, requires_grad=True)
        self.pad_coef = pad_coef

    def forward(self, x):
        if torch.is_tensor(x):
            return self.inverse_transform(self.impl(self.transform(x)), x)
        elif isinstance(x, tuple):
            x_trans = tuple(self.transform(elem) for elem in x)
            y_trans = self.impl(x_trans)
            return tuple(
                self.inverse_transform(elem, orig_x)
                for elem, orig_x in zip(y_trans, x))
        else:
            raise ValueError(f'Unexpected input type {type(x)}')

    def transform(self, x):
        height, width = x.shape[2:]
        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
        x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
        x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
        return x_padded_rotated

    def inverse_transform(self, y_padded_rotated, orig_x):
        height, width = orig_x.shape[2:]
        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)

        y_padded = rotate(
            y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
        y_height, y_width = y_padded.shape[2:]
        y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w]
        return y


 class FFC(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 ratio_gin,
                 ratio_gout,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=False,
                 enable_lfu=True,
                 padding_type='reflect',
                 gated=False,
                 **spectral_kwargs):
        super(FFC, self).__init__()

        assert stride == 1 or stride == 2, 'Stride should be 1 or 2.'
        self.stride = stride

        in_cg = int(in_channels * ratio_gin)
        in_cl = in_channels - in_cg
        out_cg = int(out_channels * ratio_gout)
        out_cl = out_channels - out_cg

        self.ratio_gin = ratio_gin
        self.ratio_gout = ratio_gout
        self.global_in_num = in_cg

        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
        self.convl2l = module(
            in_cl,
            out_cl,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            padding_mode=padding_type)
        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
        self.convl2g = module(
            in_cl,
            out_cg,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            padding_mode=padding_type)
        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
        self.convg2l = module(
            in_cg,
            out_cl,
            kernel_size,
            stride,
            padding,
            dilation,
            groups,
            bias,
            padding_mode=padding_type)
        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
        self.convg2g = module(in_cg, out_cg, stride,
                              1 if groups == 1 else groups // 2, enable_lfu,
                              **spectral_kwargs)

        self.gated = gated
        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
        self.gate = module(in_channels, 2, 1)

    def forward(self, x):
        x_l, x_g = x if type(x) is tuple else (x, 0)
        out_xl, out_xg = 0, 0

        if self.gated:
            total_input_parts = [x_l]
            if torch.is_tensor(x_g):
                total_input_parts.append(x_g)
            total_input = torch.cat(total_input_parts, dim=1)

            gates = torch.sigmoid(self.gate(total_input))
            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
        else:
            g2l_gate, l2g_gate = 1, 1

        if self.ratio_gout != 1:
            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
        if self.ratio_gout != 0:
            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)

        return out_xl, out_xg


 class FFC_BN_ACT(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 ratio_gin,
                 ratio_gout,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=False,
                 norm_layer=nn.BatchNorm2d,
                 activation_layer=nn.Identity,
                 padding_type='reflect',
                 enable_lfu=True,
                 **kwargs):
        super(FFC_BN_ACT, self).__init__()
        self.ffc = FFC(
            in_channels,
            out_channels,
            kernel_size,
            ratio_gin,
            ratio_gout,
            stride,
            padding,
            dilation,
            groups,
            bias,
            enable_lfu,
            padding_type=padding_type,
            **kwargs)
        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
        global_channels = int(out_channels * ratio_gout)
        self.bn_l = lnorm(out_channels - global_channels)
        self.bn_g = gnorm(global_channels)

        lact = nn.Identity if ratio_gout == 1 else activation_layer
        gact = nn.Identity if ratio_gout == 0 else activation_layer
        self.act_l = lact(inplace=True)
        self.act_g = gact(inplace=True)

    def forward(self, x):
        x_l, x_g = self.ffc(x)
        x_l = self.act_l(self.bn_l(x_l))
        x_g = self.act_g(self.bn_g(x_g))
        return x_l, x_g


 class FFCResnetBlock(nn.Module):

    def __init__(self,
                 dim,
                 padding_type,
                 norm_layer,
                 activation_layer=nn.ReLU,
                 dilation=1,
                 spatial_transform_kwargs=None,
                 inline=False,
                 **conv_kwargs):
        super().__init__()
        self.conv1 = FFC_BN_ACT(
            dim,
            dim,
            kernel_size=3,
            padding=dilation,
            dilation=dilation,
            norm_layer=norm_layer,
            activation_layer=activation_layer,
            padding_type=padding_type,
            **conv_kwargs)
        self.conv2 = FFC_BN_ACT(
            dim,
            dim,
            kernel_size=3,
            padding=dilation,
            dilation=dilation,
            norm_layer=norm_layer,
            activation_layer=activation_layer,
            padding_type=padding_type,
            **conv_kwargs)
        if spatial_transform_kwargs is not None:
            self.conv1 = LearnableSpatialTransformWrapper(
                self.conv1, **spatial_transform_kwargs)
            self.conv2 = LearnableSpatialTransformWrapper(
                self.conv2, **spatial_transform_kwargs)
        self.inline = inline

    def forward(self, x):
        if self.inline:
            x_l, x_g = x[:, :-self.conv1.ffc.
                         global_in_num], x[:, -self.conv1.ffc.global_in_num:]
        else:
            x_l, x_g = x if type(x) is tuple else (x, 0)

        id_l, id_g = x_l, x_g

        x_l, x_g = self.conv1((x_l, x_g))
        x_l, x_g = self.conv2((x_l, x_g))

        x_l, x_g = id_l + x_l, id_g + x_g
        out = x_l, x_g
        if self.inline:
            out = torch.cat(out, dim=1)
        return out


 class ConcatTupleLayer(nn.Module):

    def forward(self, x):
        assert isinstance(x, tuple)
        x_l, x_g = x
        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
        if not torch.is_tensor(x_g):
            return x_l
        return torch.cat(x, dim=1)


 class FFCResNetGenerator(nn.Module):

    def __init__(self,
                 input_nc=4,
                 output_nc=3,
                 ngf=64,
                 n_downsampling=3,
                 n_blocks=18,
                 norm_layer=nn.BatchNorm2d,
                 padding_type='reflect',
                 activation_layer=nn.ReLU,
                 up_norm_layer=nn.BatchNorm2d,
                 up_activation=nn.ReLU(True),
                 init_conv_kwargs={
                     'ratio_gin': 0,
                     'ratio_gout': 0,
                     'enable_lfu': False
                 },
                 downsample_conv_kwargs={
                     'ratio_gin': 0,
                     'ratio_gout': 0,
                     'enable_lfu': False
                 },
                 resnet_conv_kwargs={
                     'ratio_gin': 0.75,
                     'ratio_gout': 0.75,
                     'enable_lfu': False
                 },
                 spatial_transform_layers=None,
                 spatial_transform_kwargs={},
                 add_out_act='sigmoid',
                 max_features=1024,
                 out_ffc=False,
                 out_ffc_kwargs={}):
        assert (n_blocks >= 0)
        super().__init__()

        model = [
            nn.ReflectionPad2d(3),
            FFC_BN_ACT(
                input_nc,
                ngf,
                kernel_size=7,
                padding=0,
                norm_layer=norm_layer,
                activation_layer=activation_layer,
                **init_conv_kwargs)
        ]

        # downsample
        for i in range(n_downsampling):
            mult = 2**i
            if i == n_downsampling - 1:
                cur_conv_kwargs = dict(downsample_conv_kwargs)
                cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get(
                    'ratio_gin', 0)
            else:
                cur_conv_kwargs = downsample_conv_kwargs
            model += [
                FFC_BN_ACT(
                    min(max_features, ngf * mult),
                    min(max_features, ngf * mult * 2),
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    norm_layer=norm_layer,
                    activation_layer=activation_layer,
                    **cur_conv_kwargs)
            ]

        mult = 2**n_downsampling
        feats_num_bottleneck = min(max_features, ngf * mult)

        # resnet blocks
        for i in range(n_blocks):
            cur_resblock = FFCResnetBlock(
                feats_num_bottleneck,
                padding_type=padding_type,
                activation_layer=activation_layer,
                norm_layer=norm_layer,
                **resnet_conv_kwargs)
            if spatial_transform_layers is not None and i in spatial_transform_layers:
                cur_resblock = LearnableSpatialTransformWrapper(
                    cur_resblock, **spatial_transform_kwargs)
            model += [cur_resblock]

        model += [ConcatTupleLayer()]

        # upsample
        for i in range(n_downsampling):
            mult = 2**(n_downsampling - i)
            model += [
                nn.ConvTranspose2d(
                    min(max_features, ngf * mult),
                    min(max_features, int(ngf * mult / 2)),
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    output_padding=1),
                up_norm_layer(min(max_features, int(ngf * mult / 2))),
                up_activation
            ]

        if out_ffc:
            model += [
                FFCResnetBlock(
                    ngf,
                    padding_type=padding_type,
                    activation_layer=activation_layer,
                    norm_layer=norm_layer,
                    inline=True,
                    **out_ffc_kwargs)
            ]

        model += [
            nn.ReflectionPad2d(3),
            nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)
        ]
        if add_out_act:
            model.append(
                get_activation('tanh' if add_out_act is True else add_out_act))
        self.model = nn.Sequential(*model)

    def forward(self, input):
        return self.model(input)
--- a/modelscope/models/cv/image_inpainting/modules/inception.py
+++ b/modelscope/models/cv/image_inpainting/modules/inception.py
@@ -0,0 +1,324 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision import models

 from modelscope.utils.logger import get_logger

 try:
    from torchvision.models.utils import load_state_dict_from_url
 except ImportError:
    from torch.utils.model_zoo import load_url as load_state_dict_from_url

 # Inception weights ported to Pytorch from
 # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
 FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \
                  'fid_weights/pt_inception-2015-12-05-6726825d.pth'

 LOGGER = get_logger()


 class InceptionV3(nn.Module):
    """Pretrained InceptionV3 network returning feature maps"""

    # Index of default block of inception to return,
    # corresponds to output of final average pooling
    DEFAULT_BLOCK_INDEX = 3

    # Maps feature dimensionality to their output blocks indices
    BLOCK_INDEX_BY_DIM = {
        64: 0,  # First max pooling features
        192: 1,  # Second max pooling featurs
        768: 2,  # Pre-aux classifier features
        2048: 3  # Final average pooling features
    }

    def __init__(self,
                 output_blocks=[DEFAULT_BLOCK_INDEX],
                 resize_input=True,
                 normalize_input=True,
                 requires_grad=False,
                 use_fid_inception=True):
        """Build pretrained InceptionV3

        Parameters
        ----------
        output_blocks : list of int
            Indices of blocks to return features of. Possible values are:
                - 0: corresponds to output of first max pooling
                - 1: corresponds to output of second max pooling
                - 2: corresponds to output which is fed to aux classifier
                - 3: corresponds to output of final average pooling
        resize_input : bool
            If true, bilinearly resizes input to width and height 299 before
            feeding input to model. As the network without fully connected
            layers is fully convolutional, it should be able to handle inputs
            of arbitrary size, so resizing might not be strictly needed
        normalize_input : bool
            If true, scales the input from range (0, 1) to the range the
            pretrained Inception network expects, namely (-1, 1)
        requires_grad : bool
            If true, parameters of the model require gradients. Possibly useful
            for finetuning the network
        use_fid_inception : bool
            If true, uses the pretrained Inception model used in Tensorflow's
            FID implementation. If false, uses the pretrained Inception model
            available in torchvision. The FID Inception model has different
            weights and a slightly different structure from torchvision's
            Inception model. If you want to compute FID scores, you are
            strongly advised to set this parameter to true to get comparable
            results.
        """
        super(InceptionV3, self).__init__()

        self.resize_input = resize_input
        self.normalize_input = normalize_input
        self.output_blocks = sorted(output_blocks)
        self.last_needed_block = max(output_blocks)

        assert self.last_needed_block <= 3, \
            'Last possible output block index is 3'

        self.blocks = nn.ModuleList()

        if use_fid_inception:
            inception = fid_inception_v3()
        else:
            inception = models.inception_v3(pretrained=True)

        # Block 0: input to maxpool1
        block0 = [
            inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
            inception.Conv2d_2b_3x3,
            nn.MaxPool2d(kernel_size=3, stride=2)
        ]
        self.blocks.append(nn.Sequential(*block0))

        # Block 1: maxpool1 to maxpool2
        if self.last_needed_block >= 1:
            block1 = [
                inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
                nn.MaxPool2d(kernel_size=3, stride=2)
            ]
            self.blocks.append(nn.Sequential(*block1))

        # Block 2: maxpool2 to aux classifier
        if self.last_needed_block >= 2:
            block2 = [
                inception.Mixed_5b,
                inception.Mixed_5c,
                inception.Mixed_5d,
                inception.Mixed_6a,
                inception.Mixed_6b,
                inception.Mixed_6c,
                inception.Mixed_6d,
                inception.Mixed_6e,
            ]
            self.blocks.append(nn.Sequential(*block2))

        # Block 3: aux classifier to final avgpool
        if self.last_needed_block >= 3:
            block3 = [
                inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
                nn.AdaptiveAvgPool2d(output_size=(1, 1))
            ]
            self.blocks.append(nn.Sequential(*block3))

        for param in self.parameters():
            param.requires_grad = requires_grad

    def forward(self, inp):
        """Get Inception feature maps

        Parameters
        ----------
        inp : torch.autograd.Variable
            Input tensor of shape Bx3xHxW. Values are expected to be in
            range (0, 1)

        Returns
        -------
        List of torch.autograd.Variable, corresponding to the selected output
        block, sorted ascending by index
        """
        outp = []
        x = inp

        if self.resize_input:
            x = F.interpolate(
                x, size=(299, 299), mode='bilinear', align_corners=False)

        if self.normalize_input:
            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)

        for idx, block in enumerate(self.blocks):
            x = block(x)
            if idx in self.output_blocks:
                outp.append(x)

            if idx == self.last_needed_block:
                break

        return outp


 def fid_inception_v3():
    """Build pretrained Inception model for FID computation

    The Inception model for FID computation uses a different set of weights
    and has a slightly different structure than torchvision's Inception.

    This method first constructs torchvision's Inception and then patches the
    necessary parts that are different in the FID Inception model.
    """
    LOGGER.info('fid_inception_v3 called')
    inception = models.inception_v3(
        num_classes=1008, aux_logits=False, pretrained=False)
    LOGGER.info('models.inception_v3 done')
    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
    inception.Mixed_7b = FIDInceptionE_1(1280)
    inception.Mixed_7c = FIDInceptionE_2(2048)

    LOGGER.info('fid_inception_v3 patching done')

    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
    LOGGER.info('fid_inception_v3 weights downloaded')

    inception.load_state_dict(state_dict)
    LOGGER.info('fid_inception_v3 weights loaded into model')

    return inception


 class FIDInceptionA(models.inception.InceptionA):
    """InceptionA block patched for FID computation"""

    def __init__(self, in_channels, pool_features):
        super(FIDInceptionA, self).__init__(in_channels, pool_features)

    def forward(self, x):
        branch1x1 = self.branch1x1(x)

        branch5x5 = self.branch5x5_1(x)
        branch5x5 = self.branch5x5_2(branch5x5)

        branch3x3dbl = self.branch3x3dbl_1(x)
        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)

        # Patch: Tensorflow's average pool does not use the padded zero's in
        # its average calculation
        branch_pool = F.avg_pool2d(
            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
        branch_pool = self.branch_pool(branch_pool)

        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
        return torch.cat(outputs, 1)


 class FIDInceptionC(models.inception.InceptionC):
    """InceptionC block patched for FID computation"""

    def __init__(self, in_channels, channels_7x7):
        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)

    def forward(self, x):
        branch1x1 = self.branch1x1(x)

        branch7x7 = self.branch7x7_1(x)
        branch7x7 = self.branch7x7_2(branch7x7)
        branch7x7 = self.branch7x7_3(branch7x7)

        branch7x7dbl = self.branch7x7dbl_1(x)
        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)

        # Patch: Tensorflow's average pool does not use the padded zero's in
        # its average calculation
        branch_pool = F.avg_pool2d(
            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
        branch_pool = self.branch_pool(branch_pool)

        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
        return torch.cat(outputs, 1)


 class FIDInceptionE_1(models.inception.InceptionE):
    """First InceptionE block patched for FID computation"""

    def __init__(self, in_channels):
        super(FIDInceptionE_1, self).__init__(in_channels)

    def forward(self, x):
        branch1x1 = self.branch1x1(x)

        branch3x3 = self.branch3x3_1(x)
        branch3x3 = [
            self.branch3x3_2a(branch3x3),
            self.branch3x3_2b(branch3x3),
        ]
        branch3x3 = torch.cat(branch3x3, 1)

        branch3x3dbl = self.branch3x3dbl_1(x)
        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
        branch3x3dbl = [
            self.branch3x3dbl_3a(branch3x3dbl),
            self.branch3x3dbl_3b(branch3x3dbl),
        ]
        branch3x3dbl = torch.cat(branch3x3dbl, 1)

        # Patch: Tensorflow's average pool does not use the padded zero's in
        # its average calculation
        branch_pool = F.avg_pool2d(
            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
        branch_pool = self.branch_pool(branch_pool)

        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
        return torch.cat(outputs, 1)


 class FIDInceptionE_2(models.inception.InceptionE):
    """Second InceptionE block patched for FID computation"""

    def __init__(self, in_channels):
        super(FIDInceptionE_2, self).__init__(in_channels)

    def forward(self, x):
        branch1x1 = self.branch1x1(x)

        branch3x3 = self.branch3x3_1(x)
        branch3x3 = [
            self.branch3x3_2a(branch3x3),
            self.branch3x3_2b(branch3x3),
        ]
        branch3x3 = torch.cat(branch3x3, 1)

        branch3x3dbl = self.branch3x3dbl_1(x)
        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
        branch3x3dbl = [
            self.branch3x3dbl_3a(branch3x3dbl),
            self.branch3x3dbl_3b(branch3x3dbl),
        ]
        branch3x3dbl = torch.cat(branch3x3dbl, 1)

        # Patch: The FID Inception model uses max pooling instead of average
        # pooling. This is likely an error in this specific Inception
        # implementation, as other Inception models use average pooling here
        # (which matches the description in the paper).
        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
        branch_pool = self.branch_pool(branch_pool)

        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
        return torch.cat(outputs, 1)
--- a/modelscope/models/cv/image_inpainting/modules/perceptual.py
+++ b/modelscope/models/cv/image_inpainting/modules/perceptual.py
@@ -0,0 +1,47 @@
 """
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision

 from .ade20k import ModelBuilder

 IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
 IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]


 class ResNetPL(nn.Module):

    def __init__(self,
                 weight=1,
                 weights_path=None,
                 arch_encoder='resnet50dilated',
                 segmentation=True):
        super().__init__()
        self.impl = ModelBuilder.get_encoder(
            weights_path=weights_path,
            arch_encoder=arch_encoder,
            arch_decoder='ppm_deepsup',
            fc_dim=2048,
            segmentation=segmentation)
        self.impl.eval()
        for w in self.impl.parameters():
            w.requires_grad_(False)

        self.weight = weight

    def forward(self, pred, target):
        pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
        target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)

        pred_feats = self.impl(pred, return_feature_maps=True)
        target_feats = self.impl(target, return_feature_maps=True)

        result = torch.stack([
            F.mse_loss(cur_pred, cur_target)
            for cur_pred, cur_target in zip(pred_feats, target_feats)
        ]).sum() * self.weight
        return result
--- a/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
+++ b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
@@ -0,0 +1,75 @@
 """
 The implementation is adopted from
 https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
 """
 import collections
 import functools
 import logging
 from collections import defaultdict
 from functools import partial

 import numpy as np
 import torch.nn as nn


 # Defines the PatchGAN discriminator with the specified arguments.
 class NLayerDiscriminator(nn.Module):

    def __init__(
        self,
        input_nc=3,
        ndf=64,
        n_layers=4,
        norm_layer=nn.BatchNorm2d,
    ):
        super().__init__()
        self.n_layers = n_layers

        kw = 4
        padw = int(np.ceil((kw - 1.0) / 2))
        sequence = [[
            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
            nn.LeakyReLU(0.2, True)
        ]]

        nf = ndf
        for n in range(1, n_layers):
            nf_prev = nf
            nf = min(nf * 2, 512)

            cur_model = []
            cur_model += [
                nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
                norm_layer(nf),
                nn.LeakyReLU(0.2, True)
            ]
            sequence.append(cur_model)

        nf_prev = nf
        nf = min(nf * 2, 512)

        cur_model = []
        cur_model += [
            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
            norm_layer(nf),
            nn.LeakyReLU(0.2, True)
        ]
        sequence.append(cur_model)

        sequence += [[
            nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)
        ]]

        for n in range(len(sequence)):
            setattr(self, 'model' + str(n), nn.Sequential(*sequence[n]))

    def get_all_activations(self, x):
        res = [x]
        for n in range(self.n_layers + 2):
            model = getattr(self, 'model' + str(n))
            res.append(model(res[-1]))
        return res[1:]

    def forward(self, x):
        act = self.get_all_activations(x)
        return act[-1], act[:-1]
--- a/modelscope/models/cv/image_inpainting/refinement.py
+++ b/modelscope/models/cv/image_inpainting/refinement.py
@@ -0,0 +1,393 @@
 '''
 Part of the implementation is borrowed and modified from LaMa, publicly available at
 https://github.com/saic-mdal/lama
 '''
 import cv2
 import numpy as np
 import torch
 import torch.nn as nn
 from kornia.filters import gaussian_blur2d
 from kornia.geometry.transform import resize
 from kornia.morphology import erosion
 from torch.nn import functional as F
 from torch.optim import SGD, Adam
 from tqdm import tqdm

 from .modules.ffc import FFCResnetBlock


 def move_to_device(obj, device):
    if isinstance(obj, nn.Module):
        return obj.to(device)
    if torch.is_tensor(obj):
        return obj.to(device)
    if isinstance(obj, (tuple, list)):
        return [move_to_device(el, device) for el in obj]
    if isinstance(obj, dict):
        return {name: move_to_device(val, device) for name, val in obj.items()}
    raise ValueError(f'Unexpected type {type(obj)}')


 def ceil_modulo(x, mod):
    if x % mod == 0:
        return x
    return (x // mod + 1) * mod


 def pad_tensor_to_modulo(img, mod):
    batch_size, channels, height, width = img.shape
    out_height = ceil_modulo(height, mod)
    out_width = ceil_modulo(width, mod)
    return F.pad(
        img,
        pad=(0, out_width - width, 0, out_height - height),
        mode='reflect')


 def _pyrdown(im: torch.Tensor, downsize: tuple = None):
    """downscale the image"""
    if downsize is None:
        downsize = (im.shape[2] // 2, im.shape[3] // 2)
    assert im.shape[
        1] == 3, 'Expected shape for the input to be (n,3,height,width)'
    im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0))
    im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False)
    return im


 def _pyrdown_mask(mask: torch.Tensor,
                  downsize: tuple = None,
                  eps: float = 1e-8,
                  blur_mask: bool = True,
                  round_up: bool = True):
    """downscale the mask tensor

    Parameters
    ----------
    mask : torch.Tensor
        mask of size (B, 1, H, W)
    downsize : tuple, optional
        size to downscale to. If None, image is downscaled to half, by default None
    eps : float, optional
        threshold value for binarizing the mask, by default 1e-8
    blur_mask : bool, optional
        if True, apply gaussian filter before downscaling, by default True
    round_up : bool, optional
        if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True

    Returns
    -------
    torch.Tensor
        downscaled mask
    """

    if downsize is None:
        downsize = (mask.shape[2] // 2, mask.shape[3] // 2)
    assert mask.shape[
        1] == 1, 'Expected shape for the input to be (n,1,height,width)'
    if blur_mask is True:
        mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0))
        mask = F.interpolate(
            mask, size=downsize, mode='bilinear', align_corners=False)
    else:
        mask = F.interpolate(
            mask, size=downsize, mode='bilinear', align_corners=False)
    if round_up:
        mask[mask >= eps] = 1
        mask[mask < eps] = 0
    else:
        mask[mask >= 1.0 - eps] = 1
        mask[mask < 1.0 - eps] = 0
    return mask


 def _erode_mask(mask: torch.Tensor,
                ekernel: torch.Tensor = None,
                eps: float = 1e-8):
    """erode the mask, and set gray pixels to 0"""
    if ekernel is not None:
        mask = erosion(mask, ekernel)
        mask[mask >= 1.0 - eps] = 1
        mask[mask < 1.0 - eps] = 0
    return mask


 def _l1_loss(pred: torch.Tensor,
             pred_downscaled: torch.Tensor,
             ref: torch.Tensor,
             mask: torch.Tensor,
             mask_downscaled: torch.Tensor,
             image: torch.Tensor,
             on_pred: bool = True):
    """l1 loss on src pixels, and downscaled predictions if on_pred=True"""
    loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8]))
    if on_pred:
        loss += torch.mean(
            torch.abs(pred_downscaled[mask_downscaled >= 1e-8]
                      - ref[mask_downscaled >= 1e-8]))
    return loss


 def _infer(image: torch.Tensor,
           mask: torch.Tensor,
           forward_front: nn.Module,
           forward_rears: nn.Module,
           ref_lower_res: torch.Tensor,
           orig_shape: tuple,
           devices: list,
           scale_ind: int,
           n_iters: int = 15,
           lr: float = 0.002):
    """Performs inference with refinement at a given scale.

    Parameters
    ----------
    image : torch.Tensor
        input image to be inpainted, of size (1,3,H,W)
    mask : torch.Tensor
        input inpainting mask, of size (1,1,H,W)
    forward_front : nn.Module
        the front part of the inpainting network
    forward_rears : nn.Module
        the rear part of the inpainting network
    ref_lower_res : torch.Tensor
        the inpainting at previous scale, used as reference image
    orig_shape : tuple
        shape of the original input image before padding
    devices : list
        list of available devices
    scale_ind : int
        the scale index
    n_iters : int, optional
        number of iterations of refinement, by default 15
    lr : float, optional
        learning rate, by default 0.002

    Returns
    -------
    torch.Tensor
        inpainted image
    """
    masked_image = image * (1 - mask)
    masked_image = torch.cat([masked_image, mask], dim=1)

    mask = mask.repeat(1, 3, 1, 1)
    if ref_lower_res is not None:
        ref_lower_res = ref_lower_res.detach()
    with torch.no_grad():
        z1, z2 = forward_front(masked_image)
    # Inference
    mask = mask.to(devices[-1])
    ekernel = torch.from_numpy(
        cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
                                  (15, 15)).astype(bool)).float()
    ekernel = ekernel.to(devices[-1])
    image = image.to(devices[-1])
    z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0])
    z1.requires_grad, z2.requires_grad = True, True

    optimizer = Adam([z1, z2], lr=lr)

    pbar = tqdm(range(n_iters), leave=False)
    for idi in pbar:
        optimizer.zero_grad()
        input_feat = (z1, z2)
        for idd, forward_rear in enumerate(forward_rears):
            output_feat = forward_rear(input_feat)
            if idd < len(devices) - 1:
                midz1, midz2 = output_feat
                midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to(
                    devices[idd + 1])
                input_feat = (midz1, midz2)
            else:
                pred = output_feat

        if ref_lower_res is None:
            break
        losses = {}
        # scaled loss with downsampler
        pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]])
        mask_downscaled = _pyrdown_mask(
            mask[:, :1, :orig_shape[0], :orig_shape[1]],
            blur_mask=False,
            round_up=False)
        mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel)
        mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1)
        losses['ms_l1'] = _l1_loss(
            pred,
            pred_downscaled,
            ref_lower_res,
            mask,
            mask_downscaled,
            image,
            on_pred=True)

        loss = sum(losses.values())
        pbar.set_description(
            'Refining scale {} using scale {} ...current loss: {:.4f}'.format(
                scale_ind + 1, scale_ind, loss.item()))
        if idi < n_iters - 1:
            loss.backward()
            optimizer.step()
            del pred_downscaled
            del loss
            del pred
    # "pred" is the prediction after Plug-n-Play module
    inpainted = mask * pred + (1 - mask) * image
    inpainted = inpainted.detach().cpu()
    return inpainted


 def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int,
                            px_budget: int):
    """Build the image mask pyramid

    Parameters
    ----------
    batch : dict
        batch containing image, mask, etc
    min_side : int
        minimum side length to limit the number of scales of the pyramid
    max_scales : int
        maximum number of scales allowed
    px_budget : int
        the product H*W cannot exceed this budget, because of resource constraints

    Returns
    -------
    tuple
        image-mask pyramid in the form of list of images and list of masks
    """

    assert batch['image'].shape[
        0] == 1, 'refiner works on only batches of size 1!'

    h, w = batch['unpad_to_size']
    h, w = h[0].item(), w[0].item()

    image = batch['image'][..., :h, :w]
    mask = batch['mask'][..., :h, :w]
    if h * w > px_budget:
        # resize
        ratio = np.sqrt(px_budget / float(h * w))
        h_orig, w_orig = h, w
        h, w = int(h * ratio), int(w * ratio)
        print(
            f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...'
        )
        image = resize(
            image, (h, w), interpolation='bilinear', align_corners=False)
        mask = resize(
            mask, (h, w), interpolation='bilinear', align_corners=False)
        mask[mask > 1e-8] = 1
    breadth = min(h, w)
    n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))),
                   max_scales)
    ls_images = []
    ls_masks = []

    ls_images.append(image)
    ls_masks.append(mask)

    for _ in range(n_scales - 1):
        image_p = _pyrdown(ls_images[-1])
        mask_p = _pyrdown_mask(ls_masks[-1])
        ls_images.append(image_p)
        ls_masks.append(mask_p)
    # reverse the lists because we want the lowest resolution image as index 0
    return ls_images[::-1], ls_masks[::-1]


 def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str,
                   modulo: int, n_iters: int, lr: float, min_side: int,
                   max_scales: int, px_budget: int):
    """Refines the inpainting of the network

    Parameters
    ----------
    batch : dict
        image-mask batch, currently we assume the batchsize to be 1
    inpainter : nn.Module
        the inpainting neural network
    gpu_ids : str
        the GPU ids of the machine to use. If only single GPU, use: "0,"
    modulo : int
        pad the image to ensure dimension % modulo == 0
    n_iters : int
        number of iterations of refinement for each scale
    lr : float
        learning rate
    min_side : int
        all sides of image on all scales should be >= min_side / sqrt(2)
    max_scales : int
        max number of downscaling scales for the image-mask pyramid
    px_budget : int
        pixels budget. Any image will be resized to satisfy height*width <= px_budget

    Returns
    -------
    torch.Tensor
        inpainted image of size (1,3,H,W)
    """
    inpainter = inpainter.model
    assert not inpainter.training
    assert not inpainter.add_noise_kwargs
    assert inpainter.concat_mask

    gpu_ids = [
        f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',')
        if gpuid.isdigit()
    ]
    n_resnet_blocks = 0
    first_resblock_ind = 0
    found_first_resblock = False
    for idl in range(len(inpainter.generator.model)):
        if isinstance(inpainter.generator.model[idl], FFCResnetBlock):
            n_resnet_blocks += 1
            found_first_resblock = True
        elif not found_first_resblock:
            first_resblock_ind += 1
    resblocks_per_gpu = n_resnet_blocks // len(gpu_ids)

    devices = [torch.device(gpu_id) for gpu_id in gpu_ids]

    # split the model into front, and rear parts
    forward_front = inpainter.generator.model[0:first_resblock_ind]
    forward_front.to(devices[0])
    forward_rears = []
    for idd in range(len(gpu_ids)):
        if idd < len(gpu_ids) - 1:
            forward_rears.append(
                inpainter.generator.model[first_resblock_ind
                                          + resblocks_per_gpu
                                          * (idd):first_resblock_ind
                                          + resblocks_per_gpu * (idd + 1)])
        else:
            forward_rears.append(
                inpainter.generator.model[first_resblock_ind
                                          + resblocks_per_gpu * (idd):])
        forward_rears[idd].to(devices[idd])

    ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales,
                                                  px_budget)
    image_inpainted = None

    for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)):
        orig_shape = image.shape[2:]
        image = pad_tensor_to_modulo(image, modulo)
        mask = pad_tensor_to_modulo(mask, modulo)
        mask[mask >= 1e-8] = 1.0
        mask[mask < 1e-8] = 0.0
        image, mask = move_to_device(image, devices[0]), move_to_device(
            mask, devices[0])
        if image_inpainted is not None:
            image_inpainted = move_to_device(image_inpainted, devices[-1])
        image_inpainted = _infer(image, mask, forward_front, forward_rears,
                                 image_inpainted, orig_shape, devices, ids,
                                 n_iters, lr)
        image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]]
        # detach everything to save resources
        image = image.detach().cpu()
        mask = mask.detach().cpu()

    return image_inpainted
--- a/modelscope/models/cv/object_detection/init.py
+++ b/modelscope/models/cv/object_detection/init.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
 else:
    _import_structure = {
        'mmdet_model': ['DetectionModel'],
        'yolox_pai': ['YOLOX']
        'yolox_pai': ['YOLOX'],
    }

    import sys
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks

@MODELS.register_module(
    group_key=Tasks.image_object_detection, module_name=Models.yolox)
@MODELS.register_module(
    group_key=Tasks.image_object_detection,
    module_name=Models.image_object_detection_auto)
 class YOLOX(EasyCVBaseModel, _YOLOX):

    def __init__(self, model_dir=None, *args, **kwargs):
--- a/modelscope/models/cv/realtime_object_detection/init.py
+++ b/modelscope/models/cv/realtime_object_detection/init.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .realtime_detector import RealtimeDetector
    from .realtime_video_detector import RealtimeVideoDetector
 else:
    _import_structure = {
        'realtime_detector': ['RealtimeDetector'],
        'realtime_video_detector': ['RealtimeVideoDetector'],
    }

    import sys
--- a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
+++ b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
@@ -0,0 +1,117 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 import logging as logger
 import os
 import os.path as osp
 import time

 import cv2
 import json
 import torch
 from tqdm import tqdm

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .yolox.data.data_augment import ValTransform
 from .yolox.exp import get_exp_by_name
 from .yolox.utils import postprocess


@MODELS.register_module(
    group_key=Tasks.video_object_detection,
    module_name=Models.realtime_video_object_detection)
 class RealtimeVideoDetector(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        self.config = Config.from_file(
            os.path.join(self.model_dir, ModelFile.CONFIGURATION))

        # model type
        self.exp = get_exp_by_name(self.config.model_type)

        # build model
        self.model = self.exp.get_model()
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
        ckpt = torch.load(model_path, map_location='cpu')

        # load the model state dict
        self.model.load_state_dict(ckpt['model'])
        self.model.eval()

        # params setting
        self.exp.num_classes = self.config.num_classes
        self.confthre = self.config.conf_thr
        self.num_classes = self.exp.num_classes
        self.nmsthre = self.exp.nmsthre
        self.test_size = self.exp.test_size
        self.preproc = ValTransform(legacy=False)
        self.current_buffer = None
        self.label_mapping = self.config['labels']

    def inference(self, img):
        with torch.no_grad():
            outputs, self.current_buffer = self.model(
                img, buffer=self.current_buffer, mode='on_pipe')
        return outputs

    def forward(self, inputs):
        return self.inference_video(inputs)

    def preprocess(self, img):
        img = LoadImage.convert_to_ndarray(img)
        height, width = img.shape[:2]
        self.ratio = min(self.test_size[0] / img.shape[0],
                         self.test_size[1] / img.shape[1])

        img, _ = self.preproc(img, None, self.test_size)
        img = torch.from_numpy(img).unsqueeze(0)
        img = img.float()

        # Video decoding and preprocessing automatically are not supported by Pipeline/Model
        # Sending preprocessed video frame tensor to GPU buffer self-adaptively
        if next(self.model.parameters()).is_cuda:
            img = img.to(next(self.model.parameters()).device)
        return img

    def postprocess(self, input):
        outputs = postprocess(
            input,
            self.num_classes,
            self.confthre,
            self.nmsthre,
            class_agnostic=True)

        if len(outputs) == 1:
            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
            scores = outputs[0][:, 5].cpu().numpy()
            labels = outputs[0][:, 6].cpu().int().numpy()
            pred_label_names = []
            for lab in labels:
                pred_label_names.append(self.label_mapping[lab])

        return bboxes, scores, pred_label_names

    def inference_video(self, v_path):
        outputs = []
        desc = 'Detecting video: {}'.format(v_path)
        for frame, result in tqdm(
                self.inference_video_iter(v_path), desc=desc):
            outputs.append(result)

        return outputs

    def inference_video_iter(self, v_path):
        capture = cv2.VideoCapture(v_path)
        while capture.isOpened():
            ret, frame = capture.read()
            if not ret:
                break
            output = self.preprocess(frame)
            output = self.inference(output)
            output = self.postprocess(output)
            yield frame, output
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -13,6 +13,8 @@ def get_exp_by_name(exp_name):
        from .default import YoloXNanoExp as YoloXExp
    elif exp == 'yolox_tiny':
        from .default import YoloXTinyExp as YoloXExp
    elif exp == 'streamyolo':
        from .default import StreamYoloExp as YoloXExp
    else:
        pass
    return YoloXExp()
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/init.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/init.py
@@ -1,5 +1,5 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

 from .streamyolo import StreamYoloExp
 from .yolox_nano import YoloXNanoExp
 from .yolox_s import YoloXSExp
 from .yolox_tiny import YoloXTinyExp
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
@@ -0,0 +1,43 @@
 # The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
 import os
 import sys

 import torch

 from ..yolox_base import Exp as YoloXExp


 class StreamYoloExp(YoloXExp):

    def __init__(self):
        super(YoloXExp, self).__init__()
        self.depth = 1.0
        self.width = 1.0
        self.num_classes = 8
        self.test_size = (600, 960)
        self.test_conf = 0.3
        self.nmsthre = 0.65

    def get_model(self):
        from ...models import StreamYOLO, DFPPAFPN, TALHead

        def init_yolo(M):
            for m in M.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eps = 1e-3
                    m.momentum = 0.03

        if getattr(self, 'model', None) is None:
            in_channels = [256, 512, 1024]
            backbone = DFPPAFPN(
                self.depth, self.width, in_channels=in_channels)
            head = TALHead(
                self.num_classes,
                self.width,
                in_channels=in_channels,
                gamma=1.0,
                ignore_thr=0.5,
                ignore_value=1.6)
            self.model = StreamYOLO(backbone, head)

        return self.model
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -1,5 +1,4 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

 import os
 import random