# Conflicts: # modelscope/metrics/__init__.pymaster
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d | |||
| size 2327764 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150 | |||
| size 68524 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32 | |||
| size 53219 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c | |||
| size 254845 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae | |||
| size 48909 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d | |||
| size 1127557 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5 | |||
| size 2747938 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15 | |||
| size 27616 | |||
| @@ -1,3 +0,0 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d | |||
| size 1702339 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd | |||
| size 167407 | |||
| @@ -1,3 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572 | |||
| size 60801 | |||
| oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c | |||
| size 61239 | |||
| @@ -1,3 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c | |||
| size 60801 | |||
| oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1 | |||
| size 61115 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb | |||
| size 2924691 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87 | |||
| size 4068783 | |||
| @@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ | |||
| ENV SHELL=/bin/bash | |||
| # install special package | |||
| RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq | |||
| RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl | |||
| RUN if [ "$USE_GPU" = "True" ] ; then \ | |||
| pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ | |||
| @@ -24,20 +24,17 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
| DownloadMode) | |||
| from modelscope.utils.logger import get_logger | |||
| from .errors import (InvalidParameter, NotExistError, RequestError, | |||
| datahub_raise_on_error, handle_http_response, is_ok, | |||
| raise_on_error) | |||
| from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||
| model_id_to_group_owner_name) | |||
| datahub_raise_on_error, handle_http_post_error, | |||
| handle_http_response, is_ok, raise_on_error) | |||
| from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||
| logger = get_logger() | |||
| class HubApi: | |||
| def __init__(self, endpoint=None, dataset_endpoint=None): | |||
| def __init__(self, endpoint=None): | |||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | |||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||
| ) | |||
| def login( | |||
| self, | |||
| @@ -105,17 +102,15 @@ class HubApi: | |||
| path = f'{self.endpoint}/api/v1/models' | |||
| owner_or_group, name = model_id_to_group_owner_name(model_id) | |||
| r = requests.post( | |||
| path, | |||
| json={ | |||
| 'Path': owner_or_group, | |||
| 'Name': name, | |||
| 'ChineseName': chinese_name, | |||
| 'Visibility': visibility, # server check | |||
| 'License': license | |||
| }, | |||
| cookies=cookies) | |||
| r.raise_for_status() | |||
| body = { | |||
| 'Path': owner_or_group, | |||
| 'Name': name, | |||
| 'ChineseName': chinese_name, | |||
| 'Visibility': visibility, # server check | |||
| 'License': license | |||
| } | |||
| r = requests.post(path, json=body, cookies=cookies) | |||
| handle_http_post_error(r, path, body) | |||
| raise_on_error(r.json()) | |||
| model_repo_url = f'{get_endpoint()}/{model_id}' | |||
| return model_repo_url | |||
| @@ -290,7 +285,7 @@ class HubApi: | |||
| return files | |||
| def list_datasets(self): | |||
| path = f'{self.dataset_endpoint}/api/v1/datasets' | |||
| path = f'{self.endpoint}/api/v1/datasets' | |||
| headers = None | |||
| params = {} | |||
| r = requests.get(path, params=params, headers=headers) | |||
| @@ -317,13 +312,13 @@ class HubApi: | |||
| cache_dir): | |||
| shutil.rmtree(cache_dir) | |||
| os.makedirs(cache_dir, exist_ok=True) | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| dataset_id = resp['Data']['Id'] | |||
| dataset_type = resp['Data']['Type'] | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||
| r = requests.get(datahub_url) | |||
| resp = r.json() | |||
| datahub_raise_on_error(datahub_url, resp) | |||
| @@ -341,7 +336,7 @@ class HubApi: | |||
| file_path = file_info['Path'] | |||
| extension = os.path.splitext(file_path)[-1] | |||
| if extension in dataset_meta_format: | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| f'Revision={revision}&FilePath={file_path}' | |||
| r = requests.get(datahub_url) | |||
| r.raise_for_status() | |||
| @@ -365,7 +360,7 @@ class HubApi: | |||
| namespace: str, | |||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
| if file_name.endswith('.csv'): | |||
| file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| f'Revision={revision}&FilePath={file_name}' | |||
| return file_name | |||
| @@ -374,7 +369,7 @@ class HubApi: | |||
| dataset_name: str, | |||
| namespace: str, | |||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
| f'ststoken?Revision={revision}' | |||
| return self.datahub_remote_call(datahub_url) | |||
| @@ -385,7 +380,7 @@ class HubApi: | |||
| namespace: str, | |||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
| f'ststoken?Revision={revision}' | |||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||
| @@ -394,6 +389,19 @@ class HubApi: | |||
| raise_on_error(resp) | |||
| return resp['Data'] | |||
| def list_oss_dataset_objects(self, dataset_name, namespace, max_limit, | |||
| is_recursive, is_filter_dir, revision, | |||
| cookies): | |||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \ | |||
| f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | |||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||
| resp = requests.get(url=url, cookies=cookies) | |||
| resp = resp.json() | |||
| raise_on_error(resp) | |||
| resp = resp['Data'] | |||
| return resp | |||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | |||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | |||
| r = requests.post(url) | |||
| @@ -4,6 +4,10 @@ from http import HTTPStatus | |||
| from requests.exceptions import HTTPError | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| class NotExistError(Exception): | |||
| pass | |||
| @@ -45,15 +49,24 @@ def is_ok(rsp): | |||
| return rsp['Code'] == HTTPStatus.OK and rsp['Success'] | |||
| def handle_http_post_error(response, url, request_body): | |||
| try: | |||
| response.raise_for_status() | |||
| except HTTPError as error: | |||
| logger.error('Request %s with body: %s exception' % | |||
| (url, request_body)) | |||
| raise error | |||
| def handle_http_response(response, logger, cookies, model_id): | |||
| try: | |||
| response.raise_for_status() | |||
| except HTTPError: | |||
| except HTTPError as error: | |||
| if cookies is None: # code in [403] and | |||
| logger.error( | |||
| f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ | |||
| private. Please login first.') | |||
| raise | |||
| raise error | |||
| def raise_on_error(rsp): | |||
| @@ -1,6 +1,7 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import re | |||
| import subprocess | |||
| from typing import List | |||
| from xmlrpc.client import Boolean | |||
| @@ -138,8 +139,8 @@ class GitCommandWrapper(metaclass=Singleton): | |||
| repo_base_dir, repo_name, user_name) | |||
| response = self._run_git_command(*config_user_name_args.split(' ')) | |||
| logger.debug(response.stdout.decode('utf8')) | |||
| config_user_email_args = '-C %s/%s config user.name %s' % ( | |||
| repo_base_dir, repo_name, user_name) | |||
| config_user_email_args = '-C %s/%s config user.email %s' % ( | |||
| repo_base_dir, repo_name, user_email) | |||
| response = self._run_git_command( | |||
| *config_user_email_args.split(' ')) | |||
| logger.debug(response.stdout.decode('utf8')) | |||
| @@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton): | |||
| cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision] | |||
| return self._run_git_command(*cmds) | |||
| def get_remote_branches(self, repo_dir: str): | |||
| cmds = ['-C', '%s' % repo_dir, 'branch', '-r'] | |||
| rsp = self._run_git_command(*cmds) | |||
| info = [ | |||
| line.strip() | |||
| for line in rsp.stdout.decode('utf8').strip().split(os.linesep) | |||
| ][1:] | |||
| return ['/'.join(line.split('/')[1:]) for line in info] | |||
| def pull(self, repo_dir: str): | |||
| cmds = ['-C', repo_dir, 'pull'] | |||
| return self._run_git_command(*cmds) | |||
| @@ -0,0 +1,117 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import datetime | |||
| import os | |||
| import shutil | |||
| import tempfile | |||
| import uuid | |||
| from typing import Dict, Optional | |||
| from uuid import uuid4 | |||
| from filelock import FileLock | |||
| from modelscope import __version__ | |||
| from modelscope.hub.api import HubApi, ModelScopeConfig | |||
| from modelscope.hub.errors import InvalidParameter, NotLoginException | |||
| from modelscope.hub.git import GitCommandWrapper | |||
| from modelscope.hub.repository import Repository | |||
| from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| def upload_folder(model_id: str, | |||
| model_dir: str, | |||
| visibility: int = 0, | |||
| license: str = None, | |||
| chinese_name: Optional[str] = None, | |||
| commit_message: Optional[str] = None, | |||
| revision: Optional[str] = DEFAULT_MODEL_REVISION): | |||
| """ | |||
| Upload model from a given directory to given repository. A valid model directory | |||
| must contain a configuration.json file. | |||
| This function upload the files in given directory to given repository. If the | |||
| given repository is not exists in remote, it will automatically create it with | |||
| given visibility, license and chinese_name parameters. If the revision is also | |||
| not exists in remote repository, it will create a new branch for it. | |||
| This function must be called before calling HubApi's login with a valid token | |||
| which can be obtained from ModelScope's website. | |||
| Args: | |||
| model_id (`str`): | |||
| The model id to be uploaded, caller must have write permission for it. | |||
| model_dir(`str`): | |||
| The Absolute Path of the finetune result. | |||
| visibility(`int`, defaults to `0`): | |||
| Visibility of the new created model(1-private, 5-public). If the model is | |||
| not exists in ModelScope, this function will create a new model with this | |||
| visibility and this parameter is required. You can ignore this parameter | |||
| if you make sure the model's existence. | |||
| license(`str`, defaults to `None`): | |||
| License of the new created model(see License). If the model is not exists | |||
| in ModelScope, this function will create a new model with this license | |||
| and this parameter is required. You can ignore this parameter if you | |||
| make sure the model's existence. | |||
| chinese_name(`str`, *optional*, defaults to `None`): | |||
| chinese name of the new created model. | |||
| commit_message(`str`, *optional*, defaults to `None`): | |||
| commit message of the push request. | |||
| revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION): | |||
| which branch to push. If the branch is not exists, It will create a new | |||
| branch and push to it. | |||
| """ | |||
| if model_id is None: | |||
| raise InvalidParameter('model_id cannot be empty!') | |||
| if model_dir is None: | |||
| raise InvalidParameter('model_dir cannot be empty!') | |||
| if not os.path.exists(model_dir) or os.path.isfile(model_dir): | |||
| raise InvalidParameter('model_dir must be a valid directory.') | |||
| cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) | |||
| if not os.path.exists(cfg_file): | |||
| raise ValueError(f'{model_dir} must contain a configuration.json.') | |||
| cookies = ModelScopeConfig.get_cookies() | |||
| if cookies is None: | |||
| raise NotLoginException('Must login before upload!') | |||
| files_to_save = os.listdir(model_dir) | |||
| api = HubApi() | |||
| try: | |||
| api.get_model(model_id=model_id) | |||
| except Exception: | |||
| if visibility is None or license is None: | |||
| raise InvalidParameter( | |||
| 'visibility and license cannot be empty if want to create new repo' | |||
| ) | |||
| logger.info('Create new model %s' % model_id) | |||
| api.create_model( | |||
| model_id=model_id, | |||
| visibility=visibility, | |||
| license=license, | |||
| chinese_name=chinese_name) | |||
| tmp_dir = tempfile.mkdtemp() | |||
| git_wrapper = GitCommandWrapper() | |||
| try: | |||
| repo = Repository(model_dir=tmp_dir, clone_from=model_id) | |||
| branches = git_wrapper.get_remote_branches(tmp_dir) | |||
| if revision not in branches: | |||
| logger.info('Create new branch %s' % revision) | |||
| git_wrapper.new_branch(tmp_dir, revision) | |||
| git_wrapper.checkout(tmp_dir, revision) | |||
| for f in files_to_save: | |||
| if f[0] != '.': | |||
| src = os.path.join(model_dir, f) | |||
| if os.path.isdir(src): | |||
| shutil.copytree(src, os.path.join(tmp_dir, f)) | |||
| else: | |||
| shutil.copy(src, tmp_dir) | |||
| if not commit_message: | |||
| date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') | |||
| commit_message = '[automsg] push model %s to hub at %s' % ( | |||
| model_id, date) | |||
| repo.push(commit_message=commit_message, branch=revision) | |||
| except Exception: | |||
| raise | |||
| finally: | |||
| shutil.rmtree(tmp_dir, ignore_errors=True) | |||
| @@ -4,8 +4,7 @@ import hashlib | |||
| import os | |||
| from typing import Optional | |||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||
| DEFAULT_MODELSCOPE_DOMAIN, | |||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||
| DEFAULT_MODELSCOPE_GROUP, | |||
| MODEL_ID_SEPARATOR, | |||
| MODELSCOPE_URL_SCHEME) | |||
| @@ -44,11 +43,6 @@ def get_endpoint(): | |||
| return MODELSCOPE_URL_SCHEME + modelscope_domain | |||
| def get_dataset_hub_endpoint(): | |||
| return os.environ.get('HUB_DATASET_ENDPOINT', | |||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||
| def compute_hash(file_path): | |||
| BUFFER_SIZE = 1024 * 64 # 64k buffer size | |||
| sha256_hash = hashlib.sha256() | |||
| @@ -14,6 +14,7 @@ class Models(object): | |||
| # vision models | |||
| detection = 'detection' | |||
| realtime_object_detection = 'realtime-object-detection' | |||
| realtime_video_object_detection = 'realtime-video-object-detection' | |||
| scrfd = 'scrfd' | |||
| classification_model = 'ClassificationModel' | |||
| nafnet = 'nafnet' | |||
| @@ -27,11 +28,13 @@ class Models(object): | |||
| face_2d_keypoints = 'face-2d-keypoints' | |||
| panoptic_segmentation = 'swinL-panoptic-segmentation' | |||
| image_reid_person = 'passvitb' | |||
| image_inpainting = 'FFTInpainting' | |||
| video_summarization = 'pgl-video-summarization' | |||
| swinL_semantic_segmentation = 'swinL-semantic-segmentation' | |||
| vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | |||
| text_driven_segmentation = 'text-driven-segmentation' | |||
| resnet50_bert = 'resnet50-bert' | |||
| referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' | |||
| fer = 'fer' | |||
| retinaface = 'retinaface' | |||
| shop_segmentation = 'shop-segmentation' | |||
| @@ -39,14 +42,18 @@ class Models(object): | |||
| mtcnn = 'mtcnn' | |||
| ulfd = 'ulfd' | |||
| video_inpainting = 'video-inpainting' | |||
| human_wholebody_keypoint = 'human-wholebody-keypoint' | |||
| hand_static = 'hand-static' | |||
| face_human_hand_detection = 'face-human-hand-detection' | |||
| face_emotion = 'face-emotion' | |||
| product_segmentation = 'product-segmentation' | |||
| image_body_reshaping = 'image-body-reshaping' | |||
| # EasyCV models | |||
| yolox = 'YOLOX' | |||
| segformer = 'Segformer' | |||
| hand_2d_keypoints = 'HRNet-Hand2D-Keypoints' | |||
| image_object_detection_auto = 'image-object-detection-auto' | |||
| # nlp models | |||
| bert = 'bert' | |||
| @@ -66,6 +73,7 @@ class Models(object): | |||
| gcnncrf = 'gcnn-crf' | |||
| bart = 'bart' | |||
| gpt3 = 'gpt3' | |||
| gpt_neo = 'gpt-neo' | |||
| plug = 'plug' | |||
| bert_for_ds = 'bert-for-document-segmentation' | |||
| ponet = 'ponet' | |||
| @@ -96,6 +104,7 @@ class TaskModels(object): | |||
| information_extraction = 'information-extraction' | |||
| fill_mask = 'fill-mask' | |||
| feature_extraction = 'feature-extraction' | |||
| text_generation = 'text-generation' | |||
| class Heads(object): | |||
| @@ -111,6 +120,8 @@ class Heads(object): | |||
| token_classification = 'token-classification' | |||
| # extraction | |||
| information_extraction = 'information-extraction' | |||
| # text gen | |||
| text_generation = 'text-generation' | |||
| class Pipelines(object): | |||
| @@ -144,6 +155,7 @@ class Pipelines(object): | |||
| salient_detection = 'u2net-salient-detection' | |||
| image_classification = 'image-classification' | |||
| face_detection = 'resnet-face-detection-scrfd10gkps' | |||
| card_detection = 'resnet-card-detection-scrfd34gkps' | |||
| ulfd_face_detection = 'manual-face-detection-ulfd' | |||
| facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' | |||
| retina_face_detection = 'resnet50-face-detection-retinaface' | |||
| @@ -160,6 +172,7 @@ class Pipelines(object): | |||
| face_image_generation = 'gan-face-image-generation' | |||
| product_retrieval_embedding = 'resnet50-product-retrieval-embedding' | |||
| realtime_object_detection = 'cspnet_realtime-object-detection_yolox' | |||
| realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo' | |||
| face_recognition = 'ir101-face-recognition-cfglint' | |||
| image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' | |||
| image2image_translation = 'image-to-image-translation' | |||
| @@ -168,6 +181,7 @@ class Pipelines(object): | |||
| ocr_recognition = 'convnextTiny-ocr-recognition' | |||
| image_portrait_enhancement = 'gpen-image-portrait-enhancement' | |||
| image_to_image_generation = 'image-to-image-generation' | |||
| image_object_detection_auto = 'yolox_image-object-detection-auto' | |||
| skin_retouching = 'unet-skin-retouching' | |||
| tinynas_classification = 'tinynas-classification' | |||
| tinynas_detection = 'tinynas-detection' | |||
| @@ -178,15 +192,19 @@ class Pipelines(object): | |||
| video_summarization = 'googlenet_pgl_video_summarization' | |||
| image_semantic_segmentation = 'image-semantic-segmentation' | |||
| image_reid_person = 'passvitb-image-reid-person' | |||
| image_inpainting = 'fft-inpainting' | |||
| text_driven_segmentation = 'text-driven-segmentation' | |||
| movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | |||
| shop_segmentation = 'shop-segmentation' | |||
| video_inpainting = 'video-inpainting' | |||
| human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image' | |||
| pst_action_recognition = 'patchshift-action-recognition' | |||
| hand_static = 'hand-static' | |||
| face_human_hand_detection = 'face-human-hand-detection' | |||
| face_emotion = 'face-emotion' | |||
| product_segmentation = 'product-segmentation' | |||
| image_body_reshaping = 'flow-based-body-reshaping' | |||
| referring_video_object_segmentation = 'referring-video-object-segmentation' | |||
| # nlp tasks | |||
| automatic_post_editing = 'automatic-post-editing' | |||
| @@ -211,6 +229,7 @@ class Pipelines(object): | |||
| zero_shot_classification = 'zero-shot-classification' | |||
| text_error_correction = 'text-error-correction' | |||
| plug_generation = 'plug-generation' | |||
| gpt3_generation = 'gpt3-generation' | |||
| faq_question_answering = 'faq-question-answering' | |||
| conversational_text_to_sql = 'conversational-text-to-sql' | |||
| table_question_answering_pipeline = 'table-question-answering-pipeline' | |||
| @@ -219,6 +238,9 @@ class Pipelines(object): | |||
| relation_extraction = 'relation-extraction' | |||
| document_segmentation = 'document-segmentation' | |||
| feature_extraction = 'feature-extraction' | |||
| translation_en_to_de = 'translation_en_to_de' # keep it underscore | |||
| translation_en_to_ro = 'translation_en_to_ro' # keep it underscore | |||
| translation_en_to_fr = 'translation_en_to_fr' # keep it underscore | |||
| # audio tasks | |||
| sambert_hifigan_tts = 'sambert-hifigan-tts' | |||
| @@ -263,6 +285,9 @@ class Trainers(object): | |||
| image_portrait_enhancement = 'image-portrait-enhancement' | |||
| video_summarization = 'video-summarization' | |||
| movie_scene_segmentation = 'movie-scene-segmentation' | |||
| face_detection_scrfd = 'face-detection-scrfd' | |||
| card_detection_scrfd = 'card-detection-scrfd' | |||
| image_inpainting = 'image-inpainting' | |||
| # nlp trainers | |||
| bert_sentiment_analysis = 'bert-sentiment-analysis' | |||
| @@ -274,6 +299,7 @@ class Trainers(object): | |||
| # audio trainers | |||
| speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | |||
| speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' | |||
| class Preprocessors(object): | |||
| @@ -302,6 +328,8 @@ class Preprocessors(object): | |||
| bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' | |||
| text_gen_tokenizer = 'text-gen-tokenizer' | |||
| text2text_gen_preprocessor = 'text2text-gen-preprocessor' | |||
| text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer' | |||
| text2text_translate_preprocessor = 'text2text-translate-preprocessor' | |||
| token_cls_tokenizer = 'token-cls-tokenizer' | |||
| ner_tokenizer = 'ner-tokenizer' | |||
| nli_tokenizer = 'nli-tokenizer' | |||
| @@ -324,6 +352,7 @@ class Preprocessors(object): | |||
| re_tokenizer = 're-tokenizer' | |||
| document_segmentation = 'document-segmentation' | |||
| feature_extraction = 'feature-extraction' | |||
| sentence_piece = 'sentence-piece' | |||
| # audio preprocessor | |||
| linear_aec_fbank = 'linear-aec-fbank' | |||
| @@ -365,6 +394,8 @@ class Metrics(object): | |||
| video_summarization_metric = 'video-summarization-metric' | |||
| # metric for movie-scene-segmentation task | |||
| movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' | |||
| # metric for inpainting task | |||
| image_inpainting_metric = 'image-inpainting-metric' | |||
| class Optimizers(object): | |||
| @@ -406,6 +437,9 @@ class Hooks(object): | |||
| IterTimerHook = 'IterTimerHook' | |||
| EvaluationHook = 'EvaluationHook' | |||
| # Compression | |||
| SparsityHook = 'SparsityHook' | |||
| class LR_Schedulers(object): | |||
| """learning rate scheduler is defined here | |||
| @@ -421,6 +455,8 @@ class Datasets(object): | |||
| """ | |||
| ClsDataset = 'ClsDataset' | |||
| Face2dKeypointsDataset = 'Face2dKeypointsDataset' | |||
| HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset' | |||
| HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset' | |||
| SegDataset = 'SegDataset' | |||
| DetDataset = 'DetDataset' | |||
| DetImagesMixDataset = 'DetImagesMixDataset' | |||
| @@ -19,6 +19,7 @@ if TYPE_CHECKING: | |||
| from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric | |||
| from .accuracy_metric import AccuracyMetric | |||
| from .bleu_metric import BleuMetric | |||
| from .image_inpainting_metric import ImageInpaintingMetric | |||
| else: | |||
| _import_structure = { | |||
| @@ -36,6 +37,7 @@ else: | |||
| 'token_classification_metric': ['TokenClassificationMetric'], | |||
| 'video_summarization_metric': ['VideoSummarizationMetric'], | |||
| 'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'], | |||
| 'image_inpainting_metric': ['ImageInpaintingMetric'], | |||
| 'accuracy_metric': ['AccuracyMetric'], | |||
| 'bleu_metric': ['BleuMetric'], | |||
| } | |||
| @@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric): | |||
| total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr | |||
| return { | |||
| 'total_loss': total_loss.item(), | |||
| 'avg_sisnr': avg_sisnr.item(), | |||
| # model use opposite number of sisnr as a calculation shortcut. | |||
| # revert it in evaluation result | |||
| 'avg_sisnr': -avg_sisnr.item(), | |||
| MetricKeys.AVERAGE_LOSS: avg_loss.item() | |||
| } | |||
| @@ -18,6 +18,7 @@ class MetricKeys(object): | |||
| SSIM = 'ssim' | |||
| AVERAGE_LOSS = 'avg_loss' | |||
| FScore = 'fscore' | |||
| FID = 'fid' | |||
| BLEU_1 = 'bleu-1' | |||
| BLEU_4 = 'bleu-4' | |||
| ROUGE_1 = 'rouge-1' | |||
| @@ -39,6 +40,7 @@ task_default_metrics = { | |||
| Tasks.image_captioning: [Metrics.text_gen_metric], | |||
| Tasks.visual_question_answering: [Metrics.text_gen_metric], | |||
| Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], | |||
| Tasks.image_inpainting: [Metrics.image_inpainting_metric], | |||
| } | |||
| @@ -1,12 +1,16 @@ | |||
| # ------------------------------------------------------------------------ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| # ------------------------------------------------------------------------ | |||
| # modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py | |||
| # ------------------------------------------------------------------------ | |||
| from typing import Dict | |||
| import cv2 | |||
| import numpy as np | |||
| from skimage.metrics import peak_signal_noise_ratio, structural_similarity | |||
| import torch | |||
| from modelscope.metainfo import Metrics | |||
| from modelscope.utils.registry import default_group | |||
| from modelscope.utils.tensor_utils import (torch_nested_detach, | |||
| torch_nested_numpify) | |||
| from .base import Metric | |||
| from .builder import METRICS, MetricKeys | |||
| @@ -20,26 +24,249 @@ class ImageDenoiseMetric(Metric): | |||
| label_name = 'target' | |||
| def __init__(self): | |||
| super(ImageDenoiseMetric, self).__init__() | |||
| self.preds = [] | |||
| self.labels = [] | |||
| def add(self, outputs: Dict, inputs: Dict): | |||
| ground_truths = outputs[ImageDenoiseMetric.label_name] | |||
| eval_results = outputs[ImageDenoiseMetric.pred_name] | |||
| self.preds.append( | |||
| torch_nested_numpify(torch_nested_detach(eval_results))) | |||
| self.labels.append( | |||
| torch_nested_numpify(torch_nested_detach(ground_truths))) | |||
| self.preds.append(eval_results) | |||
| self.labels.append(ground_truths) | |||
| def evaluate(self): | |||
| psnr_list, ssim_list = [], [] | |||
| for (pred, label) in zip(self.preds, self.labels): | |||
| psnr_list.append( | |||
| peak_signal_noise_ratio(label[0], pred[0], data_range=255)) | |||
| ssim_list.append( | |||
| structural_similarity( | |||
| label[0], pred[0], multichannel=True, data_range=255)) | |||
| psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0)) | |||
| ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0)) | |||
| return { | |||
| MetricKeys.PSNR: np.mean(psnr_list), | |||
| MetricKeys.SSIM: np.mean(ssim_list) | |||
| } | |||
| def reorder_image(img, input_order='HWC'): | |||
| """Reorder images to 'HWC' order. | |||
| If the input_order is (h, w), return (h, w, 1); | |||
| If the input_order is (c, h, w), return (h, w, c); | |||
| If the input_order is (h, w, c), return as it is. | |||
| Args: | |||
| img (ndarray): Input image. | |||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||
| If the input image shape is (h, w), input_order will not have | |||
| effects. Default: 'HWC'. | |||
| Returns: | |||
| ndarray: reordered image. | |||
| """ | |||
| if input_order not in ['HWC', 'CHW']: | |||
| raise ValueError( | |||
| f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'" | |||
| ) | |||
| if len(img.shape) == 2: | |||
| img = img[..., None] | |||
| if input_order == 'CHW': | |||
| img = img.transpose(1, 2, 0) | |||
| return img | |||
| def calculate_psnr(img1, img2, crop_border, input_order='HWC'): | |||
| """Calculate PSNR (Peak Signal-to-Noise Ratio). | |||
| Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio | |||
| Args: | |||
| img1 (ndarray/tensor): Images with range [0, 255]/[0, 1]. | |||
| img2 (ndarray/tensor): Images with range [0, 255]/[0, 1]. | |||
| crop_border (int): Cropped pixels in each edge of an image. These | |||
| pixels are not involved in the PSNR calculation. | |||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||
| Default: 'HWC'. | |||
| test_y_channel (bool): Test on Y channel of YCbCr. Default: False. | |||
| Returns: | |||
| float: psnr result. | |||
| """ | |||
| assert img1.shape == img2.shape, ( | |||
| f'Image shapes are differnet: {img1.shape}, {img2.shape}.') | |||
| if input_order not in ['HWC', 'CHW']: | |||
| raise ValueError( | |||
| f'Wrong input_order {input_order}. Supported input_orders are ' | |||
| '"HWC" and "CHW"') | |||
| if type(img1) == torch.Tensor: | |||
| if len(img1.shape) == 4: | |||
| img1 = img1.squeeze(0) | |||
| img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) | |||
| if type(img2) == torch.Tensor: | |||
| if len(img2.shape) == 4: | |||
| img2 = img2.squeeze(0) | |||
| img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) | |||
| img1 = reorder_image(img1, input_order=input_order) | |||
| img2 = reorder_image(img2, input_order=input_order) | |||
| img1 = img1.astype(np.float64) | |||
| img2 = img2.astype(np.float64) | |||
| if crop_border != 0: | |||
| img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] | |||
| img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] | |||
| def _psnr(img1, img2): | |||
| mse = np.mean((img1 - img2)**2) | |||
| if mse == 0: | |||
| return float('inf') | |||
| max_value = 1. if img1.max() <= 1 else 255. | |||
| return 20. * np.log10(max_value / np.sqrt(mse)) | |||
| return _psnr(img1, img2) | |||
| def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True): | |||
| """Calculate SSIM (structural similarity). | |||
| Ref: | |||
| Image quality assessment: From error visibility to structural similarity | |||
| The results are the same as that of the official released MATLAB code in | |||
| https://ece.uwaterloo.ca/~z70wang/research/ssim/. | |||
| For three-channel images, SSIM is calculated for each channel and then | |||
| averaged. | |||
| Args: | |||
| img1 (ndarray): Images with range [0, 255]. | |||
| img2 (ndarray): Images with range [0, 255]. | |||
| crop_border (int): Cropped pixels in each edge of an image. These | |||
| pixels are not involved in the SSIM calculation. | |||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||
| Default: 'HWC'. | |||
| test_y_channel (bool): Test on Y channel of YCbCr. Default: False. | |||
| Returns: | |||
| float: ssim result. | |||
| """ | |||
| assert img1.shape == img2.shape, ( | |||
| f'Image shapes are differnet: {img1.shape}, {img2.shape}.') | |||
| if input_order not in ['HWC', 'CHW']: | |||
| raise ValueError( | |||
| f'Wrong input_order {input_order}. Supported input_orders are ' | |||
| '"HWC" and "CHW"') | |||
| if type(img1) == torch.Tensor: | |||
| if len(img1.shape) == 4: | |||
| img1 = img1.squeeze(0) | |||
| img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) | |||
| if type(img2) == torch.Tensor: | |||
| if len(img2.shape) == 4: | |||
| img2 = img2.squeeze(0) | |||
| img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) | |||
| img1 = reorder_image(img1, input_order=input_order) | |||
| img2 = reorder_image(img2, input_order=input_order) | |||
| img1 = img1.astype(np.float64) | |||
| img2 = img2.astype(np.float64) | |||
| if crop_border != 0: | |||
| img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] | |||
| img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] | |||
| def _cal_ssim(img1, img2): | |||
| ssims = [] | |||
| max_value = 1 if img1.max() <= 1 else 255 | |||
| with torch.no_grad(): | |||
| final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim( | |||
| img1, img2, max_value) | |||
| ssims.append(final_ssim) | |||
| return np.array(ssims).mean() | |||
| return _cal_ssim(img1, img2) | |||
| def _ssim(img, img2, max_value): | |||
| """Calculate SSIM (structural similarity) for one channel images. | |||
| It is called by func:`calculate_ssim`. | |||
| Args: | |||
| img (ndarray): Images with range [0, 255] with order 'HWC'. | |||
| img2 (ndarray): Images with range [0, 255] with order 'HWC'. | |||
| Returns: | |||
| float: SSIM result. | |||
| """ | |||
| c1 = (0.01 * max_value)**2 | |||
| c2 = (0.03 * max_value)**2 | |||
| img = img.astype(np.float64) | |||
| img2 = img2.astype(np.float64) | |||
| kernel = cv2.getGaussianKernel(11, 1.5) | |||
| window = np.outer(kernel, kernel.transpose()) | |||
| mu1 = cv2.filter2D(img, -1, window)[5:-5, | |||
| 5:-5] # valid mode for window size 11 | |||
| mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] | |||
| mu1_sq = mu1**2 | |||
| mu2_sq = mu2**2 | |||
| mu1_mu2 = mu1 * mu2 | |||
| sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq | |||
| sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq | |||
| sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 | |||
| tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2) | |||
| tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) | |||
| ssim_map = tmp1 / tmp2 | |||
| return ssim_map.mean() | |||
| def _3d_gaussian_calculator(img, conv3d): | |||
| out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0) | |||
| return out | |||
| def _generate_3d_gaussian_kernel(): | |||
| kernel = cv2.getGaussianKernel(11, 1.5) | |||
| window = np.outer(kernel, kernel.transpose()) | |||
| kernel_3 = cv2.getGaussianKernel(11, 1.5) | |||
| kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0)) | |||
| conv3d = torch.nn.Conv3d( | |||
| 1, | |||
| 1, (11, 11, 11), | |||
| stride=1, | |||
| padding=(5, 5, 5), | |||
| bias=False, | |||
| padding_mode='replicate') | |||
| conv3d.weight.requires_grad = False | |||
| conv3d.weight[0, 0, :, :, :] = kernel | |||
| return conv3d | |||
| def _ssim_3d(img1, img2, max_value): | |||
| assert len(img1.shape) == 3 and len(img2.shape) == 3 | |||
| """Calculate SSIM (structural similarity) for one channel images. | |||
| It is called by func:`calculate_ssim`. | |||
| Args: | |||
| img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. | |||
| img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. | |||
| Returns: | |||
| float: ssim result. | |||
| """ | |||
| C1 = (0.01 * max_value)**2 | |||
| C2 = (0.03 * max_value)**2 | |||
| img1 = img1.astype(np.float64) | |||
| img2 = img2.astype(np.float64) | |||
| kernel = _generate_3d_gaussian_kernel().cuda() | |||
| img1 = torch.tensor(img1).float().cuda() | |||
| img2 = torch.tensor(img2).float().cuda() | |||
| mu1 = _3d_gaussian_calculator(img1, kernel) | |||
| mu2 = _3d_gaussian_calculator(img2, kernel) | |||
| mu1_sq = mu1**2 | |||
| mu2_sq = mu2**2 | |||
| mu1_mu2 = mu1 * mu2 | |||
| sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq | |||
| sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq | |||
| sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2 | |||
| tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2) | |||
| tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) | |||
| ssim_map = tmp1 / tmp2 | |||
| return float(ssim_map.mean()) | |||
| @@ -0,0 +1,210 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| from typing import Dict | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from scipy import linalg | |||
| from modelscope.metainfo import Metrics | |||
| from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3 | |||
| from modelscope.utils.registry import default_group | |||
| from modelscope.utils.tensor_utils import (torch_nested_detach, | |||
| torch_nested_numpify) | |||
| from .base import Metric | |||
| from .builder import METRICS, MetricKeys | |||
| def fid_calculate_activation_statistics(act): | |||
| mu = np.mean(act, axis=0) | |||
| sigma = np.cov(act, rowvar=False) | |||
| return mu, sigma | |||
| def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6): | |||
| mu1, sigma1 = fid_calculate_activation_statistics(activations_pred) | |||
| mu2, sigma2 = fid_calculate_activation_statistics(activations_target) | |||
| diff = mu1 - mu2 | |||
| # Product might be almost singular | |||
| covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) | |||
| if not np.isfinite(covmean).all(): | |||
| offset = np.eye(sigma1.shape[0]) * eps | |||
| covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) | |||
| # Numerical error might give slight imaginary component | |||
| if np.iscomplexobj(covmean): | |||
| # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): | |||
| if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2): | |||
| m = np.max(np.abs(covmean.imag)) | |||
| raise ValueError('Imaginary component {}'.format(m)) | |||
| covmean = covmean.real | |||
| tr_covmean = np.trace(covmean) | |||
| return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) | |||
| - 2 * tr_covmean) | |||
| class FIDScore(torch.nn.Module): | |||
| def __init__(self, dims=2048, eps=1e-6): | |||
| super().__init__() | |||
| if getattr(FIDScore, '_MODEL', None) is None: | |||
| block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims] | |||
| FIDScore._MODEL = InceptionV3([block_idx]).eval() | |||
| self.model = FIDScore._MODEL | |||
| self.eps = eps | |||
| self.reset() | |||
| def forward(self, pred_batch, target_batch, mask=None): | |||
| activations_pred = self._get_activations(pred_batch) | |||
| activations_target = self._get_activations(target_batch) | |||
| self.activations_pred.append(activations_pred.detach().cpu()) | |||
| self.activations_target.append(activations_target.detach().cpu()) | |||
| def get_value(self): | |||
| activations_pred, activations_target = (self.activations_pred, | |||
| self.activations_target) | |||
| activations_pred = torch.cat(activations_pred).cpu().numpy() | |||
| activations_target = torch.cat(activations_target).cpu().numpy() | |||
| total_distance = calculate_frechet_distance( | |||
| activations_pred, activations_target, eps=self.eps) | |||
| self.reset() | |||
| return total_distance | |||
| def reset(self): | |||
| self.activations_pred = [] | |||
| self.activations_target = [] | |||
| def _get_activations(self, batch): | |||
| activations = self.model(batch)[0] | |||
| if activations.shape[2] != 1 or activations.shape[3] != 1: | |||
| assert False, \ | |||
| 'We should not have got here, because Inception always scales inputs to 299x299' | |||
| activations = activations.squeeze(-1).squeeze(-1) | |||
| return activations | |||
| class SSIM(torch.nn.Module): | |||
| """SSIM. Modified from: | |||
| https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py | |||
| """ | |||
| def __init__(self, window_size=11, size_average=True): | |||
| super().__init__() | |||
| self.window_size = window_size | |||
| self.size_average = size_average | |||
| self.channel = 1 | |||
| self.register_buffer('window', | |||
| self._create_window(window_size, self.channel)) | |||
| def forward(self, img1, img2): | |||
| assert len(img1.shape) == 4 | |||
| channel = img1.size()[1] | |||
| if channel == self.channel and self.window.data.type( | |||
| ) == img1.data.type(): | |||
| window = self.window | |||
| else: | |||
| window = self._create_window(self.window_size, channel) | |||
| window = window.type_as(img1) | |||
| self.window = window | |||
| self.channel = channel | |||
| return self._ssim(img1, img2, window, self.window_size, channel, | |||
| self.size_average) | |||
| def _gaussian(self, window_size, sigma): | |||
| gauss = torch.Tensor([ | |||
| np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2)) | |||
| for x in range(window_size) | |||
| ]) | |||
| return gauss / gauss.sum() | |||
| def _create_window(self, window_size, channel): | |||
| _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1) | |||
| _2D_window = _1D_window.mm( | |||
| _1D_window.t()).float().unsqueeze(0).unsqueeze(0) | |||
| return _2D_window.expand(channel, 1, window_size, | |||
| window_size).contiguous() | |||
| def _ssim(self, | |||
| img1, | |||
| img2, | |||
| window, | |||
| window_size, | |||
| channel, | |||
| size_average=True): | |||
| mu1 = F.conv2d( | |||
| img1, window, padding=(window_size // 2), groups=channel) | |||
| mu2 = F.conv2d( | |||
| img2, window, padding=(window_size // 2), groups=channel) | |||
| mu1_sq = mu1.pow(2) | |||
| mu2_sq = mu2.pow(2) | |||
| mu1_mu2 = mu1 * mu2 | |||
| sigma1_sq = F.conv2d( | |||
| img1 * img1, window, padding=(window_size // 2), | |||
| groups=channel) - mu1_sq | |||
| sigma2_sq = F.conv2d( | |||
| img2 * img2, window, padding=(window_size // 2), | |||
| groups=channel) - mu2_sq | |||
| sigma12 = F.conv2d( | |||
| img1 * img2, window, padding=(window_size // 2), | |||
| groups=channel) - mu1_mu2 | |||
| C1 = 0.01**2 | |||
| C2 = 0.03**2 | |||
| ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \ | |||
| ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) | |||
| if size_average: | |||
| return ssim_map.mean() | |||
| return ssim_map.mean(1).mean(1).mean(1) | |||
| def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, | |||
| missing_keys, unexpected_keys, error_msgs): | |||
| return | |||
| @METRICS.register_module( | |||
| group_key=default_group, module_name=Metrics.image_inpainting_metric) | |||
| class ImageInpaintingMetric(Metric): | |||
| """The metric computation class for image inpainting classes. | |||
| """ | |||
| def __init__(self): | |||
| self.preds = [] | |||
| self.targets = [] | |||
| self.SSIM = SSIM(window_size=11, size_average=False).eval() | |||
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |||
| self.FID = FIDScore().to(device) | |||
| def add(self, outputs: Dict, inputs: Dict): | |||
| pred = outputs['inpainted'] | |||
| target = inputs['image'] | |||
| self.preds.append(torch_nested_detach(pred)) | |||
| self.targets.append(torch_nested_detach(target)) | |||
| def evaluate(self): | |||
| ssim_list = [] | |||
| for (pred, target) in zip(self.preds, self.targets): | |||
| ssim_list.append(self.SSIM(pred, target)) | |||
| self.FID(pred, target) | |||
| ssim_list = torch_nested_numpify(ssim_list) | |||
| fid = self.FID.get_value() | |||
| return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid} | |||
| @@ -1,3 +1,6 @@ | |||
| # Part of the implementation is borrowed and modified from PGL-SUM, | |||
| # publicly available at https://github.com/e-apostolidis/PGL-SUM | |||
| from typing import Dict | |||
| import numpy as np | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict | |||
| @@ -1,15 +1,14 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Dict | |||
| import torch | |||
| from typing import Dict, Optional | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models import TorchModel | |||
| from modelscope.models.base import Tensor | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.audio.audio_utils import update_conf | |||
| from modelscope.utils.constant import Tasks | |||
| from .fsmn_sele_v2 import FSMNSeleNetV2 | |||
| @@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel): | |||
| MODEL_TXT = 'model.txt' | |||
| SC_CONFIG = 'sound_connect.conf' | |||
| SC_CONF_ITEM_KWS_MODEL = '${kws_model}' | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| def __init__(self, | |||
| model_dir: str, | |||
| training: Optional[bool] = False, | |||
| *args, | |||
| **kwargs): | |||
| """initialize the dfsmn model from the `model_dir` path. | |||
| Args: | |||
| model_dir (str): the model path. | |||
| """ | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| sc_config_file = os.path.join(model_dir, self.SC_CONFIG) | |||
| model_txt_file = os.path.join(model_dir, self.MODEL_TXT) | |||
| model_bin_file = os.path.join(model_dir, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| self._model = None | |||
| if os.path.exists(model_bin_file): | |||
| kwargs.pop('device') | |||
| self._model = FSMNSeleNetV2(*args, **kwargs) | |||
| checkpoint = torch.load(model_bin_file) | |||
| self._model.load_state_dict(checkpoint, strict=False) | |||
| self._sc = None | |||
| if os.path.exists(model_txt_file): | |||
| with open(sc_config_file) as f: | |||
| lines = f.readlines() | |||
| with open(sc_config_file, 'w') as f: | |||
| for line in lines: | |||
| if self.SC_CONF_ITEM_KWS_MODEL in line: | |||
| line = line.replace(self.SC_CONF_ITEM_KWS_MODEL, | |||
| model_txt_file) | |||
| f.write(line) | |||
| import py_sound_connect | |||
| self._sc = py_sound_connect.SoundConnect(sc_config_file) | |||
| self.size_in = self._sc.bytesPerBlockIn() | |||
| self.size_out = self._sc.bytesPerBlockOut() | |||
| if self._model is None and self._sc is None: | |||
| raise Exception( | |||
| f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.' | |||
| ) | |||
| if training: | |||
| self.model = FSMNSeleNetV2(*args, **kwargs) | |||
| else: | |||
| sc_config_file = os.path.join(model_dir, self.SC_CONFIG) | |||
| model_txt_file = os.path.join(model_dir, self.MODEL_TXT) | |||
| self._sc = None | |||
| if os.path.exists(model_txt_file): | |||
| conf_dict = dict(mode=56542, kws_model=model_txt_file) | |||
| update_conf(sc_config_file, sc_config_file, conf_dict) | |||
| import py_sound_connect | |||
| self._sc = py_sound_connect.SoundConnect(sc_config_file) | |||
| self.size_in = self._sc.bytesPerBlockIn() | |||
| self.size_out = self._sc.bytesPerBlockOut() | |||
| else: | |||
| raise Exception( | |||
| f'Invalid model directory! Failed to load model file: {model_txt_file}.' | |||
| ) | |||
| def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
| ... | |||
| return self.model.forward(input) | |||
| def forward_decode(self, data: bytes): | |||
| result = {'pcm': self._sc.process(data, self.size_out)} | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict | |||
| @@ -4,14 +4,16 @@ | |||
| from . import (action_recognition, animal_recognition, body_2d_keypoints, | |||
| body_3d_keypoints, cartoon, cmdssl_video_embedding, | |||
| crowd_counting, face_2d_keypoints, face_detection, | |||
| face_generation, image_classification, image_color_enhance, | |||
| image_colorization, image_denoise, image_instance_segmentation, | |||
| face_generation, human_wholebody_keypoint, image_classification, | |||
| image_color_enhance, image_colorization, image_denoise, | |||
| image_inpainting, image_instance_segmentation, | |||
| image_panoptic_segmentation, image_portrait_enhancement, | |||
| image_reid_person, image_semantic_segmentation, | |||
| image_to_image_generation, image_to_image_translation, | |||
| movie_scene_segmentation, object_detection, | |||
| product_retrieval_embedding, realtime_object_detection, | |||
| salient_detection, shop_segmentation, super_resolution, | |||
| referring_video_object_segmentation, salient_detection, | |||
| shop_segmentation, super_resolution, | |||
| video_single_object_tracking, video_summarization, virual_tryon) | |||
| # yapf: enable | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict, Optional, Union | |||
| @@ -1,10 +1,10 @@ | |||
| # ------------------------------------------------------------------------------ | |||
| # Copyright (c) Microsoft | |||
| # Licensed under the MIT License. | |||
| # Written by Bin Xiao (Bin.Xiao@microsoft.com) | |||
| # Modified by Ke Sun (sunk@mail.ustc.edu.cn) | |||
| # https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py | |||
| # ------------------------------------------------------------------------------ | |||
| """ | |||
| Copyright (c) Microsoft | |||
| Licensed under the MIT License. | |||
| Written by Bin Xiao (Bin.Xiao@microsoft.com) | |||
| Modified by Ke Sun (sunk@mail.ustc.edu.cn) | |||
| https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py | |||
| """ | |||
| import functools | |||
| import logging | |||
| @@ -8,12 +8,14 @@ if TYPE_CHECKING: | |||
| from .mtcnn import MtcnnFaceDetector | |||
| from .retinaface import RetinaFaceDetection | |||
| from .ulfd_slim import UlfdFaceDetector | |||
| from .scrfd import ScrfdDetect | |||
| else: | |||
| _import_structure = { | |||
| 'ulfd_slim': ['UlfdFaceDetector'], | |||
| 'retinaface': ['RetinaFaceDetection'], | |||
| 'mtcnn': ['MtcnnFaceDetector'], | |||
| 'mogface': ['MogFaceDetector'] | |||
| 'mogface': ['MogFaceDetector'], | |||
| 'scrfd': ['ScrfdDetect'] | |||
| } | |||
| import sys | |||
| @@ -1,189 +0,0 @@ | |||
| """ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py | |||
| """ | |||
| import numpy as np | |||
| from mmdet.datasets.builder import PIPELINES | |||
| from numpy import random | |||
| @PIPELINES.register_module() | |||
| class RandomSquareCrop(object): | |||
| """Random crop the image & bboxes, the cropped patches have minimum IoU | |||
| requirement with original image & bboxes, the IoU threshold is randomly | |||
| selected from min_ious. | |||
| Args: | |||
| min_ious (tuple): minimum IoU threshold for all intersections with | |||
| bounding boxes | |||
| min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, | |||
| where a >= min_crop_size). | |||
| Note: | |||
| The keys for bboxes, labels and masks should be paired. That is, \ | |||
| `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ | |||
| `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. | |||
| """ | |||
| def __init__(self, | |||
| crop_ratio_range=None, | |||
| crop_choice=None, | |||
| bbox_clip_border=True): | |||
| self.crop_ratio_range = crop_ratio_range | |||
| self.crop_choice = crop_choice | |||
| self.bbox_clip_border = bbox_clip_border | |||
| assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) | |||
| if self.crop_ratio_range is not None: | |||
| self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range | |||
| self.bbox2label = { | |||
| 'gt_bboxes': 'gt_labels', | |||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||
| } | |||
| self.bbox2mask = { | |||
| 'gt_bboxes': 'gt_masks', | |||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||
| } | |||
| def __call__(self, results): | |||
| """Call function to crop images and bounding boxes with minimum IoU | |||
| constraint. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| Returns: | |||
| dict: Result dict with images and bounding boxes cropped, \ | |||
| 'img_shape' key is updated. | |||
| """ | |||
| if 'img_fields' in results: | |||
| assert results['img_fields'] == ['img'], \ | |||
| 'Only single img_fields is allowed' | |||
| img = results['img'] | |||
| assert 'bbox_fields' in results | |||
| assert 'gt_bboxes' in results | |||
| boxes = results['gt_bboxes'] | |||
| h, w, c = img.shape | |||
| scale_retry = 0 | |||
| if self.crop_ratio_range is not None: | |||
| max_scale = self.crop_ratio_max | |||
| else: | |||
| max_scale = np.amax(self.crop_choice) | |||
| while True: | |||
| scale_retry += 1 | |||
| if scale_retry == 1 or max_scale > 1.0: | |||
| if self.crop_ratio_range is not None: | |||
| scale = np.random.uniform(self.crop_ratio_min, | |||
| self.crop_ratio_max) | |||
| elif self.crop_choice is not None: | |||
| scale = np.random.choice(self.crop_choice) | |||
| else: | |||
| scale = scale * 1.2 | |||
| for i in range(250): | |||
| short_side = min(w, h) | |||
| cw = int(scale * short_side) | |||
| ch = cw | |||
| # TODO +1 | |||
| if w == cw: | |||
| left = 0 | |||
| elif w > cw: | |||
| left = random.randint(0, w - cw) | |||
| else: | |||
| left = random.randint(w - cw, 0) | |||
| if h == ch: | |||
| top = 0 | |||
| elif h > ch: | |||
| top = random.randint(0, h - ch) | |||
| else: | |||
| top = random.randint(h - ch, 0) | |||
| patch = np.array( | |||
| (int(left), int(top), int(left + cw), int(top + ch)), | |||
| dtype=np.int) | |||
| # center of boxes should inside the crop img | |||
| # only adjust boxes and instance masks when the gt is not empty | |||
| # adjust boxes | |||
| def is_center_of_bboxes_in_patch(boxes, patch): | |||
| # TODO >= | |||
| center = (boxes[:, :2] + boxes[:, 2:]) / 2 | |||
| mask = \ | |||
| ((center[:, 0] > patch[0]) | |||
| * (center[:, 1] > patch[1]) | |||
| * (center[:, 0] < patch[2]) | |||
| * (center[:, 1] < patch[3])) | |||
| return mask | |||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||
| if not mask.any(): | |||
| continue | |||
| for key in results.get('bbox_fields', []): | |||
| boxes = results[key].copy() | |||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||
| boxes = boxes[mask] | |||
| if self.bbox_clip_border: | |||
| boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) | |||
| boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) | |||
| boxes -= np.tile(patch[:2], 2) | |||
| results[key] = boxes | |||
| # labels | |||
| label_key = self.bbox2label.get(key) | |||
| if label_key in results: | |||
| results[label_key] = results[label_key][mask] | |||
| # keypoints field | |||
| if key == 'gt_bboxes': | |||
| for kps_key in results.get('keypoints_fields', []): | |||
| keypointss = results[kps_key].copy() | |||
| keypointss = keypointss[mask, :, :] | |||
| if self.bbox_clip_border: | |||
| keypointss[:, :, : | |||
| 2] = keypointss[:, :, :2].clip( | |||
| max=patch[2:]) | |||
| keypointss[:, :, : | |||
| 2] = keypointss[:, :, :2].clip( | |||
| min=patch[:2]) | |||
| keypointss[:, :, 0] -= patch[0] | |||
| keypointss[:, :, 1] -= patch[1] | |||
| results[kps_key] = keypointss | |||
| # mask fields | |||
| mask_key = self.bbox2mask.get(key) | |||
| if mask_key in results: | |||
| results[mask_key] = results[mask_key][mask.nonzero() | |||
| [0]].crop(patch) | |||
| # adjust the img no matter whether the gt is empty before crop | |||
| rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 | |||
| patch_from = patch.copy() | |||
| patch_from[0] = max(0, patch_from[0]) | |||
| patch_from[1] = max(0, patch_from[1]) | |||
| patch_from[2] = min(img.shape[1], patch_from[2]) | |||
| patch_from[3] = min(img.shape[0], patch_from[3]) | |||
| patch_to = patch.copy() | |||
| patch_to[0] = max(0, patch_to[0] * -1) | |||
| patch_to[1] = max(0, patch_to[1] * -1) | |||
| patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) | |||
| patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) | |||
| rimg[patch_to[1]:patch_to[3], | |||
| patch_to[0]:patch_to[2], :] = img[ | |||
| patch_from[1]:patch_from[3], | |||
| patch_from[0]:patch_from[2], :] | |||
| img = rimg | |||
| results['img'] = img | |||
| results['img_shape'] = img.shape | |||
| return results | |||
| def __repr__(self): | |||
| repr_str = self.__class__.__name__ | |||
| repr_str += f'(min_ious={self.min_iou}, ' | |||
| repr_str += f'crop_size={self.crop_size})' | |||
| return repr_str | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is based on MogFace, available at | |||
| # https://github.com/damo-cv/MogFace | |||
| import os | |||
| import cv2 | |||
| @@ -0,0 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .scrfd_detect import ScrfdDetect | |||
| @@ -6,7 +6,7 @@ import numpy as np | |||
| import torch | |||
| def bbox2result(bboxes, labels, num_classes, kps=None): | |||
| def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5): | |||
| """Convert detection results to a list of numpy arrays. | |||
| Args: | |||
| @@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None): | |||
| Returns: | |||
| list(ndarray): bbox results of each class | |||
| """ | |||
| bbox_len = 5 if kps is None else 5 + 10 # if has kps, add 10 kps into bbox | |||
| bbox_len = 5 if kps is None else 5 + num_kps * 2 # if has kps, add num_kps*2 into bbox | |||
| if bboxes.shape[0] == 0: | |||
| return [ | |||
| np.zeros((0, bbox_len), dtype=np.float32) | |||
| @@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes, | |||
| Args: | |||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |||
| multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2) | |||
| multi_scores (Tensor): shape (n, #class), where the last column | |||
| contains scores of the background class, but this will be ignored. | |||
| score_thr (float): bbox threshold, bboxes with scores lower than it | |||
| @@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes, | |||
| num_classes = multi_scores.size(1) - 1 | |||
| # exclude background category | |||
| kps = None | |||
| if multi_kps is not None: | |||
| num_kps = int((multi_kps.shape[1] / num_classes) / 2) | |||
| if multi_bboxes.shape[1] > 4: | |||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | |||
| if multi_kps is not None: | |||
| kps = multi_kps.view(multi_scores.size(0), -1, 10) | |||
| kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2) | |||
| else: | |||
| bboxes = multi_bboxes[:, None].expand( | |||
| multi_scores.size(0), num_classes, 4) | |||
| if multi_kps is not None: | |||
| kps = multi_kps[:, None].expand( | |||
| multi_scores.size(0), num_classes, 10) | |||
| multi_scores.size(0), num_classes, num_kps * 2) | |||
| scores = multi_scores[:, :-1] | |||
| if score_factors is not None: | |||
| @@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes, | |||
| bboxes = bboxes.reshape(-1, 4) | |||
| if kps is not None: | |||
| kps = kps.reshape(-1, 10) | |||
| kps = kps.reshape(-1, num_kps * 2) | |||
| scores = scores.reshape(-1) | |||
| labels = labels.reshape(-1) | |||
| @@ -2,6 +2,12 @@ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines | |||
| """ | |||
| from .auto_augment import RotateV2 | |||
| from .formating import DefaultFormatBundleV2 | |||
| from .loading import LoadAnnotationsV2 | |||
| from .transforms import RandomSquareCrop | |||
| __all__ = ['RandomSquareCrop'] | |||
| __all__ = [ | |||
| 'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2', | |||
| 'DefaultFormatBundleV2' | |||
| ] | |||
| @@ -0,0 +1,271 @@ | |||
| """ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py | |||
| """ | |||
| import copy | |||
| import cv2 | |||
| import mmcv | |||
| import numpy as np | |||
| from mmdet.datasets.builder import PIPELINES | |||
| _MAX_LEVEL = 10 | |||
| def level_to_value(level, max_value): | |||
| """Map from level to values based on max_value.""" | |||
| return (level / _MAX_LEVEL) * max_value | |||
| def random_negative(value, random_negative_prob): | |||
| """Randomly negate value based on random_negative_prob.""" | |||
| return -value if np.random.rand() < random_negative_prob else value | |||
| def bbox2fields(): | |||
| """The key correspondence from bboxes to labels, masks and | |||
| segmentations.""" | |||
| bbox2label = { | |||
| 'gt_bboxes': 'gt_labels', | |||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||
| } | |||
| bbox2mask = { | |||
| 'gt_bboxes': 'gt_masks', | |||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||
| } | |||
| bbox2seg = { | |||
| 'gt_bboxes': 'gt_semantic_seg', | |||
| } | |||
| return bbox2label, bbox2mask, bbox2seg | |||
| @PIPELINES.register_module() | |||
| class RotateV2(object): | |||
| """Apply Rotate Transformation to image (and its corresponding bbox, mask, | |||
| segmentation). | |||
| Args: | |||
| level (int | float): The level should be in range (0,_MAX_LEVEL]. | |||
| scale (int | float): Isotropic scale factor. Same in | |||
| ``mmcv.imrotate``. | |||
| center (int | float | tuple[float]): Center point (w, h) of the | |||
| rotation in the source image. If None, the center of the | |||
| image will be used. Same in ``mmcv.imrotate``. | |||
| img_fill_val (int | float | tuple): The fill value for image border. | |||
| If float, the same value will be used for all the three | |||
| channels of image. If tuple, the should be 3 elements (e.g. | |||
| equals the number of channels for image). | |||
| seg_ignore_label (int): The fill value used for segmentation map. | |||
| Note this value must equals ``ignore_label`` in ``semantic_head`` | |||
| of the corresponding config. Default 255. | |||
| prob (float): The probability for perform transformation and | |||
| should be in range 0 to 1. | |||
| max_rotate_angle (int | float): The maximum angles for rotate | |||
| transformation. | |||
| random_negative_prob (float): The probability that turns the | |||
| offset negative. | |||
| """ | |||
| def __init__(self, | |||
| level, | |||
| scale=1, | |||
| center=None, | |||
| img_fill_val=128, | |||
| seg_ignore_label=255, | |||
| prob=0.5, | |||
| max_rotate_angle=30, | |||
| random_negative_prob=0.5): | |||
| assert isinstance(level, (int, float)), \ | |||
| f'The level must be type int or float. got {type(level)}.' | |||
| assert 0 <= level <= _MAX_LEVEL, \ | |||
| f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.' | |||
| assert isinstance(scale, (int, float)), \ | |||
| f'The scale must be type int or float. got type {type(scale)}.' | |||
| if isinstance(center, (int, float)): | |||
| center = (center, center) | |||
| elif isinstance(center, tuple): | |||
| assert len(center) == 2, 'center with type tuple must have '\ | |||
| f'2 elements. got {len(center)} elements.' | |||
| else: | |||
| assert center is None, 'center must be None or type int, '\ | |||
| f'float or tuple, got type {type(center)}.' | |||
| if isinstance(img_fill_val, (float, int)): | |||
| img_fill_val = tuple([float(img_fill_val)] * 3) | |||
| elif isinstance(img_fill_val, tuple): | |||
| assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\ | |||
| f'have 3 elements. got {len(img_fill_val)}.' | |||
| img_fill_val = tuple([float(val) for val in img_fill_val]) | |||
| else: | |||
| raise ValueError( | |||
| 'img_fill_val must be float or tuple with 3 elements.') | |||
| assert np.all([0 <= val <= 255 for val in img_fill_val]), \ | |||
| 'all elements of img_fill_val should between range [0,255]. '\ | |||
| f'got {img_fill_val}.' | |||
| assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\ | |||
| f'got {prob}.' | |||
| assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\ | |||
| f'should be type int or float. got type {type(max_rotate_angle)}.' | |||
| self.level = level | |||
| self.scale = scale | |||
| # Rotation angle in degrees. Positive values mean | |||
| # clockwise rotation. | |||
| self.angle = level_to_value(level, max_rotate_angle) | |||
| self.center = center | |||
| self.img_fill_val = img_fill_val | |||
| self.seg_ignore_label = seg_ignore_label | |||
| self.prob = prob | |||
| self.max_rotate_angle = max_rotate_angle | |||
| self.random_negative_prob = random_negative_prob | |||
| def _rotate_img(self, results, angle, center=None, scale=1.0): | |||
| """Rotate the image. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| angle (float): Rotation angle in degrees, positive values | |||
| mean clockwise rotation. Same in ``mmcv.imrotate``. | |||
| center (tuple[float], optional): Center point (w, h) of the | |||
| rotation. Same in ``mmcv.imrotate``. | |||
| scale (int | float): Isotropic scale factor. Same in | |||
| ``mmcv.imrotate``. | |||
| """ | |||
| for key in results.get('img_fields', ['img']): | |||
| img = results[key].copy() | |||
| img_rotated = mmcv.imrotate( | |||
| img, angle, center, scale, border_value=self.img_fill_val) | |||
| results[key] = img_rotated.astype(img.dtype) | |||
| results['img_shape'] = results[key].shape | |||
| def _rotate_bboxes(self, results, rotate_matrix): | |||
| """Rotate the bboxes.""" | |||
| h, w, c = results['img_shape'] | |||
| for key in results.get('bbox_fields', []): | |||
| min_x, min_y, max_x, max_y = np.split( | |||
| results[key], results[key].shape[-1], axis=-1) | |||
| coordinates = np.stack([[min_x, min_y], [max_x, min_y], | |||
| [min_x, max_y], | |||
| [max_x, max_y]]) # [4, 2, nb_bbox, 1] | |||
| # pad 1 to convert from format [x, y] to homogeneous | |||
| # coordinates format [x, y, 1] | |||
| coordinates = np.concatenate( | |||
| (coordinates, | |||
| np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)), | |||
| axis=1) # [4, 3, nb_bbox, 1] | |||
| coordinates = coordinates.transpose( | |||
| (2, 0, 1, 3)) # [nb_bbox, 4, 3, 1] | |||
| rotated_coords = np.matmul(rotate_matrix, | |||
| coordinates) # [nb_bbox, 4, 2, 1] | |||
| rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2] | |||
| min_x, min_y = np.min( | |||
| rotated_coords[:, :, 0], axis=1), np.min( | |||
| rotated_coords[:, :, 1], axis=1) | |||
| max_x, max_y = np.max( | |||
| rotated_coords[:, :, 0], axis=1), np.max( | |||
| rotated_coords[:, :, 1], axis=1) | |||
| results[key] = np.stack([min_x, min_y, max_x, max_y], | |||
| axis=-1).astype(results[key].dtype) | |||
| def _rotate_keypoints90(self, results, angle): | |||
| """Rotate the keypoints, only valid when angle in [-90,90,-180,180]""" | |||
| if angle not in [-90, 90, 180, -180 | |||
| ] or self.scale != 1 or self.center is not None: | |||
| return | |||
| for key in results.get('keypoints_fields', []): | |||
| k = results[key] | |||
| if angle == 90: | |||
| w, h, c = results['img'].shape | |||
| new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1) | |||
| elif angle == -90: | |||
| w, h, c = results['img'].shape | |||
| new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1) | |||
| else: | |||
| h, w, c = results['img'].shape | |||
| new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]], | |||
| axis=-1) | |||
| # a kps is invalid if thrid value is -1 | |||
| kps_invalid = new[..., -1][:, -1] == -1 | |||
| new[kps_invalid] = np.zeros(new.shape[1:]) - 1 | |||
| results[key] = new | |||
| def _rotate_masks(self, | |||
| results, | |||
| angle, | |||
| center=None, | |||
| scale=1.0, | |||
| fill_val=0): | |||
| """Rotate the masks.""" | |||
| h, w, c = results['img_shape'] | |||
| for key in results.get('mask_fields', []): | |||
| masks = results[key] | |||
| results[key] = masks.rotate((h, w), angle, center, scale, fill_val) | |||
| def _rotate_seg(self, | |||
| results, | |||
| angle, | |||
| center=None, | |||
| scale=1.0, | |||
| fill_val=255): | |||
| """Rotate the segmentation map.""" | |||
| for key in results.get('seg_fields', []): | |||
| seg = results[key].copy() | |||
| results[key] = mmcv.imrotate( | |||
| seg, angle, center, scale, | |||
| border_value=fill_val).astype(seg.dtype) | |||
| def _filter_invalid(self, results, min_bbox_size=0): | |||
| """Filter bboxes and corresponding masks too small after rotate | |||
| augmentation.""" | |||
| bbox2label, bbox2mask, _ = bbox2fields() | |||
| for key in results.get('bbox_fields', []): | |||
| bbox_w = results[key][:, 2] - results[key][:, 0] | |||
| bbox_h = results[key][:, 3] - results[key][:, 1] | |||
| valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size) | |||
| valid_inds = np.nonzero(valid_inds)[0] | |||
| results[key] = results[key][valid_inds] | |||
| # label fields. e.g. gt_labels and gt_labels_ignore | |||
| label_key = bbox2label.get(key) | |||
| if label_key in results: | |||
| results[label_key] = results[label_key][valid_inds] | |||
| # mask fields, e.g. gt_masks and gt_masks_ignore | |||
| mask_key = bbox2mask.get(key) | |||
| if mask_key in results: | |||
| results[mask_key] = results[mask_key][valid_inds] | |||
| def __call__(self, results): | |||
| """Call function to rotate images, bounding boxes, masks and semantic | |||
| segmentation maps. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| Returns: | |||
| dict: Rotated results. | |||
| """ | |||
| if np.random.rand() > self.prob: | |||
| return results | |||
| h, w = results['img'].shape[:2] | |||
| center = self.center | |||
| if center is None: | |||
| center = ((w - 1) * 0.5, (h - 1) * 0.5) | |||
| angle = random_negative(self.angle, self.random_negative_prob) | |||
| self._rotate_img(results, angle, center, self.scale) | |||
| rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale) | |||
| self._rotate_bboxes(results, rotate_matrix) | |||
| self._rotate_keypoints90(results, angle) | |||
| self._rotate_masks(results, angle, center, self.scale, fill_val=0) | |||
| self._rotate_seg( | |||
| results, angle, center, self.scale, fill_val=self.seg_ignore_label) | |||
| self._filter_invalid(results) | |||
| return results | |||
| def __repr__(self): | |||
| repr_str = self.__class__.__name__ | |||
| repr_str += f'(level={self.level}, ' | |||
| repr_str += f'scale={self.scale}, ' | |||
| repr_str += f'center={self.center}, ' | |||
| repr_str += f'img_fill_val={self.img_fill_val}, ' | |||
| repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' | |||
| repr_str += f'prob={self.prob}, ' | |||
| repr_str += f'max_rotate_angle={self.max_rotate_angle}, ' | |||
| repr_str += f'random_negative_prob={self.random_negative_prob})' | |||
| return repr_str | |||
| @@ -0,0 +1,113 @@ | |||
| """ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py | |||
| """ | |||
| import numpy as np | |||
| import torch | |||
| from mmcv.parallel import DataContainer as DC | |||
| from mmdet.datasets.builder import PIPELINES | |||
| def to_tensor(data): | |||
| """Convert objects of various python types to :obj:`torch.Tensor`. | |||
| Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, | |||
| :class:`Sequence`, :class:`int` and :class:`float`. | |||
| Args: | |||
| data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to | |||
| be converted. | |||
| """ | |||
| if isinstance(data, torch.Tensor): | |||
| return data | |||
| elif isinstance(data, np.ndarray): | |||
| return torch.from_numpy(data) | |||
| elif isinstance(data, Sequence) and not mmcv.is_str(data): | |||
| return torch.tensor(data) | |||
| elif isinstance(data, int): | |||
| return torch.LongTensor([data]) | |||
| elif isinstance(data, float): | |||
| return torch.FloatTensor([data]) | |||
| else: | |||
| raise TypeError(f'type {type(data)} cannot be converted to tensor.') | |||
| @PIPELINES.register_module() | |||
| class DefaultFormatBundleV2(object): | |||
| """Default formatting bundle. | |||
| It simplifies the pipeline of formatting common fields, including "img", | |||
| "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". | |||
| These fields are formatted as follows. | |||
| - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) | |||
| - proposals: (1)to tensor, (2)to DataContainer | |||
| - gt_bboxes: (1)to tensor, (2)to DataContainer | |||
| - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer | |||
| - gt_labels: (1)to tensor, (2)to DataContainer | |||
| - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) | |||
| - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ | |||
| (3)to DataContainer (stack=True) | |||
| """ | |||
| def __call__(self, results): | |||
| """Call function to transform and format common fields in results. | |||
| Args: | |||
| results (dict): Result dict contains the data to convert. | |||
| Returns: | |||
| dict: The result dict contains the data that is formatted with \ | |||
| default bundle. | |||
| """ | |||
| if 'img' in results: | |||
| img = results['img'] | |||
| # add default meta keys | |||
| results = self._add_default_meta_keys(results) | |||
| if len(img.shape) < 3: | |||
| img = np.expand_dims(img, -1) | |||
| img = np.ascontiguousarray(img.transpose(2, 0, 1)) | |||
| results['img'] = DC(to_tensor(img), stack=True) | |||
| for key in [ | |||
| 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss', | |||
| 'gt_labels' | |||
| ]: | |||
| if key not in results: | |||
| continue | |||
| results[key] = DC(to_tensor(results[key])) | |||
| if 'gt_masks' in results: | |||
| results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) | |||
| if 'gt_semantic_seg' in results: | |||
| results['gt_semantic_seg'] = DC( | |||
| to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) | |||
| return results | |||
| def _add_default_meta_keys(self, results): | |||
| """Add default meta keys. | |||
| We set default meta keys including `pad_shape`, `scale_factor` and | |||
| `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and | |||
| `Pad` are implemented during the whole pipeline. | |||
| Args: | |||
| results (dict): Result dict contains the data to convert. | |||
| Returns: | |||
| results (dict): Updated result dict contains the data to convert. | |||
| """ | |||
| img = results['img'] | |||
| results.setdefault('pad_shape', img.shape) | |||
| results.setdefault('scale_factor', 1.0) | |||
| num_channels = 1 if len(img.shape) < 3 else img.shape[2] | |||
| results.setdefault( | |||
| 'img_norm_cfg', | |||
| dict( | |||
| mean=np.zeros(num_channels, dtype=np.float32), | |||
| std=np.ones(num_channels, dtype=np.float32), | |||
| to_rgb=False)) | |||
| return results | |||
| def __repr__(self): | |||
| return self.__class__.__name__ | |||
| @@ -0,0 +1,225 @@ | |||
| """ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py | |||
| """ | |||
| import os.path as osp | |||
| import numpy as np | |||
| import pycocotools.mask as maskUtils | |||
| from mmdet.core import BitmapMasks, PolygonMasks | |||
| from mmdet.datasets.builder import PIPELINES | |||
| @PIPELINES.register_module() | |||
| class LoadAnnotationsV2(object): | |||
| """Load mutiple types of annotations. | |||
| Args: | |||
| with_bbox (bool): Whether to parse and load the bbox annotation. | |||
| Default: True. | |||
| with_label (bool): Whether to parse and load the label annotation. | |||
| Default: True. | |||
| with_keypoints (bool): Whether to parse and load the keypoints annotation. | |||
| Default: False. | |||
| with_mask (bool): Whether to parse and load the mask annotation. | |||
| Default: False. | |||
| with_seg (bool): Whether to parse and load the semantic segmentation | |||
| annotation. Default: False. | |||
| poly2mask (bool): Whether to convert the instance masks from polygons | |||
| to bitmaps. Default: True. | |||
| file_client_args (dict): Arguments to instantiate a FileClient. | |||
| See :class:`mmcv.fileio.FileClient` for details. | |||
| Defaults to ``dict(backend='disk')``. | |||
| """ | |||
| def __init__(self, | |||
| with_bbox=True, | |||
| with_label=True, | |||
| with_keypoints=False, | |||
| with_mask=False, | |||
| with_seg=False, | |||
| poly2mask=True, | |||
| file_client_args=dict(backend='disk')): | |||
| self.with_bbox = with_bbox | |||
| self.with_label = with_label | |||
| self.with_keypoints = with_keypoints | |||
| self.with_mask = with_mask | |||
| self.with_seg = with_seg | |||
| self.poly2mask = poly2mask | |||
| self.file_client_args = file_client_args.copy() | |||
| self.file_client = None | |||
| def _load_bboxes(self, results): | |||
| """Private function to load bounding box annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||
| Returns: | |||
| dict: The dict contains loaded bounding box annotations. | |||
| """ | |||
| ann_info = results['ann_info'] | |||
| results['gt_bboxes'] = ann_info['bboxes'].copy() | |||
| gt_bboxes_ignore = ann_info.get('bboxes_ignore', None) | |||
| if gt_bboxes_ignore is not None: | |||
| results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy() | |||
| results['bbox_fields'].append('gt_bboxes_ignore') | |||
| results['bbox_fields'].append('gt_bboxes') | |||
| return results | |||
| def _load_keypoints(self, results): | |||
| """Private function to load bounding box annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||
| Returns: | |||
| dict: The dict contains loaded bounding box annotations. | |||
| """ | |||
| ann_info = results['ann_info'] | |||
| results['gt_keypointss'] = ann_info['keypointss'].copy() | |||
| results['keypoints_fields'] = ['gt_keypointss'] | |||
| return results | |||
| def _load_labels(self, results): | |||
| """Private function to load label annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||
| Returns: | |||
| dict: The dict contains loaded label annotations. | |||
| """ | |||
| results['gt_labels'] = results['ann_info']['labels'].copy() | |||
| return results | |||
| def _poly2mask(self, mask_ann, img_h, img_w): | |||
| """Private function to convert masks represented with polygon to | |||
| bitmaps. | |||
| Args: | |||
| mask_ann (list | dict): Polygon mask annotation input. | |||
| img_h (int): The height of output mask. | |||
| img_w (int): The width of output mask. | |||
| Returns: | |||
| numpy.ndarray: The decode bitmap mask of shape (img_h, img_w). | |||
| """ | |||
| if isinstance(mask_ann, list): | |||
| # polygon -- a single object might consist of multiple parts | |||
| # we merge all parts into one mask rle code | |||
| rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) | |||
| rle = maskUtils.merge(rles) | |||
| elif isinstance(mask_ann['counts'], list): | |||
| # uncompressed RLE | |||
| rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) | |||
| else: | |||
| # rle | |||
| rle = mask_ann | |||
| mask = maskUtils.decode(rle) | |||
| return mask | |||
| def process_polygons(self, polygons): | |||
| """Convert polygons to list of ndarray and filter invalid polygons. | |||
| Args: | |||
| polygons (list[list]): Polygons of one instance. | |||
| Returns: | |||
| list[numpy.ndarray]: Processed polygons. | |||
| """ | |||
| polygons = [np.array(p) for p in polygons] | |||
| valid_polygons = [] | |||
| for polygon in polygons: | |||
| if len(polygon) % 2 == 0 and len(polygon) >= 6: | |||
| valid_polygons.append(polygon) | |||
| return valid_polygons | |||
| def _load_masks(self, results): | |||
| """Private function to load mask annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||
| Returns: | |||
| dict: The dict contains loaded mask annotations. | |||
| If ``self.poly2mask`` is set ``True``, `gt_mask` will contain | |||
| :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. | |||
| """ | |||
| h, w = results['img_info']['height'], results['img_info']['width'] | |||
| gt_masks = results['ann_info']['masks'] | |||
| if self.poly2mask: | |||
| gt_masks = BitmapMasks( | |||
| [self._poly2mask(mask, h, w) for mask in gt_masks], h, w) | |||
| else: | |||
| gt_masks = PolygonMasks( | |||
| [self.process_polygons(polygons) for polygons in gt_masks], h, | |||
| w) | |||
| results['gt_masks'] = gt_masks | |||
| results['mask_fields'].append('gt_masks') | |||
| return results | |||
| def _load_semantic_seg(self, results): | |||
| """Private function to load semantic segmentation annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`dataset`. | |||
| Returns: | |||
| dict: The dict contains loaded semantic segmentation annotations. | |||
| """ | |||
| import mmcv | |||
| if self.file_client is None: | |||
| self.file_client = mmcv.FileClient(**self.file_client_args) | |||
| filename = osp.join(results['seg_prefix'], | |||
| results['ann_info']['seg_map']) | |||
| img_bytes = self.file_client.get(filename) | |||
| results['gt_semantic_seg'] = mmcv.imfrombytes( | |||
| img_bytes, flag='unchanged').squeeze() | |||
| results['seg_fields'].append('gt_semantic_seg') | |||
| return results | |||
| def __call__(self, results): | |||
| """Call function to load multiple types annotations. | |||
| Args: | |||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||
| Returns: | |||
| dict: The dict contains loaded bounding box, label, mask and | |||
| semantic segmentation annotations. | |||
| """ | |||
| if self.with_bbox: | |||
| results = self._load_bboxes(results) | |||
| if results is None: | |||
| return None | |||
| if self.with_label: | |||
| results = self._load_labels(results) | |||
| if self.with_keypoints: | |||
| results = self._load_keypoints(results) | |||
| if self.with_mask: | |||
| results = self._load_masks(results) | |||
| if self.with_seg: | |||
| results = self._load_semantic_seg(results) | |||
| return results | |||
| def __repr__(self): | |||
| repr_str = self.__class__.__name__ | |||
| repr_str += f'(with_bbox={self.with_bbox}, ' | |||
| repr_str += f'with_label={self.with_label}, ' | |||
| repr_str += f'with_keypoints={self.with_keypoints}, ' | |||
| repr_str += f'with_mask={self.with_mask}, ' | |||
| repr_str += f'with_seg={self.with_seg})' | |||
| repr_str += f'poly2mask={self.poly2mask})' | |||
| repr_str += f'poly2mask={self.file_client_args})' | |||
| return repr_str | |||
| @@ -0,0 +1,737 @@ | |||
| """ | |||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py | |||
| """ | |||
| import mmcv | |||
| import numpy as np | |||
| from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps | |||
| from mmdet.datasets.builder import PIPELINES | |||
| from numpy import random | |||
| @PIPELINES.register_module() | |||
| class ResizeV2(object): | |||
| """Resize images & bbox & mask &kps. | |||
| This transform resizes the input image to some scale. Bboxes and masks are | |||
| then resized with the same scale factor. If the input dict contains the key | |||
| "scale", then the scale in the input dict is used, otherwise the specified | |||
| scale in the init method is used. If the input dict contains the key | |||
| "scale_factor" (if MultiScaleFlipAug does not give img_scale but | |||
| scale_factor), the actual scale will be computed by image shape and | |||
| scale_factor. | |||
| `img_scale` can either be a tuple (single-scale) or a list of tuple | |||
| (multi-scale). There are 3 multiscale modes: | |||
| - ``ratio_range is not None``: randomly sample a ratio from the ratio \ | |||
| range and multiply it with the image scale. | |||
| - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ | |||
| sample a scale from the multiscale range. | |||
| - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ | |||
| sample a scale from multiple scales. | |||
| Args: | |||
| img_scale (tuple or list[tuple]): Images scales for resizing. | |||
| multiscale_mode (str): Either "range" or "value". | |||
| ratio_range (tuple[float]): (min_ratio, max_ratio) | |||
| keep_ratio (bool): Whether to keep the aspect ratio when resizing the | |||
| image. | |||
| bbox_clip_border (bool, optional): Whether clip the objects outside | |||
| the border of the image. Defaults to True. | |||
| backend (str): Image resize backend, choices are 'cv2' and 'pillow'. | |||
| These two backends generates slightly different results. Defaults | |||
| to 'cv2'. | |||
| override (bool, optional): Whether to override `scale` and | |||
| `scale_factor` so as to call resize twice. Default False. If True, | |||
| after the first resizing, the existed `scale` and `scale_factor` | |||
| will be ignored so the second resizing can be allowed. | |||
| This option is a work-around for multiple times of resize in DETR. | |||
| Defaults to False. | |||
| """ | |||
| def __init__(self, | |||
| img_scale=None, | |||
| multiscale_mode='range', | |||
| ratio_range=None, | |||
| keep_ratio=True, | |||
| bbox_clip_border=True, | |||
| backend='cv2', | |||
| override=False): | |||
| if img_scale is None: | |||
| self.img_scale = None | |||
| else: | |||
| if isinstance(img_scale, list): | |||
| self.img_scale = img_scale | |||
| else: | |||
| self.img_scale = [img_scale] | |||
| assert mmcv.is_list_of(self.img_scale, tuple) | |||
| if ratio_range is not None: | |||
| # mode 1: given a scale and a range of image ratio | |||
| assert len(self.img_scale) == 1 | |||
| else: | |||
| # mode 2: given multiple scales or a range of scales | |||
| assert multiscale_mode in ['value', 'range'] | |||
| self.backend = backend | |||
| self.multiscale_mode = multiscale_mode | |||
| self.ratio_range = ratio_range | |||
| self.keep_ratio = keep_ratio | |||
| # TODO: refactor the override option in Resize | |||
| self.override = override | |||
| self.bbox_clip_border = bbox_clip_border | |||
| @staticmethod | |||
| def random_select(img_scales): | |||
| """Randomly select an img_scale from given candidates. | |||
| Args: | |||
| img_scales (list[tuple]): Images scales for selection. | |||
| Returns: | |||
| (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ | |||
| where ``img_scale`` is the selected image scale and \ | |||
| ``scale_idx`` is the selected index in the given candidates. | |||
| """ | |||
| assert mmcv.is_list_of(img_scales, tuple) | |||
| scale_idx = np.random.randint(len(img_scales)) | |||
| img_scale = img_scales[scale_idx] | |||
| return img_scale, scale_idx | |||
| @staticmethod | |||
| def random_sample(img_scales): | |||
| """Randomly sample an img_scale when ``multiscale_mode=='range'``. | |||
| Args: | |||
| img_scales (list[tuple]): Images scale range for sampling. | |||
| There must be two tuples in img_scales, which specify the lower | |||
| and uper bound of image scales. | |||
| Returns: | |||
| (tuple, None): Returns a tuple ``(img_scale, None)``, where \ | |||
| ``img_scale`` is sampled scale and None is just a placeholder \ | |||
| to be consistent with :func:`random_select`. | |||
| """ | |||
| assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 | |||
| img_scale_long = [max(s) for s in img_scales] | |||
| img_scale_short = [min(s) for s in img_scales] | |||
| long_edge = np.random.randint( | |||
| min(img_scale_long), | |||
| max(img_scale_long) + 1) | |||
| short_edge = np.random.randint( | |||
| min(img_scale_short), | |||
| max(img_scale_short) + 1) | |||
| img_scale = (long_edge, short_edge) | |||
| return img_scale, None | |||
| @staticmethod | |||
| def random_sample_ratio(img_scale, ratio_range): | |||
| """Randomly sample an img_scale when ``ratio_range`` is specified. | |||
| A ratio will be randomly sampled from the range specified by | |||
| ``ratio_range``. Then it would be multiplied with ``img_scale`` to | |||
| generate sampled scale. | |||
| Args: | |||
| img_scale (tuple): Images scale base to multiply with ratio. | |||
| ratio_range (tuple[float]): The minimum and maximum ratio to scale | |||
| the ``img_scale``. | |||
| Returns: | |||
| (tuple, None): Returns a tuple ``(scale, None)``, where \ | |||
| ``scale`` is sampled ratio multiplied with ``img_scale`` and \ | |||
| None is just a placeholder to be consistent with \ | |||
| :func:`random_select`. | |||
| """ | |||
| assert isinstance(img_scale, tuple) and len(img_scale) == 2 | |||
| min_ratio, max_ratio = ratio_range | |||
| assert min_ratio <= max_ratio | |||
| ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio | |||
| scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) | |||
| return scale, None | |||
| def _random_scale(self, results): | |||
| """Randomly sample an img_scale according to ``ratio_range`` and | |||
| ``multiscale_mode``. | |||
| If ``ratio_range`` is specified, a ratio will be sampled and be | |||
| multiplied with ``img_scale``. | |||
| If multiple scales are specified by ``img_scale``, a scale will be | |||
| sampled according to ``multiscale_mode``. | |||
| Otherwise, single scale will be used. | |||
| Args: | |||
| results (dict): Result dict from :obj:`dataset`. | |||
| Returns: | |||
| dict: Two new keys 'scale` and 'scale_idx` are added into \ | |||
| ``results``, which would be used by subsequent pipelines. | |||
| """ | |||
| if self.ratio_range is not None: | |||
| scale, scale_idx = self.random_sample_ratio( | |||
| self.img_scale[0], self.ratio_range) | |||
| elif len(self.img_scale) == 1: | |||
| scale, scale_idx = self.img_scale[0], 0 | |||
| elif self.multiscale_mode == 'range': | |||
| scale, scale_idx = self.random_sample(self.img_scale) | |||
| elif self.multiscale_mode == 'value': | |||
| scale, scale_idx = self.random_select(self.img_scale) | |||
| else: | |||
| raise NotImplementedError | |||
| results['scale'] = scale | |||
| results['scale_idx'] = scale_idx | |||
| def _resize_img(self, results): | |||
| """Resize images with ``results['scale']``.""" | |||
| for key in results.get('img_fields', ['img']): | |||
| if self.keep_ratio: | |||
| img, scale_factor = mmcv.imrescale( | |||
| results[key], | |||
| results['scale'], | |||
| return_scale=True, | |||
| backend=self.backend) | |||
| # the w_scale and h_scale has minor difference | |||
| # a real fix should be done in the mmcv.imrescale in the future | |||
| new_h, new_w = img.shape[:2] | |||
| h, w = results[key].shape[:2] | |||
| w_scale = new_w / w | |||
| h_scale = new_h / h | |||
| else: | |||
| img, w_scale, h_scale = mmcv.imresize( | |||
| results[key], | |||
| results['scale'], | |||
| return_scale=True, | |||
| backend=self.backend) | |||
| results[key] = img | |||
| scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], | |||
| dtype=np.float32) | |||
| results['img_shape'] = img.shape | |||
| # in case that there is no padding | |||
| results['pad_shape'] = img.shape | |||
| results['scale_factor'] = scale_factor | |||
| results['keep_ratio'] = self.keep_ratio | |||
| def _resize_bboxes(self, results): | |||
| """Resize bounding boxes with ``results['scale_factor']``.""" | |||
| for key in results.get('bbox_fields', []): | |||
| bboxes = results[key] * results['scale_factor'] | |||
| if self.bbox_clip_border: | |||
| img_shape = results['img_shape'] | |||
| bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) | |||
| bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) | |||
| results[key] = bboxes | |||
| def _resize_keypoints(self, results): | |||
| """Resize keypoints with ``results['scale_factor']``.""" | |||
| for key in results.get('keypoints_fields', []): | |||
| keypointss = results[key].copy() | |||
| factors = results['scale_factor'] | |||
| assert factors[0] == factors[2] | |||
| assert factors[1] == factors[3] | |||
| keypointss[:, :, 0] *= factors[0] | |||
| keypointss[:, :, 1] *= factors[1] | |||
| if self.bbox_clip_border: | |||
| img_shape = results['img_shape'] | |||
| keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0, | |||
| img_shape[1]) | |||
| keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0, | |||
| img_shape[0]) | |||
| results[key] = keypointss | |||
| def _resize_masks(self, results): | |||
| """Resize masks with ``results['scale']``""" | |||
| for key in results.get('mask_fields', []): | |||
| if results[key] is None: | |||
| continue | |||
| if self.keep_ratio: | |||
| results[key] = results[key].rescale(results['scale']) | |||
| else: | |||
| results[key] = results[key].resize(results['img_shape'][:2]) | |||
| def _resize_seg(self, results): | |||
| """Resize semantic segmentation map with ``results['scale']``.""" | |||
| for key in results.get('seg_fields', []): | |||
| if self.keep_ratio: | |||
| gt_seg = mmcv.imrescale( | |||
| results[key], | |||
| results['scale'], | |||
| interpolation='nearest', | |||
| backend=self.backend) | |||
| else: | |||
| gt_seg = mmcv.imresize( | |||
| results[key], | |||
| results['scale'], | |||
| interpolation='nearest', | |||
| backend=self.backend) | |||
| results['gt_semantic_seg'] = gt_seg | |||
| def __call__(self, results): | |||
| """Call function to resize images, bounding boxes, masks, semantic | |||
| segmentation map. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| Returns: | |||
| dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \ | |||
| 'keep_ratio' keys are added into result dict. | |||
| """ | |||
| if 'scale' not in results: | |||
| if 'scale_factor' in results: | |||
| img_shape = results['img'].shape[:2] | |||
| scale_factor = results['scale_factor'] | |||
| assert isinstance(scale_factor, float) | |||
| results['scale'] = tuple( | |||
| [int(x * scale_factor) for x in img_shape][::-1]) | |||
| else: | |||
| self._random_scale(results) | |||
| else: | |||
| if not self.override: | |||
| assert 'scale_factor' not in results, ( | |||
| 'scale and scale_factor cannot be both set.') | |||
| else: | |||
| results.pop('scale') | |||
| if 'scale_factor' in results: | |||
| results.pop('scale_factor') | |||
| self._random_scale(results) | |||
| self._resize_img(results) | |||
| self._resize_bboxes(results) | |||
| self._resize_keypoints(results) | |||
| self._resize_masks(results) | |||
| self._resize_seg(results) | |||
| return results | |||
| def __repr__(self): | |||
| repr_str = self.__class__.__name__ | |||
| repr_str += f'(img_scale={self.img_scale}, ' | |||
| repr_str += f'multiscale_mode={self.multiscale_mode}, ' | |||
| repr_str += f'ratio_range={self.ratio_range}, ' | |||
| repr_str += f'keep_ratio={self.keep_ratio})' | |||
| repr_str += f'bbox_clip_border={self.bbox_clip_border})' | |||
| return repr_str | |||
| @PIPELINES.register_module() | |||
| class RandomFlipV2(object): | |||
| """Flip the image & bbox & mask & kps. | |||
| If the input dict contains the key "flip", then the flag will be used, | |||
| otherwise it will be randomly decided by a ratio specified in the init | |||
| method. | |||
| When random flip is enabled, ``flip_ratio``/``direction`` can either be a | |||
| float/string or tuple of float/string. There are 3 flip modes: | |||
| - ``flip_ratio`` is float, ``direction`` is string: the image will be | |||
| ``direction``ly flipped with probability of ``flip_ratio`` . | |||
| E.g., ``flip_ratio=0.5``, ``direction='horizontal'``, | |||
| then image will be horizontally flipped with probability of 0.5. | |||
| - ``flip_ratio`` is float, ``direction`` is list of string: the image wil | |||
| be ``direction[i]``ly flipped with probability of | |||
| ``flip_ratio/len(direction)``. | |||
| E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``, | |||
| then image will be horizontally flipped with probability of 0.25, | |||
| vertically with probability of 0.25. | |||
| - ``flip_ratio`` is list of float, ``direction`` is list of string: | |||
| given ``len(flip_ratio) == len(direction)``, the image wil | |||
| be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``. | |||
| E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal', | |||
| 'vertical']``, then image will be horizontally flipped with probability | |||
| of 0.3, vertically with probability of 0.5 | |||
| Args: | |||
| flip_ratio (float | list[float], optional): The flipping probability. | |||
| Default: None. | |||
| direction(str | list[str], optional): The flipping direction. Options | |||
| are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'. | |||
| If input is a list, the length must equal ``flip_ratio``. Each | |||
| element in ``flip_ratio`` indicates the flip probability of | |||
| corresponding direction. | |||
| """ | |||
| def __init__(self, flip_ratio=None, direction='horizontal'): | |||
| if isinstance(flip_ratio, list): | |||
| assert mmcv.is_list_of(flip_ratio, float) | |||
| assert 0 <= sum(flip_ratio) <= 1 | |||
| elif isinstance(flip_ratio, float): | |||
| assert 0 <= flip_ratio <= 1 | |||
| elif flip_ratio is None: | |||
| pass | |||
| else: | |||
| raise ValueError('flip_ratios must be None, float, ' | |||
| 'or list of float') | |||
| self.flip_ratio = flip_ratio | |||
| valid_directions = ['horizontal', 'vertical', 'diagonal'] | |||
| if isinstance(direction, str): | |||
| assert direction in valid_directions | |||
| elif isinstance(direction, list): | |||
| assert mmcv.is_list_of(direction, str) | |||
| assert set(direction).issubset(set(valid_directions)) | |||
| else: | |||
| raise ValueError('direction must be either str or list of str') | |||
| self.direction = direction | |||
| if isinstance(flip_ratio, list): | |||
| assert len(self.flip_ratio) == len(self.direction) | |||
| self.count = 0 | |||
| def bbox_flip(self, bboxes, img_shape, direction): | |||
| """Flip bboxes horizontally. | |||
| Args: | |||
| bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k) | |||
| img_shape (tuple[int]): Image shape (height, width) | |||
| direction (str): Flip direction. Options are 'horizontal', | |||
| 'vertical'. | |||
| Returns: | |||
| numpy.ndarray: Flipped bounding boxes. | |||
| """ | |||
| assert bboxes.shape[-1] % 4 == 0 | |||
| flipped = bboxes.copy() | |||
| if direction == 'horizontal': | |||
| w = img_shape[1] | |||
| flipped[..., 0::4] = w - bboxes[..., 2::4] | |||
| flipped[..., 2::4] = w - bboxes[..., 0::4] | |||
| elif direction == 'vertical': | |||
| h = img_shape[0] | |||
| flipped[..., 1::4] = h - bboxes[..., 3::4] | |||
| flipped[..., 3::4] = h - bboxes[..., 1::4] | |||
| elif direction == 'diagonal': | |||
| w = img_shape[1] | |||
| h = img_shape[0] | |||
| flipped[..., 0::4] = w - bboxes[..., 2::4] | |||
| flipped[..., 1::4] = h - bboxes[..., 3::4] | |||
| flipped[..., 2::4] = w - bboxes[..., 0::4] | |||
| flipped[..., 3::4] = h - bboxes[..., 1::4] | |||
| else: | |||
| raise ValueError(f"Invalid flipping direction '{direction}'") | |||
| return flipped | |||
| def keypoints_flip(self, keypointss, img_shape, direction): | |||
| """Flip keypoints horizontally.""" | |||
| assert direction == 'horizontal' | |||
| assert keypointss.shape[-1] == 3 | |||
| num_kps = keypointss.shape[1] | |||
| assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}' | |||
| assert keypointss.ndim == 3 | |||
| flipped = keypointss.copy() | |||
| if num_kps == 5: | |||
| flip_order = [1, 0, 2, 4, 3] | |||
| elif num_kps == 4: | |||
| flip_order = [3, 2, 1, 0] | |||
| for idx, a in enumerate(flip_order): | |||
| flipped[:, idx, :] = keypointss[:, a, :] | |||
| w = img_shape[1] | |||
| flipped[..., 0] = w - flipped[..., 0] | |||
| return flipped | |||
| def __call__(self, results): | |||
| """Call function to flip bounding boxes, masks, semantic segmentation | |||
| maps. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| Returns: | |||
| dict: Flipped results, 'flip', 'flip_direction' keys are added \ | |||
| into result dict. | |||
| """ | |||
| if 'flip' not in results: | |||
| if isinstance(self.direction, list): | |||
| # None means non-flip | |||
| direction_list = self.direction + [None] | |||
| else: | |||
| # None means non-flip | |||
| direction_list = [self.direction, None] | |||
| if isinstance(self.flip_ratio, list): | |||
| non_flip_ratio = 1 - sum(self.flip_ratio) | |||
| flip_ratio_list = self.flip_ratio + [non_flip_ratio] | |||
| else: | |||
| non_flip_ratio = 1 - self.flip_ratio | |||
| # exclude non-flip | |||
| single_ratio = self.flip_ratio / (len(direction_list) - 1) | |||
| flip_ratio_list = [single_ratio] * (len(direction_list) | |||
| - 1) + [non_flip_ratio] | |||
| cur_dir = np.random.choice(direction_list, p=flip_ratio_list) | |||
| results['flip'] = cur_dir is not None | |||
| if 'flip_direction' not in results: | |||
| results['flip_direction'] = cur_dir | |||
| if results['flip']: | |||
| # flip image | |||
| for key in results.get('img_fields', ['img']): | |||
| results[key] = mmcv.imflip( | |||
| results[key], direction=results['flip_direction']) | |||
| # flip bboxes | |||
| for key in results.get('bbox_fields', []): | |||
| results[key] = self.bbox_flip(results[key], | |||
| results['img_shape'], | |||
| results['flip_direction']) | |||
| # flip kps | |||
| for key in results.get('keypoints_fields', []): | |||
| results[key] = self.keypoints_flip(results[key], | |||
| results['img_shape'], | |||
| results['flip_direction']) | |||
| # flip masks | |||
| for key in results.get('mask_fields', []): | |||
| results[key] = results[key].flip(results['flip_direction']) | |||
| # flip segs | |||
| for key in results.get('seg_fields', []): | |||
| results[key] = mmcv.imflip( | |||
| results[key], direction=results['flip_direction']) | |||
| return results | |||
| def __repr__(self): | |||
| return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})' | |||
| @PIPELINES.register_module() | |||
| class RandomSquareCrop(object): | |||
| """Random crop the image & bboxes, the cropped patches have minimum IoU | |||
| requirement with original image & bboxes, the IoU threshold is randomly | |||
| selected from min_ious. | |||
| Args: | |||
| min_ious (tuple): minimum IoU threshold for all intersections with | |||
| bounding boxes | |||
| min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, | |||
| where a >= min_crop_size). | |||
| Note: | |||
| The keys for bboxes, labels and masks should be paired. That is, \ | |||
| `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ | |||
| `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. | |||
| """ | |||
| def __init__(self, | |||
| crop_ratio_range=None, | |||
| crop_choice=None, | |||
| bbox_clip_border=True, | |||
| big_face_ratio=0, | |||
| big_face_crop_choice=None): | |||
| self.crop_ratio_range = crop_ratio_range | |||
| self.crop_choice = crop_choice | |||
| self.big_face_crop_choice = big_face_crop_choice | |||
| self.bbox_clip_border = bbox_clip_border | |||
| assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) | |||
| if self.crop_ratio_range is not None: | |||
| self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range | |||
| self.bbox2label = { | |||
| 'gt_bboxes': 'gt_labels', | |||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||
| } | |||
| self.bbox2mask = { | |||
| 'gt_bboxes': 'gt_masks', | |||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||
| } | |||
| assert big_face_ratio >= 0 and big_face_ratio <= 1.0 | |||
| self.big_face_ratio = big_face_ratio | |||
| def __call__(self, results): | |||
| """Call function to crop images and bounding boxes with minimum IoU | |||
| constraint. | |||
| Args: | |||
| results (dict): Result dict from loading pipeline. | |||
| Returns: | |||
| dict: Result dict with images and bounding boxes cropped, \ | |||
| 'img_shape' key is updated. | |||
| """ | |||
| if 'img_fields' in results: | |||
| assert results['img_fields'] == ['img'], \ | |||
| 'Only single img_fields is allowed' | |||
| img = results['img'] | |||
| assert 'bbox_fields' in results | |||
| assert 'gt_bboxes' in results | |||
| # try augment big face images | |||
| find_bigface = False | |||
| if np.random.random() < self.big_face_ratio: | |||
| min_size = 100 # h and w | |||
| expand_ratio = 0.3 # expand ratio of croped face alongwith both w and h | |||
| bbox = results['gt_bboxes'].copy() | |||
| lmks = results['gt_keypointss'].copy() | |||
| label = results['gt_labels'].copy() | |||
| # filter small faces | |||
| size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * ( | |||
| (bbox[:, 3] - bbox[:, 1]) > min_size) | |||
| bbox = bbox[size_mask] | |||
| lmks = lmks[size_mask] | |||
| label = label[size_mask] | |||
| # randomly choose a face that has no overlap with others | |||
| if len(bbox) > 0: | |||
| overlaps = bbox_overlaps(bbox, bbox) | |||
| overlaps -= np.eye(overlaps.shape[0]) | |||
| iou_mask = np.sum(overlaps, axis=1) == 0 | |||
| bbox = bbox[iou_mask] | |||
| lmks = lmks[iou_mask] | |||
| label = label[iou_mask] | |||
| if len(bbox) > 0: | |||
| choice = np.random.randint(len(bbox)) | |||
| bbox = bbox[choice] | |||
| lmks = lmks[choice] | |||
| label = [label[choice]] | |||
| w = bbox[2] - bbox[0] | |||
| h = bbox[3] - bbox[1] | |||
| x1 = bbox[0] - w * expand_ratio | |||
| x2 = bbox[2] + w * expand_ratio | |||
| y1 = bbox[1] - h * expand_ratio | |||
| y2 = bbox[3] + h * expand_ratio | |||
| x1, x2 = np.clip([x1, x2], 0, img.shape[1]) | |||
| y1, y2 = np.clip([y1, y2], 0, img.shape[0]) | |||
| bbox -= np.tile([x1, y1], 2) | |||
| lmks -= (x1, y1, 0) | |||
| find_bigface = True | |||
| img = img[int(y1):int(y2), int(x1):int(x2), :] | |||
| results['gt_bboxes'] = np.expand_dims(bbox, axis=0) | |||
| results['gt_keypointss'] = np.expand_dims(lmks, axis=0) | |||
| results['gt_labels'] = np.array(label) | |||
| results['img'] = img | |||
| boxes = results['gt_bboxes'] | |||
| h, w, c = img.shape | |||
| if self.crop_ratio_range is not None: | |||
| max_scale = self.crop_ratio_max | |||
| else: | |||
| max_scale = np.amax(self.crop_choice) | |||
| scale_retry = 0 | |||
| while True: | |||
| scale_retry += 1 | |||
| if scale_retry == 1 or max_scale > 1.0: | |||
| if self.crop_ratio_range is not None: | |||
| scale = np.random.uniform(self.crop_ratio_min, | |||
| self.crop_ratio_max) | |||
| elif self.crop_choice is not None: | |||
| scale = np.random.choice(self.crop_choice) | |||
| else: | |||
| scale = scale * 1.2 | |||
| if find_bigface: | |||
| # select a scale from big_face_crop_choice if in big_face mode | |||
| scale = np.random.choice(self.big_face_crop_choice) | |||
| for i in range(250): | |||
| long_side = max(w, h) | |||
| cw = int(scale * long_side) | |||
| ch = cw | |||
| # TODO +1 | |||
| if w == cw: | |||
| left = 0 | |||
| elif w > cw: | |||
| left = random.randint(0, w - cw) | |||
| else: | |||
| left = random.randint(w - cw, 0) | |||
| if h == ch: | |||
| top = 0 | |||
| elif h > ch: | |||
| top = random.randint(0, h - ch) | |||
| else: | |||
| top = random.randint(h - ch, 0) | |||
| patch = np.array( | |||
| (int(left), int(top), int(left + cw), int(top + ch)), | |||
| dtype=np.int32) | |||
| # center of boxes should inside the crop img | |||
| # only adjust boxes and instance masks when the gt is not empty | |||
| # adjust boxes | |||
| def is_center_of_bboxes_in_patch(boxes, patch): | |||
| # TODO >= | |||
| center = (boxes[:, :2] + boxes[:, 2:]) / 2 | |||
| mask = \ | |||
| ((center[:, 0] > patch[0]) | |||
| * (center[:, 1] > patch[1]) | |||
| * (center[:, 0] < patch[2]) | |||
| * (center[:, 1] < patch[3])) | |||
| return mask | |||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||
| if not mask.any(): | |||
| continue | |||
| for key in results.get('bbox_fields', []): | |||
| boxes = results[key].copy() | |||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||
| boxes = boxes[mask] | |||
| if self.bbox_clip_border: | |||
| boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) | |||
| boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) | |||
| boxes -= np.tile(patch[:2], 2) | |||
| results[key] = boxes | |||
| # labels | |||
| label_key = self.bbox2label.get(key) | |||
| if label_key in results: | |||
| results[label_key] = results[label_key][mask] | |||
| # keypoints field | |||
| if key == 'gt_bboxes': | |||
| for kps_key in results.get('keypoints_fields', []): | |||
| keypointss = results[kps_key].copy() | |||
| keypointss = keypointss[mask, :, :] | |||
| if self.bbox_clip_border: | |||
| keypointss[:, :, : | |||
| 2] = keypointss[:, :, :2].clip( | |||
| max=patch[2:]) | |||
| keypointss[:, :, : | |||
| 2] = keypointss[:, :, :2].clip( | |||
| min=patch[:2]) | |||
| keypointss[:, :, 0] -= patch[0] | |||
| keypointss[:, :, 1] -= patch[1] | |||
| results[kps_key] = keypointss | |||
| # mask fields | |||
| mask_key = self.bbox2mask.get(key) | |||
| if mask_key in results: | |||
| results[mask_key] = results[mask_key][mask.nonzero() | |||
| [0]].crop(patch) | |||
| # adjust the img no matter whether the gt is empty before crop | |||
| rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 | |||
| patch_from = patch.copy() | |||
| patch_from[0] = max(0, patch_from[0]) | |||
| patch_from[1] = max(0, patch_from[1]) | |||
| patch_from[2] = min(img.shape[1], patch_from[2]) | |||
| patch_from[3] = min(img.shape[0], patch_from[3]) | |||
| patch_to = patch.copy() | |||
| patch_to[0] = max(0, patch_to[0] * -1) | |||
| patch_to[1] = max(0, patch_to[1] * -1) | |||
| patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) | |||
| patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) | |||
| rimg[patch_to[1]:patch_to[3], | |||
| patch_to[0]:patch_to[2], :] = img[ | |||
| patch_from[1]:patch_from[3], | |||
| patch_from[0]:patch_from[2], :] | |||
| img = rimg | |||
| results['img'] = img | |||
| results['img_shape'] = img.shape | |||
| return results | |||
| def __repr__(self): | |||
| repr_str = self.__class__.__name__ | |||
| repr_str += f'(min_ious={self.min_iou}, ' | |||
| repr_str += f'crop_size={self.crop_size})' | |||
| return repr_str | |||
| @@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset): | |||
| CLASSES = ('FG', ) | |||
| def __init__(self, min_size=None, **kwargs): | |||
| self.NK = 5 | |||
| self.NK = kwargs.pop('num_kps', 5) | |||
| self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)} | |||
| self.min_size = min_size | |||
| self.gt_path = kwargs.get('gt_path') | |||
| @@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset): | |||
| if len(values) > 4: | |||
| if len(values) > 5: | |||
| kps = np.array( | |||
| values[4:19], dtype=np.float32).reshape((self.NK, 3)) | |||
| values[4:4 + self.NK * 3], dtype=np.float32).reshape( | |||
| (self.NK, 3)) | |||
| for li in range(kps.shape[0]): | |||
| if (kps[li, :] == -1).all(): | |||
| kps[li][2] = 0.0 # weight = 0, ignore | |||
| @@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead): | |||
| scale_mode=1, | |||
| dw_conv=False, | |||
| use_kps=False, | |||
| num_kps=5, | |||
| loss_kps=dict( | |||
| type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1), | |||
| **kwargs): | |||
| @@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead): | |||
| self.scale_mode = scale_mode | |||
| self.use_dfl = True | |||
| self.dw_conv = dw_conv | |||
| self.NK = 5 | |||
| self.NK = num_kps | |||
| self.extra_flops = 0.0 | |||
| if loss_dfl is None or not loss_dfl: | |||
| self.use_dfl = False | |||
| @@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead): | |||
| batch_size, -1, self.cls_out_channels).sigmoid() | |||
| bbox_pred = bbox_pred.permute(0, 2, 3, | |||
| 1).reshape(batch_size, -1, 4) | |||
| kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10) | |||
| kps_pred = kps_pred.permute(0, 2, 3, | |||
| 1).reshape(batch_size, -1, self.NK * 2) | |||
| return cls_score, bbox_pred, kps_pred | |||
| def forward_train(self, | |||
| @@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead): | |||
| if self.use_dfl: | |||
| kps_pred = self.integral(kps_pred) * stride[0] | |||
| else: | |||
| kps_pred = kps_pred.reshape((-1, 10)) * stride[0] | |||
| kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0] | |||
| nms_pre = cfg.get('nms_pre', -1) | |||
| if nms_pre > 0 and scores.shape[0] > nms_pre: | |||
| @@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead): | |||
| mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) | |||
| if mlvl_kps is not None: | |||
| scale_factor2 = torch.tensor( | |||
| [scale_factor[0], scale_factor[1]] * 5) | |||
| [scale_factor[0], scale_factor[1]] * self.NK) | |||
| mlvl_kps /= scale_factor2.to(mlvl_kps.device) | |||
| mlvl_scores = torch.cat(mlvl_scores) | |||
| @@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector): | |||
| gt_bboxes_ignore) | |||
| return losses | |||
| def simple_test(self, img, img_metas, rescale=False): | |||
| def simple_test(self, | |||
| img, | |||
| img_metas, | |||
| rescale=False, | |||
| repeat_head=1, | |||
| output_kps_var=0, | |||
| output_results=1): | |||
| """Test function without test time augmentation. | |||
| Args: | |||
| @@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector): | |||
| img_metas (list[dict]): List of image information. | |||
| rescale (bool, optional): Whether to rescale the results. | |||
| Defaults to False. | |||
| repeat_head (int): repeat inference times in head | |||
| output_kps_var (int): whether output kps var to calculate quality | |||
| output_results (int): 0: nothing 1: bbox 2: both bbox and kps | |||
| Returns: | |||
| list[list[np.ndarray]]: BBox results of each image and classes. | |||
| @@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector): | |||
| corresponds to each class. | |||
| """ | |||
| x = self.extract_feat(img) | |||
| outs = self.bbox_head(x) | |||
| if torch.onnx.is_in_onnx_export(): | |||
| print('single_stage.py in-onnx-export') | |||
| print(outs.__class__) | |||
| cls_score, bbox_pred, kps_pred = outs | |||
| for c in cls_score: | |||
| print(c.shape) | |||
| for c in bbox_pred: | |||
| print(c.shape) | |||
| if self.bbox_head.use_kps: | |||
| for c in kps_pred: | |||
| assert repeat_head >= 1 | |||
| kps_out0 = [] | |||
| kps_out1 = [] | |||
| kps_out2 = [] | |||
| for i in range(repeat_head): | |||
| outs = self.bbox_head(x) | |||
| kps_out0 += [outs[2][0].detach().cpu().numpy()] | |||
| kps_out1 += [outs[2][1].detach().cpu().numpy()] | |||
| kps_out2 += [outs[2][2].detach().cpu().numpy()] | |||
| if output_kps_var: | |||
| var0 = np.var(np.vstack(kps_out0), axis=0).mean() | |||
| var1 = np.var(np.vstack(kps_out1), axis=0).mean() | |||
| var2 = np.var(np.vstack(kps_out2), axis=0).mean() | |||
| var = np.mean([var0, var1, var2]) | |||
| else: | |||
| var = None | |||
| if output_results > 0: | |||
| if torch.onnx.is_in_onnx_export(): | |||
| print('single_stage.py in-onnx-export') | |||
| print(outs.__class__) | |||
| cls_score, bbox_pred, kps_pred = outs | |||
| for c in cls_score: | |||
| print(c.shape) | |||
| for c in bbox_pred: | |||
| print(c.shape) | |||
| return (cls_score, bbox_pred, kps_pred) | |||
| else: | |||
| return (cls_score, bbox_pred) | |||
| bbox_list = self.bbox_head.get_bboxes( | |||
| *outs, img_metas, rescale=rescale) | |||
| if self.bbox_head.use_kps: | |||
| for c in kps_pred: | |||
| print(c.shape) | |||
| return (cls_score, bbox_pred, kps_pred) | |||
| else: | |||
| return (cls_score, bbox_pred) | |||
| bbox_list = self.bbox_head.get_bboxes( | |||
| *outs, img_metas, rescale=rescale) | |||
| # return kps if use_kps | |||
| if len(bbox_list[0]) == 2: | |||
| bbox_results = [ | |||
| bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) | |||
| for det_bboxes, det_labels in bbox_list | |||
| ] | |||
| elif len(bbox_list[0]) == 3: | |||
| bbox_results = [ | |||
| bbox2result( | |||
| det_bboxes, | |||
| det_labels, | |||
| self.bbox_head.num_classes, | |||
| kps=det_kps) | |||
| for det_bboxes, det_labels, det_kps in bbox_list | |||
| ] | |||
| return bbox_results | |||
| # return kps if use_kps | |||
| if len(bbox_list[0]) == 2: | |||
| bbox_results = [ | |||
| bbox2result(det_bboxes, det_labels, | |||
| self.bbox_head.num_classes) | |||
| for det_bboxes, det_labels in bbox_list | |||
| ] | |||
| elif len(bbox_list[0]) == 3: | |||
| if output_results == 2: | |||
| bbox_results = [ | |||
| bbox2result( | |||
| det_bboxes, | |||
| det_labels, | |||
| self.bbox_head.num_classes, | |||
| kps=det_kps, | |||
| num_kps=self.bbox_head.NK) | |||
| for det_bboxes, det_labels, det_kps in bbox_list | |||
| ] | |||
| elif output_results == 1: | |||
| bbox_results = [ | |||
| bbox2result(det_bboxes, det_labels, | |||
| self.bbox_head.num_classes) | |||
| for det_bboxes, det_labels, _ in bbox_list | |||
| ] | |||
| else: | |||
| bbox_results = None | |||
| if var is not None: | |||
| return bbox_results, var | |||
| else: | |||
| return bbox_results | |||
| def feature_test(self, img): | |||
| x = self.extract_feat(img) | |||
| @@ -0,0 +1,71 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os.path as osp | |||
| from copy import deepcopy | |||
| from typing import Any, Dict | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| __all__ = ['ScrfdDetect'] | |||
| @MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd) | |||
| class ScrfdDetect(TorchModel): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """initialize the face detection model from the `model_dir` path. | |||
| Args: | |||
| model_dir (str): the model path. | |||
| """ | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| from mmcv import Config | |||
| from mmcv.parallel import MMDataParallel | |||
| from mmcv.runner import load_checkpoint | |||
| from mmdet.models import build_detector | |||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset | |||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop | |||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e | |||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead | |||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD | |||
| cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py')) | |||
| ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) | |||
| cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3) | |||
| detector = build_detector(cfg.model) | |||
| logger.info(f'loading model from {ckpt_path}') | |||
| device = torch.device( | |||
| f'cuda:{0}' if torch.cuda.is_available() else 'cpu') | |||
| load_checkpoint(detector, ckpt_path, map_location=device) | |||
| detector = MMDataParallel(detector, device_ids=[0]) | |||
| detector.eval() | |||
| self.detector = detector | |||
| logger.info('load model done') | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| result = self.detector( | |||
| return_loss=False, | |||
| rescale=True, | |||
| img=[input['img'][0].unsqueeze(0)], | |||
| img_metas=[[dict(input['img_metas'][0].data)]], | |||
| output_results=2) | |||
| assert result is not None | |||
| result = result[0][0] | |||
| bboxes = result[:, :4].tolist() | |||
| kpss = result[:, 5:].tolist() | |||
| scores = result[:, 4].tolist() | |||
| return { | |||
| OutputKeys.SCORES: scores, | |||
| OutputKeys.BOXES: bboxes, | |||
| OutputKeys.KEYPOINTS: kpss | |||
| } | |||
| def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: | |||
| return input | |||
| @@ -0,0 +1,20 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .hand_2d_keypoints import Hand2dKeyPoints | |||
| else: | |||
| _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']} | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,16 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from easycv.models.pose import TopDown | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||
| from modelscope.utils.constant import Tasks | |||
| @MODELS.register_module( | |||
| group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints) | |||
| class Hand2dKeyPoints(EasyCVBaseModel, TopDown): | |||
| def __init__(self, model_dir=None, *args, **kwargs): | |||
| EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||
| TopDown.__init__(self, *args, **kwargs) | |||
| @@ -0,0 +1,22 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .human_wholebody_keypoint import HumanWholeBodyKeypoint | |||
| else: | |||
| _import_structure = { | |||
| 'human_wholebody_keypoint': ['HumanWholeBodyKeypoint'] | |||
| } | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,17 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from easycv.models.pose.top_down import TopDown | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||
| from modelscope.utils.constant import Tasks | |||
| @MODELS.register_module( | |||
| group_key=Tasks.human_wholebody_keypoint, | |||
| module_name=Models.human_wholebody_keypoint) | |||
| class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown): | |||
| def __init__(self, model_dir=None, *args, **kwargs): | |||
| EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||
| TopDown.__init__(self, *args, **kwargs) | |||
| @@ -0,0 +1,20 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .image_body_reshaping import ImageBodyReshaping | |||
| else: | |||
| _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']} | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,128 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import Tensor, TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from .model import FlowGenerator | |||
| from .person_info import PersonInfo | |||
| from .pose_estimator.body import Body | |||
| from .slim_utils import image_warp_grid1, resize_on_long_side | |||
| logger = get_logger() | |||
| __all__ = ['ImageBodyReshaping'] | |||
| @MODELS.register_module( | |||
| Tasks.image_body_reshaping, module_name=Models.image_body_reshaping) | |||
| class ImageBodyReshaping(TorchModel): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """initialize the image body reshaping model from the `model_dir` path. | |||
| Args: | |||
| model_dir (str): the model path. | |||
| """ | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| if torch.cuda.is_available(): | |||
| self.device = torch.device('cuda') | |||
| else: | |||
| self.device = torch.device('cpu') | |||
| self.degree = 1.0 | |||
| self.reshape_model = FlowGenerator(n_channels=16).to(self.device) | |||
| model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
| checkpoints = torch.load(model_path, map_location=torch.device('cpu')) | |||
| self.reshape_model.load_state_dict( | |||
| checkpoints['state_dict'], strict=True) | |||
| self.reshape_model.eval() | |||
| logger.info('load body reshaping model done') | |||
| pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth') | |||
| self.pose_esti = Body(pose_model_ckpt, self.device) | |||
| logger.info('load pose model done') | |||
| def pred_joints(self, img): | |||
| if img is None: | |||
| return None | |||
| small_src, resize_scale = resize_on_long_side(img, 300) | |||
| body_joints = self.pose_esti(small_src) | |||
| if body_joints.shape[0] >= 1: | |||
| body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale | |||
| return body_joints | |||
| def pred_flow(self, img): | |||
| body_joints = self.pred_joints(img) | |||
| small_size = 1200 | |||
| if img.shape[0] > small_size or img.shape[1] > small_size: | |||
| _img, _scale = resize_on_long_side(img, small_size) | |||
| body_joints[:, :, :2] = body_joints[:, :, :2] * _scale | |||
| else: | |||
| _img = img | |||
| # We only reshape one person | |||
| if body_joints.shape[0] < 1 or body_joints.shape[0] > 1: | |||
| return None | |||
| person = PersonInfo(body_joints[0]) | |||
| with torch.no_grad(): | |||
| person_pred = person.pred_flow(_img, self.reshape_model, | |||
| self.device) | |||
| flow = np.dstack((person_pred['rDx'], person_pred['rDy'])) | |||
| scale = img.shape[0] * 1.0 / flow.shape[0] | |||
| flow = cv2.resize(flow, (img.shape[1], img.shape[0])) | |||
| flow *= scale | |||
| return flow | |||
| def warp(self, src_img, flow): | |||
| X_flow = flow[..., 0] | |||
| Y_flow = flow[..., 1] | |||
| X_flow = np.ascontiguousarray(X_flow) | |||
| Y_flow = np.ascontiguousarray(Y_flow) | |||
| pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0) | |||
| return pred | |||
| def inference(self, img): | |||
| img = img.cpu().numpy() | |||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||
| flow = self.pred_flow(img) | |||
| if flow is None: | |||
| return img | |||
| assert flow.shape[:2] == img.shape[:2] | |||
| mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8) | |||
| mag -= 3 | |||
| mag[mag <= 0] = 0 | |||
| x, y = cv2.polarToCart(mag, ang, angleInDegrees=False) | |||
| flow = np.dstack((x, y)) | |||
| flow *= self.degree | |||
| pred = self.warp(img, flow) | |||
| out_img = np.clip(pred, 0, 255) | |||
| logger.info('model inference done') | |||
| return out_img.astype(np.uint8) | |||
| @@ -0,0 +1,189 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class ConvLayer(nn.Module): | |||
| def __init__(self, in_ch, out_ch): | |||
| super(ConvLayer, self).__init__() | |||
| self.conv = nn.Sequential( | |||
| nn.ReflectionPad2d(1), | |||
| nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0), | |||
| nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)) | |||
| def forward(self, x): | |||
| x = self.conv(x) | |||
| return x | |||
| class SASA(nn.Module): | |||
| def __init__(self, in_dim): | |||
| super(SASA, self).__init__() | |||
| self.chanel_in = in_dim | |||
| self.query_conv = nn.Conv2d( | |||
| in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) | |||
| self.key_conv = nn.Conv2d( | |||
| in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) | |||
| self.value_conv = nn.Conv2d( | |||
| in_channels=in_dim, out_channels=in_dim, kernel_size=1) | |||
| self.mag_conv = nn.Conv2d( | |||
| in_channels=5, out_channels=in_dim // 32, kernel_size=1) | |||
| self.gamma = nn.Parameter(torch.zeros(1)) | |||
| self.softmax = nn.Softmax(dim=-1) # | |||
| self.sigmoid = nn.Sigmoid() | |||
| def structure_encoder(self, paf_mag, target_height, target_width): | |||
| torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True) | |||
| torso_mask = torch.clamp(torso_mask, 0, 1) | |||
| arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True) | |||
| arms_mask = torch.clamp(arms_mask, 0, 1) | |||
| legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True) | |||
| legs_mask = torch.clamp(legs_mask, 0, 1) | |||
| fg_mask = paf_mag[:, 12, :, :].unsqueeze(1) | |||
| bg_mask = 1 - fg_mask | |||
| Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask), | |||
| dim=1) | |||
| Y = F.interpolate(Y, size=(target_height, target_width), mode='area') | |||
| return Y | |||
| def forward(self, X, PAF_mag): | |||
| """extract self-attention features. | |||
| Args: | |||
| X : input feature maps( B x C x H x W) | |||
| PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity | |||
| Returns: | |||
| out : self attention value + input feature | |||
| Y: B X N X N (N is Width*Height) | |||
| """ | |||
| m_batchsize, C, height, width = X.size() | |||
| Y = self.structure_encoder(PAF_mag, height, width) | |||
| connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1, | |||
| width * height) | |||
| affinity = torch.bmm( | |||
| connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec) | |||
| affinity_centered = affinity - torch.mean(affinity) | |||
| affinity_sigmoid = self.sigmoid(affinity_centered) | |||
| proj_query = self.query_conv(X).view(m_batchsize, -1, | |||
| width * height).permute(0, 2, 1) | |||
| proj_key = self.key_conv(X).view(m_batchsize, -1, width * height) | |||
| selfatten_map = torch.bmm(proj_query, proj_key) | |||
| selfatten_centered = selfatten_map - torch.mean( | |||
| selfatten_map) # centering | |||
| selfatten_sigmoid = self.sigmoid(selfatten_centered) | |||
| SASA_map = selfatten_sigmoid * affinity_sigmoid | |||
| proj_value = self.value_conv(X).view(m_batchsize, -1, width * height) | |||
| out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1)) | |||
| out = out.view(m_batchsize, C, height, width) | |||
| out = self.gamma * out + X | |||
| return out, Y | |||
| class FlowGenerator(nn.Module): | |||
| def __init__(self, n_channels, deep_supervision=False): | |||
| super(FlowGenerator, self).__init__() | |||
| self.deep_supervision = deep_supervision | |||
| self.Encoder = nn.Sequential( | |||
| ConvLayer(n_channels, 64), | |||
| ConvLayer(64, 64), | |||
| nn.MaxPool2d(2), | |||
| ConvLayer(64, 128), | |||
| ConvLayer(128, 128), | |||
| nn.MaxPool2d(2), | |||
| ConvLayer(128, 256), | |||
| ConvLayer(256, 256), | |||
| nn.MaxPool2d(2), | |||
| ConvLayer(256, 512), | |||
| ConvLayer(512, 512), | |||
| nn.MaxPool2d(2), | |||
| ConvLayer(512, 1024), | |||
| ConvLayer(1024, 1024), | |||
| ConvLayer(1024, 1024), | |||
| ConvLayer(1024, 1024), | |||
| ConvLayer(1024, 1024), | |||
| ) | |||
| self.SASA = SASA(in_dim=1024) | |||
| self.Decoder = nn.Sequential( | |||
| ConvLayer(1024, 1024), | |||
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), | |||
| ConvLayer(1024, 512), | |||
| ConvLayer(512, 512), | |||
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), | |||
| ConvLayer(512, 256), | |||
| ConvLayer(256, 256), | |||
| ConvLayer(256, 128), | |||
| ConvLayer(128, 64), | |||
| ConvLayer(64, 32), | |||
| nn.Conv2d(32, 2, kernel_size=1, padding=0), | |||
| nn.Tanh(), | |||
| nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True), | |||
| ) | |||
| dilation_ksize = 17 | |||
| self.dilation = torch.nn.MaxPool2d( | |||
| kernel_size=dilation_ksize, | |||
| stride=1, | |||
| padding=int((dilation_ksize - 1) / 2)) | |||
| def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2): | |||
| n, c, h, w = x.size() | |||
| yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)]) | |||
| xv = xv.float() / (w - 1) * 2.0 - 1 | |||
| yv = yv.float() / (h - 1) * 2.0 - 1 | |||
| grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0) | |||
| grid = grid.to(flow.device) | |||
| grid_x = grid + 2 * flow * coff | |||
| warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode) | |||
| return warp_x | |||
| def forward(self, img, skeleton_map, coef=0.2): | |||
| """extract self-attention features. | |||
| Args: | |||
| img : input numpy image | |||
| skeleton_map : skeleton map of input image | |||
| coef: warp degree | |||
| Returns: | |||
| warp_x : warped image | |||
| flow: predicted flow | |||
| """ | |||
| img_concat = torch.cat((img, skeleton_map), dim=1) | |||
| X = self.Encoder(img_concat) | |||
| _, _, height, width = X.size() | |||
| # directly get PAF magnitude from skeleton maps via dilation | |||
| PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5) | |||
| out, Y = self.SASA(X, PAF_mag) | |||
| flow = self.Decoder(out) | |||
| flow = flow.permute(0, 2, 3, 1) # [n, 2, h, w] ==> [n, h, w, 2] | |||
| warp_x = self.warp(img, flow, coff=coef) | |||
| warp_x = torch.clamp(warp_x, min=-1.0, max=1.0) | |||
| return warp_x, flow | |||
| @@ -0,0 +1,339 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import copy | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from .slim_utils import (enlarge_box_tblr, gen_skeleton_map, | |||
| get_map_fusion_map_cuda, get_mask_bbox, | |||
| resize_on_long_side) | |||
| class PersonInfo(object): | |||
| def __init__(self, joints): | |||
| self.joints = joints | |||
| self.flow = None | |||
| self.pad_boder = False | |||
| self.height_expand = 0 | |||
| self.width_expand = 0 | |||
| self.coeff = 0.2 | |||
| self.network_input_W = 256 | |||
| self.network_input_H = 256 | |||
| self.divider = 20 | |||
| self.flow_scales = ['upper_2'] | |||
| def update_attribute(self, pad_boder, height_expand, width_expand): | |||
| self.pad_boder = pad_boder | |||
| self.height_expand = height_expand | |||
| self.width_expand = width_expand | |||
| if pad_boder: | |||
| self.joints[:, 0] += width_expand | |||
| self.joints[:, 1] += height_expand | |||
| def pred_flow(self, img, flow_net, device): | |||
| with torch.no_grad(): | |||
| if img is None: | |||
| print('image is none') | |||
| self.flow = None | |||
| if len(img.shape) == 2: | |||
| img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) | |||
| if self.pad_boder: | |||
| height_expand = self.height_expand | |||
| width_expand = self.width_expand | |||
| pad_img = cv2.copyMakeBorder( | |||
| img, | |||
| height_expand, | |||
| height_expand, | |||
| width_expand, | |||
| width_expand, | |||
| cv2.BORDER_CONSTANT, | |||
| value=(127, 127, 127)) | |||
| else: | |||
| height_expand = 0 | |||
| width_expand = 0 | |||
| pad_img = img.copy() | |||
| canvas = np.zeros( | |||
| shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32) | |||
| self.human_joint_box = self.__joint_to_body_box() | |||
| self.human_box = enlarge_box_tblr( | |||
| self.human_joint_box, pad_img, ratio=0.25) | |||
| human_box_height = self.human_box[1] - self.human_box[0] | |||
| human_box_width = self.human_box[3] - self.human_box[2] | |||
| self.leg_joint_box = self.__joint_to_leg_box() | |||
| self.leg_box = enlarge_box_tblr( | |||
| self.leg_joint_box, pad_img, ratio=0.25) | |||
| self.arm_joint_box = self.__joint_to_arm_box() | |||
| self.arm_box = enlarge_box_tblr( | |||
| self.arm_joint_box, pad_img, ratio=0.1) | |||
| x_flows = [] | |||
| y_flows = [] | |||
| multi_bbox = [] | |||
| for scale in self.flow_scales: # better for metric | |||
| scale_value = float(scale.split('_')[-1]) | |||
| arm_box = copy.deepcopy(self.arm_box) | |||
| if arm_box[0] is None: | |||
| arm_box = self.human_box | |||
| arm_box_height = arm_box[1] - arm_box[0] | |||
| arm_box_width = arm_box[3] - arm_box[2] | |||
| roi_bbox = None | |||
| if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1: | |||
| roi_bbox = self.human_box | |||
| else: | |||
| arm_box = enlarge_box_tblr( | |||
| arm_box, pad_img, ratio=scale_value) | |||
| if scale == 'upper_0.2': | |||
| arm_box[0] = min(arm_box[0], int(self.joints[0][1])) | |||
| if scale.startswith('upper'): | |||
| roi_bbox = [ | |||
| max(self.human_box[0], arm_box[0]), | |||
| min(self.human_box[1], arm_box[1]), | |||
| max(self.human_box[2], arm_box[2]), | |||
| min(self.human_box[3], arm_box[3]) | |||
| ] | |||
| if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ | |||
| 3] - roi_bbox[2] < 1: | |||
| continue | |||
| elif scale.startswith('lower'): | |||
| roi_bbox = [ | |||
| max(self.human_box[0], self.leg_box[0]), | |||
| min(self.human_box[1], self.leg_box[1]), | |||
| max(self.human_box[2], self.leg_box[2]), | |||
| min(self.human_box[3], self.leg_box[3]) | |||
| ] | |||
| if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ | |||
| 3] - roi_bbox[2] < 1: | |||
| continue | |||
| skel_map, roi_bbox = gen_skeleton_map( | |||
| self.joints, 'depth', input_roi_box=roi_bbox) | |||
| if roi_bbox is None: | |||
| continue | |||
| if skel_map.dtype != np.float32: | |||
| skel_map = skel_map.astype(np.float32) | |||
| skel_map -= 1.0 # [0,2] ->[-1,1] | |||
| multi_bbox.append(roi_bbox) | |||
| roi_bbox_height = roi_bbox[1] - roi_bbox[0] | |||
| roi_bbox_width = roi_bbox[3] - roi_bbox[2] | |||
| assert skel_map.shape[0] == roi_bbox_height | |||
| assert skel_map.shape[1] == roi_bbox_width | |||
| roi_height_pad = roi_bbox_height // self.divider | |||
| roi_width_pad = roi_bbox_width // self.divider | |||
| paded_roi_h = roi_bbox_height + 2 * roi_height_pad | |||
| paded_roi_w = roi_bbox_width + 2 * roi_width_pad | |||
| roi_height_pad_joint = skel_map.shape[0] // self.divider | |||
| roi_width_pad_joint = skel_map.shape[1] // self.divider | |||
| skel_map = np.pad( | |||
| skel_map, | |||
| ((roi_height_pad_joint, roi_height_pad_joint), | |||
| (roi_width_pad_joint, roi_width_pad_joint), (0, 0)), | |||
| 'constant', | |||
| constant_values=-1) | |||
| skel_map_resized = cv2.resize( | |||
| skel_map, (self.network_input_W, self.network_input_H)) | |||
| skel_map_resized[skel_map_resized < 0] = -1.0 | |||
| skel_map_resized[skel_map_resized > -0.5] = 1.0 | |||
| skel_map_transformed = torch.from_numpy( | |||
| skel_map_resized.transpose((2, 0, 1))) | |||
| roi_npy = pad_img[roi_bbox[0]:roi_bbox[1], | |||
| roi_bbox[2]:roi_bbox[3], :].copy() | |||
| if roi_npy.dtype != np.float32: | |||
| roi_npy = roi_npy.astype(np.float32) | |||
| roi_npy = np.pad(roi_npy, | |||
| ((roi_height_pad, roi_height_pad), | |||
| (roi_width_pad, roi_width_pad), (0, 0)), | |||
| 'edge') | |||
| roi_npy = roi_npy[:, :, ::-1] | |||
| roi_npy = cv2.resize( | |||
| roi_npy, (self.network_input_W, self.network_input_H)) | |||
| roi_npy *= 1.0 / 255 | |||
| roi_npy -= 0.5 | |||
| roi_npy *= 2 | |||
| rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1))) | |||
| rgb_tensor = rgb_tensor.unsqueeze(0).to(device) | |||
| skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device) | |||
| warped_img_val, flow_field_val = flow_net( | |||
| rgb_tensor, skel_map_tensor | |||
| ) # inference, connectivity_mask [1,12,16,16] | |||
| flow_field_val = flow_field_val.detach().squeeze().cpu().numpy( | |||
| ) | |||
| flow_field_val = cv2.resize( | |||
| flow_field_val, (paded_roi_w, paded_roi_h), | |||
| interpolation=cv2.INTER_LINEAR) | |||
| flow_field_val[..., 0] = flow_field_val[ | |||
| ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff | |||
| flow_field_val[..., 1] = flow_field_val[ | |||
| ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff | |||
| # remove pad areas | |||
| flow_field_val = flow_field_val[ | |||
| roi_height_pad:flow_field_val.shape[0] - roi_height_pad, | |||
| roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :] | |||
| diffuse_width = max(roi_bbox_width // 3, 1) | |||
| diffuse_height = max(roi_bbox_height // 3, 1) | |||
| assert roi_bbox_width == flow_field_val.shape[1] | |||
| assert roi_bbox_height == flow_field_val.shape[0] | |||
| origin_flow = np.zeros( | |||
| (pad_img.shape[0] + 2 * diffuse_height, | |||
| pad_img.shape[1] + 2 * diffuse_width, 2), | |||
| dtype=np.float32) | |||
| flow_field_val = np.pad(flow_field_val, | |||
| ((diffuse_height, diffuse_height), | |||
| (diffuse_width, diffuse_width), | |||
| (0, 0)), 'linear_ramp') | |||
| origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height, | |||
| roi_bbox[2]:roi_bbox[3] | |||
| + 2 * diffuse_width] = flow_field_val | |||
| origin_flow = origin_flow[diffuse_height:-diffuse_height, | |||
| diffuse_width:-diffuse_width, :] | |||
| x_flows.append(origin_flow[..., 0]) | |||
| y_flows.append(origin_flow[..., 1]) | |||
| if len(x_flows) == 0: | |||
| return { | |||
| 'rDx': np.zeros(canvas.shape[:2], dtype=np.float32), | |||
| 'rDy': np.zeros(canvas.shape[:2], dtype=np.float32), | |||
| 'multi_bbox': multi_bbox, | |||
| 'x_fusion_map': | |||
| np.ones(canvas.shape[:2], dtype=np.float32), | |||
| 'y_fusion_map': | |||
| np.ones(canvas.shape[:2], dtype=np.float32) | |||
| } | |||
| else: | |||
| origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow( | |||
| x_flows, y_flows, device=device) | |||
| return { | |||
| 'rDx': origin_rDx, | |||
| 'rDy': origin_rDy, | |||
| 'multi_bbox': multi_bbox, | |||
| 'x_fusion_map': x_fusion_map, | |||
| 'y_fusion_map': y_fusion_map | |||
| } | |||
| @staticmethod | |||
| def blend_multiscale_flow(x_flows, y_flows, device=None): | |||
| scale_num = len(x_flows) | |||
| if scale_num == 1: | |||
| return x_flows[0], y_flows[0], np.ones_like( | |||
| x_flows[0]), np.ones_like(x_flows[0]) | |||
| origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]), | |||
| dtype=np.float32) | |||
| origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]), | |||
| dtype=np.float32) | |||
| x_fusion_map, x_acc_map = get_map_fusion_map_cuda( | |||
| x_flows, 1, device=device) | |||
| y_fusion_map, y_acc_map = get_map_fusion_map_cuda( | |||
| y_flows, 1, device=device) | |||
| x_flow_map = 1.0 / x_fusion_map | |||
| y_flow_map = 1.0 / y_fusion_map | |||
| all_acc_map = x_acc_map + y_acc_map | |||
| all_acc_map = all_acc_map.astype(np.uint8) | |||
| roi_box = get_mask_bbox(all_acc_map, threshold=1) | |||
| if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[ | |||
| 3] - roi_box[2] <= 0: | |||
| roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]] | |||
| roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1], | |||
| roi_box[2]:roi_box[3]] | |||
| roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1], | |||
| roi_box[2]:roi_box[3]] | |||
| roi_width = roi_x_flow_map.shape[1] | |||
| roi_height = roi_x_flow_map.shape[0] | |||
| roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320) | |||
| roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320) | |||
| roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55)) | |||
| roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55)) | |||
| roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height)) | |||
| roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height)) | |||
| x_flow_map[roi_box[0]:roi_box[1], | |||
| roi_box[2]:roi_box[3]] = roi_x_flow_map | |||
| y_flow_map[roi_box[0]:roi_box[1], | |||
| roi_box[2]:roi_box[3]] = roi_y_flow_map | |||
| for i in range(scale_num): | |||
| origin_rDx += x_flows[i] | |||
| origin_rDy += y_flows[i] | |||
| origin_rDx *= x_flow_map | |||
| origin_rDy *= y_flow_map | |||
| return origin_rDx, origin_rDy, x_flow_map, y_flow_map | |||
| def __joint_to_body_box(self): | |||
| joint_left = int(np.min(self.joints, axis=0)[0]) | |||
| joint_right = int(np.max(self.joints, axis=0)[0]) | |||
| joint_top = int(np.min(self.joints, axis=0)[1]) | |||
| joint_bottom = int(np.max(self.joints, axis=0)[1]) | |||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||
| def __joint_to_leg_box(self): | |||
| leg_joints = self.joints[8:, :] | |||
| if np.max(leg_joints, axis=0)[2] < 0.05: | |||
| return [0, 0, 0, 0] | |||
| joint_left = int(np.min(leg_joints, axis=0)[0]) | |||
| joint_right = int(np.max(leg_joints, axis=0)[0]) | |||
| joint_top = int(np.min(leg_joints, axis=0)[1]) | |||
| joint_bottom = int(np.max(leg_joints, axis=0)[1]) | |||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||
| def __joint_to_arm_box(self): | |||
| arm_joints = self.joints[2:8, :] | |||
| if np.max(arm_joints, axis=0)[2] < 0.05: | |||
| return [0, 0, 0, 0] | |||
| joint_left = int(np.min(arm_joints, axis=0)[0]) | |||
| joint_right = int(np.max(arm_joints, axis=0)[0]) | |||
| joint_top = int(np.min(arm_joints, axis=0)[1]) | |||
| joint_bottom = int(np.max(arm_joints, axis=0)[1]) | |||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||
| @@ -0,0 +1,272 @@ | |||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||
| import math | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from scipy.ndimage.filters import gaussian_filter | |||
| from .model import BodyposeModel | |||
| from .util import pad_rightdown_corner, transfer | |||
| class Body(object): | |||
| def __init__(self, model_path, device): | |||
| self.model = BodyposeModel().to(device) | |||
| model_dict = transfer(self.model, torch.load(model_path)) | |||
| self.model.load_state_dict(model_dict) | |||
| self.model.eval() | |||
| def __call__(self, oriImg): | |||
| scale_search = [0.5] | |||
| boxsize = 368 | |||
| stride = 8 | |||
| padValue = 128 | |||
| thre1 = 0.1 | |||
| thre2 = 0.05 | |||
| bodyparts = 18 | |||
| multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] | |||
| heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19)) | |||
| paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) | |||
| for m in range(len(multiplier)): | |||
| scale = multiplier[m] | |||
| imageToTest = cv2.resize( | |||
| oriImg, (0, 0), | |||
| fx=scale, | |||
| fy=scale, | |||
| interpolation=cv2.INTER_CUBIC) | |||
| imageToTest_padded, pad = pad_rightdown_corner( | |||
| imageToTest, stride, padValue) | |||
| im = np.transpose( | |||
| np.float32(imageToTest_padded[:, :, :, np.newaxis]), | |||
| (3, 2, 0, 1)) / 256 - 0.5 | |||
| im = np.ascontiguousarray(im) | |||
| data = torch.from_numpy(im).float() | |||
| if torch.cuda.is_available(): | |||
| data = data.cuda() | |||
| with torch.no_grad(): | |||
| Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data) | |||
| Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy() | |||
| Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy() | |||
| # extract outputs, resize, and remove padding | |||
| heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), | |||
| (1, 2, 0)) # output 1 is heatmaps | |||
| heatmap = cv2.resize( | |||
| heatmap, (0, 0), | |||
| fx=stride, | |||
| fy=stride, | |||
| interpolation=cv2.INTER_CUBIC) | |||
| heatmap = heatmap[:imageToTest_padded.shape[0] | |||
| - pad[2], :imageToTest_padded.shape[1] | |||
| - pad[3], :] | |||
| heatmap = cv2.resize( | |||
| heatmap, (oriImg.shape[1], oriImg.shape[0]), | |||
| interpolation=cv2.INTER_CUBIC) | |||
| paf = np.transpose(np.squeeze(Mconv7_stage6_L1), | |||
| (1, 2, 0)) # output 0 is PAFs | |||
| paf = cv2.resize( | |||
| paf, (0, 0), | |||
| fx=stride, | |||
| fy=stride, | |||
| interpolation=cv2.INTER_CUBIC) | |||
| paf = paf[:imageToTest_padded.shape[0] | |||
| - pad[2], :imageToTest_padded.shape[1] - pad[3], :] | |||
| paf = cv2.resize( | |||
| paf, (oriImg.shape[1], oriImg.shape[0]), | |||
| interpolation=cv2.INTER_CUBIC) | |||
| heatmap_avg += heatmap_avg + heatmap / len(multiplier) | |||
| paf_avg += +paf / len(multiplier) | |||
| all_peaks = [] | |||
| peak_counter = 0 | |||
| for part in range(bodyparts): | |||
| map_ori = heatmap_avg[:, :, part] | |||
| one_heatmap = gaussian_filter(map_ori, sigma=3) | |||
| map_left = np.zeros(one_heatmap.shape) | |||
| map_left[1:, :] = one_heatmap[:-1, :] | |||
| map_right = np.zeros(one_heatmap.shape) | |||
| map_right[:-1, :] = one_heatmap[1:, :] | |||
| map_up = np.zeros(one_heatmap.shape) | |||
| map_up[:, 1:] = one_heatmap[:, :-1] | |||
| map_down = np.zeros(one_heatmap.shape) | |||
| map_down[:, :-1] = one_heatmap[:, 1:] | |||
| peaks_binary = np.logical_and.reduce( | |||
| (one_heatmap >= map_left, one_heatmap >= map_right, | |||
| one_heatmap >= map_up, one_heatmap >= map_down, | |||
| one_heatmap > thre1)) | |||
| peaks = list( | |||
| zip(np.nonzero(peaks_binary)[1], | |||
| np.nonzero(peaks_binary)[0])) # note reverse | |||
| peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks] | |||
| peak_id = range(peak_counter, peak_counter + len(peaks)) | |||
| peaks_with_score_and_id = [ | |||
| peaks_with_score[i] + (peak_id[i], ) | |||
| for i in range(len(peak_id)) | |||
| ] | |||
| all_peaks.append(peaks_with_score_and_id) | |||
| peak_counter += len(peaks) | |||
| # find connection in the specified sequence, center 29 is in the position 15 | |||
| limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], | |||
| [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], | |||
| [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]] | |||
| # the middle joints heatmap correpondence | |||
| mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], | |||
| [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30], | |||
| [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38], | |||
| [45, 46]] | |||
| connection_all = [] | |||
| special_k = [] | |||
| mid_num = 10 | |||
| for k in range(len(mapIdx)): | |||
| score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]] | |||
| candA = all_peaks[limbSeq[k][0] - 1] | |||
| candB = all_peaks[limbSeq[k][1] - 1] | |||
| nA = len(candA) | |||
| nB = len(candB) | |||
| if (nA != 0 and nB != 0): | |||
| connection_candidate = [] | |||
| for i in range(nA): | |||
| for j in range(nB): | |||
| vec = np.subtract(candB[j][:2], candA[i][:2]) | |||
| norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) | |||
| norm = max(0.001, norm) | |||
| vec = np.divide(vec, norm) | |||
| startend = list( | |||
| zip( | |||
| np.linspace( | |||
| candA[i][0], candB[j][0], num=mid_num), | |||
| np.linspace( | |||
| candA[i][1], candB[j][1], num=mid_num))) | |||
| vec_x = np.array([ | |||
| score_mid[int(round(startend[item][1])), | |||
| int(round(startend[item][0])), 0] | |||
| for item in range(len(startend)) | |||
| ]) | |||
| vec_y = np.array([ | |||
| score_mid[int(round(startend[item][1])), | |||
| int(round(startend[item][0])), 1] | |||
| for item in range(len(startend)) | |||
| ]) | |||
| score_midpts = np.multiply( | |||
| vec_x, vec[0]) + np.multiply(vec_y, vec[1]) | |||
| temp1 = sum(score_midpts) / len(score_midpts) | |||
| temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0) | |||
| score_with_dist_prior = temp1 + temp2 | |||
| criterion1 = len(np.nonzero( | |||
| score_midpts > thre2)[0]) > 0.8 * len(score_midpts) | |||
| criterion2 = score_with_dist_prior > 0 | |||
| if criterion1 and criterion2: | |||
| connection_candidate.append([ | |||
| i, j, score_with_dist_prior, | |||
| score_with_dist_prior + candA[i][2] | |||
| + candB[j][2] | |||
| ]) | |||
| connection_candidate = sorted( | |||
| connection_candidate, key=lambda x: x[2], reverse=True) | |||
| connection = np.zeros((0, 5)) | |||
| for c in range(len(connection_candidate)): | |||
| i, j, s = connection_candidate[c][0:3] | |||
| if (i not in connection[:, 3] | |||
| and j not in connection[:, 4]): | |||
| connection = np.vstack( | |||
| [connection, [candA[i][3], candB[j][3], s, i, j]]) | |||
| if (len(connection) >= min(nA, nB)): | |||
| break | |||
| connection_all.append(connection) | |||
| else: | |||
| special_k.append(k) | |||
| connection_all.append([]) | |||
| # last number in each row is the total parts number of that person | |||
| # the second last number in each row is the score of the overall configuration | |||
| subset = -1 * np.ones((0, 20)) | |||
| candidate = np.array( | |||
| [item for sublist in all_peaks for item in sublist]) | |||
| for k in range(len(mapIdx)): | |||
| if k not in special_k: | |||
| partAs = connection_all[k][:, 0] | |||
| partBs = connection_all[k][:, 1] | |||
| indexA, indexB = np.array(limbSeq[k]) - 1 | |||
| for i in range(len(connection_all[k])): # = 1:size(temp,1) | |||
| found = 0 | |||
| subset_idx = [-1, -1] | |||
| for j in range(len(subset)): # 1:size(subset,1): | |||
| if subset[j][indexA] == partAs[i] or subset[j][ | |||
| indexB] == partBs[i]: | |||
| subset_idx[found] = j | |||
| found += 1 | |||
| if found == 1: | |||
| j = subset_idx[0] | |||
| if subset[j][indexB] != partBs[i]: | |||
| subset[j][indexB] = partBs[i] | |||
| subset[j][-1] += 1 | |||
| subset[j][-2] += candidate[ | |||
| partBs[i].astype(int), | |||
| 2] + connection_all[k][i][2] | |||
| elif found == 2: # if found 2 and disjoint, merge them | |||
| j1, j2 = subset_idx | |||
| tmp1 = (subset[j1] >= 0).astype(int) | |||
| tmp2 = (subset[j2] >= 0).astype(int) | |||
| membership = (tmp1 + tmp2)[:-2] | |||
| if len(np.nonzero(membership == 2)[0]) == 0: # merge | |||
| subset[j1][:-2] += (subset[j2][:-2] + 1) | |||
| subset[j1][-2:] += subset[j2][-2:] | |||
| subset[j1][-2] += connection_all[k][i][2] | |||
| subset = np.delete(subset, j2, 0) | |||
| else: # as like found == 1 | |||
| subset[j1][indexB] = partBs[i] | |||
| subset[j1][-1] += 1 | |||
| subset[j1][-2] += candidate[ | |||
| partBs[i].astype(int), | |||
| 2] + connection_all[k][i][2] | |||
| # if find no partA in the subset, create a new subset | |||
| elif not found and k < 17: | |||
| row = -1 * np.ones(20) | |||
| row[indexA] = partAs[i] | |||
| row[indexB] = partBs[i] | |||
| row[-1] = 2 | |||
| row[-2] = sum( | |||
| candidate[connection_all[k][i, :2].astype(int), | |||
| 2]) + connection_all[k][i][2] | |||
| subset = np.vstack([subset, row]) | |||
| # delete some rows of subset which has few parts occur | |||
| deleteIdx = [] | |||
| for i in range(len(subset)): | |||
| if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: | |||
| deleteIdx.append(i) | |||
| subset = np.delete(subset, deleteIdx, axis=0) | |||
| # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts | |||
| # candidate: x, y, score, id | |||
| count = subset.shape[0] | |||
| joints = np.zeros(shape=(count, bodyparts, 3)) | |||
| for i in range(count): | |||
| for j in range(bodyparts): | |||
| joints[i, j, :3] = candidate[int(subset[i, j]), :3] | |||
| confidence = 1.0 if subset[i, j] >= 0 else 0.0 | |||
| joints[i, j, 2] *= confidence | |||
| return joints | |||
| @@ -0,0 +1,141 @@ | |||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||
| from collections import OrderedDict | |||
| import torch | |||
| import torch.nn as nn | |||
| def make_layers(block, no_relu_layers): | |||
| layers = [] | |||
| for layer_name, v in block.items(): | |||
| if 'pool' in layer_name: | |||
| layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2]) | |||
| layers.append((layer_name, layer)) | |||
| else: | |||
| conv2d = nn.Conv2d( | |||
| in_channels=v[0], | |||
| out_channels=v[1], | |||
| kernel_size=v[2], | |||
| stride=v[3], | |||
| padding=v[4]) | |||
| layers.append((layer_name, conv2d)) | |||
| if layer_name not in no_relu_layers: | |||
| layers.append(('relu_' + layer_name, nn.ReLU(inplace=True))) | |||
| return nn.Sequential(OrderedDict(layers)) | |||
| class BodyposeModel(nn.Module): | |||
| def __init__(self): | |||
| super(BodyposeModel, self).__init__() | |||
| # these layers have no relu layer | |||
| no_relu_layers = [ | |||
| 'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1', | |||
| 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2', | |||
| 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1', | |||
| 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1' | |||
| ] | |||
| blocks = {} | |||
| block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]), | |||
| ('conv1_2', [64, 64, 3, 1, 1]), | |||
| ('pool1_stage1', [2, 2, 0]), | |||
| ('conv2_1', [64, 128, 3, 1, 1]), | |||
| ('conv2_2', [128, 128, 3, 1, 1]), | |||
| ('pool2_stage1', [2, 2, 0]), | |||
| ('conv3_1', [128, 256, 3, 1, 1]), | |||
| ('conv3_2', [256, 256, 3, 1, 1]), | |||
| ('conv3_3', [256, 256, 3, 1, 1]), | |||
| ('conv3_4', [256, 256, 3, 1, 1]), | |||
| ('pool3_stage1', [2, 2, 0]), | |||
| ('conv4_1', [256, 512, 3, 1, 1]), | |||
| ('conv4_2', [512, 512, 3, 1, 1]), | |||
| ('conv4_3_CPM', [512, 256, 3, 1, 1]), | |||
| ('conv4_4_CPM', [256, 128, 3, 1, 1])]) | |||
| # Stage 1 | |||
| block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), | |||
| ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]), | |||
| ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), | |||
| ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]), | |||
| ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])]) | |||
| block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), | |||
| ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]), | |||
| ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), | |||
| ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]), | |||
| ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])]) | |||
| blocks['block1_1'] = block1_1 | |||
| blocks['block1_2'] = block1_2 | |||
| self.model0 = make_layers(block0, no_relu_layers) | |||
| # Stages 2 - 6 | |||
| for i in range(2, 7): | |||
| blocks['block%d_1' % i] = OrderedDict([ | |||
| ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]), | |||
| ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]), | |||
| ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0]) | |||
| ]) | |||
| blocks['block%d_2' % i] = OrderedDict([ | |||
| ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]), | |||
| ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||
| ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]), | |||
| ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0]) | |||
| ]) | |||
| for k in blocks.keys(): | |||
| blocks[k] = make_layers(blocks[k], no_relu_layers) | |||
| self.model1_1 = blocks['block1_1'] | |||
| self.model2_1 = blocks['block2_1'] | |||
| self.model3_1 = blocks['block3_1'] | |||
| self.model4_1 = blocks['block4_1'] | |||
| self.model5_1 = blocks['block5_1'] | |||
| self.model6_1 = blocks['block6_1'] | |||
| self.model1_2 = blocks['block1_2'] | |||
| self.model2_2 = blocks['block2_2'] | |||
| self.model3_2 = blocks['block3_2'] | |||
| self.model4_2 = blocks['block4_2'] | |||
| self.model5_2 = blocks['block5_2'] | |||
| self.model6_2 = blocks['block6_2'] | |||
| def forward(self, x): | |||
| out1 = self.model0(x) | |||
| out1_1 = self.model1_1(out1) | |||
| out1_2 = self.model1_2(out1) | |||
| out2 = torch.cat([out1_1, out1_2, out1], 1) | |||
| out2_1 = self.model2_1(out2) | |||
| out2_2 = self.model2_2(out2) | |||
| out3 = torch.cat([out2_1, out2_2, out1], 1) | |||
| out3_1 = self.model3_1(out3) | |||
| out3_2 = self.model3_2(out3) | |||
| out4 = torch.cat([out3_1, out3_2, out1], 1) | |||
| out4_1 = self.model4_1(out4) | |||
| out4_2 = self.model4_2(out4) | |||
| out5 = torch.cat([out4_1, out4_2, out1], 1) | |||
| out5_1 = self.model5_1(out5) | |||
| out5_2 = self.model5_2(out5) | |||
| out6 = torch.cat([out5_1, out5_2, out1], 1) | |||
| out6_1 = self.model6_1(out6) | |||
| out6_2 = self.model6_2(out6) | |||
| return out6_1, out6_2 | |||
| @@ -0,0 +1,33 @@ | |||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||
| import numpy as np | |||
| def pad_rightdown_corner(img, stride, padValue): | |||
| h = img.shape[0] | |||
| w = img.shape[1] | |||
| pad = 4 * [None] | |||
| pad[0] = 0 # up | |||
| pad[1] = 0 # left | |||
| pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down | |||
| pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right | |||
| img_padded = img | |||
| pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1)) | |||
| img_padded = np.concatenate((pad_up, img_padded), axis=0) | |||
| pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1)) | |||
| img_padded = np.concatenate((pad_left, img_padded), axis=1) | |||
| pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1)) | |||
| img_padded = np.concatenate((img_padded, pad_down), axis=0) | |||
| pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1)) | |||
| img_padded = np.concatenate((img_padded, pad_right), axis=1) | |||
| return img_padded, pad | |||
| def transfer(model, model_weights): | |||
| transfered_model_weights = {} | |||
| for weights_name in model.state_dict().keys(): | |||
| transfered_model_weights[weights_name] = model_weights['.'.join( | |||
| weights_name.split('.')[1:])] | |||
| return transfered_model_weights | |||
| @@ -0,0 +1,507 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import math | |||
| import os | |||
| import random | |||
| import cv2 | |||
| import numba | |||
| import numpy as np | |||
| import torch | |||
| def resize_on_long_side(img, long_side=800): | |||
| src_height = img.shape[0] | |||
| src_width = img.shape[1] | |||
| if src_height > src_width: | |||
| scale = long_side * 1.0 / src_height | |||
| _img = cv2.resize( | |||
| img, (int(src_width * scale), long_side), | |||
| interpolation=cv2.INTER_LINEAR) | |||
| else: | |||
| scale = long_side * 1.0 / src_width | |||
| _img = cv2.resize( | |||
| img, (long_side, int(src_height * scale)), | |||
| interpolation=cv2.INTER_LINEAR) | |||
| return _img, scale | |||
| def point_in_box(pt, box): | |||
| pt_x = pt[0] | |||
| pt_y = pt[1] | |||
| if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[ | |||
| 1] and pt_y <= box[1] + box[3]: | |||
| return True | |||
| else: | |||
| return False | |||
| def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True): | |||
| if roi_bbox is None or None in roi_bbox: | |||
| return [None, None, None, None] | |||
| top = roi_bbox[0] | |||
| bottom = roi_bbox[1] | |||
| left = roi_bbox[2] | |||
| right = roi_bbox[3] | |||
| roi_width = roi_bbox[3] - roi_bbox[2] | |||
| roi_height = roi_bbox[1] - roi_bbox[0] | |||
| right = left + roi_width | |||
| bottom = top + roi_height | |||
| long_side = roi_width if roi_width > roi_height else roi_height | |||
| if use_long_side: | |||
| new_left = left - int(long_side * ratio) | |||
| else: | |||
| new_left = left - int(roi_width * ratio) | |||
| new_left = 1 if new_left < 0 else new_left | |||
| if use_long_side: | |||
| new_top = top - int(long_side * ratio) | |||
| else: | |||
| new_top = top - int(roi_height * ratio) | |||
| new_top = 1 if new_top < 0 else new_top | |||
| if use_long_side: | |||
| new_right = right + int(long_side * ratio) | |||
| else: | |||
| new_right = right + int(roi_width * ratio) | |||
| new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right | |||
| if use_long_side: | |||
| new_bottom = bottom + int(long_side * ratio) | |||
| else: | |||
| new_bottom = bottom + int(roi_height * ratio) | |||
| new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom | |||
| bbox = [new_top, new_bottom, new_left, new_right] | |||
| return bbox | |||
| def gen_PAF(image, joints): | |||
| assert joints.shape[0] == 18 | |||
| assert joints.shape[1] == 3 | |||
| org_h = image.shape[0] | |||
| org_w = image.shape[1] | |||
| small_image, resize_scale = resize_on_long_side(image, 120) | |||
| joints[:, :2] = joints[:, :2] * resize_scale | |||
| joint_left = int(np.min(joints, axis=0)[0]) | |||
| joint_right = int(np.max(joints, axis=0)[0]) | |||
| joint_top = int(np.min(joints, axis=0)[1]) | |||
| joint_bottom = int(np.max(joints, axis=0)[1]) | |||
| limb_width = min( | |||
| abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6 | |||
| if limb_width % 2 == 0: | |||
| limb_width += 1 | |||
| kernel_size = limb_width | |||
| part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12), | |||
| (12, 13), (8, 9), (9, 10)] | |||
| map_list = [] | |||
| mask_list = [] | |||
| PAF_all = np.zeros( | |||
| shape=(small_image.shape[0], small_image.shape[1], 2), | |||
| dtype=np.float32) | |||
| for c, pair in enumerate(part_orders): | |||
| idx_a_name = pair[0] | |||
| idx_b_name = pair[1] | |||
| jointa = joints[idx_a_name] | |||
| jointb = joints[idx_b_name] | |||
| confidence_threshold = 0.05 | |||
| if jointa[2] > confidence_threshold and jointb[ | |||
| 2] > confidence_threshold: | |||
| canvas = np.zeros( | |||
| shape=(small_image.shape[0], small_image.shape[1]), | |||
| dtype=np.uint8) | |||
| canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])), | |||
| (int(jointb[0]), int(jointb[1])), | |||
| (255, 255, 255), 5) | |||
| kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, | |||
| (kernel_size, kernel_size)) | |||
| canvas = cv2.dilate(canvas, kernel, 1) | |||
| canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0) | |||
| canvas = canvas.astype(np.float32) / 255 | |||
| PAF = np.zeros( | |||
| shape=(small_image.shape[0], small_image.shape[1], 2), | |||
| dtype=np.float32) | |||
| PAF[..., 0] = jointb[0] - jointa[0] | |||
| PAF[..., 1] = jointb[1] - jointa[1] | |||
| mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1]) | |||
| PAF /= (np.dstack((mag, mag)) + 1e-5) | |||
| single_PAF = PAF * np.dstack((canvas, canvas)) | |||
| map_list.append( | |||
| cv2.GaussianBlur(single_PAF, | |||
| (kernel_size * 3, kernel_size * 3), 0)) | |||
| mask_list.append( | |||
| cv2.GaussianBlur(canvas.copy(), | |||
| (kernel_size * 3, kernel_size * 3), 0)) | |||
| PAF_all = PAF_all * (1.0 - np.dstack( | |||
| (canvas, canvas))) + single_PAF | |||
| PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0) | |||
| PAF_all = cv2.resize( | |||
| PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR) | |||
| map_list.append(PAF_all) | |||
| return PAF_all, map_list, mask_list | |||
| def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None): | |||
| if type(joints) == list: | |||
| joints = np.array(joints) | |||
| assert stack_mode == 'column' or stack_mode == 'depth' | |||
| part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), | |||
| (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] | |||
| def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0): | |||
| jointa = joints[a] | |||
| jointb = joints[b] | |||
| temp1 = int((jointa[0] - x_offset) * scale) | |||
| temp2 = int((jointa[1] - y_offset) * scale) | |||
| temp3 = int((jointb[0] - x_offset) * scale) | |||
| temp4 = int((jointb[1] - y_offset) * scale) | |||
| cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width) | |||
| roi_box = input_roi_box | |||
| roi_box_width = roi_box[3] - roi_box[2] | |||
| roi_box_height = roi_box[1] - roi_box[0] | |||
| short_side_length = min(roi_box_width, roi_box_height) | |||
| line_width = short_side_length // 30 | |||
| line_width = max(line_width, 2) | |||
| map_cube = np.zeros( | |||
| shape=(roi_box_height, roi_box_width, len(part_orders) + 1), | |||
| dtype=np.float32) | |||
| use_line_width = min(5, line_width) | |||
| fx = use_line_width * 1.0 / line_width # fx 最大值为1 | |||
| if fx < 0.99: | |||
| map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx) | |||
| for c, pair in enumerate(part_orders): | |||
| tmp = map_cube[..., c].copy() | |||
| link( | |||
| tmp, | |||
| pair[0], | |||
| pair[1], (2.0, 2.0, 2.0), | |||
| use_line_width, | |||
| scale=fx, | |||
| x_offset=roi_box[2], | |||
| y_offset=roi_box[0]) | |||
| map_cube[..., c] = tmp | |||
| tmp = map_cube[..., -1].copy() | |||
| link( | |||
| tmp, | |||
| pair[0], | |||
| pair[1], (2.0, 2.0, 2.0), | |||
| use_line_width, | |||
| scale=fx, | |||
| x_offset=roi_box[2], | |||
| y_offset=roi_box[0]) | |||
| map_cube[..., -1] = tmp | |||
| map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height)) | |||
| if stack_mode == 'depth': | |||
| return map_cube, roi_box | |||
| elif stack_mode == 'column': | |||
| joint_maps = [] | |||
| for c in range(len(part_orders) + 1): | |||
| joint_maps.append(map_cube[..., c]) | |||
| joint_map = np.column_stack(joint_maps) | |||
| return joint_map, roi_box | |||
| def plot_one_box(x, img, color=None, label=None, line_thickness=None): | |||
| tl = line_thickness or round( | |||
| 0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness | |||
| color = color or [random.randint(0, 255) for _ in range(3)] | |||
| c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) | |||
| cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) | |||
| if label: | |||
| tf = max(tl - 1, 1) # font thickness | |||
| t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] | |||
| c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 | |||
| cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled | |||
| cv2.putText( | |||
| img, | |||
| label, (c1[0], c1[1] - 2), | |||
| 0, | |||
| tl / 3, [225, 255, 255], | |||
| thickness=tf, | |||
| lineType=cv2.LINE_AA) | |||
| def draw_line(im, points, color, stroke_size=2, closed=False): | |||
| points = points.astype(np.int32) | |||
| for i in range(len(points) - 1): | |||
| cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color, | |||
| stroke_size) | |||
| if closed: | |||
| cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size) | |||
| def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2): | |||
| left = bbox[0] | |||
| top = bbox[1] | |||
| right = bbox[2] | |||
| bottom = bbox[3] | |||
| roi_width = right - left | |||
| roi_height = bottom - top | |||
| new_left = left - int(roi_width * enlarge_ratio) | |||
| new_left = 0 if new_left < 0 else new_left | |||
| new_top = top - int(roi_height * enlarge_ratio) | |||
| new_top = 0 if new_top < 0 else new_top | |||
| new_right = right + int(roi_width * enlarge_ratio) | |||
| new_right = img_width if new_right > img_width else new_right | |||
| new_bottom = bottom + int(roi_height * enlarge_ratio) | |||
| new_bottom = img_height if new_bottom > img_height else new_bottom | |||
| bbox = [new_left, new_top, new_right, new_bottom] | |||
| bbox = [int(x) for x in bbox] | |||
| return bbox | |||
| def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')): | |||
| map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list] | |||
| map_concat = torch.stack(tuple(map_list_cuda), dim=-1) | |||
| map_concat = torch.abs(map_concat) | |||
| map_concat[map_concat < threshold] = 0 | |||
| map_concat[map_concat > 1e-5] = 1.0 | |||
| sum_map = torch.sum(map_concat, dim=2) | |||
| a = torch.ones_like(sum_map) | |||
| acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map)) | |||
| fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map) | |||
| fusion_map = fusion_map.float() | |||
| acc_map = acc_map.float() | |||
| fusion_map = fusion_map.cpu().numpy().astype(np.float32) | |||
| acc_map = acc_map.cpu().numpy().astype(np.float32) | |||
| return fusion_map, acc_map | |||
| def gen_border_shade(height, width, height_band, width_band): | |||
| height_ratio = height_band * 1.0 / height | |||
| width_ratio = width_band * 1.0 / width | |||
| _height_band = int(256 * height_ratio) | |||
| _width_band = int(256 * width_ratio) | |||
| canvas = np.zeros((256, 256), dtype=np.float32) | |||
| canvas[_height_band // 2:-_height_band // 2, | |||
| _width_band // 2:-_width_band // 2] = 1.0 | |||
| canvas = cv2.blur(canvas, (_height_band, _width_band)) | |||
| canvas = cv2.resize(canvas, (width, height)) | |||
| return canvas | |||
| def get_mask_bbox(mask, threshold=127): | |||
| ret, mask = cv2.threshold(mask, threshold, 1, 0) | |||
| if cv2.countNonZero(mask) == 0: | |||
| return [None, None, None, None] | |||
| col_acc = np.sum(mask, 0) | |||
| row_acc = np.sum(mask, 1) | |||
| col_acc = col_acc.tolist() | |||
| row_acc = row_acc.tolist() | |||
| for x in range(len(col_acc)): | |||
| if col_acc[x] > 0: | |||
| left = x | |||
| break | |||
| for x in range(1, len(col_acc)): | |||
| if col_acc[-x] > 0: | |||
| right = len(col_acc) - x | |||
| break | |||
| for x in range(len(row_acc)): | |||
| if row_acc[x] > 0: | |||
| top = x | |||
| break | |||
| for x in range(1, len(row_acc)): | |||
| if row_acc[-x] > 0: | |||
| bottom = len(row_acc[::-1]) - x | |||
| break | |||
| return [top, bottom, left, right] | |||
| def visualize_flow(flow): | |||
| h, w = flow.shape[:2] | |||
| hsv = np.zeros((h, w, 3), np.uint8) | |||
| mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1]) | |||
| hsv[..., 0] = ang * 180 / np.pi / 2 | |||
| hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX) | |||
| hsv[..., 2] = 255 | |||
| bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) | |||
| bgr = bgr * 1.0 / 255 | |||
| return bgr.astype(np.float32) | |||
| def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1): | |||
| part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), | |||
| (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] | |||
| abandon_idxs = [0, 1, 14, 15, 16, 17] | |||
| # draw joints | |||
| for i, joint in enumerate(joints): | |||
| if i in abandon_idxs: | |||
| continue | |||
| if joint[-1] > confidence_threshold: | |||
| cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2) | |||
| if show_text: | |||
| cv2.putText(image, | |||
| str(i) + '[{:.2f}]'.format(joint[-1]), | |||
| (int(joint[0]), int(joint[1])), | |||
| cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) | |||
| # draw link | |||
| for pair in part_orders: | |||
| if joints[pair[0]][-1] > confidence_threshold and joints[ | |||
| pair[1]][-1] > confidence_threshold: | |||
| cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])), | |||
| (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color, | |||
| 2) | |||
| return image | |||
| def get_heatmap_cv(img, magn, max_flow_mag): | |||
| min_flow_mag = .5 | |||
| cv_magn = np.clip( | |||
| 255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7), | |||
| a_min=0, | |||
| a_max=255).astype(np.uint8) | |||
| if img.dtype != np.uint8: | |||
| img = (255 * img).astype(np.uint8) | |||
| heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET) | |||
| heatmap_img = heatmap_img[..., ::-1] | |||
| h, w = magn.shape | |||
| img_alpha = np.ones((h, w), dtype=np.double)[:, :, None] | |||
| heatmap_alpha = np.clip( | |||
| magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7 | |||
| heatmap_alpha[heatmap_alpha < .2]**.5 | |||
| pm_hm = heatmap_img * heatmap_alpha | |||
| pm_img = img * img_alpha | |||
| cv_out = pm_hm + pm_img * (1 - heatmap_alpha) | |||
| cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8) | |||
| return cv_out | |||
| def save_heatmap_cv(img, flow, supression=2): | |||
| flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2) | |||
| flow_magn -= supression | |||
| flow_magn[flow_magn <= 0] = 0 | |||
| cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3) | |||
| return cv_out | |||
| @numba.jit(nopython=True, parallel=False) | |||
| def bilinear_interp(x, y, v11, v12, v21, v22): | |||
| temp1 = (v11 * (1 - y) + v12 * y) * (1 - x) | |||
| temp2 = (v21 * (1 - y) + v22 * y) * x | |||
| result = temp1 + temp2 | |||
| return result | |||
| @numba.jit(nopython=True, parallel=False) | |||
| def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand, | |||
| height_expand): | |||
| srcW = oriImg.shape[1] | |||
| srcH = oriImg.shape[0] | |||
| newImg = oriImg.copy() | |||
| for i in range(srcH): | |||
| for j in range(srcW): | |||
| _i = i | |||
| _j = j | |||
| deltaX = rDx[_i, _j] | |||
| deltaY = rDy[_i, _j] | |||
| nx = _j + deltaX * transRatio | |||
| ny = _i + deltaY * transRatio | |||
| if nx >= srcW - width_expand - 1: | |||
| if nx > srcW - 1: | |||
| nx = srcW - 1 | |||
| if ny >= srcH - height_expand - 1: | |||
| if ny > srcH - 1: | |||
| ny = srcH - 1 | |||
| if nx < width_expand: | |||
| if nx < 0: | |||
| nx = 0 | |||
| if ny < height_expand: | |||
| if ny < 0: | |||
| ny = 0 | |||
| nxi = int(math.floor(nx)) | |||
| nyi = int(math.floor(ny)) | |||
| nxi1 = int(math.ceil(nx)) | |||
| nyi1 = int(math.ceil(ny)) | |||
| for ll in range(3): | |||
| newImg[_i, _j, | |||
| ll] = bilinear_interp(ny - nyi, nx - nxi, | |||
| oriImg[nyi, nxi, | |||
| ll], oriImg[nyi, nxi1, ll], | |||
| oriImg[nyi1, nxi, | |||
| ll], oriImg[nyi1, nxi1, | |||
| ll]) | |||
| return newImg | |||
| @@ -1,3 +1,6 @@ | |||
| # The implementation is adopted from Jingwen He, | |||
| # made publicly available at https://github.com/hejingwenhejingwen/CSRNet | |||
| import functools | |||
| import math | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os.path as osp | |||
| from copy import deepcopy | |||
| from typing import Dict, Union | |||
| @@ -1,3 +1,8 @@ | |||
| # ------------------------------------------------------------------------ | |||
| # Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py | |||
| # Copyright (c) 2022 megvii-model. All Rights Reserved. | |||
| # ------------------------------------------------------------------------ | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,8 @@ | |||
| # ------------------------------------------------------------------------ | |||
| # Modified from BasicSR (https://github.com/xinntao/BasicSR) | |||
| # Copyright 2018-2020 BasicSR Authors | |||
| # ------------------------------------------------------------------------ | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,8 +1,8 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from copy import deepcopy | |||
| from typing import Any, Dict, Union | |||
| import numpy as np | |||
| import torch.cuda | |||
| from torch.nn.parallel import DataParallel, DistributedDataParallel | |||
| @@ -77,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel): | |||
| def _evaluate_postprocess(self, input: Tensor, | |||
| target: Tensor) -> Dict[str, list]: | |||
| preds = self.model(input) | |||
| preds = list(torch.split(preds, 1, 0)) | |||
| targets = list(torch.split(target, 1, 0)) | |||
| preds = [(pred.data * 255.).squeeze(0).permute( | |||
| 1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds] | |||
| targets = [(target.data * 255.).squeeze(0).permute( | |||
| 1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets] | |||
| preds = list(torch.split(preds.clamp(0, 1), 1, 0)) | |||
| targets = list(torch.split(target.clamp(0, 1), 1, 0)) | |||
| return {'pred': preds, 'target': targets} | |||
| @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .image_denoise_dataset import PairedImageDataset | |||
| from .model import FFTInpainting | |||
| else: | |||
| _import_structure = { | |||
| 'image_denoise_dataset': ['PairedImageDataset'], | |||
| 'model': ['FFTInpainting'], | |||
| } | |||
| import sys | |||
| @@ -0,0 +1,75 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| from typing import Dict, Tuple | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from modelscope.utils.logger import get_logger | |||
| from .modules.adversarial import NonSaturatingWithR1 | |||
| from .modules.ffc import FFCResNetGenerator | |||
| from .modules.perceptual import ResNetPL | |||
| from .modules.pix2pixhd import NLayerDiscriminator | |||
| LOGGER = get_logger() | |||
| class BaseInpaintingTrainingModule(nn.Module): | |||
| def __init__(self, | |||
| model_dir='', | |||
| use_ddp=True, | |||
| predict_only=False, | |||
| visualize_each_iters=100, | |||
| average_generator=False, | |||
| generator_avg_beta=0.999, | |||
| average_generator_start_step=30000, | |||
| average_generator_period=10, | |||
| store_discr_outputs_for_vis=False, | |||
| **kwargs): | |||
| super().__init__() | |||
| LOGGER.info( | |||
| f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}' | |||
| ) | |||
| self.generator = FFCResNetGenerator() | |||
| self.use_ddp = use_ddp | |||
| if not predict_only: | |||
| self.discriminator = NLayerDiscriminator() | |||
| self.adversarial_loss = NonSaturatingWithR1( | |||
| weight=10, | |||
| gp_coef=0.001, | |||
| mask_as_fake_target=True, | |||
| allow_scale_mask=True) | |||
| self.average_generator = average_generator | |||
| self.generator_avg_beta = generator_avg_beta | |||
| self.average_generator_start_step = average_generator_start_step | |||
| self.average_generator_period = average_generator_period | |||
| self.generator_average = None | |||
| self.last_generator_averaging_step = -1 | |||
| self.store_discr_outputs_for_vis = store_discr_outputs_for_vis | |||
| self.loss_l1 = nn.L1Loss(reduction='none') | |||
| self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir) | |||
| self.visualize_each_iters = visualize_each_iters | |||
| LOGGER.info('BaseInpaintingTrainingModule init done') | |||
| def forward(self, batch: Dict[str, | |||
| torch.Tensor]) -> Dict[str, torch.Tensor]: | |||
| """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys""" | |||
| raise NotImplementedError() | |||
| def generator_loss(self, | |||
| batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| raise NotImplementedError() | |||
| def discriminator_loss( | |||
| self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| raise NotImplementedError() | |||
| @@ -0,0 +1,210 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import bisect | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from modelscope.utils.logger import get_logger | |||
| from .base import BaseInpaintingTrainingModule | |||
| from .modules.feature_matching import feature_matching_loss, masked_l1_loss | |||
| LOGGER = get_logger() | |||
| def set_requires_grad(module, value): | |||
| for param in module.parameters(): | |||
| param.requires_grad = value | |||
| def add_prefix_to_keys(dct, prefix): | |||
| return {prefix + k: v for k, v in dct.items()} | |||
| class LinearRamp: | |||
| def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0): | |||
| self.start_value = start_value | |||
| self.end_value = end_value | |||
| self.start_iter = start_iter | |||
| self.end_iter = end_iter | |||
| def __call__(self, i): | |||
| if i < self.start_iter: | |||
| return self.start_value | |||
| if i >= self.end_iter: | |||
| return self.end_value | |||
| part = (i - self.start_iter) / (self.end_iter - self.start_iter) | |||
| return self.start_value * (1 - part) + self.end_value * part | |||
| class LadderRamp: | |||
| def __init__(self, start_iters, values): | |||
| self.start_iters = start_iters | |||
| self.values = values | |||
| assert len(values) == len(start_iters) + 1, (len(values), | |||
| len(start_iters)) | |||
| def __call__(self, i): | |||
| segment_i = bisect.bisect_right(self.start_iters, i) | |||
| return self.values[segment_i] | |||
| def get_ramp(kind='ladder', **kwargs): | |||
| if kind == 'linear': | |||
| return LinearRamp(**kwargs) | |||
| if kind == 'ladder': | |||
| return LadderRamp(**kwargs) | |||
| raise ValueError(f'Unexpected ramp kind: {kind}') | |||
| class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule): | |||
| def __init__(self, | |||
| model_dir='', | |||
| predict_only=False, | |||
| concat_mask=True, | |||
| rescale_scheduler_kwargs=None, | |||
| image_to_discriminator='predicted_image', | |||
| add_noise_kwargs=None, | |||
| noise_fill_hole=False, | |||
| const_area_crop_kwargs=None, | |||
| distance_weighter_kwargs=None, | |||
| distance_weighted_mask_for_discr=False, | |||
| fake_fakes_proba=0, | |||
| fake_fakes_generator_kwargs=None, | |||
| **kwargs): | |||
| super().__init__(model_dir=model_dir, predict_only=predict_only) | |||
| self.concat_mask = concat_mask | |||
| self.rescale_size_getter = get_ramp( | |||
| **rescale_scheduler_kwargs | |||
| ) if rescale_scheduler_kwargs is not None else None | |||
| self.image_to_discriminator = image_to_discriminator | |||
| self.add_noise_kwargs = add_noise_kwargs | |||
| self.noise_fill_hole = noise_fill_hole | |||
| self.const_area_crop_kwargs = const_area_crop_kwargs | |||
| self.refine_mask_for_losses = None | |||
| self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr | |||
| self.feature_matching_weight = 100 | |||
| self.losses_l1_weight_known = 10 | |||
| self.losses_l1_weight_missing = 0 | |||
| self.fake_fakes_proba = fake_fakes_proba | |||
| def forward(self, batch): | |||
| img = batch['image'] | |||
| mask = batch['mask'] | |||
| masked_img = img * (1 - mask) | |||
| if self.concat_mask: | |||
| masked_img = torch.cat([masked_img, mask], dim=1) | |||
| batch['predicted_image'] = self.generator(masked_img) | |||
| batch['inpainted'] = mask * batch['predicted_image'] + ( | |||
| 1 - mask) * batch['image'] | |||
| batch['mask_for_losses'] = mask | |||
| return batch | |||
| def generator_loss(self, batch): | |||
| img = batch['image'] | |||
| predicted_img = batch[self.image_to_discriminator] | |||
| original_mask = batch['mask'] | |||
| supervised_mask = batch['mask_for_losses'] | |||
| # L1 | |||
| l1_value = masked_l1_loss(predicted_img, img, supervised_mask, | |||
| self.losses_l1_weight_known, | |||
| self.losses_l1_weight_missing) | |||
| total_loss = l1_value | |||
| metrics = dict(gen_l1=l1_value) | |||
| # discriminator | |||
| # adversarial_loss calls backward by itself | |||
| mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask | |||
| self.adversarial_loss.pre_generator_step( | |||
| real_batch=img, | |||
| fake_batch=predicted_img, | |||
| generator=self.generator, | |||
| discriminator=self.discriminator) | |||
| discr_real_pred, discr_real_features = self.discriminator(img) | |||
| discr_fake_pred, discr_fake_features = self.discriminator( | |||
| predicted_img) | |||
| adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss( | |||
| real_batch=img, | |||
| fake_batch=predicted_img, | |||
| discr_real_pred=discr_real_pred, | |||
| discr_fake_pred=discr_fake_pred, | |||
| mask=mask_for_discr) | |||
| total_loss = total_loss + adv_gen_loss | |||
| metrics['gen_adv'] = adv_gen_loss | |||
| metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) | |||
| # feature matching | |||
| if self.feature_matching_weight > 0: | |||
| need_mask_in_fm = False | |||
| mask_for_fm = supervised_mask if need_mask_in_fm else None | |||
| fm_value = feature_matching_loss( | |||
| discr_fake_features, discr_real_features, | |||
| mask=mask_for_fm) * self.feature_matching_weight | |||
| total_loss = total_loss + fm_value | |||
| metrics['gen_fm'] = fm_value | |||
| if self.loss_resnet_pl is not None: | |||
| resnet_pl_value = self.loss_resnet_pl(predicted_img, img) | |||
| total_loss = total_loss + resnet_pl_value | |||
| metrics['gen_resnet_pl'] = resnet_pl_value | |||
| return total_loss, metrics | |||
| def discriminator_loss(self, batch): | |||
| total_loss = 0 | |||
| metrics = {} | |||
| predicted_img = batch[self.image_to_discriminator].detach() | |||
| self.adversarial_loss.pre_discriminator_step( | |||
| real_batch=batch['image'], | |||
| fake_batch=predicted_img, | |||
| generator=self.generator, | |||
| discriminator=self.discriminator) | |||
| discr_real_pred, discr_real_features = self.discriminator( | |||
| batch['image']) | |||
| discr_fake_pred, discr_fake_features = self.discriminator( | |||
| predicted_img) | |||
| adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss( | |||
| real_batch=batch['image'], | |||
| fake_batch=predicted_img, | |||
| discr_real_pred=discr_real_pred, | |||
| discr_fake_pred=discr_fake_pred, | |||
| mask=batch['mask']) | |||
| total_loss = (total_loss + adv_discr_loss) * 0.1 | |||
| metrics['discr_adv'] = adv_discr_loss | |||
| metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) | |||
| return total_loss, metrics | |||
| def _do_step(self, batch, optimizer_idx=None): | |||
| if optimizer_idx == 0: # step for generator | |||
| set_requires_grad(self.generator, True) | |||
| set_requires_grad(self.discriminator, False) | |||
| elif optimizer_idx == 1: # step for discriminator | |||
| set_requires_grad(self.generator, False) | |||
| set_requires_grad(self.discriminator, True) | |||
| batch = self(batch) | |||
| total_loss = 0 | |||
| if optimizer_idx is None or optimizer_idx == 0: # step for generator | |||
| total_loss, metrics = self.generator_loss(batch) | |||
| elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator | |||
| total_loss, metrics = self.discriminator_loss(batch) | |||
| result = dict(loss=total_loss) | |||
| return result | |||
| @@ -0,0 +1,36 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base.base_torch_model import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| LOGGER = get_logger() | |||
| @MODELS.register_module( | |||
| Tasks.image_inpainting, module_name=Models.image_inpainting) | |||
| class FFTInpainting(TorchModel): | |||
| def __init__(self, model_dir: str, **kwargs): | |||
| super().__init__(model_dir, **kwargs) | |||
| from .default import DefaultInpaintingTrainingModule | |||
| pretrained = kwargs.get('pretrained', True) | |||
| predict_only = kwargs.get('predict_only', False) | |||
| net = DefaultInpaintingTrainingModule( | |||
| model_dir=model_dir, predict_only=predict_only) | |||
| if pretrained: | |||
| path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
| LOGGER.info(f'loading pretrained model from {path}') | |||
| state = torch.load(path, map_location='cpu') | |||
| net.load_state_dict(state, strict=False) | |||
| self.model = net | |||
| def forward(self, inputs): | |||
| return self.model(inputs) | |||
| @@ -0,0 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .base import ModelBuilder | |||
| @@ -0,0 +1,380 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import os | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.nn.modules import BatchNorm2d | |||
| from . import resnet | |||
| NUM_CLASS = 150 | |||
| # Model Builder | |||
| class ModelBuilder: | |||
| # custom weights initialization | |||
| @staticmethod | |||
| def weights_init(m): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| nn.init.kaiming_normal_(m.weight.data) | |||
| elif classname.find('BatchNorm') != -1: | |||
| m.weight.data.fill_(1.) | |||
| m.bias.data.fill_(1e-4) | |||
| @staticmethod | |||
| def build_encoder(arch='resnet50dilated', | |||
| fc_dim=512, | |||
| weights='', | |||
| model_dir=''): | |||
| pretrained = True if len(weights) == 0 else False | |||
| arch = arch.lower() | |||
| if arch == 'resnet50dilated': | |||
| orig_resnet = resnet.__dict__['resnet50']( | |||
| pretrained=pretrained, model_dir=model_dir) | |||
| net_encoder = ResnetDilated(orig_resnet, dilate_scale=8) | |||
| elif arch == 'resnet50': | |||
| orig_resnet = resnet.__dict__['resnet50']( | |||
| pretrained=pretrained, model_dir=model_dir) | |||
| net_encoder = Resnet(orig_resnet) | |||
| else: | |||
| raise Exception('Architecture undefined!') | |||
| # encoders are usually pretrained | |||
| # net_encoder.apply(ModelBuilder.weights_init) | |||
| if len(weights) > 0: | |||
| print('Loading weights for net_encoder') | |||
| net_encoder.load_state_dict( | |||
| torch.load(weights, map_location=lambda storage, loc: storage), | |||
| strict=False) | |||
| return net_encoder | |||
| @staticmethod | |||
| def build_decoder(arch='ppm_deepsup', | |||
| fc_dim=512, | |||
| num_class=NUM_CLASS, | |||
| weights='', | |||
| use_softmax=False, | |||
| drop_last_conv=False): | |||
| arch = arch.lower() | |||
| if arch == 'ppm_deepsup': | |||
| net_decoder = PPMDeepsup( | |||
| num_class=num_class, | |||
| fc_dim=fc_dim, | |||
| use_softmax=use_softmax, | |||
| drop_last_conv=drop_last_conv) | |||
| elif arch == 'c1_deepsup': | |||
| net_decoder = C1DeepSup( | |||
| num_class=num_class, | |||
| fc_dim=fc_dim, | |||
| use_softmax=use_softmax, | |||
| drop_last_conv=drop_last_conv) | |||
| else: | |||
| raise Exception('Architecture undefined!') | |||
| net_decoder.apply(ModelBuilder.weights_init) | |||
| if len(weights) > 0: | |||
| print('Loading weights for net_decoder') | |||
| net_decoder.load_state_dict( | |||
| torch.load(weights, map_location=lambda storage, loc: storage), | |||
| strict=False) | |||
| return net_decoder | |||
| @staticmethod | |||
| def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim, | |||
| drop_last_conv, *arts, **kwargs): | |||
| path = os.path.join( | |||
| weights_path, 'ade20k', | |||
| f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth') | |||
| return ModelBuilder.build_decoder( | |||
| arch=arch_decoder, | |||
| fc_dim=fc_dim, | |||
| weights=path, | |||
| use_softmax=True, | |||
| drop_last_conv=drop_last_conv) | |||
| @staticmethod | |||
| def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim, | |||
| segmentation, *arts, **kwargs): | |||
| if segmentation: | |||
| path = os.path.join( | |||
| weights_path, 'ade20k', | |||
| f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth') | |||
| else: | |||
| path = '' | |||
| return ModelBuilder.build_encoder( | |||
| arch=arch_encoder, | |||
| fc_dim=fc_dim, | |||
| weights=path, | |||
| model_dir=weights_path) | |||
| def conv3x3_bn_relu(in_planes, out_planes, stride=1): | |||
| return nn.Sequential( | |||
| nn.Conv2d( | |||
| in_planes, | |||
| out_planes, | |||
| kernel_size=3, | |||
| stride=stride, | |||
| padding=1, | |||
| bias=False), | |||
| BatchNorm2d(out_planes), | |||
| nn.ReLU(inplace=True), | |||
| ) | |||
| # pyramid pooling, deep supervision | |||
| class PPMDeepsup(nn.Module): | |||
| def __init__(self, | |||
| num_class=NUM_CLASS, | |||
| fc_dim=4096, | |||
| use_softmax=False, | |||
| pool_scales=(1, 2, 3, 6), | |||
| drop_last_conv=False): | |||
| super().__init__() | |||
| self.use_softmax = use_softmax | |||
| self.drop_last_conv = drop_last_conv | |||
| self.ppm = [] | |||
| for scale in pool_scales: | |||
| self.ppm.append( | |||
| nn.Sequential( | |||
| nn.AdaptiveAvgPool2d(scale), | |||
| nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False), | |||
| BatchNorm2d(512), nn.ReLU(inplace=True))) | |||
| self.ppm = nn.ModuleList(self.ppm) | |||
| self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) | |||
| self.conv_last = nn.Sequential( | |||
| nn.Conv2d( | |||
| fc_dim + len(pool_scales) * 512, | |||
| 512, | |||
| kernel_size=3, | |||
| padding=1, | |||
| bias=False), BatchNorm2d(512), nn.ReLU(inplace=True), | |||
| nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1)) | |||
| self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||
| self.dropout_deepsup = nn.Dropout2d(0.1) | |||
| def forward(self, conv_out, segSize=None): | |||
| conv5 = conv_out[-1] | |||
| input_size = conv5.size() | |||
| ppm_out = [conv5] | |||
| for pool_scale in self.ppm: | |||
| ppm_out.append( | |||
| nn.functional.interpolate( | |||
| pool_scale(conv5), (input_size[2], input_size[3]), | |||
| mode='bilinear', | |||
| align_corners=False)) | |||
| ppm_out = torch.cat(ppm_out, 1) | |||
| if self.drop_last_conv: | |||
| return ppm_out | |||
| else: | |||
| x = self.conv_last(ppm_out) | |||
| if self.use_softmax: # is True during inference | |||
| x = nn.functional.interpolate( | |||
| x, size=segSize, mode='bilinear', align_corners=False) | |||
| x = nn.functional.softmax(x, dim=1) | |||
| return x | |||
| # deep sup | |||
| conv4 = conv_out[-2] | |||
| _ = self.cbr_deepsup(conv4) | |||
| _ = self.dropout_deepsup(_) | |||
| _ = self.conv_last_deepsup(_) | |||
| x = nn.functional.log_softmax(x, dim=1) | |||
| _ = nn.functional.log_softmax(_, dim=1) | |||
| return (x, _) | |||
| class Resnet(nn.Module): | |||
| def __init__(self, orig_resnet): | |||
| super(Resnet, self).__init__() | |||
| # take pretrained resnet, except AvgPool and FC | |||
| self.conv1 = orig_resnet.conv1 | |||
| self.bn1 = orig_resnet.bn1 | |||
| self.relu1 = orig_resnet.relu1 | |||
| self.conv2 = orig_resnet.conv2 | |||
| self.bn2 = orig_resnet.bn2 | |||
| self.relu2 = orig_resnet.relu2 | |||
| self.conv3 = orig_resnet.conv3 | |||
| self.bn3 = orig_resnet.bn3 | |||
| self.relu3 = orig_resnet.relu3 | |||
| self.maxpool = orig_resnet.maxpool | |||
| self.layer1 = orig_resnet.layer1 | |||
| self.layer2 = orig_resnet.layer2 | |||
| self.layer3 = orig_resnet.layer3 | |||
| self.layer4 = orig_resnet.layer4 | |||
| def forward(self, x, return_feature_maps=False): | |||
| conv_out = [] | |||
| x = self.relu1(self.bn1(self.conv1(x))) | |||
| x = self.relu2(self.bn2(self.conv2(x))) | |||
| x = self.relu3(self.bn3(self.conv3(x))) | |||
| x = self.maxpool(x) | |||
| x = self.layer1(x) | |||
| conv_out.append(x) | |||
| x = self.layer2(x) | |||
| conv_out.append(x) | |||
| x = self.layer3(x) | |||
| conv_out.append(x) | |||
| x = self.layer4(x) | |||
| conv_out.append(x) | |||
| if return_feature_maps: | |||
| return conv_out | |||
| return [x] | |||
| # Resnet Dilated | |||
| class ResnetDilated(nn.Module): | |||
| def __init__(self, orig_resnet, dilate_scale=8): | |||
| super().__init__() | |||
| from functools import partial | |||
| if dilate_scale == 8: | |||
| orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2)) | |||
| orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4)) | |||
| elif dilate_scale == 16: | |||
| orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2)) | |||
| # take pretrained resnet, except AvgPool and FC | |||
| self.conv1 = orig_resnet.conv1 | |||
| self.bn1 = orig_resnet.bn1 | |||
| self.relu1 = orig_resnet.relu1 | |||
| self.conv2 = orig_resnet.conv2 | |||
| self.bn2 = orig_resnet.bn2 | |||
| self.relu2 = orig_resnet.relu2 | |||
| self.conv3 = orig_resnet.conv3 | |||
| self.bn3 = orig_resnet.bn3 | |||
| self.relu3 = orig_resnet.relu3 | |||
| self.maxpool = orig_resnet.maxpool | |||
| self.layer1 = orig_resnet.layer1 | |||
| self.layer2 = orig_resnet.layer2 | |||
| self.layer3 = orig_resnet.layer3 | |||
| self.layer4 = orig_resnet.layer4 | |||
| def _nostride_dilate(self, m, dilate): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| # the convolution with stride | |||
| if m.stride == (2, 2): | |||
| m.stride = (1, 1) | |||
| if m.kernel_size == (3, 3): | |||
| m.dilation = (dilate // 2, dilate // 2) | |||
| m.padding = (dilate // 2, dilate // 2) | |||
| # other convoluions | |||
| else: | |||
| if m.kernel_size == (3, 3): | |||
| m.dilation = (dilate, dilate) | |||
| m.padding = (dilate, dilate) | |||
| def forward(self, x, return_feature_maps=False): | |||
| conv_out = [] | |||
| x = self.relu1(self.bn1(self.conv1(x))) | |||
| x = self.relu2(self.bn2(self.conv2(x))) | |||
| x = self.relu3(self.bn3(self.conv3(x))) | |||
| x = self.maxpool(x) | |||
| x = self.layer1(x) | |||
| conv_out.append(x) | |||
| x = self.layer2(x) | |||
| conv_out.append(x) | |||
| x = self.layer3(x) | |||
| conv_out.append(x) | |||
| x = self.layer4(x) | |||
| conv_out.append(x) | |||
| if return_feature_maps: | |||
| return conv_out | |||
| return [x] | |||
| # last conv, deep supervision | |||
| class C1DeepSup(nn.Module): | |||
| def __init__(self, | |||
| num_class=150, | |||
| fc_dim=2048, | |||
| use_softmax=False, | |||
| drop_last_conv=False): | |||
| super(C1DeepSup, self).__init__() | |||
| self.use_softmax = use_softmax | |||
| self.drop_last_conv = drop_last_conv | |||
| self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) | |||
| self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) | |||
| # last conv | |||
| self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||
| self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||
| def forward(self, conv_out, segSize=None): | |||
| conv5 = conv_out[-1] | |||
| x = self.cbr(conv5) | |||
| if self.drop_last_conv: | |||
| return x | |||
| else: | |||
| x = self.conv_last(x) | |||
| if self.use_softmax: # is True during inference | |||
| x = nn.functional.interpolate( | |||
| x, size=segSize, mode='bilinear', align_corners=False) | |||
| x = nn.functional.softmax(x, dim=1) | |||
| return x | |||
| # deep sup | |||
| conv4 = conv_out[-2] | |||
| _ = self.cbr_deepsup(conv4) | |||
| _ = self.conv_last_deepsup(_) | |||
| x = nn.functional.log_softmax(x, dim=1) | |||
| _ = nn.functional.log_softmax(_, dim=1) | |||
| return (x, _) | |||
| # last conv | |||
| class C1(nn.Module): | |||
| def __init__(self, num_class=150, fc_dim=2048, use_softmax=False): | |||
| super(C1, self).__init__() | |||
| self.use_softmax = use_softmax | |||
| self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) | |||
| # last conv | |||
| self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||
| def forward(self, conv_out, segSize=None): | |||
| conv5 = conv_out[-1] | |||
| x = self.cbr(conv5) | |||
| x = self.conv_last(x) | |||
| if self.use_softmax: # is True during inference | |||
| x = nn.functional.interpolate( | |||
| x, size=segSize, mode='bilinear', align_corners=False) | |||
| x = nn.functional.softmax(x, dim=1) | |||
| else: | |||
| x = nn.functional.log_softmax(x, dim=1) | |||
| return x | |||
| @@ -0,0 +1,183 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import math | |||
| import os | |||
| import torch | |||
| import torch.nn as nn | |||
| from torch.nn import BatchNorm2d | |||
| __all__ = ['ResNet', 'resnet50'] | |||
| def conv3x3(in_planes, out_planes, stride=1): | |||
| '3x3 convolution with padding' | |||
| return nn.Conv2d( | |||
| in_planes, | |||
| out_planes, | |||
| kernel_size=3, | |||
| stride=stride, | |||
| padding=1, | |||
| bias=False) | |||
| class BasicBlock(nn.Module): | |||
| expansion = 1 | |||
| def __init__(self, inplanes, planes, stride=1, downsample=None): | |||
| super(BasicBlock, self).__init__() | |||
| self.conv1 = conv3x3(inplanes, planes, stride) | |||
| self.bn1 = BatchNorm2d(planes) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| self.conv2 = conv3x3(planes, planes) | |||
| self.bn2 = BatchNorm2d(planes) | |||
| self.downsample = downsample | |||
| self.stride = stride | |||
| def forward(self, x): | |||
| residual = x | |||
| out = self.conv1(x) | |||
| out = self.bn1(out) | |||
| out = self.relu(out) | |||
| out = self.conv2(out) | |||
| out = self.bn2(out) | |||
| if self.downsample is not None: | |||
| residual = self.downsample(x) | |||
| out += residual | |||
| out = self.relu(out) | |||
| return out | |||
| class Bottleneck(nn.Module): | |||
| expansion = 4 | |||
| def __init__(self, inplanes, planes, stride=1, downsample=None): | |||
| super(Bottleneck, self).__init__() | |||
| self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | |||
| self.bn1 = BatchNorm2d(planes) | |||
| self.conv2 = nn.Conv2d( | |||
| planes, | |||
| planes, | |||
| kernel_size=3, | |||
| stride=stride, | |||
| padding=1, | |||
| bias=False) | |||
| self.bn2 = BatchNorm2d(planes) | |||
| self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | |||
| self.bn3 = BatchNorm2d(planes * 4) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| self.downsample = downsample | |||
| self.stride = stride | |||
| def forward(self, x): | |||
| residual = x | |||
| out = self.conv1(x) | |||
| out = self.bn1(out) | |||
| out = self.relu(out) | |||
| out = self.conv2(out) | |||
| out = self.bn2(out) | |||
| out = self.relu(out) | |||
| out = self.conv3(out) | |||
| out = self.bn3(out) | |||
| if self.downsample is not None: | |||
| residual = self.downsample(x) | |||
| out += residual | |||
| out = self.relu(out) | |||
| return out | |||
| class ResNet(nn.Module): | |||
| def __init__(self, block, layers, num_classes=1000): | |||
| self.inplanes = 128 | |||
| super(ResNet, self).__init__() | |||
| self.conv1 = conv3x3(3, 64, stride=2) | |||
| self.bn1 = BatchNorm2d(64) | |||
| self.relu1 = nn.ReLU(inplace=True) | |||
| self.conv2 = conv3x3(64, 64) | |||
| self.bn2 = BatchNorm2d(64) | |||
| self.relu2 = nn.ReLU(inplace=True) | |||
| self.conv3 = conv3x3(64, 128) | |||
| self.bn3 = BatchNorm2d(128) | |||
| self.relu3 = nn.ReLU(inplace=True) | |||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||
| self.layer1 = self._make_layer(block, 64, layers[0]) | |||
| self.layer2 = self._make_layer(block, 128, layers[1], stride=2) | |||
| self.layer3 = self._make_layer(block, 256, layers[2], stride=2) | |||
| self.layer4 = self._make_layer(block, 512, layers[3], stride=2) | |||
| self.avgpool = nn.AvgPool2d(7, stride=1) | |||
| self.fc = nn.Linear(512 * block.expansion, num_classes) | |||
| for m in self.modules(): | |||
| if isinstance(m, nn.Conv2d): | |||
| n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||
| m.weight.data.normal_(0, math.sqrt(2. / n)) | |||
| elif isinstance(m, BatchNorm2d): | |||
| m.weight.data.fill_(1) | |||
| m.bias.data.zero_() | |||
| def _make_layer(self, block, planes, blocks, stride=1): | |||
| downsample = None | |||
| if stride != 1 or self.inplanes != planes * block.expansion: | |||
| downsample = nn.Sequential( | |||
| nn.Conv2d( | |||
| self.inplanes, | |||
| planes * block.expansion, | |||
| kernel_size=1, | |||
| stride=stride, | |||
| bias=False), | |||
| BatchNorm2d(planes * block.expansion), | |||
| ) | |||
| layers = [] | |||
| layers.append(block(self.inplanes, planes, stride, downsample)) | |||
| self.inplanes = planes * block.expansion | |||
| for i in range(1, blocks): | |||
| layers.append(block(self.inplanes, planes)) | |||
| return nn.Sequential(*layers) | |||
| def forward(self, x): | |||
| x = self.relu1(self.bn1(self.conv1(x))) | |||
| x = self.relu2(self.bn2(self.conv2(x))) | |||
| x = self.relu3(self.bn3(self.conv3(x))) | |||
| x = self.maxpool(x) | |||
| x = self.layer1(x) | |||
| x = self.layer2(x) | |||
| x = self.layer3(x) | |||
| x = self.layer4(x) | |||
| x = self.avgpool(x) | |||
| x = x.view(x.size(0), -1) | |||
| x = self.fc(x) | |||
| return x | |||
| def resnet50(pretrained=False, model_dir='', **kwargs): | |||
| """Constructs a ResNet-50 model. | |||
| Args: | |||
| pretrained (bool): If True, returns a model pre-trained on ImageNet | |||
| """ | |||
| model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) | |||
| if pretrained: | |||
| cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth') | |||
| model.load_state_dict( | |||
| torch.load(cached_file, map_location='cpu'), strict=False) | |||
| return model | |||
| @@ -0,0 +1,167 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| from typing import Dict, Optional, Tuple | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class BaseAdversarialLoss: | |||
| def pre_generator_step(self, real_batch: torch.Tensor, | |||
| fake_batch: torch.Tensor, generator: nn.Module, | |||
| discriminator: nn.Module): | |||
| """ | |||
| Prepare for generator step | |||
| :param real_batch: Tensor, a batch of real samples | |||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||
| :param generator: | |||
| :param discriminator: | |||
| :return: None | |||
| """ | |||
| def pre_discriminator_step(self, real_batch: torch.Tensor, | |||
| fake_batch: torch.Tensor, generator: nn.Module, | |||
| discriminator: nn.Module): | |||
| """ | |||
| Prepare for discriminator step | |||
| :param real_batch: Tensor, a batch of real samples | |||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||
| :param generator: | |||
| :param discriminator: | |||
| :return: None | |||
| """ | |||
| def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||
| mask: Optional[torch.Tensor] = None) \ | |||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| """ | |||
| Calculate generator loss | |||
| :param real_batch: Tensor, a batch of real samples | |||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||
| :param discr_real_pred: Tensor, discriminator output for real_batch | |||
| :param discr_fake_pred: Tensor, discriminator output for fake_batch | |||
| :param mask: Tensor, actual mask, which was at input of generator when making fake_batch | |||
| :return: total generator loss along with some values that might be interesting to log | |||
| """ | |||
| raise NotImplementedError | |||
| def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||
| mask: Optional[torch.Tensor] = None) \ | |||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| """ | |||
| Calculate discriminator loss and call .backward() on it | |||
| :param real_batch: Tensor, a batch of real samples | |||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||
| :param discr_real_pred: Tensor, discriminator output for real_batch | |||
| :param discr_fake_pred: Tensor, discriminator output for fake_batch | |||
| :param mask: Tensor, actual mask, which was at input of generator when making fake_batch | |||
| :return: total discriminator loss along with some values that might be interesting to log | |||
| """ | |||
| raise NotImplementedError | |||
| def interpolate_mask(self, mask, shape): | |||
| assert mask is not None | |||
| assert self.allow_scale_mask or shape == mask.shape[-2:] | |||
| if shape != mask.shape[-2:] and self.allow_scale_mask: | |||
| if self.mask_scale_mode == 'maxpool': | |||
| mask = F.adaptive_max_pool2d(mask, shape) | |||
| else: | |||
| mask = F.interpolate( | |||
| mask, size=shape, mode=self.mask_scale_mode) | |||
| return mask | |||
| def make_r1_gp(discr_real_pred, real_batch): | |||
| if torch.is_grad_enabled(): | |||
| grad_real = torch.autograd.grad( | |||
| outputs=discr_real_pred.sum(), | |||
| inputs=real_batch, | |||
| create_graph=True)[0] | |||
| grad_penalty = (grad_real.view(grad_real.shape[0], | |||
| -1).norm(2, dim=1)**2).mean() | |||
| else: | |||
| grad_penalty = 0 | |||
| real_batch.requires_grad = False | |||
| return grad_penalty | |||
| class NonSaturatingWithR1(BaseAdversarialLoss): | |||
| def __init__(self, | |||
| gp_coef=5, | |||
| weight=1, | |||
| mask_as_fake_target=False, | |||
| allow_scale_mask=False, | |||
| mask_scale_mode='nearest', | |||
| extra_mask_weight_for_gen=0, | |||
| use_unmasked_for_gen=True, | |||
| use_unmasked_for_discr=True): | |||
| self.gp_coef = gp_coef | |||
| self.weight = weight | |||
| # use for discr => use for gen; | |||
| # otherwise we teach only the discr to pay attention to very small difference | |||
| assert use_unmasked_for_gen or (not use_unmasked_for_discr) | |||
| # mask as target => use unmasked for discr: | |||
| # if we don't care about unmasked regions at all | |||
| # then it doesn't matter if the value of mask_as_fake_target is true or false | |||
| assert use_unmasked_for_discr or (not mask_as_fake_target) | |||
| self.use_unmasked_for_gen = use_unmasked_for_gen | |||
| self.use_unmasked_for_discr = use_unmasked_for_discr | |||
| self.mask_as_fake_target = mask_as_fake_target | |||
| self.allow_scale_mask = allow_scale_mask | |||
| self.mask_scale_mode = mask_scale_mode | |||
| self.extra_mask_weight_for_gen = extra_mask_weight_for_gen | |||
| def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||
| mask=None) \ | |||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| fake_loss = F.softplus(-discr_fake_pred) | |||
| if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \ | |||
| not self.use_unmasked_for_gen: # == if masked region should be treated differently | |||
| mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) | |||
| if not self.use_unmasked_for_gen: | |||
| fake_loss = fake_loss * mask | |||
| else: | |||
| pixel_weights = 1 + mask * self.extra_mask_weight_for_gen | |||
| fake_loss = fake_loss * pixel_weights | |||
| return fake_loss.mean() * self.weight, dict() | |||
| def pre_discriminator_step(self, real_batch: torch.Tensor, | |||
| fake_batch: torch.Tensor, generator: nn.Module, | |||
| discriminator: nn.Module): | |||
| real_batch.requires_grad = True | |||
| def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||
| mask=None) \ | |||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||
| real_loss = F.softplus(-discr_real_pred) | |||
| grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef | |||
| fake_loss = F.softplus(discr_fake_pred) | |||
| if not self.use_unmasked_for_discr or self.mask_as_fake_target: | |||
| # == if masked region should be treated differently | |||
| mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) | |||
| # use_unmasked_for_discr=False only makes sense for fakes; | |||
| # for reals there is no difference beetween two regions | |||
| fake_loss = fake_loss * mask | |||
| if self.mask_as_fake_target: | |||
| fake_loss = fake_loss + (1 | |||
| - mask) * F.softplus(-discr_fake_pred) | |||
| sum_discr_loss = real_loss + grad_penalty + fake_loss | |||
| metrics = dict( | |||
| discr_real_out=discr_real_pred.mean(), | |||
| discr_fake_out=discr_fake_pred.mean(), | |||
| discr_real_gp=grad_penalty) | |||
| return sum_discr_loss.mean(), metrics | |||
| @@ -0,0 +1,45 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| from typing import List | |||
| import torch | |||
| import torch.nn.functional as F | |||
| def masked_l2_loss(pred, target, mask, weight_known, weight_missing): | |||
| per_pixel_l2 = F.mse_loss(pred, target, reduction='none') | |||
| pixel_weights = mask * weight_missing + (1 - mask) * weight_known | |||
| return (pixel_weights * per_pixel_l2).mean() | |||
| def masked_l1_loss(pred, target, mask, weight_known, weight_missing): | |||
| per_pixel_l1 = F.l1_loss(pred, target, reduction='none') | |||
| pixel_weights = mask * weight_missing + (1 - mask) * weight_known | |||
| return (pixel_weights * per_pixel_l1).mean() | |||
| def feature_matching_loss(fake_features: List[torch.Tensor], | |||
| target_features: List[torch.Tensor], | |||
| mask=None): | |||
| if mask is None: | |||
| res = torch.stack([ | |||
| F.mse_loss(fake_feat, target_feat) | |||
| for fake_feat, target_feat in zip(fake_features, target_features) | |||
| ]).mean() | |||
| else: | |||
| res = 0 | |||
| norm = 0 | |||
| for fake_feat, target_feat in zip(fake_features, target_features): | |||
| cur_mask = F.interpolate( | |||
| mask, | |||
| size=fake_feat.shape[-2:], | |||
| mode='bilinear', | |||
| align_corners=False) | |||
| error_weights = 1 - cur_mask | |||
| cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean() | |||
| res = res + cur_val | |||
| norm += 1 | |||
| res = res / norm | |||
| return res | |||
| @@ -0,0 +1,588 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from kornia.geometry.transform import rotate | |||
| def get_activation(kind='tanh'): | |||
| if kind == 'tanh': | |||
| return nn.Tanh() | |||
| if kind == 'sigmoid': | |||
| return nn.Sigmoid() | |||
| if kind is False: | |||
| return nn.Identity() | |||
| raise ValueError(f'Unknown activation kind {kind}') | |||
| class SELayer(nn.Module): | |||
| def __init__(self, channel, reduction=16): | |||
| super(SELayer, self).__init__() | |||
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||
| self.fc = nn.Sequential( | |||
| nn.Linear(channel, channel // reduction, bias=False), | |||
| nn.ReLU(inplace=True), | |||
| nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid()) | |||
| def forward(self, x): | |||
| b, c, _, _ = x.size() | |||
| y = self.avg_pool(x).view(b, c) | |||
| y = self.fc(y).view(b, c, 1, 1) | |||
| res = x * y.expand_as(x) | |||
| return res | |||
| class FourierUnit(nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| groups=1, | |||
| spatial_scale_factor=None, | |||
| spatial_scale_mode='bilinear', | |||
| spectral_pos_encoding=False, | |||
| use_se=False, | |||
| se_kwargs=None, | |||
| ffc3d=False, | |||
| fft_norm='ortho'): | |||
| # bn_layer not used | |||
| super(FourierUnit, self).__init__() | |||
| self.groups = groups | |||
| self.conv_layer = torch.nn.Conv2d( | |||
| in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), | |||
| out_channels=out_channels * 2, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| groups=self.groups, | |||
| bias=False) | |||
| self.bn = torch.nn.BatchNorm2d(out_channels * 2) | |||
| self.relu = torch.nn.ReLU(inplace=True) | |||
| # squeeze and excitation block | |||
| self.use_se = use_se | |||
| if use_se: | |||
| if se_kwargs is None: | |||
| se_kwargs = {} | |||
| self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) | |||
| self.spatial_scale_factor = spatial_scale_factor | |||
| self.spatial_scale_mode = spatial_scale_mode | |||
| self.spectral_pos_encoding = spectral_pos_encoding | |||
| self.ffc3d = ffc3d | |||
| self.fft_norm = fft_norm | |||
| def forward(self, x): | |||
| batch = x.shape[0] | |||
| if self.spatial_scale_factor is not None: | |||
| orig_size = x.shape[-2:] | |||
| x = F.interpolate( | |||
| x, | |||
| scale_factor=self.spatial_scale_factor, | |||
| mode=self.spatial_scale_mode, | |||
| align_corners=False) | |||
| # (batch, c, h, w/2+1, 2) | |||
| fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) | |||
| ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) | |||
| ffted = torch.stack((ffted.real, ffted.imag), dim=-1) | |||
| ffted = ffted.permute(0, 1, 4, 2, | |||
| 3).contiguous() # (batch, c, 2, h, w/2+1) | |||
| ffted = ffted.view(( | |||
| batch, | |||
| -1, | |||
| ) + ffted.size()[3:]) | |||
| if self.spectral_pos_encoding: | |||
| height, width = ffted.shape[-2:] | |||
| coords_vert = torch.linspace(0, 1, | |||
| height)[None, None, :, None].expand( | |||
| batch, 1, height, width).to(ffted) | |||
| coords_hor = torch.linspace(0, 1, | |||
| width)[None, None, None, :].expand( | |||
| batch, 1, height, width).to(ffted) | |||
| ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) | |||
| if self.use_se: | |||
| ffted = self.se(ffted) | |||
| ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1) | |||
| ffted = self.relu(self.bn(ffted)) | |||
| ffted = ffted.view(( | |||
| batch, | |||
| -1, | |||
| 2, | |||
| ) + ffted.size()[2:]).permute( | |||
| 0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2) | |||
| ffted = torch.complex(ffted[..., 0], ffted[..., 1]) | |||
| ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] | |||
| output = torch.fft.irfftn( | |||
| ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm) | |||
| if self.spatial_scale_factor is not None: | |||
| output = F.interpolate( | |||
| output, | |||
| size=orig_size, | |||
| mode=self.spatial_scale_mode, | |||
| align_corners=False) | |||
| return output | |||
| class SpectralTransform(nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| stride=1, | |||
| groups=1, | |||
| enable_lfu=True, | |||
| **fu_kwargs): | |||
| # bn_layer not used | |||
| super(SpectralTransform, self).__init__() | |||
| self.enable_lfu = enable_lfu | |||
| if stride == 2: | |||
| self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) | |||
| else: | |||
| self.downsample = nn.Identity() | |||
| self.stride = stride | |||
| self.conv1 = nn.Sequential( | |||
| nn.Conv2d( | |||
| in_channels, | |||
| out_channels // 2, | |||
| kernel_size=1, | |||
| groups=groups, | |||
| bias=False), nn.BatchNorm2d(out_channels // 2), | |||
| nn.ReLU(inplace=True)) | |||
| self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups, | |||
| **fu_kwargs) | |||
| if self.enable_lfu: | |||
| self.lfu = FourierUnit(out_channels // 2, out_channels // 2, | |||
| groups) | |||
| self.conv2 = torch.nn.Conv2d( | |||
| out_channels // 2, | |||
| out_channels, | |||
| kernel_size=1, | |||
| groups=groups, | |||
| bias=False) | |||
| def forward(self, x): | |||
| x = self.downsample(x) | |||
| x = self.conv1(x) | |||
| output = self.fu(x) | |||
| if self.enable_lfu: | |||
| n, c, h, w = x.shape | |||
| split_no = 2 | |||
| split_s = h // split_no | |||
| xs = torch.cat( | |||
| torch.split(x[:, :c // 4], split_s, dim=-2), | |||
| dim=1).contiguous() | |||
| xs = torch.cat( | |||
| torch.split(xs, split_s, dim=-1), dim=1).contiguous() | |||
| xs = self.lfu(xs) | |||
| xs = xs.repeat(1, 1, split_no, split_no).contiguous() | |||
| else: | |||
| xs = 0 | |||
| output = self.conv2(x + output + xs) | |||
| return output | |||
| class LearnableSpatialTransformWrapper(nn.Module): | |||
| def __init__(self, | |||
| impl, | |||
| pad_coef=0.5, | |||
| angle_init_range=80, | |||
| train_angle=True): | |||
| super().__init__() | |||
| self.impl = impl | |||
| self.angle = torch.rand(1) * angle_init_range | |||
| if train_angle: | |||
| self.angle = nn.Parameter(self.angle, requires_grad=True) | |||
| self.pad_coef = pad_coef | |||
| def forward(self, x): | |||
| if torch.is_tensor(x): | |||
| return self.inverse_transform(self.impl(self.transform(x)), x) | |||
| elif isinstance(x, tuple): | |||
| x_trans = tuple(self.transform(elem) for elem in x) | |||
| y_trans = self.impl(x_trans) | |||
| return tuple( | |||
| self.inverse_transform(elem, orig_x) | |||
| for elem, orig_x in zip(y_trans, x)) | |||
| else: | |||
| raise ValueError(f'Unexpected input type {type(x)}') | |||
| def transform(self, x): | |||
| height, width = x.shape[2:] | |||
| pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) | |||
| x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect') | |||
| x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded)) | |||
| return x_padded_rotated | |||
| def inverse_transform(self, y_padded_rotated, orig_x): | |||
| height, width = orig_x.shape[2:] | |||
| pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) | |||
| y_padded = rotate( | |||
| y_padded_rotated, angle=-self.angle.to(y_padded_rotated)) | |||
| y_height, y_width = y_padded.shape[2:] | |||
| y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w] | |||
| return y | |||
| class FFC(nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| ratio_gin, | |||
| ratio_gout, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=False, | |||
| enable_lfu=True, | |||
| padding_type='reflect', | |||
| gated=False, | |||
| **spectral_kwargs): | |||
| super(FFC, self).__init__() | |||
| assert stride == 1 or stride == 2, 'Stride should be 1 or 2.' | |||
| self.stride = stride | |||
| in_cg = int(in_channels * ratio_gin) | |||
| in_cl = in_channels - in_cg | |||
| out_cg = int(out_channels * ratio_gout) | |||
| out_cl = out_channels - out_cg | |||
| self.ratio_gin = ratio_gin | |||
| self.ratio_gout = ratio_gout | |||
| self.global_in_num = in_cg | |||
| module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d | |||
| self.convl2l = module( | |||
| in_cl, | |||
| out_cl, | |||
| kernel_size, | |||
| stride, | |||
| padding, | |||
| dilation, | |||
| groups, | |||
| bias, | |||
| padding_mode=padding_type) | |||
| module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d | |||
| self.convl2g = module( | |||
| in_cl, | |||
| out_cg, | |||
| kernel_size, | |||
| stride, | |||
| padding, | |||
| dilation, | |||
| groups, | |||
| bias, | |||
| padding_mode=padding_type) | |||
| module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d | |||
| self.convg2l = module( | |||
| in_cg, | |||
| out_cl, | |||
| kernel_size, | |||
| stride, | |||
| padding, | |||
| dilation, | |||
| groups, | |||
| bias, | |||
| padding_mode=padding_type) | |||
| module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform | |||
| self.convg2g = module(in_cg, out_cg, stride, | |||
| 1 if groups == 1 else groups // 2, enable_lfu, | |||
| **spectral_kwargs) | |||
| self.gated = gated | |||
| module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d | |||
| self.gate = module(in_channels, 2, 1) | |||
| def forward(self, x): | |||
| x_l, x_g = x if type(x) is tuple else (x, 0) | |||
| out_xl, out_xg = 0, 0 | |||
| if self.gated: | |||
| total_input_parts = [x_l] | |||
| if torch.is_tensor(x_g): | |||
| total_input_parts.append(x_g) | |||
| total_input = torch.cat(total_input_parts, dim=1) | |||
| gates = torch.sigmoid(self.gate(total_input)) | |||
| g2l_gate, l2g_gate = gates.chunk(2, dim=1) | |||
| else: | |||
| g2l_gate, l2g_gate = 1, 1 | |||
| if self.ratio_gout != 1: | |||
| out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate | |||
| if self.ratio_gout != 0: | |||
| out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g) | |||
| return out_xl, out_xg | |||
| class FFC_BN_ACT(nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| ratio_gin, | |||
| ratio_gout, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=False, | |||
| norm_layer=nn.BatchNorm2d, | |||
| activation_layer=nn.Identity, | |||
| padding_type='reflect', | |||
| enable_lfu=True, | |||
| **kwargs): | |||
| super(FFC_BN_ACT, self).__init__() | |||
| self.ffc = FFC( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| ratio_gin, | |||
| ratio_gout, | |||
| stride, | |||
| padding, | |||
| dilation, | |||
| groups, | |||
| bias, | |||
| enable_lfu, | |||
| padding_type=padding_type, | |||
| **kwargs) | |||
| lnorm = nn.Identity if ratio_gout == 1 else norm_layer | |||
| gnorm = nn.Identity if ratio_gout == 0 else norm_layer | |||
| global_channels = int(out_channels * ratio_gout) | |||
| self.bn_l = lnorm(out_channels - global_channels) | |||
| self.bn_g = gnorm(global_channels) | |||
| lact = nn.Identity if ratio_gout == 1 else activation_layer | |||
| gact = nn.Identity if ratio_gout == 0 else activation_layer | |||
| self.act_l = lact(inplace=True) | |||
| self.act_g = gact(inplace=True) | |||
| def forward(self, x): | |||
| x_l, x_g = self.ffc(x) | |||
| x_l = self.act_l(self.bn_l(x_l)) | |||
| x_g = self.act_g(self.bn_g(x_g)) | |||
| return x_l, x_g | |||
| class FFCResnetBlock(nn.Module): | |||
| def __init__(self, | |||
| dim, | |||
| padding_type, | |||
| norm_layer, | |||
| activation_layer=nn.ReLU, | |||
| dilation=1, | |||
| spatial_transform_kwargs=None, | |||
| inline=False, | |||
| **conv_kwargs): | |||
| super().__init__() | |||
| self.conv1 = FFC_BN_ACT( | |||
| dim, | |||
| dim, | |||
| kernel_size=3, | |||
| padding=dilation, | |||
| dilation=dilation, | |||
| norm_layer=norm_layer, | |||
| activation_layer=activation_layer, | |||
| padding_type=padding_type, | |||
| **conv_kwargs) | |||
| self.conv2 = FFC_BN_ACT( | |||
| dim, | |||
| dim, | |||
| kernel_size=3, | |||
| padding=dilation, | |||
| dilation=dilation, | |||
| norm_layer=norm_layer, | |||
| activation_layer=activation_layer, | |||
| padding_type=padding_type, | |||
| **conv_kwargs) | |||
| if spatial_transform_kwargs is not None: | |||
| self.conv1 = LearnableSpatialTransformWrapper( | |||
| self.conv1, **spatial_transform_kwargs) | |||
| self.conv2 = LearnableSpatialTransformWrapper( | |||
| self.conv2, **spatial_transform_kwargs) | |||
| self.inline = inline | |||
| def forward(self, x): | |||
| if self.inline: | |||
| x_l, x_g = x[:, :-self.conv1.ffc. | |||
| global_in_num], x[:, -self.conv1.ffc.global_in_num:] | |||
| else: | |||
| x_l, x_g = x if type(x) is tuple else (x, 0) | |||
| id_l, id_g = x_l, x_g | |||
| x_l, x_g = self.conv1((x_l, x_g)) | |||
| x_l, x_g = self.conv2((x_l, x_g)) | |||
| x_l, x_g = id_l + x_l, id_g + x_g | |||
| out = x_l, x_g | |||
| if self.inline: | |||
| out = torch.cat(out, dim=1) | |||
| return out | |||
| class ConcatTupleLayer(nn.Module): | |||
| def forward(self, x): | |||
| assert isinstance(x, tuple) | |||
| x_l, x_g = x | |||
| assert torch.is_tensor(x_l) or torch.is_tensor(x_g) | |||
| if not torch.is_tensor(x_g): | |||
| return x_l | |||
| return torch.cat(x, dim=1) | |||
| class FFCResNetGenerator(nn.Module): | |||
| def __init__(self, | |||
| input_nc=4, | |||
| output_nc=3, | |||
| ngf=64, | |||
| n_downsampling=3, | |||
| n_blocks=18, | |||
| norm_layer=nn.BatchNorm2d, | |||
| padding_type='reflect', | |||
| activation_layer=nn.ReLU, | |||
| up_norm_layer=nn.BatchNorm2d, | |||
| up_activation=nn.ReLU(True), | |||
| init_conv_kwargs={ | |||
| 'ratio_gin': 0, | |||
| 'ratio_gout': 0, | |||
| 'enable_lfu': False | |||
| }, | |||
| downsample_conv_kwargs={ | |||
| 'ratio_gin': 0, | |||
| 'ratio_gout': 0, | |||
| 'enable_lfu': False | |||
| }, | |||
| resnet_conv_kwargs={ | |||
| 'ratio_gin': 0.75, | |||
| 'ratio_gout': 0.75, | |||
| 'enable_lfu': False | |||
| }, | |||
| spatial_transform_layers=None, | |||
| spatial_transform_kwargs={}, | |||
| add_out_act='sigmoid', | |||
| max_features=1024, | |||
| out_ffc=False, | |||
| out_ffc_kwargs={}): | |||
| assert (n_blocks >= 0) | |||
| super().__init__() | |||
| model = [ | |||
| nn.ReflectionPad2d(3), | |||
| FFC_BN_ACT( | |||
| input_nc, | |||
| ngf, | |||
| kernel_size=7, | |||
| padding=0, | |||
| norm_layer=norm_layer, | |||
| activation_layer=activation_layer, | |||
| **init_conv_kwargs) | |||
| ] | |||
| # downsample | |||
| for i in range(n_downsampling): | |||
| mult = 2**i | |||
| if i == n_downsampling - 1: | |||
| cur_conv_kwargs = dict(downsample_conv_kwargs) | |||
| cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get( | |||
| 'ratio_gin', 0) | |||
| else: | |||
| cur_conv_kwargs = downsample_conv_kwargs | |||
| model += [ | |||
| FFC_BN_ACT( | |||
| min(max_features, ngf * mult), | |||
| min(max_features, ngf * mult * 2), | |||
| kernel_size=3, | |||
| stride=2, | |||
| padding=1, | |||
| norm_layer=norm_layer, | |||
| activation_layer=activation_layer, | |||
| **cur_conv_kwargs) | |||
| ] | |||
| mult = 2**n_downsampling | |||
| feats_num_bottleneck = min(max_features, ngf * mult) | |||
| # resnet blocks | |||
| for i in range(n_blocks): | |||
| cur_resblock = FFCResnetBlock( | |||
| feats_num_bottleneck, | |||
| padding_type=padding_type, | |||
| activation_layer=activation_layer, | |||
| norm_layer=norm_layer, | |||
| **resnet_conv_kwargs) | |||
| if spatial_transform_layers is not None and i in spatial_transform_layers: | |||
| cur_resblock = LearnableSpatialTransformWrapper( | |||
| cur_resblock, **spatial_transform_kwargs) | |||
| model += [cur_resblock] | |||
| model += [ConcatTupleLayer()] | |||
| # upsample | |||
| for i in range(n_downsampling): | |||
| mult = 2**(n_downsampling - i) | |||
| model += [ | |||
| nn.ConvTranspose2d( | |||
| min(max_features, ngf * mult), | |||
| min(max_features, int(ngf * mult / 2)), | |||
| kernel_size=3, | |||
| stride=2, | |||
| padding=1, | |||
| output_padding=1), | |||
| up_norm_layer(min(max_features, int(ngf * mult / 2))), | |||
| up_activation | |||
| ] | |||
| if out_ffc: | |||
| model += [ | |||
| FFCResnetBlock( | |||
| ngf, | |||
| padding_type=padding_type, | |||
| activation_layer=activation_layer, | |||
| norm_layer=norm_layer, | |||
| inline=True, | |||
| **out_ffc_kwargs) | |||
| ] | |||
| model += [ | |||
| nn.ReflectionPad2d(3), | |||
| nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0) | |||
| ] | |||
| if add_out_act: | |||
| model.append( | |||
| get_activation('tanh' if add_out_act is True else add_out_act)) | |||
| self.model = nn.Sequential(*model) | |||
| def forward(self, input): | |||
| return self.model(input) | |||
| @@ -0,0 +1,324 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torchvision import models | |||
| from modelscope.utils.logger import get_logger | |||
| try: | |||
| from torchvision.models.utils import load_state_dict_from_url | |||
| except ImportError: | |||
| from torch.utils.model_zoo import load_url as load_state_dict_from_url | |||
| # Inception weights ported to Pytorch from | |||
| # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz | |||
| FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \ | |||
| 'fid_weights/pt_inception-2015-12-05-6726825d.pth' | |||
| LOGGER = get_logger() | |||
| class InceptionV3(nn.Module): | |||
| """Pretrained InceptionV3 network returning feature maps""" | |||
| # Index of default block of inception to return, | |||
| # corresponds to output of final average pooling | |||
| DEFAULT_BLOCK_INDEX = 3 | |||
| # Maps feature dimensionality to their output blocks indices | |||
| BLOCK_INDEX_BY_DIM = { | |||
| 64: 0, # First max pooling features | |||
| 192: 1, # Second max pooling featurs | |||
| 768: 2, # Pre-aux classifier features | |||
| 2048: 3 # Final average pooling features | |||
| } | |||
| def __init__(self, | |||
| output_blocks=[DEFAULT_BLOCK_INDEX], | |||
| resize_input=True, | |||
| normalize_input=True, | |||
| requires_grad=False, | |||
| use_fid_inception=True): | |||
| """Build pretrained InceptionV3 | |||
| Parameters | |||
| ---------- | |||
| output_blocks : list of int | |||
| Indices of blocks to return features of. Possible values are: | |||
| - 0: corresponds to output of first max pooling | |||
| - 1: corresponds to output of second max pooling | |||
| - 2: corresponds to output which is fed to aux classifier | |||
| - 3: corresponds to output of final average pooling | |||
| resize_input : bool | |||
| If true, bilinearly resizes input to width and height 299 before | |||
| feeding input to model. As the network without fully connected | |||
| layers is fully convolutional, it should be able to handle inputs | |||
| of arbitrary size, so resizing might not be strictly needed | |||
| normalize_input : bool | |||
| If true, scales the input from range (0, 1) to the range the | |||
| pretrained Inception network expects, namely (-1, 1) | |||
| requires_grad : bool | |||
| If true, parameters of the model require gradients. Possibly useful | |||
| for finetuning the network | |||
| use_fid_inception : bool | |||
| If true, uses the pretrained Inception model used in Tensorflow's | |||
| FID implementation. If false, uses the pretrained Inception model | |||
| available in torchvision. The FID Inception model has different | |||
| weights and a slightly different structure from torchvision's | |||
| Inception model. If you want to compute FID scores, you are | |||
| strongly advised to set this parameter to true to get comparable | |||
| results. | |||
| """ | |||
| super(InceptionV3, self).__init__() | |||
| self.resize_input = resize_input | |||
| self.normalize_input = normalize_input | |||
| self.output_blocks = sorted(output_blocks) | |||
| self.last_needed_block = max(output_blocks) | |||
| assert self.last_needed_block <= 3, \ | |||
| 'Last possible output block index is 3' | |||
| self.blocks = nn.ModuleList() | |||
| if use_fid_inception: | |||
| inception = fid_inception_v3() | |||
| else: | |||
| inception = models.inception_v3(pretrained=True) | |||
| # Block 0: input to maxpool1 | |||
| block0 = [ | |||
| inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, | |||
| inception.Conv2d_2b_3x3, | |||
| nn.MaxPool2d(kernel_size=3, stride=2) | |||
| ] | |||
| self.blocks.append(nn.Sequential(*block0)) | |||
| # Block 1: maxpool1 to maxpool2 | |||
| if self.last_needed_block >= 1: | |||
| block1 = [ | |||
| inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, | |||
| nn.MaxPool2d(kernel_size=3, stride=2) | |||
| ] | |||
| self.blocks.append(nn.Sequential(*block1)) | |||
| # Block 2: maxpool2 to aux classifier | |||
| if self.last_needed_block >= 2: | |||
| block2 = [ | |||
| inception.Mixed_5b, | |||
| inception.Mixed_5c, | |||
| inception.Mixed_5d, | |||
| inception.Mixed_6a, | |||
| inception.Mixed_6b, | |||
| inception.Mixed_6c, | |||
| inception.Mixed_6d, | |||
| inception.Mixed_6e, | |||
| ] | |||
| self.blocks.append(nn.Sequential(*block2)) | |||
| # Block 3: aux classifier to final avgpool | |||
| if self.last_needed_block >= 3: | |||
| block3 = [ | |||
| inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c, | |||
| nn.AdaptiveAvgPool2d(output_size=(1, 1)) | |||
| ] | |||
| self.blocks.append(nn.Sequential(*block3)) | |||
| for param in self.parameters(): | |||
| param.requires_grad = requires_grad | |||
| def forward(self, inp): | |||
| """Get Inception feature maps | |||
| Parameters | |||
| ---------- | |||
| inp : torch.autograd.Variable | |||
| Input tensor of shape Bx3xHxW. Values are expected to be in | |||
| range (0, 1) | |||
| Returns | |||
| ------- | |||
| List of torch.autograd.Variable, corresponding to the selected output | |||
| block, sorted ascending by index | |||
| """ | |||
| outp = [] | |||
| x = inp | |||
| if self.resize_input: | |||
| x = F.interpolate( | |||
| x, size=(299, 299), mode='bilinear', align_corners=False) | |||
| if self.normalize_input: | |||
| x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) | |||
| for idx, block in enumerate(self.blocks): | |||
| x = block(x) | |||
| if idx in self.output_blocks: | |||
| outp.append(x) | |||
| if idx == self.last_needed_block: | |||
| break | |||
| return outp | |||
| def fid_inception_v3(): | |||
| """Build pretrained Inception model for FID computation | |||
| The Inception model for FID computation uses a different set of weights | |||
| and has a slightly different structure than torchvision's Inception. | |||
| This method first constructs torchvision's Inception and then patches the | |||
| necessary parts that are different in the FID Inception model. | |||
| """ | |||
| LOGGER.info('fid_inception_v3 called') | |||
| inception = models.inception_v3( | |||
| num_classes=1008, aux_logits=False, pretrained=False) | |||
| LOGGER.info('models.inception_v3 done') | |||
| inception.Mixed_5b = FIDInceptionA(192, pool_features=32) | |||
| inception.Mixed_5c = FIDInceptionA(256, pool_features=64) | |||
| inception.Mixed_5d = FIDInceptionA(288, pool_features=64) | |||
| inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128) | |||
| inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160) | |||
| inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160) | |||
| inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192) | |||
| inception.Mixed_7b = FIDInceptionE_1(1280) | |||
| inception.Mixed_7c = FIDInceptionE_2(2048) | |||
| LOGGER.info('fid_inception_v3 patching done') | |||
| state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True) | |||
| LOGGER.info('fid_inception_v3 weights downloaded') | |||
| inception.load_state_dict(state_dict) | |||
| LOGGER.info('fid_inception_v3 weights loaded into model') | |||
| return inception | |||
| class FIDInceptionA(models.inception.InceptionA): | |||
| """InceptionA block patched for FID computation""" | |||
| def __init__(self, in_channels, pool_features): | |||
| super(FIDInceptionA, self).__init__(in_channels, pool_features) | |||
| def forward(self, x): | |||
| branch1x1 = self.branch1x1(x) | |||
| branch5x5 = self.branch5x5_1(x) | |||
| branch5x5 = self.branch5x5_2(branch5x5) | |||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||
| branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) | |||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||
| # its average calculation | |||
| branch_pool = F.avg_pool2d( | |||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||
| branch_pool = self.branch_pool(branch_pool) | |||
| outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] | |||
| return torch.cat(outputs, 1) | |||
| class FIDInceptionC(models.inception.InceptionC): | |||
| """InceptionC block patched for FID computation""" | |||
| def __init__(self, in_channels, channels_7x7): | |||
| super(FIDInceptionC, self).__init__(in_channels, channels_7x7) | |||
| def forward(self, x): | |||
| branch1x1 = self.branch1x1(x) | |||
| branch7x7 = self.branch7x7_1(x) | |||
| branch7x7 = self.branch7x7_2(branch7x7) | |||
| branch7x7 = self.branch7x7_3(branch7x7) | |||
| branch7x7dbl = self.branch7x7dbl_1(x) | |||
| branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) | |||
| branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) | |||
| branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) | |||
| branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) | |||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||
| # its average calculation | |||
| branch_pool = F.avg_pool2d( | |||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||
| branch_pool = self.branch_pool(branch_pool) | |||
| outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] | |||
| return torch.cat(outputs, 1) | |||
| class FIDInceptionE_1(models.inception.InceptionE): | |||
| """First InceptionE block patched for FID computation""" | |||
| def __init__(self, in_channels): | |||
| super(FIDInceptionE_1, self).__init__(in_channels) | |||
| def forward(self, x): | |||
| branch1x1 = self.branch1x1(x) | |||
| branch3x3 = self.branch3x3_1(x) | |||
| branch3x3 = [ | |||
| self.branch3x3_2a(branch3x3), | |||
| self.branch3x3_2b(branch3x3), | |||
| ] | |||
| branch3x3 = torch.cat(branch3x3, 1) | |||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||
| branch3x3dbl = [ | |||
| self.branch3x3dbl_3a(branch3x3dbl), | |||
| self.branch3x3dbl_3b(branch3x3dbl), | |||
| ] | |||
| branch3x3dbl = torch.cat(branch3x3dbl, 1) | |||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||
| # its average calculation | |||
| branch_pool = F.avg_pool2d( | |||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||
| branch_pool = self.branch_pool(branch_pool) | |||
| outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] | |||
| return torch.cat(outputs, 1) | |||
| class FIDInceptionE_2(models.inception.InceptionE): | |||
| """Second InceptionE block patched for FID computation""" | |||
| def __init__(self, in_channels): | |||
| super(FIDInceptionE_2, self).__init__(in_channels) | |||
| def forward(self, x): | |||
| branch1x1 = self.branch1x1(x) | |||
| branch3x3 = self.branch3x3_1(x) | |||
| branch3x3 = [ | |||
| self.branch3x3_2a(branch3x3), | |||
| self.branch3x3_2b(branch3x3), | |||
| ] | |||
| branch3x3 = torch.cat(branch3x3, 1) | |||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||
| branch3x3dbl = [ | |||
| self.branch3x3dbl_3a(branch3x3dbl), | |||
| self.branch3x3dbl_3b(branch3x3dbl), | |||
| ] | |||
| branch3x3dbl = torch.cat(branch3x3dbl, 1) | |||
| # Patch: The FID Inception model uses max pooling instead of average | |||
| # pooling. This is likely an error in this specific Inception | |||
| # implementation, as other Inception models use average pooling here | |||
| # (which matches the description in the paper). | |||
| branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) | |||
| branch_pool = self.branch_pool(branch_pool) | |||
| outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] | |||
| return torch.cat(outputs, 1) | |||
| @@ -0,0 +1,47 @@ | |||
| """ | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| """ | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import torchvision | |||
| from .ade20k import ModelBuilder | |||
| IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None] | |||
| IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None] | |||
| class ResNetPL(nn.Module): | |||
| def __init__(self, | |||
| weight=1, | |||
| weights_path=None, | |||
| arch_encoder='resnet50dilated', | |||
| segmentation=True): | |||
| super().__init__() | |||
| self.impl = ModelBuilder.get_encoder( | |||
| weights_path=weights_path, | |||
| arch_encoder=arch_encoder, | |||
| arch_decoder='ppm_deepsup', | |||
| fc_dim=2048, | |||
| segmentation=segmentation) | |||
| self.impl.eval() | |||
| for w in self.impl.parameters(): | |||
| w.requires_grad_(False) | |||
| self.weight = weight | |||
| def forward(self, pred, target): | |||
| pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred) | |||
| target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target) | |||
| pred_feats = self.impl(pred, return_feature_maps=True) | |||
| target_feats = self.impl(target, return_feature_maps=True) | |||
| result = torch.stack([ | |||
| F.mse_loss(cur_pred, cur_target) | |||
| for cur_pred, cur_target in zip(pred_feats, target_feats) | |||
| ]).sum() * self.weight | |||
| return result | |||
| @@ -0,0 +1,75 @@ | |||
| """ | |||
| The implementation is adopted from | |||
| https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py | |||
| """ | |||
| import collections | |||
| import functools | |||
| import logging | |||
| from collections import defaultdict | |||
| from functools import partial | |||
| import numpy as np | |||
| import torch.nn as nn | |||
| # Defines the PatchGAN discriminator with the specified arguments. | |||
| class NLayerDiscriminator(nn.Module): | |||
| def __init__( | |||
| self, | |||
| input_nc=3, | |||
| ndf=64, | |||
| n_layers=4, | |||
| norm_layer=nn.BatchNorm2d, | |||
| ): | |||
| super().__init__() | |||
| self.n_layers = n_layers | |||
| kw = 4 | |||
| padw = int(np.ceil((kw - 1.0) / 2)) | |||
| sequence = [[ | |||
| nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), | |||
| nn.LeakyReLU(0.2, True) | |||
| ]] | |||
| nf = ndf | |||
| for n in range(1, n_layers): | |||
| nf_prev = nf | |||
| nf = min(nf * 2, 512) | |||
| cur_model = [] | |||
| cur_model += [ | |||
| nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw), | |||
| norm_layer(nf), | |||
| nn.LeakyReLU(0.2, True) | |||
| ] | |||
| sequence.append(cur_model) | |||
| nf_prev = nf | |||
| nf = min(nf * 2, 512) | |||
| cur_model = [] | |||
| cur_model += [ | |||
| nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw), | |||
| norm_layer(nf), | |||
| nn.LeakyReLU(0.2, True) | |||
| ] | |||
| sequence.append(cur_model) | |||
| sequence += [[ | |||
| nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw) | |||
| ]] | |||
| for n in range(len(sequence)): | |||
| setattr(self, 'model' + str(n), nn.Sequential(*sequence[n])) | |||
| def get_all_activations(self, x): | |||
| res = [x] | |||
| for n in range(self.n_layers + 2): | |||
| model = getattr(self, 'model' + str(n)) | |||
| res.append(model(res[-1])) | |||
| return res[1:] | |||
| def forward(self, x): | |||
| act = self.get_all_activations(x) | |||
| return act[-1], act[:-1] | |||
| @@ -0,0 +1,393 @@ | |||
| ''' | |||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||
| https://github.com/saic-mdal/lama | |||
| ''' | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| from kornia.filters import gaussian_blur2d | |||
| from kornia.geometry.transform import resize | |||
| from kornia.morphology import erosion | |||
| from torch.nn import functional as F | |||
| from torch.optim import SGD, Adam | |||
| from tqdm import tqdm | |||
| from .modules.ffc import FFCResnetBlock | |||
| def move_to_device(obj, device): | |||
| if isinstance(obj, nn.Module): | |||
| return obj.to(device) | |||
| if torch.is_tensor(obj): | |||
| return obj.to(device) | |||
| if isinstance(obj, (tuple, list)): | |||
| return [move_to_device(el, device) for el in obj] | |||
| if isinstance(obj, dict): | |||
| return {name: move_to_device(val, device) for name, val in obj.items()} | |||
| raise ValueError(f'Unexpected type {type(obj)}') | |||
| def ceil_modulo(x, mod): | |||
| if x % mod == 0: | |||
| return x | |||
| return (x // mod + 1) * mod | |||
| def pad_tensor_to_modulo(img, mod): | |||
| batch_size, channels, height, width = img.shape | |||
| out_height = ceil_modulo(height, mod) | |||
| out_width = ceil_modulo(width, mod) | |||
| return F.pad( | |||
| img, | |||
| pad=(0, out_width - width, 0, out_height - height), | |||
| mode='reflect') | |||
| def _pyrdown(im: torch.Tensor, downsize: tuple = None): | |||
| """downscale the image""" | |||
| if downsize is None: | |||
| downsize = (im.shape[2] // 2, im.shape[3] // 2) | |||
| assert im.shape[ | |||
| 1] == 3, 'Expected shape for the input to be (n,3,height,width)' | |||
| im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0)) | |||
| im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False) | |||
| return im | |||
| def _pyrdown_mask(mask: torch.Tensor, | |||
| downsize: tuple = None, | |||
| eps: float = 1e-8, | |||
| blur_mask: bool = True, | |||
| round_up: bool = True): | |||
| """downscale the mask tensor | |||
| Parameters | |||
| ---------- | |||
| mask : torch.Tensor | |||
| mask of size (B, 1, H, W) | |||
| downsize : tuple, optional | |||
| size to downscale to. If None, image is downscaled to half, by default None | |||
| eps : float, optional | |||
| threshold value for binarizing the mask, by default 1e-8 | |||
| blur_mask : bool, optional | |||
| if True, apply gaussian filter before downscaling, by default True | |||
| round_up : bool, optional | |||
| if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True | |||
| Returns | |||
| ------- | |||
| torch.Tensor | |||
| downscaled mask | |||
| """ | |||
| if downsize is None: | |||
| downsize = (mask.shape[2] // 2, mask.shape[3] // 2) | |||
| assert mask.shape[ | |||
| 1] == 1, 'Expected shape for the input to be (n,1,height,width)' | |||
| if blur_mask is True: | |||
| mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0)) | |||
| mask = F.interpolate( | |||
| mask, size=downsize, mode='bilinear', align_corners=False) | |||
| else: | |||
| mask = F.interpolate( | |||
| mask, size=downsize, mode='bilinear', align_corners=False) | |||
| if round_up: | |||
| mask[mask >= eps] = 1 | |||
| mask[mask < eps] = 0 | |||
| else: | |||
| mask[mask >= 1.0 - eps] = 1 | |||
| mask[mask < 1.0 - eps] = 0 | |||
| return mask | |||
| def _erode_mask(mask: torch.Tensor, | |||
| ekernel: torch.Tensor = None, | |||
| eps: float = 1e-8): | |||
| """erode the mask, and set gray pixels to 0""" | |||
| if ekernel is not None: | |||
| mask = erosion(mask, ekernel) | |||
| mask[mask >= 1.0 - eps] = 1 | |||
| mask[mask < 1.0 - eps] = 0 | |||
| return mask | |||
| def _l1_loss(pred: torch.Tensor, | |||
| pred_downscaled: torch.Tensor, | |||
| ref: torch.Tensor, | |||
| mask: torch.Tensor, | |||
| mask_downscaled: torch.Tensor, | |||
| image: torch.Tensor, | |||
| on_pred: bool = True): | |||
| """l1 loss on src pixels, and downscaled predictions if on_pred=True""" | |||
| loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8])) | |||
| if on_pred: | |||
| loss += torch.mean( | |||
| torch.abs(pred_downscaled[mask_downscaled >= 1e-8] | |||
| - ref[mask_downscaled >= 1e-8])) | |||
| return loss | |||
| def _infer(image: torch.Tensor, | |||
| mask: torch.Tensor, | |||
| forward_front: nn.Module, | |||
| forward_rears: nn.Module, | |||
| ref_lower_res: torch.Tensor, | |||
| orig_shape: tuple, | |||
| devices: list, | |||
| scale_ind: int, | |||
| n_iters: int = 15, | |||
| lr: float = 0.002): | |||
| """Performs inference with refinement at a given scale. | |||
| Parameters | |||
| ---------- | |||
| image : torch.Tensor | |||
| input image to be inpainted, of size (1,3,H,W) | |||
| mask : torch.Tensor | |||
| input inpainting mask, of size (1,1,H,W) | |||
| forward_front : nn.Module | |||
| the front part of the inpainting network | |||
| forward_rears : nn.Module | |||
| the rear part of the inpainting network | |||
| ref_lower_res : torch.Tensor | |||
| the inpainting at previous scale, used as reference image | |||
| orig_shape : tuple | |||
| shape of the original input image before padding | |||
| devices : list | |||
| list of available devices | |||
| scale_ind : int | |||
| the scale index | |||
| n_iters : int, optional | |||
| number of iterations of refinement, by default 15 | |||
| lr : float, optional | |||
| learning rate, by default 0.002 | |||
| Returns | |||
| ------- | |||
| torch.Tensor | |||
| inpainted image | |||
| """ | |||
| masked_image = image * (1 - mask) | |||
| masked_image = torch.cat([masked_image, mask], dim=1) | |||
| mask = mask.repeat(1, 3, 1, 1) | |||
| if ref_lower_res is not None: | |||
| ref_lower_res = ref_lower_res.detach() | |||
| with torch.no_grad(): | |||
| z1, z2 = forward_front(masked_image) | |||
| # Inference | |||
| mask = mask.to(devices[-1]) | |||
| ekernel = torch.from_numpy( | |||
| cv2.getStructuringElement(cv2.MORPH_ELLIPSE, | |||
| (15, 15)).astype(bool)).float() | |||
| ekernel = ekernel.to(devices[-1]) | |||
| image = image.to(devices[-1]) | |||
| z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0]) | |||
| z1.requires_grad, z2.requires_grad = True, True | |||
| optimizer = Adam([z1, z2], lr=lr) | |||
| pbar = tqdm(range(n_iters), leave=False) | |||
| for idi in pbar: | |||
| optimizer.zero_grad() | |||
| input_feat = (z1, z2) | |||
| for idd, forward_rear in enumerate(forward_rears): | |||
| output_feat = forward_rear(input_feat) | |||
| if idd < len(devices) - 1: | |||
| midz1, midz2 = output_feat | |||
| midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to( | |||
| devices[idd + 1]) | |||
| input_feat = (midz1, midz2) | |||
| else: | |||
| pred = output_feat | |||
| if ref_lower_res is None: | |||
| break | |||
| losses = {} | |||
| # scaled loss with downsampler | |||
| pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]]) | |||
| mask_downscaled = _pyrdown_mask( | |||
| mask[:, :1, :orig_shape[0], :orig_shape[1]], | |||
| blur_mask=False, | |||
| round_up=False) | |||
| mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel) | |||
| mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1) | |||
| losses['ms_l1'] = _l1_loss( | |||
| pred, | |||
| pred_downscaled, | |||
| ref_lower_res, | |||
| mask, | |||
| mask_downscaled, | |||
| image, | |||
| on_pred=True) | |||
| loss = sum(losses.values()) | |||
| pbar.set_description( | |||
| 'Refining scale {} using scale {} ...current loss: {:.4f}'.format( | |||
| scale_ind + 1, scale_ind, loss.item())) | |||
| if idi < n_iters - 1: | |||
| loss.backward() | |||
| optimizer.step() | |||
| del pred_downscaled | |||
| del loss | |||
| del pred | |||
| # "pred" is the prediction after Plug-n-Play module | |||
| inpainted = mask * pred + (1 - mask) * image | |||
| inpainted = inpainted.detach().cpu() | |||
| return inpainted | |||
| def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int, | |||
| px_budget: int): | |||
| """Build the image mask pyramid | |||
| Parameters | |||
| ---------- | |||
| batch : dict | |||
| batch containing image, mask, etc | |||
| min_side : int | |||
| minimum side length to limit the number of scales of the pyramid | |||
| max_scales : int | |||
| maximum number of scales allowed | |||
| px_budget : int | |||
| the product H*W cannot exceed this budget, because of resource constraints | |||
| Returns | |||
| ------- | |||
| tuple | |||
| image-mask pyramid in the form of list of images and list of masks | |||
| """ | |||
| assert batch['image'].shape[ | |||
| 0] == 1, 'refiner works on only batches of size 1!' | |||
| h, w = batch['unpad_to_size'] | |||
| h, w = h[0].item(), w[0].item() | |||
| image = batch['image'][..., :h, :w] | |||
| mask = batch['mask'][..., :h, :w] | |||
| if h * w > px_budget: | |||
| # resize | |||
| ratio = np.sqrt(px_budget / float(h * w)) | |||
| h_orig, w_orig = h, w | |||
| h, w = int(h * ratio), int(w * ratio) | |||
| print( | |||
| f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...' | |||
| ) | |||
| image = resize( | |||
| image, (h, w), interpolation='bilinear', align_corners=False) | |||
| mask = resize( | |||
| mask, (h, w), interpolation='bilinear', align_corners=False) | |||
| mask[mask > 1e-8] = 1 | |||
| breadth = min(h, w) | |||
| n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))), | |||
| max_scales) | |||
| ls_images = [] | |||
| ls_masks = [] | |||
| ls_images.append(image) | |||
| ls_masks.append(mask) | |||
| for _ in range(n_scales - 1): | |||
| image_p = _pyrdown(ls_images[-1]) | |||
| mask_p = _pyrdown_mask(ls_masks[-1]) | |||
| ls_images.append(image_p) | |||
| ls_masks.append(mask_p) | |||
| # reverse the lists because we want the lowest resolution image as index 0 | |||
| return ls_images[::-1], ls_masks[::-1] | |||
| def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str, | |||
| modulo: int, n_iters: int, lr: float, min_side: int, | |||
| max_scales: int, px_budget: int): | |||
| """Refines the inpainting of the network | |||
| Parameters | |||
| ---------- | |||
| batch : dict | |||
| image-mask batch, currently we assume the batchsize to be 1 | |||
| inpainter : nn.Module | |||
| the inpainting neural network | |||
| gpu_ids : str | |||
| the GPU ids of the machine to use. If only single GPU, use: "0," | |||
| modulo : int | |||
| pad the image to ensure dimension % modulo == 0 | |||
| n_iters : int | |||
| number of iterations of refinement for each scale | |||
| lr : float | |||
| learning rate | |||
| min_side : int | |||
| all sides of image on all scales should be >= min_side / sqrt(2) | |||
| max_scales : int | |||
| max number of downscaling scales for the image-mask pyramid | |||
| px_budget : int | |||
| pixels budget. Any image will be resized to satisfy height*width <= px_budget | |||
| Returns | |||
| ------- | |||
| torch.Tensor | |||
| inpainted image of size (1,3,H,W) | |||
| """ | |||
| inpainter = inpainter.model | |||
| assert not inpainter.training | |||
| assert not inpainter.add_noise_kwargs | |||
| assert inpainter.concat_mask | |||
| gpu_ids = [ | |||
| f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',') | |||
| if gpuid.isdigit() | |||
| ] | |||
| n_resnet_blocks = 0 | |||
| first_resblock_ind = 0 | |||
| found_first_resblock = False | |||
| for idl in range(len(inpainter.generator.model)): | |||
| if isinstance(inpainter.generator.model[idl], FFCResnetBlock): | |||
| n_resnet_blocks += 1 | |||
| found_first_resblock = True | |||
| elif not found_first_resblock: | |||
| first_resblock_ind += 1 | |||
| resblocks_per_gpu = n_resnet_blocks // len(gpu_ids) | |||
| devices = [torch.device(gpu_id) for gpu_id in gpu_ids] | |||
| # split the model into front, and rear parts | |||
| forward_front = inpainter.generator.model[0:first_resblock_ind] | |||
| forward_front.to(devices[0]) | |||
| forward_rears = [] | |||
| for idd in range(len(gpu_ids)): | |||
| if idd < len(gpu_ids) - 1: | |||
| forward_rears.append( | |||
| inpainter.generator.model[first_resblock_ind | |||
| + resblocks_per_gpu | |||
| * (idd):first_resblock_ind | |||
| + resblocks_per_gpu * (idd + 1)]) | |||
| else: | |||
| forward_rears.append( | |||
| inpainter.generator.model[first_resblock_ind | |||
| + resblocks_per_gpu * (idd):]) | |||
| forward_rears[idd].to(devices[idd]) | |||
| ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales, | |||
| px_budget) | |||
| image_inpainted = None | |||
| for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)): | |||
| orig_shape = image.shape[2:] | |||
| image = pad_tensor_to_modulo(image, modulo) | |||
| mask = pad_tensor_to_modulo(mask, modulo) | |||
| mask[mask >= 1e-8] = 1.0 | |||
| mask[mask < 1e-8] = 0.0 | |||
| image, mask = move_to_device(image, devices[0]), move_to_device( | |||
| mask, devices[0]) | |||
| if image_inpainted is not None: | |||
| image_inpainted = move_to_device(image_inpainted, devices[-1]) | |||
| image_inpainted = _infer(image, mask, forward_front, forward_rears, | |||
| image_inpainted, orig_shape, devices, ids, | |||
| n_iters, lr) | |||
| image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]] | |||
| # detach everything to save resources | |||
| image = image.detach().cpu() | |||
| mask = mask.detach().cpu() | |||
| return image_inpainted | |||
| @@ -10,7 +10,7 @@ if TYPE_CHECKING: | |||
| else: | |||
| _import_structure = { | |||
| 'mmdet_model': ['DetectionModel'], | |||
| 'yolox_pai': ['YOLOX'] | |||
| 'yolox_pai': ['YOLOX'], | |||
| } | |||
| import sys | |||
| @@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks | |||
| @MODELS.register_module( | |||
| group_key=Tasks.image_object_detection, module_name=Models.yolox) | |||
| @MODELS.register_module( | |||
| group_key=Tasks.image_object_detection, | |||
| module_name=Models.image_object_detection_auto) | |||
| class YOLOX(EasyCVBaseModel, _YOLOX): | |||
| def __init__(self, model_dir=None, *args, **kwargs): | |||
| @@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .realtime_detector import RealtimeDetector | |||
| from .realtime_video_detector import RealtimeVideoDetector | |||
| else: | |||
| _import_structure = { | |||
| 'realtime_detector': ['RealtimeDetector'], | |||
| 'realtime_video_detector': ['RealtimeVideoDetector'], | |||
| } | |||
| import sys | |||
| @@ -0,0 +1,117 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import argparse | |||
| import logging as logger | |||
| import os | |||
| import os.path as osp | |||
| import time | |||
| import cv2 | |||
| import json | |||
| import torch | |||
| from tqdm import tqdm | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base.base_torch_model import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.preprocessors import LoadImage | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .yolox.data.data_augment import ValTransform | |||
| from .yolox.exp import get_exp_by_name | |||
| from .yolox.utils import postprocess | |||
| @MODELS.register_module( | |||
| group_key=Tasks.video_object_detection, | |||
| module_name=Models.realtime_video_object_detection) | |||
| class RealtimeVideoDetector(TorchModel): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| self.config = Config.from_file( | |||
| os.path.join(self.model_dir, ModelFile.CONFIGURATION)) | |||
| # model type | |||
| self.exp = get_exp_by_name(self.config.model_type) | |||
| # build model | |||
| self.model = self.exp.get_model() | |||
| model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) | |||
| ckpt = torch.load(model_path, map_location='cpu') | |||
| # load the model state dict | |||
| self.model.load_state_dict(ckpt['model']) | |||
| self.model.eval() | |||
| # params setting | |||
| self.exp.num_classes = self.config.num_classes | |||
| self.confthre = self.config.conf_thr | |||
| self.num_classes = self.exp.num_classes | |||
| self.nmsthre = self.exp.nmsthre | |||
| self.test_size = self.exp.test_size | |||
| self.preproc = ValTransform(legacy=False) | |||
| self.current_buffer = None | |||
| self.label_mapping = self.config['labels'] | |||
| def inference(self, img): | |||
| with torch.no_grad(): | |||
| outputs, self.current_buffer = self.model( | |||
| img, buffer=self.current_buffer, mode='on_pipe') | |||
| return outputs | |||
| def forward(self, inputs): | |||
| return self.inference_video(inputs) | |||
| def preprocess(self, img): | |||
| img = LoadImage.convert_to_ndarray(img) | |||
| height, width = img.shape[:2] | |||
| self.ratio = min(self.test_size[0] / img.shape[0], | |||
| self.test_size[1] / img.shape[1]) | |||
| img, _ = self.preproc(img, None, self.test_size) | |||
| img = torch.from_numpy(img).unsqueeze(0) | |||
| img = img.float() | |||
| # Video decoding and preprocessing automatically are not supported by Pipeline/Model | |||
| # Sending preprocessed video frame tensor to GPU buffer self-adaptively | |||
| if next(self.model.parameters()).is_cuda: | |||
| img = img.to(next(self.model.parameters()).device) | |||
| return img | |||
| def postprocess(self, input): | |||
| outputs = postprocess( | |||
| input, | |||
| self.num_classes, | |||
| self.confthre, | |||
| self.nmsthre, | |||
| class_agnostic=True) | |||
| if len(outputs) == 1: | |||
| bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio | |||
| scores = outputs[0][:, 5].cpu().numpy() | |||
| labels = outputs[0][:, 6].cpu().int().numpy() | |||
| pred_label_names = [] | |||
| for lab in labels: | |||
| pred_label_names.append(self.label_mapping[lab]) | |||
| return bboxes, scores, pred_label_names | |||
| def inference_video(self, v_path): | |||
| outputs = [] | |||
| desc = 'Detecting video: {}'.format(v_path) | |||
| for frame, result in tqdm( | |||
| self.inference_video_iter(v_path), desc=desc): | |||
| outputs.append(result) | |||
| return outputs | |||
| def inference_video_iter(self, v_path): | |||
| capture = cv2.VideoCapture(v_path) | |||
| while capture.isOpened(): | |||
| ret, frame = capture.read() | |||
| if not ret: | |||
| break | |||
| output = self.preprocess(frame) | |||
| output = self.inference(output) | |||
| output = self.postprocess(output) | |||
| yield frame, output | |||
| @@ -13,6 +13,8 @@ def get_exp_by_name(exp_name): | |||
| from .default import YoloXNanoExp as YoloXExp | |||
| elif exp == 'yolox_tiny': | |||
| from .default import YoloXTinyExp as YoloXExp | |||
| elif exp == 'streamyolo': | |||
| from .default import StreamYoloExp as YoloXExp | |||
| else: | |||
| pass | |||
| return YoloXExp() | |||
| @@ -1,5 +1,5 @@ | |||
| # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
| from .streamyolo import StreamYoloExp | |||
| from .yolox_nano import YoloXNanoExp | |||
| from .yolox_s import YoloXSExp | |||
| from .yolox_tiny import YoloXTinyExp | |||
| @@ -0,0 +1,43 @@ | |||
| # The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO | |||
| import os | |||
| import sys | |||
| import torch | |||
| from ..yolox_base import Exp as YoloXExp | |||
| class StreamYoloExp(YoloXExp): | |||
| def __init__(self): | |||
| super(YoloXExp, self).__init__() | |||
| self.depth = 1.0 | |||
| self.width = 1.0 | |||
| self.num_classes = 8 | |||
| self.test_size = (600, 960) | |||
| self.test_conf = 0.3 | |||
| self.nmsthre = 0.65 | |||
| def get_model(self): | |||
| from ...models import StreamYOLO, DFPPAFPN, TALHead | |||
| def init_yolo(M): | |||
| for m in M.modules(): | |||
| if isinstance(m, nn.BatchNorm2d): | |||
| m.eps = 1e-3 | |||
| m.momentum = 0.03 | |||
| if getattr(self, 'model', None) is None: | |||
| in_channels = [256, 512, 1024] | |||
| backbone = DFPPAFPN( | |||
| self.depth, self.width, in_channels=in_channels) | |||
| head = TALHead( | |||
| self.num_classes, | |||
| self.width, | |||
| in_channels=in_channels, | |||
| gamma=1.0, | |||
| ignore_thr=0.5, | |||
| ignore_value=1.6) | |||
| self.model = StreamYOLO(backbone, head) | |||
| return self.model | |||
| @@ -1,5 +1,4 @@ | |||
| # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
| import os | |||
| import random | |||