# Conflicts: # modelscope/metrics/__init__.pymaster
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d | |||||
| size 2327764 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150 | |||||
| size 68524 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32 | |||||
| size 53219 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c | |||||
| size 254845 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae | |||||
| size 48909 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d | |||||
| size 1127557 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5 | |||||
| size 2747938 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15 | |||||
| size 27616 | |||||
| @@ -1,3 +0,0 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d | |||||
| size 1702339 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd | |||||
| size 167407 | |||||
| @@ -1,3 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | version https://git-lfs.github.com/spec/v1 | ||||
| oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572 | |||||
| size 60801 | |||||
| oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c | |||||
| size 61239 | |||||
| @@ -1,3 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | version https://git-lfs.github.com/spec/v1 | ||||
| oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c | |||||
| size 60801 | |||||
| oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1 | |||||
| size 61115 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb | |||||
| size 2924691 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87 | |||||
| size 4068783 | |||||
| @@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ | |||||
| ENV SHELL=/bin/bash | ENV SHELL=/bin/bash | ||||
| # install special package | # install special package | ||||
| RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq | |||||
| RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl | |||||
| RUN if [ "$USE_GPU" = "True" ] ; then \ | RUN if [ "$USE_GPU" = "True" ] ; then \ | ||||
| pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ | pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ | ||||
| @@ -24,20 +24,17 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||||
| DownloadMode) | DownloadMode) | ||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| from .errors import (InvalidParameter, NotExistError, RequestError, | from .errors import (InvalidParameter, NotExistError, RequestError, | ||||
| datahub_raise_on_error, handle_http_response, is_ok, | |||||
| raise_on_error) | |||||
| from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||||
| model_id_to_group_owner_name) | |||||
| datahub_raise_on_error, handle_http_post_error, | |||||
| handle_http_response, is_ok, raise_on_error) | |||||
| from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||||
| logger = get_logger() | logger = get_logger() | ||||
| class HubApi: | class HubApi: | ||||
| def __init__(self, endpoint=None, dataset_endpoint=None): | |||||
| def __init__(self, endpoint=None): | |||||
| self.endpoint = endpoint if endpoint is not None else get_endpoint() | self.endpoint = endpoint if endpoint is not None else get_endpoint() | ||||
| self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||||
| ) | |||||
| def login( | def login( | ||||
| self, | self, | ||||
| @@ -105,17 +102,15 @@ class HubApi: | |||||
| path = f'{self.endpoint}/api/v1/models' | path = f'{self.endpoint}/api/v1/models' | ||||
| owner_or_group, name = model_id_to_group_owner_name(model_id) | owner_or_group, name = model_id_to_group_owner_name(model_id) | ||||
| r = requests.post( | |||||
| path, | |||||
| json={ | |||||
| 'Path': owner_or_group, | |||||
| 'Name': name, | |||||
| 'ChineseName': chinese_name, | |||||
| 'Visibility': visibility, # server check | |||||
| 'License': license | |||||
| }, | |||||
| cookies=cookies) | |||||
| r.raise_for_status() | |||||
| body = { | |||||
| 'Path': owner_or_group, | |||||
| 'Name': name, | |||||
| 'ChineseName': chinese_name, | |||||
| 'Visibility': visibility, # server check | |||||
| 'License': license | |||||
| } | |||||
| r = requests.post(path, json=body, cookies=cookies) | |||||
| handle_http_post_error(r, path, body) | |||||
| raise_on_error(r.json()) | raise_on_error(r.json()) | ||||
| model_repo_url = f'{get_endpoint()}/{model_id}' | model_repo_url = f'{get_endpoint()}/{model_id}' | ||||
| return model_repo_url | return model_repo_url | ||||
| @@ -290,7 +285,7 @@ class HubApi: | |||||
| return files | return files | ||||
| def list_datasets(self): | def list_datasets(self): | ||||
| path = f'{self.dataset_endpoint}/api/v1/datasets' | |||||
| path = f'{self.endpoint}/api/v1/datasets' | |||||
| headers = None | headers = None | ||||
| params = {} | params = {} | ||||
| r = requests.get(path, params=params, headers=headers) | r = requests.get(path, params=params, headers=headers) | ||||
| @@ -317,13 +312,13 @@ class HubApi: | |||||
| cache_dir): | cache_dir): | ||||
| shutil.rmtree(cache_dir) | shutil.rmtree(cache_dir) | ||||
| os.makedirs(cache_dir, exist_ok=True) | os.makedirs(cache_dir, exist_ok=True) | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| dataset_id = resp['Data']['Id'] | dataset_id = resp['Data']['Id'] | ||||
| dataset_type = resp['Data']['Type'] | dataset_type = resp['Data']['Type'] | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| @@ -341,7 +336,7 @@ class HubApi: | |||||
| file_path = file_info['Path'] | file_path = file_info['Path'] | ||||
| extension = os.path.splitext(file_path)[-1] | extension = os.path.splitext(file_path)[-1] | ||||
| if extension in dataset_meta_format: | if extension in dataset_meta_format: | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_path}' | f'Revision={revision}&FilePath={file_path}' | ||||
| r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
| r.raise_for_status() | r.raise_for_status() | ||||
| @@ -365,7 +360,7 @@ class HubApi: | |||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| if file_name.endswith('.csv'): | if file_name.endswith('.csv'): | ||||
| file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
| f'Revision={revision}&FilePath={file_name}' | f'Revision={revision}&FilePath={file_name}' | ||||
| return file_name | return file_name | ||||
| @@ -374,7 +369,7 @@ class HubApi: | |||||
| dataset_name: str, | dataset_name: str, | ||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| return self.datahub_remote_call(datahub_url) | return self.datahub_remote_call(datahub_url) | ||||
| @@ -385,7 +380,7 @@ class HubApi: | |||||
| namespace: str, | namespace: str, | ||||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
| datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | cookies = requests.utils.dict_from_cookiejar(cookies) | ||||
| @@ -394,6 +389,19 @@ class HubApi: | |||||
| raise_on_error(resp) | raise_on_error(resp) | ||||
| return resp['Data'] | return resp['Data'] | ||||
| def list_oss_dataset_objects(self, dataset_name, namespace, max_limit, | |||||
| is_recursive, is_filter_dir, revision, | |||||
| cookies): | |||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \ | |||||
| f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | |||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
| resp = requests.get(url=url, cookies=cookies) | |||||
| resp = resp.json() | |||||
| raise_on_error(resp) | |||||
| resp = resp['Data'] | |||||
| return resp | |||||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | ||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | ||||
| r = requests.post(url) | r = requests.post(url) | ||||
| @@ -4,6 +4,10 @@ from http import HTTPStatus | |||||
| from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| class NotExistError(Exception): | class NotExistError(Exception): | ||||
| pass | pass | ||||
| @@ -45,15 +49,24 @@ def is_ok(rsp): | |||||
| return rsp['Code'] == HTTPStatus.OK and rsp['Success'] | return rsp['Code'] == HTTPStatus.OK and rsp['Success'] | ||||
| def handle_http_post_error(response, url, request_body): | |||||
| try: | |||||
| response.raise_for_status() | |||||
| except HTTPError as error: | |||||
| logger.error('Request %s with body: %s exception' % | |||||
| (url, request_body)) | |||||
| raise error | |||||
| def handle_http_response(response, logger, cookies, model_id): | def handle_http_response(response, logger, cookies, model_id): | ||||
| try: | try: | ||||
| response.raise_for_status() | response.raise_for_status() | ||||
| except HTTPError: | |||||
| except HTTPError as error: | |||||
| if cookies is None: # code in [403] and | if cookies is None: # code in [403] and | ||||
| logger.error( | logger.error( | ||||
| f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ | f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ | ||||
| private. Please login first.') | private. Please login first.') | ||||
| raise | |||||
| raise error | |||||
| def raise_on_error(rsp): | def raise_on_error(rsp): | ||||
| @@ -1,6 +1,7 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| import os | import os | ||||
| import re | |||||
| import subprocess | import subprocess | ||||
| from typing import List | from typing import List | ||||
| from xmlrpc.client import Boolean | from xmlrpc.client import Boolean | ||||
| @@ -138,8 +139,8 @@ class GitCommandWrapper(metaclass=Singleton): | |||||
| repo_base_dir, repo_name, user_name) | repo_base_dir, repo_name, user_name) | ||||
| response = self._run_git_command(*config_user_name_args.split(' ')) | response = self._run_git_command(*config_user_name_args.split(' ')) | ||||
| logger.debug(response.stdout.decode('utf8')) | logger.debug(response.stdout.decode('utf8')) | ||||
| config_user_email_args = '-C %s/%s config user.name %s' % ( | |||||
| repo_base_dir, repo_name, user_name) | |||||
| config_user_email_args = '-C %s/%s config user.email %s' % ( | |||||
| repo_base_dir, repo_name, user_email) | |||||
| response = self._run_git_command( | response = self._run_git_command( | ||||
| *config_user_email_args.split(' ')) | *config_user_email_args.split(' ')) | ||||
| logger.debug(response.stdout.decode('utf8')) | logger.debug(response.stdout.decode('utf8')) | ||||
| @@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton): | |||||
| cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision] | cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision] | ||||
| return self._run_git_command(*cmds) | return self._run_git_command(*cmds) | ||||
| def get_remote_branches(self, repo_dir: str): | |||||
| cmds = ['-C', '%s' % repo_dir, 'branch', '-r'] | |||||
| rsp = self._run_git_command(*cmds) | |||||
| info = [ | |||||
| line.strip() | |||||
| for line in rsp.stdout.decode('utf8').strip().split(os.linesep) | |||||
| ][1:] | |||||
| return ['/'.join(line.split('/')[1:]) for line in info] | |||||
| def pull(self, repo_dir: str): | def pull(self, repo_dir: str): | ||||
| cmds = ['-C', repo_dir, 'pull'] | cmds = ['-C', repo_dir, 'pull'] | ||||
| return self._run_git_command(*cmds) | return self._run_git_command(*cmds) | ||||
| @@ -0,0 +1,117 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import datetime | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import uuid | |||||
| from typing import Dict, Optional | |||||
| from uuid import uuid4 | |||||
| from filelock import FileLock | |||||
| from modelscope import __version__ | |||||
| from modelscope.hub.api import HubApi, ModelScopeConfig | |||||
| from modelscope.hub.errors import InvalidParameter, NotLoginException | |||||
| from modelscope.hub.git import GitCommandWrapper | |||||
| from modelscope.hub.repository import Repository | |||||
| from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| def upload_folder(model_id: str, | |||||
| model_dir: str, | |||||
| visibility: int = 0, | |||||
| license: str = None, | |||||
| chinese_name: Optional[str] = None, | |||||
| commit_message: Optional[str] = None, | |||||
| revision: Optional[str] = DEFAULT_MODEL_REVISION): | |||||
| """ | |||||
| Upload model from a given directory to given repository. A valid model directory | |||||
| must contain a configuration.json file. | |||||
| This function upload the files in given directory to given repository. If the | |||||
| given repository is not exists in remote, it will automatically create it with | |||||
| given visibility, license and chinese_name parameters. If the revision is also | |||||
| not exists in remote repository, it will create a new branch for it. | |||||
| This function must be called before calling HubApi's login with a valid token | |||||
| which can be obtained from ModelScope's website. | |||||
| Args: | |||||
| model_id (`str`): | |||||
| The model id to be uploaded, caller must have write permission for it. | |||||
| model_dir(`str`): | |||||
| The Absolute Path of the finetune result. | |||||
| visibility(`int`, defaults to `0`): | |||||
| Visibility of the new created model(1-private, 5-public). If the model is | |||||
| not exists in ModelScope, this function will create a new model with this | |||||
| visibility and this parameter is required. You can ignore this parameter | |||||
| if you make sure the model's existence. | |||||
| license(`str`, defaults to `None`): | |||||
| License of the new created model(see License). If the model is not exists | |||||
| in ModelScope, this function will create a new model with this license | |||||
| and this parameter is required. You can ignore this parameter if you | |||||
| make sure the model's existence. | |||||
| chinese_name(`str`, *optional*, defaults to `None`): | |||||
| chinese name of the new created model. | |||||
| commit_message(`str`, *optional*, defaults to `None`): | |||||
| commit message of the push request. | |||||
| revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION): | |||||
| which branch to push. If the branch is not exists, It will create a new | |||||
| branch and push to it. | |||||
| """ | |||||
| if model_id is None: | |||||
| raise InvalidParameter('model_id cannot be empty!') | |||||
| if model_dir is None: | |||||
| raise InvalidParameter('model_dir cannot be empty!') | |||||
| if not os.path.exists(model_dir) or os.path.isfile(model_dir): | |||||
| raise InvalidParameter('model_dir must be a valid directory.') | |||||
| cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) | |||||
| if not os.path.exists(cfg_file): | |||||
| raise ValueError(f'{model_dir} must contain a configuration.json.') | |||||
| cookies = ModelScopeConfig.get_cookies() | |||||
| if cookies is None: | |||||
| raise NotLoginException('Must login before upload!') | |||||
| files_to_save = os.listdir(model_dir) | |||||
| api = HubApi() | |||||
| try: | |||||
| api.get_model(model_id=model_id) | |||||
| except Exception: | |||||
| if visibility is None or license is None: | |||||
| raise InvalidParameter( | |||||
| 'visibility and license cannot be empty if want to create new repo' | |||||
| ) | |||||
| logger.info('Create new model %s' % model_id) | |||||
| api.create_model( | |||||
| model_id=model_id, | |||||
| visibility=visibility, | |||||
| license=license, | |||||
| chinese_name=chinese_name) | |||||
| tmp_dir = tempfile.mkdtemp() | |||||
| git_wrapper = GitCommandWrapper() | |||||
| try: | |||||
| repo = Repository(model_dir=tmp_dir, clone_from=model_id) | |||||
| branches = git_wrapper.get_remote_branches(tmp_dir) | |||||
| if revision not in branches: | |||||
| logger.info('Create new branch %s' % revision) | |||||
| git_wrapper.new_branch(tmp_dir, revision) | |||||
| git_wrapper.checkout(tmp_dir, revision) | |||||
| for f in files_to_save: | |||||
| if f[0] != '.': | |||||
| src = os.path.join(model_dir, f) | |||||
| if os.path.isdir(src): | |||||
| shutil.copytree(src, os.path.join(tmp_dir, f)) | |||||
| else: | |||||
| shutil.copy(src, tmp_dir) | |||||
| if not commit_message: | |||||
| date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') | |||||
| commit_message = '[automsg] push model %s to hub at %s' % ( | |||||
| model_id, date) | |||||
| repo.push(commit_message=commit_message, branch=revision) | |||||
| except Exception: | |||||
| raise | |||||
| finally: | |||||
| shutil.rmtree(tmp_dir, ignore_errors=True) | |||||
| @@ -4,8 +4,7 @@ import hashlib | |||||
| import os | import os | ||||
| from typing import Optional | from typing import Optional | ||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||||
| DEFAULT_MODELSCOPE_DOMAIN, | |||||
| from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||||
| DEFAULT_MODELSCOPE_GROUP, | DEFAULT_MODELSCOPE_GROUP, | ||||
| MODEL_ID_SEPARATOR, | MODEL_ID_SEPARATOR, | ||||
| MODELSCOPE_URL_SCHEME) | MODELSCOPE_URL_SCHEME) | ||||
| @@ -44,11 +43,6 @@ def get_endpoint(): | |||||
| return MODELSCOPE_URL_SCHEME + modelscope_domain | return MODELSCOPE_URL_SCHEME + modelscope_domain | ||||
| def get_dataset_hub_endpoint(): | |||||
| return os.environ.get('HUB_DATASET_ENDPOINT', | |||||
| DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||||
| def compute_hash(file_path): | def compute_hash(file_path): | ||||
| BUFFER_SIZE = 1024 * 64 # 64k buffer size | BUFFER_SIZE = 1024 * 64 # 64k buffer size | ||||
| sha256_hash = hashlib.sha256() | sha256_hash = hashlib.sha256() | ||||
| @@ -14,6 +14,7 @@ class Models(object): | |||||
| # vision models | # vision models | ||||
| detection = 'detection' | detection = 'detection' | ||||
| realtime_object_detection = 'realtime-object-detection' | realtime_object_detection = 'realtime-object-detection' | ||||
| realtime_video_object_detection = 'realtime-video-object-detection' | |||||
| scrfd = 'scrfd' | scrfd = 'scrfd' | ||||
| classification_model = 'ClassificationModel' | classification_model = 'ClassificationModel' | ||||
| nafnet = 'nafnet' | nafnet = 'nafnet' | ||||
| @@ -27,11 +28,13 @@ class Models(object): | |||||
| face_2d_keypoints = 'face-2d-keypoints' | face_2d_keypoints = 'face-2d-keypoints' | ||||
| panoptic_segmentation = 'swinL-panoptic-segmentation' | panoptic_segmentation = 'swinL-panoptic-segmentation' | ||||
| image_reid_person = 'passvitb' | image_reid_person = 'passvitb' | ||||
| image_inpainting = 'FFTInpainting' | |||||
| video_summarization = 'pgl-video-summarization' | video_summarization = 'pgl-video-summarization' | ||||
| swinL_semantic_segmentation = 'swinL-semantic-segmentation' | swinL_semantic_segmentation = 'swinL-semantic-segmentation' | ||||
| vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | ||||
| text_driven_segmentation = 'text-driven-segmentation' | text_driven_segmentation = 'text-driven-segmentation' | ||||
| resnet50_bert = 'resnet50-bert' | resnet50_bert = 'resnet50-bert' | ||||
| referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' | |||||
| fer = 'fer' | fer = 'fer' | ||||
| retinaface = 'retinaface' | retinaface = 'retinaface' | ||||
| shop_segmentation = 'shop-segmentation' | shop_segmentation = 'shop-segmentation' | ||||
| @@ -39,14 +42,18 @@ class Models(object): | |||||
| mtcnn = 'mtcnn' | mtcnn = 'mtcnn' | ||||
| ulfd = 'ulfd' | ulfd = 'ulfd' | ||||
| video_inpainting = 'video-inpainting' | video_inpainting = 'video-inpainting' | ||||
| human_wholebody_keypoint = 'human-wholebody-keypoint' | |||||
| hand_static = 'hand-static' | hand_static = 'hand-static' | ||||
| face_human_hand_detection = 'face-human-hand-detection' | face_human_hand_detection = 'face-human-hand-detection' | ||||
| face_emotion = 'face-emotion' | face_emotion = 'face-emotion' | ||||
| product_segmentation = 'product-segmentation' | product_segmentation = 'product-segmentation' | ||||
| image_body_reshaping = 'image-body-reshaping' | |||||
| # EasyCV models | # EasyCV models | ||||
| yolox = 'YOLOX' | yolox = 'YOLOX' | ||||
| segformer = 'Segformer' | segformer = 'Segformer' | ||||
| hand_2d_keypoints = 'HRNet-Hand2D-Keypoints' | |||||
| image_object_detection_auto = 'image-object-detection-auto' | |||||
| # nlp models | # nlp models | ||||
| bert = 'bert' | bert = 'bert' | ||||
| @@ -66,6 +73,7 @@ class Models(object): | |||||
| gcnncrf = 'gcnn-crf' | gcnncrf = 'gcnn-crf' | ||||
| bart = 'bart' | bart = 'bart' | ||||
| gpt3 = 'gpt3' | gpt3 = 'gpt3' | ||||
| gpt_neo = 'gpt-neo' | |||||
| plug = 'plug' | plug = 'plug' | ||||
| bert_for_ds = 'bert-for-document-segmentation' | bert_for_ds = 'bert-for-document-segmentation' | ||||
| ponet = 'ponet' | ponet = 'ponet' | ||||
| @@ -96,6 +104,7 @@ class TaskModels(object): | |||||
| information_extraction = 'information-extraction' | information_extraction = 'information-extraction' | ||||
| fill_mask = 'fill-mask' | fill_mask = 'fill-mask' | ||||
| feature_extraction = 'feature-extraction' | feature_extraction = 'feature-extraction' | ||||
| text_generation = 'text-generation' | |||||
| class Heads(object): | class Heads(object): | ||||
| @@ -111,6 +120,8 @@ class Heads(object): | |||||
| token_classification = 'token-classification' | token_classification = 'token-classification' | ||||
| # extraction | # extraction | ||||
| information_extraction = 'information-extraction' | information_extraction = 'information-extraction' | ||||
| # text gen | |||||
| text_generation = 'text-generation' | |||||
| class Pipelines(object): | class Pipelines(object): | ||||
| @@ -144,6 +155,7 @@ class Pipelines(object): | |||||
| salient_detection = 'u2net-salient-detection' | salient_detection = 'u2net-salient-detection' | ||||
| image_classification = 'image-classification' | image_classification = 'image-classification' | ||||
| face_detection = 'resnet-face-detection-scrfd10gkps' | face_detection = 'resnet-face-detection-scrfd10gkps' | ||||
| card_detection = 'resnet-card-detection-scrfd34gkps' | |||||
| ulfd_face_detection = 'manual-face-detection-ulfd' | ulfd_face_detection = 'manual-face-detection-ulfd' | ||||
| facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' | facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' | ||||
| retina_face_detection = 'resnet50-face-detection-retinaface' | retina_face_detection = 'resnet50-face-detection-retinaface' | ||||
| @@ -160,6 +172,7 @@ class Pipelines(object): | |||||
| face_image_generation = 'gan-face-image-generation' | face_image_generation = 'gan-face-image-generation' | ||||
| product_retrieval_embedding = 'resnet50-product-retrieval-embedding' | product_retrieval_embedding = 'resnet50-product-retrieval-embedding' | ||||
| realtime_object_detection = 'cspnet_realtime-object-detection_yolox' | realtime_object_detection = 'cspnet_realtime-object-detection_yolox' | ||||
| realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo' | |||||
| face_recognition = 'ir101-face-recognition-cfglint' | face_recognition = 'ir101-face-recognition-cfglint' | ||||
| image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' | image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' | ||||
| image2image_translation = 'image-to-image-translation' | image2image_translation = 'image-to-image-translation' | ||||
| @@ -168,6 +181,7 @@ class Pipelines(object): | |||||
| ocr_recognition = 'convnextTiny-ocr-recognition' | ocr_recognition = 'convnextTiny-ocr-recognition' | ||||
| image_portrait_enhancement = 'gpen-image-portrait-enhancement' | image_portrait_enhancement = 'gpen-image-portrait-enhancement' | ||||
| image_to_image_generation = 'image-to-image-generation' | image_to_image_generation = 'image-to-image-generation' | ||||
| image_object_detection_auto = 'yolox_image-object-detection-auto' | |||||
| skin_retouching = 'unet-skin-retouching' | skin_retouching = 'unet-skin-retouching' | ||||
| tinynas_classification = 'tinynas-classification' | tinynas_classification = 'tinynas-classification' | ||||
| tinynas_detection = 'tinynas-detection' | tinynas_detection = 'tinynas-detection' | ||||
| @@ -178,15 +192,19 @@ class Pipelines(object): | |||||
| video_summarization = 'googlenet_pgl_video_summarization' | video_summarization = 'googlenet_pgl_video_summarization' | ||||
| image_semantic_segmentation = 'image-semantic-segmentation' | image_semantic_segmentation = 'image-semantic-segmentation' | ||||
| image_reid_person = 'passvitb-image-reid-person' | image_reid_person = 'passvitb-image-reid-person' | ||||
| image_inpainting = 'fft-inpainting' | |||||
| text_driven_segmentation = 'text-driven-segmentation' | text_driven_segmentation = 'text-driven-segmentation' | ||||
| movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | ||||
| shop_segmentation = 'shop-segmentation' | shop_segmentation = 'shop-segmentation' | ||||
| video_inpainting = 'video-inpainting' | video_inpainting = 'video-inpainting' | ||||
| human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image' | |||||
| pst_action_recognition = 'patchshift-action-recognition' | pst_action_recognition = 'patchshift-action-recognition' | ||||
| hand_static = 'hand-static' | hand_static = 'hand-static' | ||||
| face_human_hand_detection = 'face-human-hand-detection' | face_human_hand_detection = 'face-human-hand-detection' | ||||
| face_emotion = 'face-emotion' | face_emotion = 'face-emotion' | ||||
| product_segmentation = 'product-segmentation' | product_segmentation = 'product-segmentation' | ||||
| image_body_reshaping = 'flow-based-body-reshaping' | |||||
| referring_video_object_segmentation = 'referring-video-object-segmentation' | |||||
| # nlp tasks | # nlp tasks | ||||
| automatic_post_editing = 'automatic-post-editing' | automatic_post_editing = 'automatic-post-editing' | ||||
| @@ -211,6 +229,7 @@ class Pipelines(object): | |||||
| zero_shot_classification = 'zero-shot-classification' | zero_shot_classification = 'zero-shot-classification' | ||||
| text_error_correction = 'text-error-correction' | text_error_correction = 'text-error-correction' | ||||
| plug_generation = 'plug-generation' | plug_generation = 'plug-generation' | ||||
| gpt3_generation = 'gpt3-generation' | |||||
| faq_question_answering = 'faq-question-answering' | faq_question_answering = 'faq-question-answering' | ||||
| conversational_text_to_sql = 'conversational-text-to-sql' | conversational_text_to_sql = 'conversational-text-to-sql' | ||||
| table_question_answering_pipeline = 'table-question-answering-pipeline' | table_question_answering_pipeline = 'table-question-answering-pipeline' | ||||
| @@ -219,6 +238,9 @@ class Pipelines(object): | |||||
| relation_extraction = 'relation-extraction' | relation_extraction = 'relation-extraction' | ||||
| document_segmentation = 'document-segmentation' | document_segmentation = 'document-segmentation' | ||||
| feature_extraction = 'feature-extraction' | feature_extraction = 'feature-extraction' | ||||
| translation_en_to_de = 'translation_en_to_de' # keep it underscore | |||||
| translation_en_to_ro = 'translation_en_to_ro' # keep it underscore | |||||
| translation_en_to_fr = 'translation_en_to_fr' # keep it underscore | |||||
| # audio tasks | # audio tasks | ||||
| sambert_hifigan_tts = 'sambert-hifigan-tts' | sambert_hifigan_tts = 'sambert-hifigan-tts' | ||||
| @@ -263,6 +285,9 @@ class Trainers(object): | |||||
| image_portrait_enhancement = 'image-portrait-enhancement' | image_portrait_enhancement = 'image-portrait-enhancement' | ||||
| video_summarization = 'video-summarization' | video_summarization = 'video-summarization' | ||||
| movie_scene_segmentation = 'movie-scene-segmentation' | movie_scene_segmentation = 'movie-scene-segmentation' | ||||
| face_detection_scrfd = 'face-detection-scrfd' | |||||
| card_detection_scrfd = 'card-detection-scrfd' | |||||
| image_inpainting = 'image-inpainting' | |||||
| # nlp trainers | # nlp trainers | ||||
| bert_sentiment_analysis = 'bert-sentiment-analysis' | bert_sentiment_analysis = 'bert-sentiment-analysis' | ||||
| @@ -274,6 +299,7 @@ class Trainers(object): | |||||
| # audio trainers | # audio trainers | ||||
| speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | ||||
| speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' | |||||
| class Preprocessors(object): | class Preprocessors(object): | ||||
| @@ -302,6 +328,8 @@ class Preprocessors(object): | |||||
| bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' | bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' | ||||
| text_gen_tokenizer = 'text-gen-tokenizer' | text_gen_tokenizer = 'text-gen-tokenizer' | ||||
| text2text_gen_preprocessor = 'text2text-gen-preprocessor' | text2text_gen_preprocessor = 'text2text-gen-preprocessor' | ||||
| text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer' | |||||
| text2text_translate_preprocessor = 'text2text-translate-preprocessor' | |||||
| token_cls_tokenizer = 'token-cls-tokenizer' | token_cls_tokenizer = 'token-cls-tokenizer' | ||||
| ner_tokenizer = 'ner-tokenizer' | ner_tokenizer = 'ner-tokenizer' | ||||
| nli_tokenizer = 'nli-tokenizer' | nli_tokenizer = 'nli-tokenizer' | ||||
| @@ -324,6 +352,7 @@ class Preprocessors(object): | |||||
| re_tokenizer = 're-tokenizer' | re_tokenizer = 're-tokenizer' | ||||
| document_segmentation = 'document-segmentation' | document_segmentation = 'document-segmentation' | ||||
| feature_extraction = 'feature-extraction' | feature_extraction = 'feature-extraction' | ||||
| sentence_piece = 'sentence-piece' | |||||
| # audio preprocessor | # audio preprocessor | ||||
| linear_aec_fbank = 'linear-aec-fbank' | linear_aec_fbank = 'linear-aec-fbank' | ||||
| @@ -365,6 +394,8 @@ class Metrics(object): | |||||
| video_summarization_metric = 'video-summarization-metric' | video_summarization_metric = 'video-summarization-metric' | ||||
| # metric for movie-scene-segmentation task | # metric for movie-scene-segmentation task | ||||
| movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' | movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' | ||||
| # metric for inpainting task | |||||
| image_inpainting_metric = 'image-inpainting-metric' | |||||
| class Optimizers(object): | class Optimizers(object): | ||||
| @@ -406,6 +437,9 @@ class Hooks(object): | |||||
| IterTimerHook = 'IterTimerHook' | IterTimerHook = 'IterTimerHook' | ||||
| EvaluationHook = 'EvaluationHook' | EvaluationHook = 'EvaluationHook' | ||||
| # Compression | |||||
| SparsityHook = 'SparsityHook' | |||||
| class LR_Schedulers(object): | class LR_Schedulers(object): | ||||
| """learning rate scheduler is defined here | """learning rate scheduler is defined here | ||||
| @@ -421,6 +455,8 @@ class Datasets(object): | |||||
| """ | """ | ||||
| ClsDataset = 'ClsDataset' | ClsDataset = 'ClsDataset' | ||||
| Face2dKeypointsDataset = 'Face2dKeypointsDataset' | Face2dKeypointsDataset = 'Face2dKeypointsDataset' | ||||
| HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset' | |||||
| HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset' | |||||
| SegDataset = 'SegDataset' | SegDataset = 'SegDataset' | ||||
| DetDataset = 'DetDataset' | DetDataset = 'DetDataset' | ||||
| DetImagesMixDataset = 'DetImagesMixDataset' | DetImagesMixDataset = 'DetImagesMixDataset' | ||||
| @@ -19,6 +19,7 @@ if TYPE_CHECKING: | |||||
| from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric | from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric | ||||
| from .accuracy_metric import AccuracyMetric | from .accuracy_metric import AccuracyMetric | ||||
| from .bleu_metric import BleuMetric | from .bleu_metric import BleuMetric | ||||
| from .image_inpainting_metric import ImageInpaintingMetric | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| @@ -36,6 +37,7 @@ else: | |||||
| 'token_classification_metric': ['TokenClassificationMetric'], | 'token_classification_metric': ['TokenClassificationMetric'], | ||||
| 'video_summarization_metric': ['VideoSummarizationMetric'], | 'video_summarization_metric': ['VideoSummarizationMetric'], | ||||
| 'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'], | 'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'], | ||||
| 'image_inpainting_metric': ['ImageInpaintingMetric'], | |||||
| 'accuracy_metric': ['AccuracyMetric'], | 'accuracy_metric': ['AccuracyMetric'], | ||||
| 'bleu_metric': ['BleuMetric'], | 'bleu_metric': ['BleuMetric'], | ||||
| } | } | ||||
| @@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric): | |||||
| total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr | total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr | ||||
| return { | return { | ||||
| 'total_loss': total_loss.item(), | 'total_loss': total_loss.item(), | ||||
| 'avg_sisnr': avg_sisnr.item(), | |||||
| # model use opposite number of sisnr as a calculation shortcut. | |||||
| # revert it in evaluation result | |||||
| 'avg_sisnr': -avg_sisnr.item(), | |||||
| MetricKeys.AVERAGE_LOSS: avg_loss.item() | MetricKeys.AVERAGE_LOSS: avg_loss.item() | ||||
| } | } | ||||
| @@ -18,6 +18,7 @@ class MetricKeys(object): | |||||
| SSIM = 'ssim' | SSIM = 'ssim' | ||||
| AVERAGE_LOSS = 'avg_loss' | AVERAGE_LOSS = 'avg_loss' | ||||
| FScore = 'fscore' | FScore = 'fscore' | ||||
| FID = 'fid' | |||||
| BLEU_1 = 'bleu-1' | BLEU_1 = 'bleu-1' | ||||
| BLEU_4 = 'bleu-4' | BLEU_4 = 'bleu-4' | ||||
| ROUGE_1 = 'rouge-1' | ROUGE_1 = 'rouge-1' | ||||
| @@ -39,6 +40,7 @@ task_default_metrics = { | |||||
| Tasks.image_captioning: [Metrics.text_gen_metric], | Tasks.image_captioning: [Metrics.text_gen_metric], | ||||
| Tasks.visual_question_answering: [Metrics.text_gen_metric], | Tasks.visual_question_answering: [Metrics.text_gen_metric], | ||||
| Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], | Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], | ||||
| Tasks.image_inpainting: [Metrics.image_inpainting_metric], | |||||
| } | } | ||||
| @@ -1,12 +1,16 @@ | |||||
| # ------------------------------------------------------------------------ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # ------------------------------------------------------------------------ | |||||
| # modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py | |||||
| # ------------------------------------------------------------------------ | |||||
| from typing import Dict | from typing import Dict | ||||
| import cv2 | |||||
| import numpy as np | import numpy as np | ||||
| from skimage.metrics import peak_signal_noise_ratio, structural_similarity | |||||
| import torch | |||||
| from modelscope.metainfo import Metrics | from modelscope.metainfo import Metrics | ||||
| from modelscope.utils.registry import default_group | from modelscope.utils.registry import default_group | ||||
| from modelscope.utils.tensor_utils import (torch_nested_detach, | |||||
| torch_nested_numpify) | |||||
| from .base import Metric | from .base import Metric | ||||
| from .builder import METRICS, MetricKeys | from .builder import METRICS, MetricKeys | ||||
| @@ -20,26 +24,249 @@ class ImageDenoiseMetric(Metric): | |||||
| label_name = 'target' | label_name = 'target' | ||||
| def __init__(self): | def __init__(self): | ||||
| super(ImageDenoiseMetric, self).__init__() | |||||
| self.preds = [] | self.preds = [] | ||||
| self.labels = [] | self.labels = [] | ||||
| def add(self, outputs: Dict, inputs: Dict): | def add(self, outputs: Dict, inputs: Dict): | ||||
| ground_truths = outputs[ImageDenoiseMetric.label_name] | ground_truths = outputs[ImageDenoiseMetric.label_name] | ||||
| eval_results = outputs[ImageDenoiseMetric.pred_name] | eval_results = outputs[ImageDenoiseMetric.pred_name] | ||||
| self.preds.append( | |||||
| torch_nested_numpify(torch_nested_detach(eval_results))) | |||||
| self.labels.append( | |||||
| torch_nested_numpify(torch_nested_detach(ground_truths))) | |||||
| self.preds.append(eval_results) | |||||
| self.labels.append(ground_truths) | |||||
| def evaluate(self): | def evaluate(self): | ||||
| psnr_list, ssim_list = [], [] | psnr_list, ssim_list = [], [] | ||||
| for (pred, label) in zip(self.preds, self.labels): | for (pred, label) in zip(self.preds, self.labels): | ||||
| psnr_list.append( | |||||
| peak_signal_noise_ratio(label[0], pred[0], data_range=255)) | |||||
| ssim_list.append( | |||||
| structural_similarity( | |||||
| label[0], pred[0], multichannel=True, data_range=255)) | |||||
| psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0)) | |||||
| ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0)) | |||||
| return { | return { | ||||
| MetricKeys.PSNR: np.mean(psnr_list), | MetricKeys.PSNR: np.mean(psnr_list), | ||||
| MetricKeys.SSIM: np.mean(ssim_list) | MetricKeys.SSIM: np.mean(ssim_list) | ||||
| } | } | ||||
| def reorder_image(img, input_order='HWC'): | |||||
| """Reorder images to 'HWC' order. | |||||
| If the input_order is (h, w), return (h, w, 1); | |||||
| If the input_order is (c, h, w), return (h, w, c); | |||||
| If the input_order is (h, w, c), return as it is. | |||||
| Args: | |||||
| img (ndarray): Input image. | |||||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||||
| If the input image shape is (h, w), input_order will not have | |||||
| effects. Default: 'HWC'. | |||||
| Returns: | |||||
| ndarray: reordered image. | |||||
| """ | |||||
| if input_order not in ['HWC', 'CHW']: | |||||
| raise ValueError( | |||||
| f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'" | |||||
| ) | |||||
| if len(img.shape) == 2: | |||||
| img = img[..., None] | |||||
| if input_order == 'CHW': | |||||
| img = img.transpose(1, 2, 0) | |||||
| return img | |||||
| def calculate_psnr(img1, img2, crop_border, input_order='HWC'): | |||||
| """Calculate PSNR (Peak Signal-to-Noise Ratio). | |||||
| Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio | |||||
| Args: | |||||
| img1 (ndarray/tensor): Images with range [0, 255]/[0, 1]. | |||||
| img2 (ndarray/tensor): Images with range [0, 255]/[0, 1]. | |||||
| crop_border (int): Cropped pixels in each edge of an image. These | |||||
| pixels are not involved in the PSNR calculation. | |||||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||||
| Default: 'HWC'. | |||||
| test_y_channel (bool): Test on Y channel of YCbCr. Default: False. | |||||
| Returns: | |||||
| float: psnr result. | |||||
| """ | |||||
| assert img1.shape == img2.shape, ( | |||||
| f'Image shapes are differnet: {img1.shape}, {img2.shape}.') | |||||
| if input_order not in ['HWC', 'CHW']: | |||||
| raise ValueError( | |||||
| f'Wrong input_order {input_order}. Supported input_orders are ' | |||||
| '"HWC" and "CHW"') | |||||
| if type(img1) == torch.Tensor: | |||||
| if len(img1.shape) == 4: | |||||
| img1 = img1.squeeze(0) | |||||
| img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) | |||||
| if type(img2) == torch.Tensor: | |||||
| if len(img2.shape) == 4: | |||||
| img2 = img2.squeeze(0) | |||||
| img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) | |||||
| img1 = reorder_image(img1, input_order=input_order) | |||||
| img2 = reorder_image(img2, input_order=input_order) | |||||
| img1 = img1.astype(np.float64) | |||||
| img2 = img2.astype(np.float64) | |||||
| if crop_border != 0: | |||||
| img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] | |||||
| img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] | |||||
| def _psnr(img1, img2): | |||||
| mse = np.mean((img1 - img2)**2) | |||||
| if mse == 0: | |||||
| return float('inf') | |||||
| max_value = 1. if img1.max() <= 1 else 255. | |||||
| return 20. * np.log10(max_value / np.sqrt(mse)) | |||||
| return _psnr(img1, img2) | |||||
| def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True): | |||||
| """Calculate SSIM (structural similarity). | |||||
| Ref: | |||||
| Image quality assessment: From error visibility to structural similarity | |||||
| The results are the same as that of the official released MATLAB code in | |||||
| https://ece.uwaterloo.ca/~z70wang/research/ssim/. | |||||
| For three-channel images, SSIM is calculated for each channel and then | |||||
| averaged. | |||||
| Args: | |||||
| img1 (ndarray): Images with range [0, 255]. | |||||
| img2 (ndarray): Images with range [0, 255]. | |||||
| crop_border (int): Cropped pixels in each edge of an image. These | |||||
| pixels are not involved in the SSIM calculation. | |||||
| input_order (str): Whether the input order is 'HWC' or 'CHW'. | |||||
| Default: 'HWC'. | |||||
| test_y_channel (bool): Test on Y channel of YCbCr. Default: False. | |||||
| Returns: | |||||
| float: ssim result. | |||||
| """ | |||||
| assert img1.shape == img2.shape, ( | |||||
| f'Image shapes are differnet: {img1.shape}, {img2.shape}.') | |||||
| if input_order not in ['HWC', 'CHW']: | |||||
| raise ValueError( | |||||
| f'Wrong input_order {input_order}. Supported input_orders are ' | |||||
| '"HWC" and "CHW"') | |||||
| if type(img1) == torch.Tensor: | |||||
| if len(img1.shape) == 4: | |||||
| img1 = img1.squeeze(0) | |||||
| img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) | |||||
| if type(img2) == torch.Tensor: | |||||
| if len(img2.shape) == 4: | |||||
| img2 = img2.squeeze(0) | |||||
| img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) | |||||
| img1 = reorder_image(img1, input_order=input_order) | |||||
| img2 = reorder_image(img2, input_order=input_order) | |||||
| img1 = img1.astype(np.float64) | |||||
| img2 = img2.astype(np.float64) | |||||
| if crop_border != 0: | |||||
| img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] | |||||
| img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] | |||||
| def _cal_ssim(img1, img2): | |||||
| ssims = [] | |||||
| max_value = 1 if img1.max() <= 1 else 255 | |||||
| with torch.no_grad(): | |||||
| final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim( | |||||
| img1, img2, max_value) | |||||
| ssims.append(final_ssim) | |||||
| return np.array(ssims).mean() | |||||
| return _cal_ssim(img1, img2) | |||||
| def _ssim(img, img2, max_value): | |||||
| """Calculate SSIM (structural similarity) for one channel images. | |||||
| It is called by func:`calculate_ssim`. | |||||
| Args: | |||||
| img (ndarray): Images with range [0, 255] with order 'HWC'. | |||||
| img2 (ndarray): Images with range [0, 255] with order 'HWC'. | |||||
| Returns: | |||||
| float: SSIM result. | |||||
| """ | |||||
| c1 = (0.01 * max_value)**2 | |||||
| c2 = (0.03 * max_value)**2 | |||||
| img = img.astype(np.float64) | |||||
| img2 = img2.astype(np.float64) | |||||
| kernel = cv2.getGaussianKernel(11, 1.5) | |||||
| window = np.outer(kernel, kernel.transpose()) | |||||
| mu1 = cv2.filter2D(img, -1, window)[5:-5, | |||||
| 5:-5] # valid mode for window size 11 | |||||
| mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] | |||||
| mu1_sq = mu1**2 | |||||
| mu2_sq = mu2**2 | |||||
| mu1_mu2 = mu1 * mu2 | |||||
| sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq | |||||
| sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq | |||||
| sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 | |||||
| tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2) | |||||
| tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) | |||||
| ssim_map = tmp1 / tmp2 | |||||
| return ssim_map.mean() | |||||
| def _3d_gaussian_calculator(img, conv3d): | |||||
| out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0) | |||||
| return out | |||||
| def _generate_3d_gaussian_kernel(): | |||||
| kernel = cv2.getGaussianKernel(11, 1.5) | |||||
| window = np.outer(kernel, kernel.transpose()) | |||||
| kernel_3 = cv2.getGaussianKernel(11, 1.5) | |||||
| kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0)) | |||||
| conv3d = torch.nn.Conv3d( | |||||
| 1, | |||||
| 1, (11, 11, 11), | |||||
| stride=1, | |||||
| padding=(5, 5, 5), | |||||
| bias=False, | |||||
| padding_mode='replicate') | |||||
| conv3d.weight.requires_grad = False | |||||
| conv3d.weight[0, 0, :, :, :] = kernel | |||||
| return conv3d | |||||
| def _ssim_3d(img1, img2, max_value): | |||||
| assert len(img1.shape) == 3 and len(img2.shape) == 3 | |||||
| """Calculate SSIM (structural similarity) for one channel images. | |||||
| It is called by func:`calculate_ssim`. | |||||
| Args: | |||||
| img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. | |||||
| img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. | |||||
| Returns: | |||||
| float: ssim result. | |||||
| """ | |||||
| C1 = (0.01 * max_value)**2 | |||||
| C2 = (0.03 * max_value)**2 | |||||
| img1 = img1.astype(np.float64) | |||||
| img2 = img2.astype(np.float64) | |||||
| kernel = _generate_3d_gaussian_kernel().cuda() | |||||
| img1 = torch.tensor(img1).float().cuda() | |||||
| img2 = torch.tensor(img2).float().cuda() | |||||
| mu1 = _3d_gaussian_calculator(img1, kernel) | |||||
| mu2 = _3d_gaussian_calculator(img2, kernel) | |||||
| mu1_sq = mu1**2 | |||||
| mu2_sq = mu2**2 | |||||
| mu1_mu2 = mu1 * mu2 | |||||
| sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq | |||||
| sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq | |||||
| sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2 | |||||
| tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2) | |||||
| tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) | |||||
| ssim_map = tmp1 / tmp2 | |||||
| return float(ssim_map.mean()) | |||||
| @@ -0,0 +1,210 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| from typing import Dict | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| from scipy import linalg | |||||
| from modelscope.metainfo import Metrics | |||||
| from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3 | |||||
| from modelscope.utils.registry import default_group | |||||
| from modelscope.utils.tensor_utils import (torch_nested_detach, | |||||
| torch_nested_numpify) | |||||
| from .base import Metric | |||||
| from .builder import METRICS, MetricKeys | |||||
| def fid_calculate_activation_statistics(act): | |||||
| mu = np.mean(act, axis=0) | |||||
| sigma = np.cov(act, rowvar=False) | |||||
| return mu, sigma | |||||
| def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6): | |||||
| mu1, sigma1 = fid_calculate_activation_statistics(activations_pred) | |||||
| mu2, sigma2 = fid_calculate_activation_statistics(activations_target) | |||||
| diff = mu1 - mu2 | |||||
| # Product might be almost singular | |||||
| covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) | |||||
| if not np.isfinite(covmean).all(): | |||||
| offset = np.eye(sigma1.shape[0]) * eps | |||||
| covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) | |||||
| # Numerical error might give slight imaginary component | |||||
| if np.iscomplexobj(covmean): | |||||
| # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): | |||||
| if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2): | |||||
| m = np.max(np.abs(covmean.imag)) | |||||
| raise ValueError('Imaginary component {}'.format(m)) | |||||
| covmean = covmean.real | |||||
| tr_covmean = np.trace(covmean) | |||||
| return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) | |||||
| - 2 * tr_covmean) | |||||
| class FIDScore(torch.nn.Module): | |||||
| def __init__(self, dims=2048, eps=1e-6): | |||||
| super().__init__() | |||||
| if getattr(FIDScore, '_MODEL', None) is None: | |||||
| block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims] | |||||
| FIDScore._MODEL = InceptionV3([block_idx]).eval() | |||||
| self.model = FIDScore._MODEL | |||||
| self.eps = eps | |||||
| self.reset() | |||||
| def forward(self, pred_batch, target_batch, mask=None): | |||||
| activations_pred = self._get_activations(pred_batch) | |||||
| activations_target = self._get_activations(target_batch) | |||||
| self.activations_pred.append(activations_pred.detach().cpu()) | |||||
| self.activations_target.append(activations_target.detach().cpu()) | |||||
| def get_value(self): | |||||
| activations_pred, activations_target = (self.activations_pred, | |||||
| self.activations_target) | |||||
| activations_pred = torch.cat(activations_pred).cpu().numpy() | |||||
| activations_target = torch.cat(activations_target).cpu().numpy() | |||||
| total_distance = calculate_frechet_distance( | |||||
| activations_pred, activations_target, eps=self.eps) | |||||
| self.reset() | |||||
| return total_distance | |||||
| def reset(self): | |||||
| self.activations_pred = [] | |||||
| self.activations_target = [] | |||||
| def _get_activations(self, batch): | |||||
| activations = self.model(batch)[0] | |||||
| if activations.shape[2] != 1 or activations.shape[3] != 1: | |||||
| assert False, \ | |||||
| 'We should not have got here, because Inception always scales inputs to 299x299' | |||||
| activations = activations.squeeze(-1).squeeze(-1) | |||||
| return activations | |||||
| class SSIM(torch.nn.Module): | |||||
| """SSIM. Modified from: | |||||
| https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py | |||||
| """ | |||||
| def __init__(self, window_size=11, size_average=True): | |||||
| super().__init__() | |||||
| self.window_size = window_size | |||||
| self.size_average = size_average | |||||
| self.channel = 1 | |||||
| self.register_buffer('window', | |||||
| self._create_window(window_size, self.channel)) | |||||
| def forward(self, img1, img2): | |||||
| assert len(img1.shape) == 4 | |||||
| channel = img1.size()[1] | |||||
| if channel == self.channel and self.window.data.type( | |||||
| ) == img1.data.type(): | |||||
| window = self.window | |||||
| else: | |||||
| window = self._create_window(self.window_size, channel) | |||||
| window = window.type_as(img1) | |||||
| self.window = window | |||||
| self.channel = channel | |||||
| return self._ssim(img1, img2, window, self.window_size, channel, | |||||
| self.size_average) | |||||
| def _gaussian(self, window_size, sigma): | |||||
| gauss = torch.Tensor([ | |||||
| np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2)) | |||||
| for x in range(window_size) | |||||
| ]) | |||||
| return gauss / gauss.sum() | |||||
| def _create_window(self, window_size, channel): | |||||
| _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1) | |||||
| _2D_window = _1D_window.mm( | |||||
| _1D_window.t()).float().unsqueeze(0).unsqueeze(0) | |||||
| return _2D_window.expand(channel, 1, window_size, | |||||
| window_size).contiguous() | |||||
| def _ssim(self, | |||||
| img1, | |||||
| img2, | |||||
| window, | |||||
| window_size, | |||||
| channel, | |||||
| size_average=True): | |||||
| mu1 = F.conv2d( | |||||
| img1, window, padding=(window_size // 2), groups=channel) | |||||
| mu2 = F.conv2d( | |||||
| img2, window, padding=(window_size // 2), groups=channel) | |||||
| mu1_sq = mu1.pow(2) | |||||
| mu2_sq = mu2.pow(2) | |||||
| mu1_mu2 = mu1 * mu2 | |||||
| sigma1_sq = F.conv2d( | |||||
| img1 * img1, window, padding=(window_size // 2), | |||||
| groups=channel) - mu1_sq | |||||
| sigma2_sq = F.conv2d( | |||||
| img2 * img2, window, padding=(window_size // 2), | |||||
| groups=channel) - mu2_sq | |||||
| sigma12 = F.conv2d( | |||||
| img1 * img2, window, padding=(window_size // 2), | |||||
| groups=channel) - mu1_mu2 | |||||
| C1 = 0.01**2 | |||||
| C2 = 0.03**2 | |||||
| ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \ | |||||
| ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) | |||||
| if size_average: | |||||
| return ssim_map.mean() | |||||
| return ssim_map.mean(1).mean(1).mean(1) | |||||
| def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, | |||||
| missing_keys, unexpected_keys, error_msgs): | |||||
| return | |||||
| @METRICS.register_module( | |||||
| group_key=default_group, module_name=Metrics.image_inpainting_metric) | |||||
| class ImageInpaintingMetric(Metric): | |||||
| """The metric computation class for image inpainting classes. | |||||
| """ | |||||
| def __init__(self): | |||||
| self.preds = [] | |||||
| self.targets = [] | |||||
| self.SSIM = SSIM(window_size=11, size_average=False).eval() | |||||
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |||||
| self.FID = FIDScore().to(device) | |||||
| def add(self, outputs: Dict, inputs: Dict): | |||||
| pred = outputs['inpainted'] | |||||
| target = inputs['image'] | |||||
| self.preds.append(torch_nested_detach(pred)) | |||||
| self.targets.append(torch_nested_detach(target)) | |||||
| def evaluate(self): | |||||
| ssim_list = [] | |||||
| for (pred, target) in zip(self.preds, self.targets): | |||||
| ssim_list.append(self.SSIM(pred, target)) | |||||
| self.FID(pred, target) | |||||
| ssim_list = torch_nested_numpify(ssim_list) | |||||
| fid = self.FID.get_value() | |||||
| return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid} | |||||
| @@ -1,3 +1,6 @@ | |||||
| # Part of the implementation is borrowed and modified from PGL-SUM, | |||||
| # publicly available at https://github.com/e-apostolidis/PGL-SUM | |||||
| from typing import Dict | from typing import Dict | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from typing import Any, Dict | from typing import Any, Dict | ||||
| @@ -1,15 +1,14 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| import os | import os | ||||
| from typing import Dict | |||||
| import torch | |||||
| from typing import Dict, Optional | |||||
| from modelscope.metainfo import Models | from modelscope.metainfo import Models | ||||
| from modelscope.models import TorchModel | from modelscope.models import TorchModel | ||||
| from modelscope.models.base import Tensor | from modelscope.models.base import Tensor | ||||
| from modelscope.models.builder import MODELS | from modelscope.models.builder import MODELS | ||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.audio.audio_utils import update_conf | |||||
| from modelscope.utils.constant import Tasks | |||||
| from .fsmn_sele_v2 import FSMNSeleNetV2 | from .fsmn_sele_v2 import FSMNSeleNetV2 | ||||
| @@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel): | |||||
| MODEL_TXT = 'model.txt' | MODEL_TXT = 'model.txt' | ||||
| SC_CONFIG = 'sound_connect.conf' | SC_CONFIG = 'sound_connect.conf' | ||||
| SC_CONF_ITEM_KWS_MODEL = '${kws_model}' | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| def __init__(self, | |||||
| model_dir: str, | |||||
| training: Optional[bool] = False, | |||||
| *args, | |||||
| **kwargs): | |||||
| """initialize the dfsmn model from the `model_dir` path. | """initialize the dfsmn model from the `model_dir` path. | ||||
| Args: | Args: | ||||
| model_dir (str): the model path. | model_dir (str): the model path. | ||||
| """ | """ | ||||
| super().__init__(model_dir, *args, **kwargs) | super().__init__(model_dir, *args, **kwargs) | ||||
| sc_config_file = os.path.join(model_dir, self.SC_CONFIG) | |||||
| model_txt_file = os.path.join(model_dir, self.MODEL_TXT) | |||||
| model_bin_file = os.path.join(model_dir, | |||||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||||
| self._model = None | |||||
| if os.path.exists(model_bin_file): | |||||
| kwargs.pop('device') | |||||
| self._model = FSMNSeleNetV2(*args, **kwargs) | |||||
| checkpoint = torch.load(model_bin_file) | |||||
| self._model.load_state_dict(checkpoint, strict=False) | |||||
| self._sc = None | |||||
| if os.path.exists(model_txt_file): | |||||
| with open(sc_config_file) as f: | |||||
| lines = f.readlines() | |||||
| with open(sc_config_file, 'w') as f: | |||||
| for line in lines: | |||||
| if self.SC_CONF_ITEM_KWS_MODEL in line: | |||||
| line = line.replace(self.SC_CONF_ITEM_KWS_MODEL, | |||||
| model_txt_file) | |||||
| f.write(line) | |||||
| import py_sound_connect | |||||
| self._sc = py_sound_connect.SoundConnect(sc_config_file) | |||||
| self.size_in = self._sc.bytesPerBlockIn() | |||||
| self.size_out = self._sc.bytesPerBlockOut() | |||||
| if self._model is None and self._sc is None: | |||||
| raise Exception( | |||||
| f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.' | |||||
| ) | |||||
| if training: | |||||
| self.model = FSMNSeleNetV2(*args, **kwargs) | |||||
| else: | |||||
| sc_config_file = os.path.join(model_dir, self.SC_CONFIG) | |||||
| model_txt_file = os.path.join(model_dir, self.MODEL_TXT) | |||||
| self._sc = None | |||||
| if os.path.exists(model_txt_file): | |||||
| conf_dict = dict(mode=56542, kws_model=model_txt_file) | |||||
| update_conf(sc_config_file, sc_config_file, conf_dict) | |||||
| import py_sound_connect | |||||
| self._sc = py_sound_connect.SoundConnect(sc_config_file) | |||||
| self.size_in = self._sc.bytesPerBlockIn() | |||||
| self.size_out = self._sc.bytesPerBlockOut() | |||||
| else: | |||||
| raise Exception( | |||||
| f'Invalid model directory! Failed to load model file: {model_txt_file}.' | |||||
| ) | |||||
| def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | ||||
| ... | |||||
| return self.model.forward(input) | |||||
| def forward_decode(self, data: bytes): | def forward_decode(self, data: bytes): | ||||
| result = {'pcm': self._sc.process(data, self.size_out)} | result = {'pcm': self._sc.process(data, self.size_out)} | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from typing import Any, Dict | from typing import Any, Dict | ||||
| @@ -4,14 +4,16 @@ | |||||
| from . import (action_recognition, animal_recognition, body_2d_keypoints, | from . import (action_recognition, animal_recognition, body_2d_keypoints, | ||||
| body_3d_keypoints, cartoon, cmdssl_video_embedding, | body_3d_keypoints, cartoon, cmdssl_video_embedding, | ||||
| crowd_counting, face_2d_keypoints, face_detection, | crowd_counting, face_2d_keypoints, face_detection, | ||||
| face_generation, image_classification, image_color_enhance, | |||||
| image_colorization, image_denoise, image_instance_segmentation, | |||||
| face_generation, human_wholebody_keypoint, image_classification, | |||||
| image_color_enhance, image_colorization, image_denoise, | |||||
| image_inpainting, image_instance_segmentation, | |||||
| image_panoptic_segmentation, image_portrait_enhancement, | image_panoptic_segmentation, image_portrait_enhancement, | ||||
| image_reid_person, image_semantic_segmentation, | image_reid_person, image_semantic_segmentation, | ||||
| image_to_image_generation, image_to_image_translation, | image_to_image_generation, image_to_image_translation, | ||||
| movie_scene_segmentation, object_detection, | movie_scene_segmentation, object_detection, | ||||
| product_retrieval_embedding, realtime_object_detection, | product_retrieval_embedding, realtime_object_detection, | ||||
| salient_detection, shop_segmentation, super_resolution, | |||||
| referring_video_object_segmentation, salient_detection, | |||||
| shop_segmentation, super_resolution, | |||||
| video_single_object_tracking, video_summarization, virual_tryon) | video_single_object_tracking, video_summarization, virual_tryon) | ||||
| # yapf: enable | # yapf: enable | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from typing import Any, Dict, Optional, Union | from typing import Any, Dict, Optional, Union | ||||
| @@ -1,10 +1,10 @@ | |||||
| # ------------------------------------------------------------------------------ | |||||
| # Copyright (c) Microsoft | |||||
| # Licensed under the MIT License. | |||||
| # Written by Bin Xiao (Bin.Xiao@microsoft.com) | |||||
| # Modified by Ke Sun (sunk@mail.ustc.edu.cn) | |||||
| # https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py | |||||
| # ------------------------------------------------------------------------------ | |||||
| """ | |||||
| Copyright (c) Microsoft | |||||
| Licensed under the MIT License. | |||||
| Written by Bin Xiao (Bin.Xiao@microsoft.com) | |||||
| Modified by Ke Sun (sunk@mail.ustc.edu.cn) | |||||
| https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py | |||||
| """ | |||||
| import functools | import functools | ||||
| import logging | import logging | ||||
| @@ -8,12 +8,14 @@ if TYPE_CHECKING: | |||||
| from .mtcnn import MtcnnFaceDetector | from .mtcnn import MtcnnFaceDetector | ||||
| from .retinaface import RetinaFaceDetection | from .retinaface import RetinaFaceDetection | ||||
| from .ulfd_slim import UlfdFaceDetector | from .ulfd_slim import UlfdFaceDetector | ||||
| from .scrfd import ScrfdDetect | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'ulfd_slim': ['UlfdFaceDetector'], | 'ulfd_slim': ['UlfdFaceDetector'], | ||||
| 'retinaface': ['RetinaFaceDetection'], | 'retinaface': ['RetinaFaceDetection'], | ||||
| 'mtcnn': ['MtcnnFaceDetector'], | 'mtcnn': ['MtcnnFaceDetector'], | ||||
| 'mogface': ['MogFaceDetector'] | |||||
| 'mogface': ['MogFaceDetector'], | |||||
| 'scrfd': ['ScrfdDetect'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -1,189 +0,0 @@ | |||||
| """ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py | |||||
| """ | |||||
| import numpy as np | |||||
| from mmdet.datasets.builder import PIPELINES | |||||
| from numpy import random | |||||
| @PIPELINES.register_module() | |||||
| class RandomSquareCrop(object): | |||||
| """Random crop the image & bboxes, the cropped patches have minimum IoU | |||||
| requirement with original image & bboxes, the IoU threshold is randomly | |||||
| selected from min_ious. | |||||
| Args: | |||||
| min_ious (tuple): minimum IoU threshold for all intersections with | |||||
| bounding boxes | |||||
| min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, | |||||
| where a >= min_crop_size). | |||||
| Note: | |||||
| The keys for bboxes, labels and masks should be paired. That is, \ | |||||
| `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ | |||||
| `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. | |||||
| """ | |||||
| def __init__(self, | |||||
| crop_ratio_range=None, | |||||
| crop_choice=None, | |||||
| bbox_clip_border=True): | |||||
| self.crop_ratio_range = crop_ratio_range | |||||
| self.crop_choice = crop_choice | |||||
| self.bbox_clip_border = bbox_clip_border | |||||
| assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) | |||||
| if self.crop_ratio_range is not None: | |||||
| self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range | |||||
| self.bbox2label = { | |||||
| 'gt_bboxes': 'gt_labels', | |||||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||||
| } | |||||
| self.bbox2mask = { | |||||
| 'gt_bboxes': 'gt_masks', | |||||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||||
| } | |||||
| def __call__(self, results): | |||||
| """Call function to crop images and bounding boxes with minimum IoU | |||||
| constraint. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| Returns: | |||||
| dict: Result dict with images and bounding boxes cropped, \ | |||||
| 'img_shape' key is updated. | |||||
| """ | |||||
| if 'img_fields' in results: | |||||
| assert results['img_fields'] == ['img'], \ | |||||
| 'Only single img_fields is allowed' | |||||
| img = results['img'] | |||||
| assert 'bbox_fields' in results | |||||
| assert 'gt_bboxes' in results | |||||
| boxes = results['gt_bboxes'] | |||||
| h, w, c = img.shape | |||||
| scale_retry = 0 | |||||
| if self.crop_ratio_range is not None: | |||||
| max_scale = self.crop_ratio_max | |||||
| else: | |||||
| max_scale = np.amax(self.crop_choice) | |||||
| while True: | |||||
| scale_retry += 1 | |||||
| if scale_retry == 1 or max_scale > 1.0: | |||||
| if self.crop_ratio_range is not None: | |||||
| scale = np.random.uniform(self.crop_ratio_min, | |||||
| self.crop_ratio_max) | |||||
| elif self.crop_choice is not None: | |||||
| scale = np.random.choice(self.crop_choice) | |||||
| else: | |||||
| scale = scale * 1.2 | |||||
| for i in range(250): | |||||
| short_side = min(w, h) | |||||
| cw = int(scale * short_side) | |||||
| ch = cw | |||||
| # TODO +1 | |||||
| if w == cw: | |||||
| left = 0 | |||||
| elif w > cw: | |||||
| left = random.randint(0, w - cw) | |||||
| else: | |||||
| left = random.randint(w - cw, 0) | |||||
| if h == ch: | |||||
| top = 0 | |||||
| elif h > ch: | |||||
| top = random.randint(0, h - ch) | |||||
| else: | |||||
| top = random.randint(h - ch, 0) | |||||
| patch = np.array( | |||||
| (int(left), int(top), int(left + cw), int(top + ch)), | |||||
| dtype=np.int) | |||||
| # center of boxes should inside the crop img | |||||
| # only adjust boxes and instance masks when the gt is not empty | |||||
| # adjust boxes | |||||
| def is_center_of_bboxes_in_patch(boxes, patch): | |||||
| # TODO >= | |||||
| center = (boxes[:, :2] + boxes[:, 2:]) / 2 | |||||
| mask = \ | |||||
| ((center[:, 0] > patch[0]) | |||||
| * (center[:, 1] > patch[1]) | |||||
| * (center[:, 0] < patch[2]) | |||||
| * (center[:, 1] < patch[3])) | |||||
| return mask | |||||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||||
| if not mask.any(): | |||||
| continue | |||||
| for key in results.get('bbox_fields', []): | |||||
| boxes = results[key].copy() | |||||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||||
| boxes = boxes[mask] | |||||
| if self.bbox_clip_border: | |||||
| boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) | |||||
| boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) | |||||
| boxes -= np.tile(patch[:2], 2) | |||||
| results[key] = boxes | |||||
| # labels | |||||
| label_key = self.bbox2label.get(key) | |||||
| if label_key in results: | |||||
| results[label_key] = results[label_key][mask] | |||||
| # keypoints field | |||||
| if key == 'gt_bboxes': | |||||
| for kps_key in results.get('keypoints_fields', []): | |||||
| keypointss = results[kps_key].copy() | |||||
| keypointss = keypointss[mask, :, :] | |||||
| if self.bbox_clip_border: | |||||
| keypointss[:, :, : | |||||
| 2] = keypointss[:, :, :2].clip( | |||||
| max=patch[2:]) | |||||
| keypointss[:, :, : | |||||
| 2] = keypointss[:, :, :2].clip( | |||||
| min=patch[:2]) | |||||
| keypointss[:, :, 0] -= patch[0] | |||||
| keypointss[:, :, 1] -= patch[1] | |||||
| results[kps_key] = keypointss | |||||
| # mask fields | |||||
| mask_key = self.bbox2mask.get(key) | |||||
| if mask_key in results: | |||||
| results[mask_key] = results[mask_key][mask.nonzero() | |||||
| [0]].crop(patch) | |||||
| # adjust the img no matter whether the gt is empty before crop | |||||
| rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 | |||||
| patch_from = patch.copy() | |||||
| patch_from[0] = max(0, patch_from[0]) | |||||
| patch_from[1] = max(0, patch_from[1]) | |||||
| patch_from[2] = min(img.shape[1], patch_from[2]) | |||||
| patch_from[3] = min(img.shape[0], patch_from[3]) | |||||
| patch_to = patch.copy() | |||||
| patch_to[0] = max(0, patch_to[0] * -1) | |||||
| patch_to[1] = max(0, patch_to[1] * -1) | |||||
| patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) | |||||
| patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) | |||||
| rimg[patch_to[1]:patch_to[3], | |||||
| patch_to[0]:patch_to[2], :] = img[ | |||||
| patch_from[1]:patch_from[3], | |||||
| patch_from[0]:patch_from[2], :] | |||||
| img = rimg | |||||
| results['img'] = img | |||||
| results['img_shape'] = img.shape | |||||
| return results | |||||
| def __repr__(self): | |||||
| repr_str = self.__class__.__name__ | |||||
| repr_str += f'(min_ious={self.min_iou}, ' | |||||
| repr_str += f'crop_size={self.crop_size})' | |||||
| return repr_str | |||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is based on MogFace, available at | |||||
| # https://github.com/damo-cv/MogFace | |||||
| import os | import os | ||||
| import cv2 | import cv2 | ||||
| @@ -0,0 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .scrfd_detect import ScrfdDetect | |||||
| @@ -6,7 +6,7 @@ import numpy as np | |||||
| import torch | import torch | ||||
| def bbox2result(bboxes, labels, num_classes, kps=None): | |||||
| def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5): | |||||
| """Convert detection results to a list of numpy arrays. | """Convert detection results to a list of numpy arrays. | ||||
| Args: | Args: | ||||
| @@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None): | |||||
| Returns: | Returns: | ||||
| list(ndarray): bbox results of each class | list(ndarray): bbox results of each class | ||||
| """ | """ | ||||
| bbox_len = 5 if kps is None else 5 + 10 # if has kps, add 10 kps into bbox | |||||
| bbox_len = 5 if kps is None else 5 + num_kps * 2 # if has kps, add num_kps*2 into bbox | |||||
| if bboxes.shape[0] == 0: | if bboxes.shape[0] == 0: | ||||
| return [ | return [ | ||||
| np.zeros((0, bbox_len), dtype=np.float32) | np.zeros((0, bbox_len), dtype=np.float32) | ||||
| @@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes, | |||||
| Args: | Args: | ||||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | ||||
| multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2) | |||||
| multi_scores (Tensor): shape (n, #class), where the last column | multi_scores (Tensor): shape (n, #class), where the last column | ||||
| contains scores of the background class, but this will be ignored. | contains scores of the background class, but this will be ignored. | ||||
| score_thr (float): bbox threshold, bboxes with scores lower than it | score_thr (float): bbox threshold, bboxes with scores lower than it | ||||
| @@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes, | |||||
| num_classes = multi_scores.size(1) - 1 | num_classes = multi_scores.size(1) - 1 | ||||
| # exclude background category | # exclude background category | ||||
| kps = None | kps = None | ||||
| if multi_kps is not None: | |||||
| num_kps = int((multi_kps.shape[1] / num_classes) / 2) | |||||
| if multi_bboxes.shape[1] > 4: | if multi_bboxes.shape[1] > 4: | ||||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | ||||
| if multi_kps is not None: | if multi_kps is not None: | ||||
| kps = multi_kps.view(multi_scores.size(0), -1, 10) | |||||
| kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2) | |||||
| else: | else: | ||||
| bboxes = multi_bboxes[:, None].expand( | bboxes = multi_bboxes[:, None].expand( | ||||
| multi_scores.size(0), num_classes, 4) | multi_scores.size(0), num_classes, 4) | ||||
| if multi_kps is not None: | if multi_kps is not None: | ||||
| kps = multi_kps[:, None].expand( | kps = multi_kps[:, None].expand( | ||||
| multi_scores.size(0), num_classes, 10) | |||||
| multi_scores.size(0), num_classes, num_kps * 2) | |||||
| scores = multi_scores[:, :-1] | scores = multi_scores[:, :-1] | ||||
| if score_factors is not None: | if score_factors is not None: | ||||
| @@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes, | |||||
| bboxes = bboxes.reshape(-1, 4) | bboxes = bboxes.reshape(-1, 4) | ||||
| if kps is not None: | if kps is not None: | ||||
| kps = kps.reshape(-1, 10) | |||||
| kps = kps.reshape(-1, num_kps * 2) | |||||
| scores = scores.reshape(-1) | scores = scores.reshape(-1) | ||||
| labels = labels.reshape(-1) | labels = labels.reshape(-1) | ||||
| @@ -2,6 +2,12 @@ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | ||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines | https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines | ||||
| """ | """ | ||||
| from .auto_augment import RotateV2 | |||||
| from .formating import DefaultFormatBundleV2 | |||||
| from .loading import LoadAnnotationsV2 | |||||
| from .transforms import RandomSquareCrop | from .transforms import RandomSquareCrop | ||||
| __all__ = ['RandomSquareCrop'] | |||||
| __all__ = [ | |||||
| 'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2', | |||||
| 'DefaultFormatBundleV2' | |||||
| ] | |||||
| @@ -0,0 +1,271 @@ | |||||
| """ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py | |||||
| """ | |||||
| import copy | |||||
| import cv2 | |||||
| import mmcv | |||||
| import numpy as np | |||||
| from mmdet.datasets.builder import PIPELINES | |||||
| _MAX_LEVEL = 10 | |||||
| def level_to_value(level, max_value): | |||||
| """Map from level to values based on max_value.""" | |||||
| return (level / _MAX_LEVEL) * max_value | |||||
| def random_negative(value, random_negative_prob): | |||||
| """Randomly negate value based on random_negative_prob.""" | |||||
| return -value if np.random.rand() < random_negative_prob else value | |||||
| def bbox2fields(): | |||||
| """The key correspondence from bboxes to labels, masks and | |||||
| segmentations.""" | |||||
| bbox2label = { | |||||
| 'gt_bboxes': 'gt_labels', | |||||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||||
| } | |||||
| bbox2mask = { | |||||
| 'gt_bboxes': 'gt_masks', | |||||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||||
| } | |||||
| bbox2seg = { | |||||
| 'gt_bboxes': 'gt_semantic_seg', | |||||
| } | |||||
| return bbox2label, bbox2mask, bbox2seg | |||||
| @PIPELINES.register_module() | |||||
| class RotateV2(object): | |||||
| """Apply Rotate Transformation to image (and its corresponding bbox, mask, | |||||
| segmentation). | |||||
| Args: | |||||
| level (int | float): The level should be in range (0,_MAX_LEVEL]. | |||||
| scale (int | float): Isotropic scale factor. Same in | |||||
| ``mmcv.imrotate``. | |||||
| center (int | float | tuple[float]): Center point (w, h) of the | |||||
| rotation in the source image. If None, the center of the | |||||
| image will be used. Same in ``mmcv.imrotate``. | |||||
| img_fill_val (int | float | tuple): The fill value for image border. | |||||
| If float, the same value will be used for all the three | |||||
| channels of image. If tuple, the should be 3 elements (e.g. | |||||
| equals the number of channels for image). | |||||
| seg_ignore_label (int): The fill value used for segmentation map. | |||||
| Note this value must equals ``ignore_label`` in ``semantic_head`` | |||||
| of the corresponding config. Default 255. | |||||
| prob (float): The probability for perform transformation and | |||||
| should be in range 0 to 1. | |||||
| max_rotate_angle (int | float): The maximum angles for rotate | |||||
| transformation. | |||||
| random_negative_prob (float): The probability that turns the | |||||
| offset negative. | |||||
| """ | |||||
| def __init__(self, | |||||
| level, | |||||
| scale=1, | |||||
| center=None, | |||||
| img_fill_val=128, | |||||
| seg_ignore_label=255, | |||||
| prob=0.5, | |||||
| max_rotate_angle=30, | |||||
| random_negative_prob=0.5): | |||||
| assert isinstance(level, (int, float)), \ | |||||
| f'The level must be type int or float. got {type(level)}.' | |||||
| assert 0 <= level <= _MAX_LEVEL, \ | |||||
| f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.' | |||||
| assert isinstance(scale, (int, float)), \ | |||||
| f'The scale must be type int or float. got type {type(scale)}.' | |||||
| if isinstance(center, (int, float)): | |||||
| center = (center, center) | |||||
| elif isinstance(center, tuple): | |||||
| assert len(center) == 2, 'center with type tuple must have '\ | |||||
| f'2 elements. got {len(center)} elements.' | |||||
| else: | |||||
| assert center is None, 'center must be None or type int, '\ | |||||
| f'float or tuple, got type {type(center)}.' | |||||
| if isinstance(img_fill_val, (float, int)): | |||||
| img_fill_val = tuple([float(img_fill_val)] * 3) | |||||
| elif isinstance(img_fill_val, tuple): | |||||
| assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\ | |||||
| f'have 3 elements. got {len(img_fill_val)}.' | |||||
| img_fill_val = tuple([float(val) for val in img_fill_val]) | |||||
| else: | |||||
| raise ValueError( | |||||
| 'img_fill_val must be float or tuple with 3 elements.') | |||||
| assert np.all([0 <= val <= 255 for val in img_fill_val]), \ | |||||
| 'all elements of img_fill_val should between range [0,255]. '\ | |||||
| f'got {img_fill_val}.' | |||||
| assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\ | |||||
| f'got {prob}.' | |||||
| assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\ | |||||
| f'should be type int or float. got type {type(max_rotate_angle)}.' | |||||
| self.level = level | |||||
| self.scale = scale | |||||
| # Rotation angle in degrees. Positive values mean | |||||
| # clockwise rotation. | |||||
| self.angle = level_to_value(level, max_rotate_angle) | |||||
| self.center = center | |||||
| self.img_fill_val = img_fill_val | |||||
| self.seg_ignore_label = seg_ignore_label | |||||
| self.prob = prob | |||||
| self.max_rotate_angle = max_rotate_angle | |||||
| self.random_negative_prob = random_negative_prob | |||||
| def _rotate_img(self, results, angle, center=None, scale=1.0): | |||||
| """Rotate the image. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| angle (float): Rotation angle in degrees, positive values | |||||
| mean clockwise rotation. Same in ``mmcv.imrotate``. | |||||
| center (tuple[float], optional): Center point (w, h) of the | |||||
| rotation. Same in ``mmcv.imrotate``. | |||||
| scale (int | float): Isotropic scale factor. Same in | |||||
| ``mmcv.imrotate``. | |||||
| """ | |||||
| for key in results.get('img_fields', ['img']): | |||||
| img = results[key].copy() | |||||
| img_rotated = mmcv.imrotate( | |||||
| img, angle, center, scale, border_value=self.img_fill_val) | |||||
| results[key] = img_rotated.astype(img.dtype) | |||||
| results['img_shape'] = results[key].shape | |||||
| def _rotate_bboxes(self, results, rotate_matrix): | |||||
| """Rotate the bboxes.""" | |||||
| h, w, c = results['img_shape'] | |||||
| for key in results.get('bbox_fields', []): | |||||
| min_x, min_y, max_x, max_y = np.split( | |||||
| results[key], results[key].shape[-1], axis=-1) | |||||
| coordinates = np.stack([[min_x, min_y], [max_x, min_y], | |||||
| [min_x, max_y], | |||||
| [max_x, max_y]]) # [4, 2, nb_bbox, 1] | |||||
| # pad 1 to convert from format [x, y] to homogeneous | |||||
| # coordinates format [x, y, 1] | |||||
| coordinates = np.concatenate( | |||||
| (coordinates, | |||||
| np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)), | |||||
| axis=1) # [4, 3, nb_bbox, 1] | |||||
| coordinates = coordinates.transpose( | |||||
| (2, 0, 1, 3)) # [nb_bbox, 4, 3, 1] | |||||
| rotated_coords = np.matmul(rotate_matrix, | |||||
| coordinates) # [nb_bbox, 4, 2, 1] | |||||
| rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2] | |||||
| min_x, min_y = np.min( | |||||
| rotated_coords[:, :, 0], axis=1), np.min( | |||||
| rotated_coords[:, :, 1], axis=1) | |||||
| max_x, max_y = np.max( | |||||
| rotated_coords[:, :, 0], axis=1), np.max( | |||||
| rotated_coords[:, :, 1], axis=1) | |||||
| results[key] = np.stack([min_x, min_y, max_x, max_y], | |||||
| axis=-1).astype(results[key].dtype) | |||||
| def _rotate_keypoints90(self, results, angle): | |||||
| """Rotate the keypoints, only valid when angle in [-90,90,-180,180]""" | |||||
| if angle not in [-90, 90, 180, -180 | |||||
| ] or self.scale != 1 or self.center is not None: | |||||
| return | |||||
| for key in results.get('keypoints_fields', []): | |||||
| k = results[key] | |||||
| if angle == 90: | |||||
| w, h, c = results['img'].shape | |||||
| new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1) | |||||
| elif angle == -90: | |||||
| w, h, c = results['img'].shape | |||||
| new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1) | |||||
| else: | |||||
| h, w, c = results['img'].shape | |||||
| new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]], | |||||
| axis=-1) | |||||
| # a kps is invalid if thrid value is -1 | |||||
| kps_invalid = new[..., -1][:, -1] == -1 | |||||
| new[kps_invalid] = np.zeros(new.shape[1:]) - 1 | |||||
| results[key] = new | |||||
| def _rotate_masks(self, | |||||
| results, | |||||
| angle, | |||||
| center=None, | |||||
| scale=1.0, | |||||
| fill_val=0): | |||||
| """Rotate the masks.""" | |||||
| h, w, c = results['img_shape'] | |||||
| for key in results.get('mask_fields', []): | |||||
| masks = results[key] | |||||
| results[key] = masks.rotate((h, w), angle, center, scale, fill_val) | |||||
| def _rotate_seg(self, | |||||
| results, | |||||
| angle, | |||||
| center=None, | |||||
| scale=1.0, | |||||
| fill_val=255): | |||||
| """Rotate the segmentation map.""" | |||||
| for key in results.get('seg_fields', []): | |||||
| seg = results[key].copy() | |||||
| results[key] = mmcv.imrotate( | |||||
| seg, angle, center, scale, | |||||
| border_value=fill_val).astype(seg.dtype) | |||||
| def _filter_invalid(self, results, min_bbox_size=0): | |||||
| """Filter bboxes and corresponding masks too small after rotate | |||||
| augmentation.""" | |||||
| bbox2label, bbox2mask, _ = bbox2fields() | |||||
| for key in results.get('bbox_fields', []): | |||||
| bbox_w = results[key][:, 2] - results[key][:, 0] | |||||
| bbox_h = results[key][:, 3] - results[key][:, 1] | |||||
| valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size) | |||||
| valid_inds = np.nonzero(valid_inds)[0] | |||||
| results[key] = results[key][valid_inds] | |||||
| # label fields. e.g. gt_labels and gt_labels_ignore | |||||
| label_key = bbox2label.get(key) | |||||
| if label_key in results: | |||||
| results[label_key] = results[label_key][valid_inds] | |||||
| # mask fields, e.g. gt_masks and gt_masks_ignore | |||||
| mask_key = bbox2mask.get(key) | |||||
| if mask_key in results: | |||||
| results[mask_key] = results[mask_key][valid_inds] | |||||
| def __call__(self, results): | |||||
| """Call function to rotate images, bounding boxes, masks and semantic | |||||
| segmentation maps. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| Returns: | |||||
| dict: Rotated results. | |||||
| """ | |||||
| if np.random.rand() > self.prob: | |||||
| return results | |||||
| h, w = results['img'].shape[:2] | |||||
| center = self.center | |||||
| if center is None: | |||||
| center = ((w - 1) * 0.5, (h - 1) * 0.5) | |||||
| angle = random_negative(self.angle, self.random_negative_prob) | |||||
| self._rotate_img(results, angle, center, self.scale) | |||||
| rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale) | |||||
| self._rotate_bboxes(results, rotate_matrix) | |||||
| self._rotate_keypoints90(results, angle) | |||||
| self._rotate_masks(results, angle, center, self.scale, fill_val=0) | |||||
| self._rotate_seg( | |||||
| results, angle, center, self.scale, fill_val=self.seg_ignore_label) | |||||
| self._filter_invalid(results) | |||||
| return results | |||||
| def __repr__(self): | |||||
| repr_str = self.__class__.__name__ | |||||
| repr_str += f'(level={self.level}, ' | |||||
| repr_str += f'scale={self.scale}, ' | |||||
| repr_str += f'center={self.center}, ' | |||||
| repr_str += f'img_fill_val={self.img_fill_val}, ' | |||||
| repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' | |||||
| repr_str += f'prob={self.prob}, ' | |||||
| repr_str += f'max_rotate_angle={self.max_rotate_angle}, ' | |||||
| repr_str += f'random_negative_prob={self.random_negative_prob})' | |||||
| return repr_str | |||||
| @@ -0,0 +1,113 @@ | |||||
| """ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py | |||||
| """ | |||||
| import numpy as np | |||||
| import torch | |||||
| from mmcv.parallel import DataContainer as DC | |||||
| from mmdet.datasets.builder import PIPELINES | |||||
| def to_tensor(data): | |||||
| """Convert objects of various python types to :obj:`torch.Tensor`. | |||||
| Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, | |||||
| :class:`Sequence`, :class:`int` and :class:`float`. | |||||
| Args: | |||||
| data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to | |||||
| be converted. | |||||
| """ | |||||
| if isinstance(data, torch.Tensor): | |||||
| return data | |||||
| elif isinstance(data, np.ndarray): | |||||
| return torch.from_numpy(data) | |||||
| elif isinstance(data, Sequence) and not mmcv.is_str(data): | |||||
| return torch.tensor(data) | |||||
| elif isinstance(data, int): | |||||
| return torch.LongTensor([data]) | |||||
| elif isinstance(data, float): | |||||
| return torch.FloatTensor([data]) | |||||
| else: | |||||
| raise TypeError(f'type {type(data)} cannot be converted to tensor.') | |||||
| @PIPELINES.register_module() | |||||
| class DefaultFormatBundleV2(object): | |||||
| """Default formatting bundle. | |||||
| It simplifies the pipeline of formatting common fields, including "img", | |||||
| "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". | |||||
| These fields are formatted as follows. | |||||
| - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) | |||||
| - proposals: (1)to tensor, (2)to DataContainer | |||||
| - gt_bboxes: (1)to tensor, (2)to DataContainer | |||||
| - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer | |||||
| - gt_labels: (1)to tensor, (2)to DataContainer | |||||
| - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) | |||||
| - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ | |||||
| (3)to DataContainer (stack=True) | |||||
| """ | |||||
| def __call__(self, results): | |||||
| """Call function to transform and format common fields in results. | |||||
| Args: | |||||
| results (dict): Result dict contains the data to convert. | |||||
| Returns: | |||||
| dict: The result dict contains the data that is formatted with \ | |||||
| default bundle. | |||||
| """ | |||||
| if 'img' in results: | |||||
| img = results['img'] | |||||
| # add default meta keys | |||||
| results = self._add_default_meta_keys(results) | |||||
| if len(img.shape) < 3: | |||||
| img = np.expand_dims(img, -1) | |||||
| img = np.ascontiguousarray(img.transpose(2, 0, 1)) | |||||
| results['img'] = DC(to_tensor(img), stack=True) | |||||
| for key in [ | |||||
| 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss', | |||||
| 'gt_labels' | |||||
| ]: | |||||
| if key not in results: | |||||
| continue | |||||
| results[key] = DC(to_tensor(results[key])) | |||||
| if 'gt_masks' in results: | |||||
| results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) | |||||
| if 'gt_semantic_seg' in results: | |||||
| results['gt_semantic_seg'] = DC( | |||||
| to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) | |||||
| return results | |||||
| def _add_default_meta_keys(self, results): | |||||
| """Add default meta keys. | |||||
| We set default meta keys including `pad_shape`, `scale_factor` and | |||||
| `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and | |||||
| `Pad` are implemented during the whole pipeline. | |||||
| Args: | |||||
| results (dict): Result dict contains the data to convert. | |||||
| Returns: | |||||
| results (dict): Updated result dict contains the data to convert. | |||||
| """ | |||||
| img = results['img'] | |||||
| results.setdefault('pad_shape', img.shape) | |||||
| results.setdefault('scale_factor', 1.0) | |||||
| num_channels = 1 if len(img.shape) < 3 else img.shape[2] | |||||
| results.setdefault( | |||||
| 'img_norm_cfg', | |||||
| dict( | |||||
| mean=np.zeros(num_channels, dtype=np.float32), | |||||
| std=np.ones(num_channels, dtype=np.float32), | |||||
| to_rgb=False)) | |||||
| return results | |||||
| def __repr__(self): | |||||
| return self.__class__.__name__ | |||||
| @@ -0,0 +1,225 @@ | |||||
| """ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py | |||||
| """ | |||||
| import os.path as osp | |||||
| import numpy as np | |||||
| import pycocotools.mask as maskUtils | |||||
| from mmdet.core import BitmapMasks, PolygonMasks | |||||
| from mmdet.datasets.builder import PIPELINES | |||||
| @PIPELINES.register_module() | |||||
| class LoadAnnotationsV2(object): | |||||
| """Load mutiple types of annotations. | |||||
| Args: | |||||
| with_bbox (bool): Whether to parse and load the bbox annotation. | |||||
| Default: True. | |||||
| with_label (bool): Whether to parse and load the label annotation. | |||||
| Default: True. | |||||
| with_keypoints (bool): Whether to parse and load the keypoints annotation. | |||||
| Default: False. | |||||
| with_mask (bool): Whether to parse and load the mask annotation. | |||||
| Default: False. | |||||
| with_seg (bool): Whether to parse and load the semantic segmentation | |||||
| annotation. Default: False. | |||||
| poly2mask (bool): Whether to convert the instance masks from polygons | |||||
| to bitmaps. Default: True. | |||||
| file_client_args (dict): Arguments to instantiate a FileClient. | |||||
| See :class:`mmcv.fileio.FileClient` for details. | |||||
| Defaults to ``dict(backend='disk')``. | |||||
| """ | |||||
| def __init__(self, | |||||
| with_bbox=True, | |||||
| with_label=True, | |||||
| with_keypoints=False, | |||||
| with_mask=False, | |||||
| with_seg=False, | |||||
| poly2mask=True, | |||||
| file_client_args=dict(backend='disk')): | |||||
| self.with_bbox = with_bbox | |||||
| self.with_label = with_label | |||||
| self.with_keypoints = with_keypoints | |||||
| self.with_mask = with_mask | |||||
| self.with_seg = with_seg | |||||
| self.poly2mask = poly2mask | |||||
| self.file_client_args = file_client_args.copy() | |||||
| self.file_client = None | |||||
| def _load_bboxes(self, results): | |||||
| """Private function to load bounding box annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded bounding box annotations. | |||||
| """ | |||||
| ann_info = results['ann_info'] | |||||
| results['gt_bboxes'] = ann_info['bboxes'].copy() | |||||
| gt_bboxes_ignore = ann_info.get('bboxes_ignore', None) | |||||
| if gt_bboxes_ignore is not None: | |||||
| results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy() | |||||
| results['bbox_fields'].append('gt_bboxes_ignore') | |||||
| results['bbox_fields'].append('gt_bboxes') | |||||
| return results | |||||
| def _load_keypoints(self, results): | |||||
| """Private function to load bounding box annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded bounding box annotations. | |||||
| """ | |||||
| ann_info = results['ann_info'] | |||||
| results['gt_keypointss'] = ann_info['keypointss'].copy() | |||||
| results['keypoints_fields'] = ['gt_keypointss'] | |||||
| return results | |||||
| def _load_labels(self, results): | |||||
| """Private function to load label annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded label annotations. | |||||
| """ | |||||
| results['gt_labels'] = results['ann_info']['labels'].copy() | |||||
| return results | |||||
| def _poly2mask(self, mask_ann, img_h, img_w): | |||||
| """Private function to convert masks represented with polygon to | |||||
| bitmaps. | |||||
| Args: | |||||
| mask_ann (list | dict): Polygon mask annotation input. | |||||
| img_h (int): The height of output mask. | |||||
| img_w (int): The width of output mask. | |||||
| Returns: | |||||
| numpy.ndarray: The decode bitmap mask of shape (img_h, img_w). | |||||
| """ | |||||
| if isinstance(mask_ann, list): | |||||
| # polygon -- a single object might consist of multiple parts | |||||
| # we merge all parts into one mask rle code | |||||
| rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) | |||||
| rle = maskUtils.merge(rles) | |||||
| elif isinstance(mask_ann['counts'], list): | |||||
| # uncompressed RLE | |||||
| rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) | |||||
| else: | |||||
| # rle | |||||
| rle = mask_ann | |||||
| mask = maskUtils.decode(rle) | |||||
| return mask | |||||
| def process_polygons(self, polygons): | |||||
| """Convert polygons to list of ndarray and filter invalid polygons. | |||||
| Args: | |||||
| polygons (list[list]): Polygons of one instance. | |||||
| Returns: | |||||
| list[numpy.ndarray]: Processed polygons. | |||||
| """ | |||||
| polygons = [np.array(p) for p in polygons] | |||||
| valid_polygons = [] | |||||
| for polygon in polygons: | |||||
| if len(polygon) % 2 == 0 and len(polygon) >= 6: | |||||
| valid_polygons.append(polygon) | |||||
| return valid_polygons | |||||
| def _load_masks(self, results): | |||||
| """Private function to load mask annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded mask annotations. | |||||
| If ``self.poly2mask`` is set ``True``, `gt_mask` will contain | |||||
| :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. | |||||
| """ | |||||
| h, w = results['img_info']['height'], results['img_info']['width'] | |||||
| gt_masks = results['ann_info']['masks'] | |||||
| if self.poly2mask: | |||||
| gt_masks = BitmapMasks( | |||||
| [self._poly2mask(mask, h, w) for mask in gt_masks], h, w) | |||||
| else: | |||||
| gt_masks = PolygonMasks( | |||||
| [self.process_polygons(polygons) for polygons in gt_masks], h, | |||||
| w) | |||||
| results['gt_masks'] = gt_masks | |||||
| results['mask_fields'].append('gt_masks') | |||||
| return results | |||||
| def _load_semantic_seg(self, results): | |||||
| """Private function to load semantic segmentation annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`dataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded semantic segmentation annotations. | |||||
| """ | |||||
| import mmcv | |||||
| if self.file_client is None: | |||||
| self.file_client = mmcv.FileClient(**self.file_client_args) | |||||
| filename = osp.join(results['seg_prefix'], | |||||
| results['ann_info']['seg_map']) | |||||
| img_bytes = self.file_client.get(filename) | |||||
| results['gt_semantic_seg'] = mmcv.imfrombytes( | |||||
| img_bytes, flag='unchanged').squeeze() | |||||
| results['seg_fields'].append('gt_semantic_seg') | |||||
| return results | |||||
| def __call__(self, results): | |||||
| """Call function to load multiple types annotations. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`mmdet.CustomDataset`. | |||||
| Returns: | |||||
| dict: The dict contains loaded bounding box, label, mask and | |||||
| semantic segmentation annotations. | |||||
| """ | |||||
| if self.with_bbox: | |||||
| results = self._load_bboxes(results) | |||||
| if results is None: | |||||
| return None | |||||
| if self.with_label: | |||||
| results = self._load_labels(results) | |||||
| if self.with_keypoints: | |||||
| results = self._load_keypoints(results) | |||||
| if self.with_mask: | |||||
| results = self._load_masks(results) | |||||
| if self.with_seg: | |||||
| results = self._load_semantic_seg(results) | |||||
| return results | |||||
| def __repr__(self): | |||||
| repr_str = self.__class__.__name__ | |||||
| repr_str += f'(with_bbox={self.with_bbox}, ' | |||||
| repr_str += f'with_label={self.with_label}, ' | |||||
| repr_str += f'with_keypoints={self.with_keypoints}, ' | |||||
| repr_str += f'with_mask={self.with_mask}, ' | |||||
| repr_str += f'with_seg={self.with_seg})' | |||||
| repr_str += f'poly2mask={self.poly2mask})' | |||||
| repr_str += f'poly2mask={self.file_client_args})' | |||||
| return repr_str | |||||
| @@ -0,0 +1,737 @@ | |||||
| """ | |||||
| The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||||
| https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py | |||||
| """ | |||||
| import mmcv | |||||
| import numpy as np | |||||
| from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps | |||||
| from mmdet.datasets.builder import PIPELINES | |||||
| from numpy import random | |||||
| @PIPELINES.register_module() | |||||
| class ResizeV2(object): | |||||
| """Resize images & bbox & mask &kps. | |||||
| This transform resizes the input image to some scale. Bboxes and masks are | |||||
| then resized with the same scale factor. If the input dict contains the key | |||||
| "scale", then the scale in the input dict is used, otherwise the specified | |||||
| scale in the init method is used. If the input dict contains the key | |||||
| "scale_factor" (if MultiScaleFlipAug does not give img_scale but | |||||
| scale_factor), the actual scale will be computed by image shape and | |||||
| scale_factor. | |||||
| `img_scale` can either be a tuple (single-scale) or a list of tuple | |||||
| (multi-scale). There are 3 multiscale modes: | |||||
| - ``ratio_range is not None``: randomly sample a ratio from the ratio \ | |||||
| range and multiply it with the image scale. | |||||
| - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ | |||||
| sample a scale from the multiscale range. | |||||
| - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ | |||||
| sample a scale from multiple scales. | |||||
| Args: | |||||
| img_scale (tuple or list[tuple]): Images scales for resizing. | |||||
| multiscale_mode (str): Either "range" or "value". | |||||
| ratio_range (tuple[float]): (min_ratio, max_ratio) | |||||
| keep_ratio (bool): Whether to keep the aspect ratio when resizing the | |||||
| image. | |||||
| bbox_clip_border (bool, optional): Whether clip the objects outside | |||||
| the border of the image. Defaults to True. | |||||
| backend (str): Image resize backend, choices are 'cv2' and 'pillow'. | |||||
| These two backends generates slightly different results. Defaults | |||||
| to 'cv2'. | |||||
| override (bool, optional): Whether to override `scale` and | |||||
| `scale_factor` so as to call resize twice. Default False. If True, | |||||
| after the first resizing, the existed `scale` and `scale_factor` | |||||
| will be ignored so the second resizing can be allowed. | |||||
| This option is a work-around for multiple times of resize in DETR. | |||||
| Defaults to False. | |||||
| """ | |||||
| def __init__(self, | |||||
| img_scale=None, | |||||
| multiscale_mode='range', | |||||
| ratio_range=None, | |||||
| keep_ratio=True, | |||||
| bbox_clip_border=True, | |||||
| backend='cv2', | |||||
| override=False): | |||||
| if img_scale is None: | |||||
| self.img_scale = None | |||||
| else: | |||||
| if isinstance(img_scale, list): | |||||
| self.img_scale = img_scale | |||||
| else: | |||||
| self.img_scale = [img_scale] | |||||
| assert mmcv.is_list_of(self.img_scale, tuple) | |||||
| if ratio_range is not None: | |||||
| # mode 1: given a scale and a range of image ratio | |||||
| assert len(self.img_scale) == 1 | |||||
| else: | |||||
| # mode 2: given multiple scales or a range of scales | |||||
| assert multiscale_mode in ['value', 'range'] | |||||
| self.backend = backend | |||||
| self.multiscale_mode = multiscale_mode | |||||
| self.ratio_range = ratio_range | |||||
| self.keep_ratio = keep_ratio | |||||
| # TODO: refactor the override option in Resize | |||||
| self.override = override | |||||
| self.bbox_clip_border = bbox_clip_border | |||||
| @staticmethod | |||||
| def random_select(img_scales): | |||||
| """Randomly select an img_scale from given candidates. | |||||
| Args: | |||||
| img_scales (list[tuple]): Images scales for selection. | |||||
| Returns: | |||||
| (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ | |||||
| where ``img_scale`` is the selected image scale and \ | |||||
| ``scale_idx`` is the selected index in the given candidates. | |||||
| """ | |||||
| assert mmcv.is_list_of(img_scales, tuple) | |||||
| scale_idx = np.random.randint(len(img_scales)) | |||||
| img_scale = img_scales[scale_idx] | |||||
| return img_scale, scale_idx | |||||
| @staticmethod | |||||
| def random_sample(img_scales): | |||||
| """Randomly sample an img_scale when ``multiscale_mode=='range'``. | |||||
| Args: | |||||
| img_scales (list[tuple]): Images scale range for sampling. | |||||
| There must be two tuples in img_scales, which specify the lower | |||||
| and uper bound of image scales. | |||||
| Returns: | |||||
| (tuple, None): Returns a tuple ``(img_scale, None)``, where \ | |||||
| ``img_scale`` is sampled scale and None is just a placeholder \ | |||||
| to be consistent with :func:`random_select`. | |||||
| """ | |||||
| assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 | |||||
| img_scale_long = [max(s) for s in img_scales] | |||||
| img_scale_short = [min(s) for s in img_scales] | |||||
| long_edge = np.random.randint( | |||||
| min(img_scale_long), | |||||
| max(img_scale_long) + 1) | |||||
| short_edge = np.random.randint( | |||||
| min(img_scale_short), | |||||
| max(img_scale_short) + 1) | |||||
| img_scale = (long_edge, short_edge) | |||||
| return img_scale, None | |||||
| @staticmethod | |||||
| def random_sample_ratio(img_scale, ratio_range): | |||||
| """Randomly sample an img_scale when ``ratio_range`` is specified. | |||||
| A ratio will be randomly sampled from the range specified by | |||||
| ``ratio_range``. Then it would be multiplied with ``img_scale`` to | |||||
| generate sampled scale. | |||||
| Args: | |||||
| img_scale (tuple): Images scale base to multiply with ratio. | |||||
| ratio_range (tuple[float]): The minimum and maximum ratio to scale | |||||
| the ``img_scale``. | |||||
| Returns: | |||||
| (tuple, None): Returns a tuple ``(scale, None)``, where \ | |||||
| ``scale`` is sampled ratio multiplied with ``img_scale`` and \ | |||||
| None is just a placeholder to be consistent with \ | |||||
| :func:`random_select`. | |||||
| """ | |||||
| assert isinstance(img_scale, tuple) and len(img_scale) == 2 | |||||
| min_ratio, max_ratio = ratio_range | |||||
| assert min_ratio <= max_ratio | |||||
| ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio | |||||
| scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) | |||||
| return scale, None | |||||
| def _random_scale(self, results): | |||||
| """Randomly sample an img_scale according to ``ratio_range`` and | |||||
| ``multiscale_mode``. | |||||
| If ``ratio_range`` is specified, a ratio will be sampled and be | |||||
| multiplied with ``img_scale``. | |||||
| If multiple scales are specified by ``img_scale``, a scale will be | |||||
| sampled according to ``multiscale_mode``. | |||||
| Otherwise, single scale will be used. | |||||
| Args: | |||||
| results (dict): Result dict from :obj:`dataset`. | |||||
| Returns: | |||||
| dict: Two new keys 'scale` and 'scale_idx` are added into \ | |||||
| ``results``, which would be used by subsequent pipelines. | |||||
| """ | |||||
| if self.ratio_range is not None: | |||||
| scale, scale_idx = self.random_sample_ratio( | |||||
| self.img_scale[0], self.ratio_range) | |||||
| elif len(self.img_scale) == 1: | |||||
| scale, scale_idx = self.img_scale[0], 0 | |||||
| elif self.multiscale_mode == 'range': | |||||
| scale, scale_idx = self.random_sample(self.img_scale) | |||||
| elif self.multiscale_mode == 'value': | |||||
| scale, scale_idx = self.random_select(self.img_scale) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| results['scale'] = scale | |||||
| results['scale_idx'] = scale_idx | |||||
| def _resize_img(self, results): | |||||
| """Resize images with ``results['scale']``.""" | |||||
| for key in results.get('img_fields', ['img']): | |||||
| if self.keep_ratio: | |||||
| img, scale_factor = mmcv.imrescale( | |||||
| results[key], | |||||
| results['scale'], | |||||
| return_scale=True, | |||||
| backend=self.backend) | |||||
| # the w_scale and h_scale has minor difference | |||||
| # a real fix should be done in the mmcv.imrescale in the future | |||||
| new_h, new_w = img.shape[:2] | |||||
| h, w = results[key].shape[:2] | |||||
| w_scale = new_w / w | |||||
| h_scale = new_h / h | |||||
| else: | |||||
| img, w_scale, h_scale = mmcv.imresize( | |||||
| results[key], | |||||
| results['scale'], | |||||
| return_scale=True, | |||||
| backend=self.backend) | |||||
| results[key] = img | |||||
| scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], | |||||
| dtype=np.float32) | |||||
| results['img_shape'] = img.shape | |||||
| # in case that there is no padding | |||||
| results['pad_shape'] = img.shape | |||||
| results['scale_factor'] = scale_factor | |||||
| results['keep_ratio'] = self.keep_ratio | |||||
| def _resize_bboxes(self, results): | |||||
| """Resize bounding boxes with ``results['scale_factor']``.""" | |||||
| for key in results.get('bbox_fields', []): | |||||
| bboxes = results[key] * results['scale_factor'] | |||||
| if self.bbox_clip_border: | |||||
| img_shape = results['img_shape'] | |||||
| bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) | |||||
| bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) | |||||
| results[key] = bboxes | |||||
| def _resize_keypoints(self, results): | |||||
| """Resize keypoints with ``results['scale_factor']``.""" | |||||
| for key in results.get('keypoints_fields', []): | |||||
| keypointss = results[key].copy() | |||||
| factors = results['scale_factor'] | |||||
| assert factors[0] == factors[2] | |||||
| assert factors[1] == factors[3] | |||||
| keypointss[:, :, 0] *= factors[0] | |||||
| keypointss[:, :, 1] *= factors[1] | |||||
| if self.bbox_clip_border: | |||||
| img_shape = results['img_shape'] | |||||
| keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0, | |||||
| img_shape[1]) | |||||
| keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0, | |||||
| img_shape[0]) | |||||
| results[key] = keypointss | |||||
| def _resize_masks(self, results): | |||||
| """Resize masks with ``results['scale']``""" | |||||
| for key in results.get('mask_fields', []): | |||||
| if results[key] is None: | |||||
| continue | |||||
| if self.keep_ratio: | |||||
| results[key] = results[key].rescale(results['scale']) | |||||
| else: | |||||
| results[key] = results[key].resize(results['img_shape'][:2]) | |||||
| def _resize_seg(self, results): | |||||
| """Resize semantic segmentation map with ``results['scale']``.""" | |||||
| for key in results.get('seg_fields', []): | |||||
| if self.keep_ratio: | |||||
| gt_seg = mmcv.imrescale( | |||||
| results[key], | |||||
| results['scale'], | |||||
| interpolation='nearest', | |||||
| backend=self.backend) | |||||
| else: | |||||
| gt_seg = mmcv.imresize( | |||||
| results[key], | |||||
| results['scale'], | |||||
| interpolation='nearest', | |||||
| backend=self.backend) | |||||
| results['gt_semantic_seg'] = gt_seg | |||||
| def __call__(self, results): | |||||
| """Call function to resize images, bounding boxes, masks, semantic | |||||
| segmentation map. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| Returns: | |||||
| dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \ | |||||
| 'keep_ratio' keys are added into result dict. | |||||
| """ | |||||
| if 'scale' not in results: | |||||
| if 'scale_factor' in results: | |||||
| img_shape = results['img'].shape[:2] | |||||
| scale_factor = results['scale_factor'] | |||||
| assert isinstance(scale_factor, float) | |||||
| results['scale'] = tuple( | |||||
| [int(x * scale_factor) for x in img_shape][::-1]) | |||||
| else: | |||||
| self._random_scale(results) | |||||
| else: | |||||
| if not self.override: | |||||
| assert 'scale_factor' not in results, ( | |||||
| 'scale and scale_factor cannot be both set.') | |||||
| else: | |||||
| results.pop('scale') | |||||
| if 'scale_factor' in results: | |||||
| results.pop('scale_factor') | |||||
| self._random_scale(results) | |||||
| self._resize_img(results) | |||||
| self._resize_bboxes(results) | |||||
| self._resize_keypoints(results) | |||||
| self._resize_masks(results) | |||||
| self._resize_seg(results) | |||||
| return results | |||||
| def __repr__(self): | |||||
| repr_str = self.__class__.__name__ | |||||
| repr_str += f'(img_scale={self.img_scale}, ' | |||||
| repr_str += f'multiscale_mode={self.multiscale_mode}, ' | |||||
| repr_str += f'ratio_range={self.ratio_range}, ' | |||||
| repr_str += f'keep_ratio={self.keep_ratio})' | |||||
| repr_str += f'bbox_clip_border={self.bbox_clip_border})' | |||||
| return repr_str | |||||
| @PIPELINES.register_module() | |||||
| class RandomFlipV2(object): | |||||
| """Flip the image & bbox & mask & kps. | |||||
| If the input dict contains the key "flip", then the flag will be used, | |||||
| otherwise it will be randomly decided by a ratio specified in the init | |||||
| method. | |||||
| When random flip is enabled, ``flip_ratio``/``direction`` can either be a | |||||
| float/string or tuple of float/string. There are 3 flip modes: | |||||
| - ``flip_ratio`` is float, ``direction`` is string: the image will be | |||||
| ``direction``ly flipped with probability of ``flip_ratio`` . | |||||
| E.g., ``flip_ratio=0.5``, ``direction='horizontal'``, | |||||
| then image will be horizontally flipped with probability of 0.5. | |||||
| - ``flip_ratio`` is float, ``direction`` is list of string: the image wil | |||||
| be ``direction[i]``ly flipped with probability of | |||||
| ``flip_ratio/len(direction)``. | |||||
| E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``, | |||||
| then image will be horizontally flipped with probability of 0.25, | |||||
| vertically with probability of 0.25. | |||||
| - ``flip_ratio`` is list of float, ``direction`` is list of string: | |||||
| given ``len(flip_ratio) == len(direction)``, the image wil | |||||
| be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``. | |||||
| E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal', | |||||
| 'vertical']``, then image will be horizontally flipped with probability | |||||
| of 0.3, vertically with probability of 0.5 | |||||
| Args: | |||||
| flip_ratio (float | list[float], optional): The flipping probability. | |||||
| Default: None. | |||||
| direction(str | list[str], optional): The flipping direction. Options | |||||
| are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'. | |||||
| If input is a list, the length must equal ``flip_ratio``. Each | |||||
| element in ``flip_ratio`` indicates the flip probability of | |||||
| corresponding direction. | |||||
| """ | |||||
| def __init__(self, flip_ratio=None, direction='horizontal'): | |||||
| if isinstance(flip_ratio, list): | |||||
| assert mmcv.is_list_of(flip_ratio, float) | |||||
| assert 0 <= sum(flip_ratio) <= 1 | |||||
| elif isinstance(flip_ratio, float): | |||||
| assert 0 <= flip_ratio <= 1 | |||||
| elif flip_ratio is None: | |||||
| pass | |||||
| else: | |||||
| raise ValueError('flip_ratios must be None, float, ' | |||||
| 'or list of float') | |||||
| self.flip_ratio = flip_ratio | |||||
| valid_directions = ['horizontal', 'vertical', 'diagonal'] | |||||
| if isinstance(direction, str): | |||||
| assert direction in valid_directions | |||||
| elif isinstance(direction, list): | |||||
| assert mmcv.is_list_of(direction, str) | |||||
| assert set(direction).issubset(set(valid_directions)) | |||||
| else: | |||||
| raise ValueError('direction must be either str or list of str') | |||||
| self.direction = direction | |||||
| if isinstance(flip_ratio, list): | |||||
| assert len(self.flip_ratio) == len(self.direction) | |||||
| self.count = 0 | |||||
| def bbox_flip(self, bboxes, img_shape, direction): | |||||
| """Flip bboxes horizontally. | |||||
| Args: | |||||
| bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k) | |||||
| img_shape (tuple[int]): Image shape (height, width) | |||||
| direction (str): Flip direction. Options are 'horizontal', | |||||
| 'vertical'. | |||||
| Returns: | |||||
| numpy.ndarray: Flipped bounding boxes. | |||||
| """ | |||||
| assert bboxes.shape[-1] % 4 == 0 | |||||
| flipped = bboxes.copy() | |||||
| if direction == 'horizontal': | |||||
| w = img_shape[1] | |||||
| flipped[..., 0::4] = w - bboxes[..., 2::4] | |||||
| flipped[..., 2::4] = w - bboxes[..., 0::4] | |||||
| elif direction == 'vertical': | |||||
| h = img_shape[0] | |||||
| flipped[..., 1::4] = h - bboxes[..., 3::4] | |||||
| flipped[..., 3::4] = h - bboxes[..., 1::4] | |||||
| elif direction == 'diagonal': | |||||
| w = img_shape[1] | |||||
| h = img_shape[0] | |||||
| flipped[..., 0::4] = w - bboxes[..., 2::4] | |||||
| flipped[..., 1::4] = h - bboxes[..., 3::4] | |||||
| flipped[..., 2::4] = w - bboxes[..., 0::4] | |||||
| flipped[..., 3::4] = h - bboxes[..., 1::4] | |||||
| else: | |||||
| raise ValueError(f"Invalid flipping direction '{direction}'") | |||||
| return flipped | |||||
| def keypoints_flip(self, keypointss, img_shape, direction): | |||||
| """Flip keypoints horizontally.""" | |||||
| assert direction == 'horizontal' | |||||
| assert keypointss.shape[-1] == 3 | |||||
| num_kps = keypointss.shape[1] | |||||
| assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}' | |||||
| assert keypointss.ndim == 3 | |||||
| flipped = keypointss.copy() | |||||
| if num_kps == 5: | |||||
| flip_order = [1, 0, 2, 4, 3] | |||||
| elif num_kps == 4: | |||||
| flip_order = [3, 2, 1, 0] | |||||
| for idx, a in enumerate(flip_order): | |||||
| flipped[:, idx, :] = keypointss[:, a, :] | |||||
| w = img_shape[1] | |||||
| flipped[..., 0] = w - flipped[..., 0] | |||||
| return flipped | |||||
| def __call__(self, results): | |||||
| """Call function to flip bounding boxes, masks, semantic segmentation | |||||
| maps. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| Returns: | |||||
| dict: Flipped results, 'flip', 'flip_direction' keys are added \ | |||||
| into result dict. | |||||
| """ | |||||
| if 'flip' not in results: | |||||
| if isinstance(self.direction, list): | |||||
| # None means non-flip | |||||
| direction_list = self.direction + [None] | |||||
| else: | |||||
| # None means non-flip | |||||
| direction_list = [self.direction, None] | |||||
| if isinstance(self.flip_ratio, list): | |||||
| non_flip_ratio = 1 - sum(self.flip_ratio) | |||||
| flip_ratio_list = self.flip_ratio + [non_flip_ratio] | |||||
| else: | |||||
| non_flip_ratio = 1 - self.flip_ratio | |||||
| # exclude non-flip | |||||
| single_ratio = self.flip_ratio / (len(direction_list) - 1) | |||||
| flip_ratio_list = [single_ratio] * (len(direction_list) | |||||
| - 1) + [non_flip_ratio] | |||||
| cur_dir = np.random.choice(direction_list, p=flip_ratio_list) | |||||
| results['flip'] = cur_dir is not None | |||||
| if 'flip_direction' not in results: | |||||
| results['flip_direction'] = cur_dir | |||||
| if results['flip']: | |||||
| # flip image | |||||
| for key in results.get('img_fields', ['img']): | |||||
| results[key] = mmcv.imflip( | |||||
| results[key], direction=results['flip_direction']) | |||||
| # flip bboxes | |||||
| for key in results.get('bbox_fields', []): | |||||
| results[key] = self.bbox_flip(results[key], | |||||
| results['img_shape'], | |||||
| results['flip_direction']) | |||||
| # flip kps | |||||
| for key in results.get('keypoints_fields', []): | |||||
| results[key] = self.keypoints_flip(results[key], | |||||
| results['img_shape'], | |||||
| results['flip_direction']) | |||||
| # flip masks | |||||
| for key in results.get('mask_fields', []): | |||||
| results[key] = results[key].flip(results['flip_direction']) | |||||
| # flip segs | |||||
| for key in results.get('seg_fields', []): | |||||
| results[key] = mmcv.imflip( | |||||
| results[key], direction=results['flip_direction']) | |||||
| return results | |||||
| def __repr__(self): | |||||
| return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})' | |||||
| @PIPELINES.register_module() | |||||
| class RandomSquareCrop(object): | |||||
| """Random crop the image & bboxes, the cropped patches have minimum IoU | |||||
| requirement with original image & bboxes, the IoU threshold is randomly | |||||
| selected from min_ious. | |||||
| Args: | |||||
| min_ious (tuple): minimum IoU threshold for all intersections with | |||||
| bounding boxes | |||||
| min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, | |||||
| where a >= min_crop_size). | |||||
| Note: | |||||
| The keys for bboxes, labels and masks should be paired. That is, \ | |||||
| `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ | |||||
| `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. | |||||
| """ | |||||
| def __init__(self, | |||||
| crop_ratio_range=None, | |||||
| crop_choice=None, | |||||
| bbox_clip_border=True, | |||||
| big_face_ratio=0, | |||||
| big_face_crop_choice=None): | |||||
| self.crop_ratio_range = crop_ratio_range | |||||
| self.crop_choice = crop_choice | |||||
| self.big_face_crop_choice = big_face_crop_choice | |||||
| self.bbox_clip_border = bbox_clip_border | |||||
| assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) | |||||
| if self.crop_ratio_range is not None: | |||||
| self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range | |||||
| self.bbox2label = { | |||||
| 'gt_bboxes': 'gt_labels', | |||||
| 'gt_bboxes_ignore': 'gt_labels_ignore' | |||||
| } | |||||
| self.bbox2mask = { | |||||
| 'gt_bboxes': 'gt_masks', | |||||
| 'gt_bboxes_ignore': 'gt_masks_ignore' | |||||
| } | |||||
| assert big_face_ratio >= 0 and big_face_ratio <= 1.0 | |||||
| self.big_face_ratio = big_face_ratio | |||||
| def __call__(self, results): | |||||
| """Call function to crop images and bounding boxes with minimum IoU | |||||
| constraint. | |||||
| Args: | |||||
| results (dict): Result dict from loading pipeline. | |||||
| Returns: | |||||
| dict: Result dict with images and bounding boxes cropped, \ | |||||
| 'img_shape' key is updated. | |||||
| """ | |||||
| if 'img_fields' in results: | |||||
| assert results['img_fields'] == ['img'], \ | |||||
| 'Only single img_fields is allowed' | |||||
| img = results['img'] | |||||
| assert 'bbox_fields' in results | |||||
| assert 'gt_bboxes' in results | |||||
| # try augment big face images | |||||
| find_bigface = False | |||||
| if np.random.random() < self.big_face_ratio: | |||||
| min_size = 100 # h and w | |||||
| expand_ratio = 0.3 # expand ratio of croped face alongwith both w and h | |||||
| bbox = results['gt_bboxes'].copy() | |||||
| lmks = results['gt_keypointss'].copy() | |||||
| label = results['gt_labels'].copy() | |||||
| # filter small faces | |||||
| size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * ( | |||||
| (bbox[:, 3] - bbox[:, 1]) > min_size) | |||||
| bbox = bbox[size_mask] | |||||
| lmks = lmks[size_mask] | |||||
| label = label[size_mask] | |||||
| # randomly choose a face that has no overlap with others | |||||
| if len(bbox) > 0: | |||||
| overlaps = bbox_overlaps(bbox, bbox) | |||||
| overlaps -= np.eye(overlaps.shape[0]) | |||||
| iou_mask = np.sum(overlaps, axis=1) == 0 | |||||
| bbox = bbox[iou_mask] | |||||
| lmks = lmks[iou_mask] | |||||
| label = label[iou_mask] | |||||
| if len(bbox) > 0: | |||||
| choice = np.random.randint(len(bbox)) | |||||
| bbox = bbox[choice] | |||||
| lmks = lmks[choice] | |||||
| label = [label[choice]] | |||||
| w = bbox[2] - bbox[0] | |||||
| h = bbox[3] - bbox[1] | |||||
| x1 = bbox[0] - w * expand_ratio | |||||
| x2 = bbox[2] + w * expand_ratio | |||||
| y1 = bbox[1] - h * expand_ratio | |||||
| y2 = bbox[3] + h * expand_ratio | |||||
| x1, x2 = np.clip([x1, x2], 0, img.shape[1]) | |||||
| y1, y2 = np.clip([y1, y2], 0, img.shape[0]) | |||||
| bbox -= np.tile([x1, y1], 2) | |||||
| lmks -= (x1, y1, 0) | |||||
| find_bigface = True | |||||
| img = img[int(y1):int(y2), int(x1):int(x2), :] | |||||
| results['gt_bboxes'] = np.expand_dims(bbox, axis=0) | |||||
| results['gt_keypointss'] = np.expand_dims(lmks, axis=0) | |||||
| results['gt_labels'] = np.array(label) | |||||
| results['img'] = img | |||||
| boxes = results['gt_bboxes'] | |||||
| h, w, c = img.shape | |||||
| if self.crop_ratio_range is not None: | |||||
| max_scale = self.crop_ratio_max | |||||
| else: | |||||
| max_scale = np.amax(self.crop_choice) | |||||
| scale_retry = 0 | |||||
| while True: | |||||
| scale_retry += 1 | |||||
| if scale_retry == 1 or max_scale > 1.0: | |||||
| if self.crop_ratio_range is not None: | |||||
| scale = np.random.uniform(self.crop_ratio_min, | |||||
| self.crop_ratio_max) | |||||
| elif self.crop_choice is not None: | |||||
| scale = np.random.choice(self.crop_choice) | |||||
| else: | |||||
| scale = scale * 1.2 | |||||
| if find_bigface: | |||||
| # select a scale from big_face_crop_choice if in big_face mode | |||||
| scale = np.random.choice(self.big_face_crop_choice) | |||||
| for i in range(250): | |||||
| long_side = max(w, h) | |||||
| cw = int(scale * long_side) | |||||
| ch = cw | |||||
| # TODO +1 | |||||
| if w == cw: | |||||
| left = 0 | |||||
| elif w > cw: | |||||
| left = random.randint(0, w - cw) | |||||
| else: | |||||
| left = random.randint(w - cw, 0) | |||||
| if h == ch: | |||||
| top = 0 | |||||
| elif h > ch: | |||||
| top = random.randint(0, h - ch) | |||||
| else: | |||||
| top = random.randint(h - ch, 0) | |||||
| patch = np.array( | |||||
| (int(left), int(top), int(left + cw), int(top + ch)), | |||||
| dtype=np.int32) | |||||
| # center of boxes should inside the crop img | |||||
| # only adjust boxes and instance masks when the gt is not empty | |||||
| # adjust boxes | |||||
| def is_center_of_bboxes_in_patch(boxes, patch): | |||||
| # TODO >= | |||||
| center = (boxes[:, :2] + boxes[:, 2:]) / 2 | |||||
| mask = \ | |||||
| ((center[:, 0] > patch[0]) | |||||
| * (center[:, 1] > patch[1]) | |||||
| * (center[:, 0] < patch[2]) | |||||
| * (center[:, 1] < patch[3])) | |||||
| return mask | |||||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||||
| if not mask.any(): | |||||
| continue | |||||
| for key in results.get('bbox_fields', []): | |||||
| boxes = results[key].copy() | |||||
| mask = is_center_of_bboxes_in_patch(boxes, patch) | |||||
| boxes = boxes[mask] | |||||
| if self.bbox_clip_border: | |||||
| boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) | |||||
| boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) | |||||
| boxes -= np.tile(patch[:2], 2) | |||||
| results[key] = boxes | |||||
| # labels | |||||
| label_key = self.bbox2label.get(key) | |||||
| if label_key in results: | |||||
| results[label_key] = results[label_key][mask] | |||||
| # keypoints field | |||||
| if key == 'gt_bboxes': | |||||
| for kps_key in results.get('keypoints_fields', []): | |||||
| keypointss = results[kps_key].copy() | |||||
| keypointss = keypointss[mask, :, :] | |||||
| if self.bbox_clip_border: | |||||
| keypointss[:, :, : | |||||
| 2] = keypointss[:, :, :2].clip( | |||||
| max=patch[2:]) | |||||
| keypointss[:, :, : | |||||
| 2] = keypointss[:, :, :2].clip( | |||||
| min=patch[:2]) | |||||
| keypointss[:, :, 0] -= patch[0] | |||||
| keypointss[:, :, 1] -= patch[1] | |||||
| results[kps_key] = keypointss | |||||
| # mask fields | |||||
| mask_key = self.bbox2mask.get(key) | |||||
| if mask_key in results: | |||||
| results[mask_key] = results[mask_key][mask.nonzero() | |||||
| [0]].crop(patch) | |||||
| # adjust the img no matter whether the gt is empty before crop | |||||
| rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 | |||||
| patch_from = patch.copy() | |||||
| patch_from[0] = max(0, patch_from[0]) | |||||
| patch_from[1] = max(0, patch_from[1]) | |||||
| patch_from[2] = min(img.shape[1], patch_from[2]) | |||||
| patch_from[3] = min(img.shape[0], patch_from[3]) | |||||
| patch_to = patch.copy() | |||||
| patch_to[0] = max(0, patch_to[0] * -1) | |||||
| patch_to[1] = max(0, patch_to[1] * -1) | |||||
| patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) | |||||
| patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) | |||||
| rimg[patch_to[1]:patch_to[3], | |||||
| patch_to[0]:patch_to[2], :] = img[ | |||||
| patch_from[1]:patch_from[3], | |||||
| patch_from[0]:patch_from[2], :] | |||||
| img = rimg | |||||
| results['img'] = img | |||||
| results['img_shape'] = img.shape | |||||
| return results | |||||
| def __repr__(self): | |||||
| repr_str = self.__class__.__name__ | |||||
| repr_str += f'(min_ious={self.min_iou}, ' | |||||
| repr_str += f'crop_size={self.crop_size})' | |||||
| return repr_str | |||||
| @@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset): | |||||
| CLASSES = ('FG', ) | CLASSES = ('FG', ) | ||||
| def __init__(self, min_size=None, **kwargs): | def __init__(self, min_size=None, **kwargs): | ||||
| self.NK = 5 | |||||
| self.NK = kwargs.pop('num_kps', 5) | |||||
| self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)} | self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)} | ||||
| self.min_size = min_size | self.min_size = min_size | ||||
| self.gt_path = kwargs.get('gt_path') | self.gt_path = kwargs.get('gt_path') | ||||
| @@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset): | |||||
| if len(values) > 4: | if len(values) > 4: | ||||
| if len(values) > 5: | if len(values) > 5: | ||||
| kps = np.array( | kps = np.array( | ||||
| values[4:19], dtype=np.float32).reshape((self.NK, 3)) | |||||
| values[4:4 + self.NK * 3], dtype=np.float32).reshape( | |||||
| (self.NK, 3)) | |||||
| for li in range(kps.shape[0]): | for li in range(kps.shape[0]): | ||||
| if (kps[li, :] == -1).all(): | if (kps[li, :] == -1).all(): | ||||
| kps[li][2] = 0.0 # weight = 0, ignore | kps[li][2] = 0.0 # weight = 0, ignore | ||||
| @@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead): | |||||
| scale_mode=1, | scale_mode=1, | ||||
| dw_conv=False, | dw_conv=False, | ||||
| use_kps=False, | use_kps=False, | ||||
| num_kps=5, | |||||
| loss_kps=dict( | loss_kps=dict( | ||||
| type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1), | type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1), | ||||
| **kwargs): | **kwargs): | ||||
| @@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead): | |||||
| self.scale_mode = scale_mode | self.scale_mode = scale_mode | ||||
| self.use_dfl = True | self.use_dfl = True | ||||
| self.dw_conv = dw_conv | self.dw_conv = dw_conv | ||||
| self.NK = 5 | |||||
| self.NK = num_kps | |||||
| self.extra_flops = 0.0 | self.extra_flops = 0.0 | ||||
| if loss_dfl is None or not loss_dfl: | if loss_dfl is None or not loss_dfl: | ||||
| self.use_dfl = False | self.use_dfl = False | ||||
| @@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead): | |||||
| batch_size, -1, self.cls_out_channels).sigmoid() | batch_size, -1, self.cls_out_channels).sigmoid() | ||||
| bbox_pred = bbox_pred.permute(0, 2, 3, | bbox_pred = bbox_pred.permute(0, 2, 3, | ||||
| 1).reshape(batch_size, -1, 4) | 1).reshape(batch_size, -1, 4) | ||||
| kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10) | |||||
| kps_pred = kps_pred.permute(0, 2, 3, | |||||
| 1).reshape(batch_size, -1, self.NK * 2) | |||||
| return cls_score, bbox_pred, kps_pred | return cls_score, bbox_pred, kps_pred | ||||
| def forward_train(self, | def forward_train(self, | ||||
| @@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead): | |||||
| if self.use_dfl: | if self.use_dfl: | ||||
| kps_pred = self.integral(kps_pred) * stride[0] | kps_pred = self.integral(kps_pred) * stride[0] | ||||
| else: | else: | ||||
| kps_pred = kps_pred.reshape((-1, 10)) * stride[0] | |||||
| kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0] | |||||
| nms_pre = cfg.get('nms_pre', -1) | nms_pre = cfg.get('nms_pre', -1) | ||||
| if nms_pre > 0 and scores.shape[0] > nms_pre: | if nms_pre > 0 and scores.shape[0] > nms_pre: | ||||
| @@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead): | |||||
| mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) | mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) | ||||
| if mlvl_kps is not None: | if mlvl_kps is not None: | ||||
| scale_factor2 = torch.tensor( | scale_factor2 = torch.tensor( | ||||
| [scale_factor[0], scale_factor[1]] * 5) | |||||
| [scale_factor[0], scale_factor[1]] * self.NK) | |||||
| mlvl_kps /= scale_factor2.to(mlvl_kps.device) | mlvl_kps /= scale_factor2.to(mlvl_kps.device) | ||||
| mlvl_scores = torch.cat(mlvl_scores) | mlvl_scores = torch.cat(mlvl_scores) | ||||
| @@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector): | |||||
| gt_bboxes_ignore) | gt_bboxes_ignore) | ||||
| return losses | return losses | ||||
| def simple_test(self, img, img_metas, rescale=False): | |||||
| def simple_test(self, | |||||
| img, | |||||
| img_metas, | |||||
| rescale=False, | |||||
| repeat_head=1, | |||||
| output_kps_var=0, | |||||
| output_results=1): | |||||
| """Test function without test time augmentation. | """Test function without test time augmentation. | ||||
| Args: | Args: | ||||
| @@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector): | |||||
| img_metas (list[dict]): List of image information. | img_metas (list[dict]): List of image information. | ||||
| rescale (bool, optional): Whether to rescale the results. | rescale (bool, optional): Whether to rescale the results. | ||||
| Defaults to False. | Defaults to False. | ||||
| repeat_head (int): repeat inference times in head | |||||
| output_kps_var (int): whether output kps var to calculate quality | |||||
| output_results (int): 0: nothing 1: bbox 2: both bbox and kps | |||||
| Returns: | Returns: | ||||
| list[list[np.ndarray]]: BBox results of each image and classes. | list[list[np.ndarray]]: BBox results of each image and classes. | ||||
| @@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector): | |||||
| corresponds to each class. | corresponds to each class. | ||||
| """ | """ | ||||
| x = self.extract_feat(img) | x = self.extract_feat(img) | ||||
| outs = self.bbox_head(x) | |||||
| if torch.onnx.is_in_onnx_export(): | |||||
| print('single_stage.py in-onnx-export') | |||||
| print(outs.__class__) | |||||
| cls_score, bbox_pred, kps_pred = outs | |||||
| for c in cls_score: | |||||
| print(c.shape) | |||||
| for c in bbox_pred: | |||||
| print(c.shape) | |||||
| if self.bbox_head.use_kps: | |||||
| for c in kps_pred: | |||||
| assert repeat_head >= 1 | |||||
| kps_out0 = [] | |||||
| kps_out1 = [] | |||||
| kps_out2 = [] | |||||
| for i in range(repeat_head): | |||||
| outs = self.bbox_head(x) | |||||
| kps_out0 += [outs[2][0].detach().cpu().numpy()] | |||||
| kps_out1 += [outs[2][1].detach().cpu().numpy()] | |||||
| kps_out2 += [outs[2][2].detach().cpu().numpy()] | |||||
| if output_kps_var: | |||||
| var0 = np.var(np.vstack(kps_out0), axis=0).mean() | |||||
| var1 = np.var(np.vstack(kps_out1), axis=0).mean() | |||||
| var2 = np.var(np.vstack(kps_out2), axis=0).mean() | |||||
| var = np.mean([var0, var1, var2]) | |||||
| else: | |||||
| var = None | |||||
| if output_results > 0: | |||||
| if torch.onnx.is_in_onnx_export(): | |||||
| print('single_stage.py in-onnx-export') | |||||
| print(outs.__class__) | |||||
| cls_score, bbox_pred, kps_pred = outs | |||||
| for c in cls_score: | |||||
| print(c.shape) | |||||
| for c in bbox_pred: | |||||
| print(c.shape) | print(c.shape) | ||||
| return (cls_score, bbox_pred, kps_pred) | |||||
| else: | |||||
| return (cls_score, bbox_pred) | |||||
| bbox_list = self.bbox_head.get_bboxes( | |||||
| *outs, img_metas, rescale=rescale) | |||||
| if self.bbox_head.use_kps: | |||||
| for c in kps_pred: | |||||
| print(c.shape) | |||||
| return (cls_score, bbox_pred, kps_pred) | |||||
| else: | |||||
| return (cls_score, bbox_pred) | |||||
| bbox_list = self.bbox_head.get_bboxes( | |||||
| *outs, img_metas, rescale=rescale) | |||||
| # return kps if use_kps | |||||
| if len(bbox_list[0]) == 2: | |||||
| bbox_results = [ | |||||
| bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) | |||||
| for det_bboxes, det_labels in bbox_list | |||||
| ] | |||||
| elif len(bbox_list[0]) == 3: | |||||
| bbox_results = [ | |||||
| bbox2result( | |||||
| det_bboxes, | |||||
| det_labels, | |||||
| self.bbox_head.num_classes, | |||||
| kps=det_kps) | |||||
| for det_bboxes, det_labels, det_kps in bbox_list | |||||
| ] | |||||
| return bbox_results | |||||
| # return kps if use_kps | |||||
| if len(bbox_list[0]) == 2: | |||||
| bbox_results = [ | |||||
| bbox2result(det_bboxes, det_labels, | |||||
| self.bbox_head.num_classes) | |||||
| for det_bboxes, det_labels in bbox_list | |||||
| ] | |||||
| elif len(bbox_list[0]) == 3: | |||||
| if output_results == 2: | |||||
| bbox_results = [ | |||||
| bbox2result( | |||||
| det_bboxes, | |||||
| det_labels, | |||||
| self.bbox_head.num_classes, | |||||
| kps=det_kps, | |||||
| num_kps=self.bbox_head.NK) | |||||
| for det_bboxes, det_labels, det_kps in bbox_list | |||||
| ] | |||||
| elif output_results == 1: | |||||
| bbox_results = [ | |||||
| bbox2result(det_bboxes, det_labels, | |||||
| self.bbox_head.num_classes) | |||||
| for det_bboxes, det_labels, _ in bbox_list | |||||
| ] | |||||
| else: | |||||
| bbox_results = None | |||||
| if var is not None: | |||||
| return bbox_results, var | |||||
| else: | |||||
| return bbox_results | |||||
| def feature_test(self, img): | def feature_test(self, img): | ||||
| x = self.extract_feat(img) | x = self.extract_feat(img) | ||||
| @@ -0,0 +1,71 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os.path as osp | |||||
| from copy import deepcopy | |||||
| from typing import Any, Dict | |||||
| import torch | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| __all__ = ['ScrfdDetect'] | |||||
| @MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd) | |||||
| class ScrfdDetect(TorchModel): | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| """initialize the face detection model from the `model_dir` path. | |||||
| Args: | |||||
| model_dir (str): the model path. | |||||
| """ | |||||
| super().__init__(model_dir, *args, **kwargs) | |||||
| from mmcv import Config | |||||
| from mmcv.parallel import MMDataParallel | |||||
| from mmcv.runner import load_checkpoint | |||||
| from mmdet.models import build_detector | |||||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset | |||||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop | |||||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e | |||||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead | |||||
| from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD | |||||
| cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py')) | |||||
| ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) | |||||
| cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3) | |||||
| detector = build_detector(cfg.model) | |||||
| logger.info(f'loading model from {ckpt_path}') | |||||
| device = torch.device( | |||||
| f'cuda:{0}' if torch.cuda.is_available() else 'cpu') | |||||
| load_checkpoint(detector, ckpt_path, map_location=device) | |||||
| detector = MMDataParallel(detector, device_ids=[0]) | |||||
| detector.eval() | |||||
| self.detector = detector | |||||
| logger.info('load model done') | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| result = self.detector( | |||||
| return_loss=False, | |||||
| rescale=True, | |||||
| img=[input['img'][0].unsqueeze(0)], | |||||
| img_metas=[[dict(input['img_metas'][0].data)]], | |||||
| output_results=2) | |||||
| assert result is not None | |||||
| result = result[0][0] | |||||
| bboxes = result[:, :4].tolist() | |||||
| kpss = result[:, 5:].tolist() | |||||
| scores = result[:, 4].tolist() | |||||
| return { | |||||
| OutputKeys.SCORES: scores, | |||||
| OutputKeys.BOXES: bboxes, | |||||
| OutputKeys.KEYPOINTS: kpss | |||||
| } | |||||
| def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: | |||||
| return input | |||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .hand_2d_keypoints import Hand2dKeyPoints | |||||
| else: | |||||
| _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from easycv.models.pose import TopDown | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||||
| from modelscope.utils.constant import Tasks | |||||
| @MODELS.register_module( | |||||
| group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints) | |||||
| class Hand2dKeyPoints(EasyCVBaseModel, TopDown): | |||||
| def __init__(self, model_dir=None, *args, **kwargs): | |||||
| EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||||
| TopDown.__init__(self, *args, **kwargs) | |||||
| @@ -0,0 +1,22 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .human_wholebody_keypoint import HumanWholeBodyKeypoint | |||||
| else: | |||||
| _import_structure = { | |||||
| 'human_wholebody_keypoint': ['HumanWholeBodyKeypoint'] | |||||
| } | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,17 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from easycv.models.pose.top_down import TopDown | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||||
| from modelscope.utils.constant import Tasks | |||||
| @MODELS.register_module( | |||||
| group_key=Tasks.human_wholebody_keypoint, | |||||
| module_name=Models.human_wholebody_keypoint) | |||||
| class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown): | |||||
| def __init__(self, model_dir=None, *args, **kwargs): | |||||
| EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||||
| TopDown.__init__(self, *args, **kwargs) | |||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .image_body_reshaping import ImageBodyReshaping | |||||
| else: | |||||
| _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,128 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| from typing import Any, Dict | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import Tensor, TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .model import FlowGenerator | |||||
| from .person_info import PersonInfo | |||||
| from .pose_estimator.body import Body | |||||
| from .slim_utils import image_warp_grid1, resize_on_long_side | |||||
| logger = get_logger() | |||||
| __all__ = ['ImageBodyReshaping'] | |||||
| @MODELS.register_module( | |||||
| Tasks.image_body_reshaping, module_name=Models.image_body_reshaping) | |||||
| class ImageBodyReshaping(TorchModel): | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| """initialize the image body reshaping model from the `model_dir` path. | |||||
| Args: | |||||
| model_dir (str): the model path. | |||||
| """ | |||||
| super().__init__(model_dir, *args, **kwargs) | |||||
| if torch.cuda.is_available(): | |||||
| self.device = torch.device('cuda') | |||||
| else: | |||||
| self.device = torch.device('cpu') | |||||
| self.degree = 1.0 | |||||
| self.reshape_model = FlowGenerator(n_channels=16).to(self.device) | |||||
| model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||||
| checkpoints = torch.load(model_path, map_location=torch.device('cpu')) | |||||
| self.reshape_model.load_state_dict( | |||||
| checkpoints['state_dict'], strict=True) | |||||
| self.reshape_model.eval() | |||||
| logger.info('load body reshaping model done') | |||||
| pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth') | |||||
| self.pose_esti = Body(pose_model_ckpt, self.device) | |||||
| logger.info('load pose model done') | |||||
| def pred_joints(self, img): | |||||
| if img is None: | |||||
| return None | |||||
| small_src, resize_scale = resize_on_long_side(img, 300) | |||||
| body_joints = self.pose_esti(small_src) | |||||
| if body_joints.shape[0] >= 1: | |||||
| body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale | |||||
| return body_joints | |||||
| def pred_flow(self, img): | |||||
| body_joints = self.pred_joints(img) | |||||
| small_size = 1200 | |||||
| if img.shape[0] > small_size or img.shape[1] > small_size: | |||||
| _img, _scale = resize_on_long_side(img, small_size) | |||||
| body_joints[:, :, :2] = body_joints[:, :, :2] * _scale | |||||
| else: | |||||
| _img = img | |||||
| # We only reshape one person | |||||
| if body_joints.shape[0] < 1 or body_joints.shape[0] > 1: | |||||
| return None | |||||
| person = PersonInfo(body_joints[0]) | |||||
| with torch.no_grad(): | |||||
| person_pred = person.pred_flow(_img, self.reshape_model, | |||||
| self.device) | |||||
| flow = np.dstack((person_pred['rDx'], person_pred['rDy'])) | |||||
| scale = img.shape[0] * 1.0 / flow.shape[0] | |||||
| flow = cv2.resize(flow, (img.shape[1], img.shape[0])) | |||||
| flow *= scale | |||||
| return flow | |||||
| def warp(self, src_img, flow): | |||||
| X_flow = flow[..., 0] | |||||
| Y_flow = flow[..., 1] | |||||
| X_flow = np.ascontiguousarray(X_flow) | |||||
| Y_flow = np.ascontiguousarray(Y_flow) | |||||
| pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0) | |||||
| return pred | |||||
| def inference(self, img): | |||||
| img = img.cpu().numpy() | |||||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||||
| flow = self.pred_flow(img) | |||||
| if flow is None: | |||||
| return img | |||||
| assert flow.shape[:2] == img.shape[:2] | |||||
| mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8) | |||||
| mag -= 3 | |||||
| mag[mag <= 0] = 0 | |||||
| x, y = cv2.polarToCart(mag, ang, angleInDegrees=False) | |||||
| flow = np.dstack((x, y)) | |||||
| flow *= self.degree | |||||
| pred = self.warp(img, flow) | |||||
| out_img = np.clip(pred, 0, 255) | |||||
| logger.info('model inference done') | |||||
| return out_img.astype(np.uint8) | |||||
| @@ -0,0 +1,189 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class ConvLayer(nn.Module): | |||||
| def __init__(self, in_ch, out_ch): | |||||
| super(ConvLayer, self).__init__() | |||||
| self.conv = nn.Sequential( | |||||
| nn.ReflectionPad2d(1), | |||||
| nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0), | |||||
| nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)) | |||||
| def forward(self, x): | |||||
| x = self.conv(x) | |||||
| return x | |||||
| class SASA(nn.Module): | |||||
| def __init__(self, in_dim): | |||||
| super(SASA, self).__init__() | |||||
| self.chanel_in = in_dim | |||||
| self.query_conv = nn.Conv2d( | |||||
| in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) | |||||
| self.key_conv = nn.Conv2d( | |||||
| in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) | |||||
| self.value_conv = nn.Conv2d( | |||||
| in_channels=in_dim, out_channels=in_dim, kernel_size=1) | |||||
| self.mag_conv = nn.Conv2d( | |||||
| in_channels=5, out_channels=in_dim // 32, kernel_size=1) | |||||
| self.gamma = nn.Parameter(torch.zeros(1)) | |||||
| self.softmax = nn.Softmax(dim=-1) # | |||||
| self.sigmoid = nn.Sigmoid() | |||||
| def structure_encoder(self, paf_mag, target_height, target_width): | |||||
| torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True) | |||||
| torso_mask = torch.clamp(torso_mask, 0, 1) | |||||
| arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True) | |||||
| arms_mask = torch.clamp(arms_mask, 0, 1) | |||||
| legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True) | |||||
| legs_mask = torch.clamp(legs_mask, 0, 1) | |||||
| fg_mask = paf_mag[:, 12, :, :].unsqueeze(1) | |||||
| bg_mask = 1 - fg_mask | |||||
| Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask), | |||||
| dim=1) | |||||
| Y = F.interpolate(Y, size=(target_height, target_width), mode='area') | |||||
| return Y | |||||
| def forward(self, X, PAF_mag): | |||||
| """extract self-attention features. | |||||
| Args: | |||||
| X : input feature maps( B x C x H x W) | |||||
| PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity | |||||
| Returns: | |||||
| out : self attention value + input feature | |||||
| Y: B X N X N (N is Width*Height) | |||||
| """ | |||||
| m_batchsize, C, height, width = X.size() | |||||
| Y = self.structure_encoder(PAF_mag, height, width) | |||||
| connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1, | |||||
| width * height) | |||||
| affinity = torch.bmm( | |||||
| connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec) | |||||
| affinity_centered = affinity - torch.mean(affinity) | |||||
| affinity_sigmoid = self.sigmoid(affinity_centered) | |||||
| proj_query = self.query_conv(X).view(m_batchsize, -1, | |||||
| width * height).permute(0, 2, 1) | |||||
| proj_key = self.key_conv(X).view(m_batchsize, -1, width * height) | |||||
| selfatten_map = torch.bmm(proj_query, proj_key) | |||||
| selfatten_centered = selfatten_map - torch.mean( | |||||
| selfatten_map) # centering | |||||
| selfatten_sigmoid = self.sigmoid(selfatten_centered) | |||||
| SASA_map = selfatten_sigmoid * affinity_sigmoid | |||||
| proj_value = self.value_conv(X).view(m_batchsize, -1, width * height) | |||||
| out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1)) | |||||
| out = out.view(m_batchsize, C, height, width) | |||||
| out = self.gamma * out + X | |||||
| return out, Y | |||||
| class FlowGenerator(nn.Module): | |||||
| def __init__(self, n_channels, deep_supervision=False): | |||||
| super(FlowGenerator, self).__init__() | |||||
| self.deep_supervision = deep_supervision | |||||
| self.Encoder = nn.Sequential( | |||||
| ConvLayer(n_channels, 64), | |||||
| ConvLayer(64, 64), | |||||
| nn.MaxPool2d(2), | |||||
| ConvLayer(64, 128), | |||||
| ConvLayer(128, 128), | |||||
| nn.MaxPool2d(2), | |||||
| ConvLayer(128, 256), | |||||
| ConvLayer(256, 256), | |||||
| nn.MaxPool2d(2), | |||||
| ConvLayer(256, 512), | |||||
| ConvLayer(512, 512), | |||||
| nn.MaxPool2d(2), | |||||
| ConvLayer(512, 1024), | |||||
| ConvLayer(1024, 1024), | |||||
| ConvLayer(1024, 1024), | |||||
| ConvLayer(1024, 1024), | |||||
| ConvLayer(1024, 1024), | |||||
| ) | |||||
| self.SASA = SASA(in_dim=1024) | |||||
| self.Decoder = nn.Sequential( | |||||
| ConvLayer(1024, 1024), | |||||
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), | |||||
| ConvLayer(1024, 512), | |||||
| ConvLayer(512, 512), | |||||
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), | |||||
| ConvLayer(512, 256), | |||||
| ConvLayer(256, 256), | |||||
| ConvLayer(256, 128), | |||||
| ConvLayer(128, 64), | |||||
| ConvLayer(64, 32), | |||||
| nn.Conv2d(32, 2, kernel_size=1, padding=0), | |||||
| nn.Tanh(), | |||||
| nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True), | |||||
| ) | |||||
| dilation_ksize = 17 | |||||
| self.dilation = torch.nn.MaxPool2d( | |||||
| kernel_size=dilation_ksize, | |||||
| stride=1, | |||||
| padding=int((dilation_ksize - 1) / 2)) | |||||
| def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2): | |||||
| n, c, h, w = x.size() | |||||
| yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)]) | |||||
| xv = xv.float() / (w - 1) * 2.0 - 1 | |||||
| yv = yv.float() / (h - 1) * 2.0 - 1 | |||||
| grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0) | |||||
| grid = grid.to(flow.device) | |||||
| grid_x = grid + 2 * flow * coff | |||||
| warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode) | |||||
| return warp_x | |||||
| def forward(self, img, skeleton_map, coef=0.2): | |||||
| """extract self-attention features. | |||||
| Args: | |||||
| img : input numpy image | |||||
| skeleton_map : skeleton map of input image | |||||
| coef: warp degree | |||||
| Returns: | |||||
| warp_x : warped image | |||||
| flow: predicted flow | |||||
| """ | |||||
| img_concat = torch.cat((img, skeleton_map), dim=1) | |||||
| X = self.Encoder(img_concat) | |||||
| _, _, height, width = X.size() | |||||
| # directly get PAF magnitude from skeleton maps via dilation | |||||
| PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5) | |||||
| out, Y = self.SASA(X, PAF_mag) | |||||
| flow = self.Decoder(out) | |||||
| flow = flow.permute(0, 2, 3, 1) # [n, 2, h, w] ==> [n, h, w, 2] | |||||
| warp_x = self.warp(img, flow, coff=coef) | |||||
| warp_x = torch.clamp(warp_x, min=-1.0, max=1.0) | |||||
| return warp_x, flow | |||||
| @@ -0,0 +1,339 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import copy | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| from .slim_utils import (enlarge_box_tblr, gen_skeleton_map, | |||||
| get_map_fusion_map_cuda, get_mask_bbox, | |||||
| resize_on_long_side) | |||||
| class PersonInfo(object): | |||||
| def __init__(self, joints): | |||||
| self.joints = joints | |||||
| self.flow = None | |||||
| self.pad_boder = False | |||||
| self.height_expand = 0 | |||||
| self.width_expand = 0 | |||||
| self.coeff = 0.2 | |||||
| self.network_input_W = 256 | |||||
| self.network_input_H = 256 | |||||
| self.divider = 20 | |||||
| self.flow_scales = ['upper_2'] | |||||
| def update_attribute(self, pad_boder, height_expand, width_expand): | |||||
| self.pad_boder = pad_boder | |||||
| self.height_expand = height_expand | |||||
| self.width_expand = width_expand | |||||
| if pad_boder: | |||||
| self.joints[:, 0] += width_expand | |||||
| self.joints[:, 1] += height_expand | |||||
| def pred_flow(self, img, flow_net, device): | |||||
| with torch.no_grad(): | |||||
| if img is None: | |||||
| print('image is none') | |||||
| self.flow = None | |||||
| if len(img.shape) == 2: | |||||
| img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) | |||||
| if self.pad_boder: | |||||
| height_expand = self.height_expand | |||||
| width_expand = self.width_expand | |||||
| pad_img = cv2.copyMakeBorder( | |||||
| img, | |||||
| height_expand, | |||||
| height_expand, | |||||
| width_expand, | |||||
| width_expand, | |||||
| cv2.BORDER_CONSTANT, | |||||
| value=(127, 127, 127)) | |||||
| else: | |||||
| height_expand = 0 | |||||
| width_expand = 0 | |||||
| pad_img = img.copy() | |||||
| canvas = np.zeros( | |||||
| shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32) | |||||
| self.human_joint_box = self.__joint_to_body_box() | |||||
| self.human_box = enlarge_box_tblr( | |||||
| self.human_joint_box, pad_img, ratio=0.25) | |||||
| human_box_height = self.human_box[1] - self.human_box[0] | |||||
| human_box_width = self.human_box[3] - self.human_box[2] | |||||
| self.leg_joint_box = self.__joint_to_leg_box() | |||||
| self.leg_box = enlarge_box_tblr( | |||||
| self.leg_joint_box, pad_img, ratio=0.25) | |||||
| self.arm_joint_box = self.__joint_to_arm_box() | |||||
| self.arm_box = enlarge_box_tblr( | |||||
| self.arm_joint_box, pad_img, ratio=0.1) | |||||
| x_flows = [] | |||||
| y_flows = [] | |||||
| multi_bbox = [] | |||||
| for scale in self.flow_scales: # better for metric | |||||
| scale_value = float(scale.split('_')[-1]) | |||||
| arm_box = copy.deepcopy(self.arm_box) | |||||
| if arm_box[0] is None: | |||||
| arm_box = self.human_box | |||||
| arm_box_height = arm_box[1] - arm_box[0] | |||||
| arm_box_width = arm_box[3] - arm_box[2] | |||||
| roi_bbox = None | |||||
| if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1: | |||||
| roi_bbox = self.human_box | |||||
| else: | |||||
| arm_box = enlarge_box_tblr( | |||||
| arm_box, pad_img, ratio=scale_value) | |||||
| if scale == 'upper_0.2': | |||||
| arm_box[0] = min(arm_box[0], int(self.joints[0][1])) | |||||
| if scale.startswith('upper'): | |||||
| roi_bbox = [ | |||||
| max(self.human_box[0], arm_box[0]), | |||||
| min(self.human_box[1], arm_box[1]), | |||||
| max(self.human_box[2], arm_box[2]), | |||||
| min(self.human_box[3], arm_box[3]) | |||||
| ] | |||||
| if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ | |||||
| 3] - roi_bbox[2] < 1: | |||||
| continue | |||||
| elif scale.startswith('lower'): | |||||
| roi_bbox = [ | |||||
| max(self.human_box[0], self.leg_box[0]), | |||||
| min(self.human_box[1], self.leg_box[1]), | |||||
| max(self.human_box[2], self.leg_box[2]), | |||||
| min(self.human_box[3], self.leg_box[3]) | |||||
| ] | |||||
| if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ | |||||
| 3] - roi_bbox[2] < 1: | |||||
| continue | |||||
| skel_map, roi_bbox = gen_skeleton_map( | |||||
| self.joints, 'depth', input_roi_box=roi_bbox) | |||||
| if roi_bbox is None: | |||||
| continue | |||||
| if skel_map.dtype != np.float32: | |||||
| skel_map = skel_map.astype(np.float32) | |||||
| skel_map -= 1.0 # [0,2] ->[-1,1] | |||||
| multi_bbox.append(roi_bbox) | |||||
| roi_bbox_height = roi_bbox[1] - roi_bbox[0] | |||||
| roi_bbox_width = roi_bbox[3] - roi_bbox[2] | |||||
| assert skel_map.shape[0] == roi_bbox_height | |||||
| assert skel_map.shape[1] == roi_bbox_width | |||||
| roi_height_pad = roi_bbox_height // self.divider | |||||
| roi_width_pad = roi_bbox_width // self.divider | |||||
| paded_roi_h = roi_bbox_height + 2 * roi_height_pad | |||||
| paded_roi_w = roi_bbox_width + 2 * roi_width_pad | |||||
| roi_height_pad_joint = skel_map.shape[0] // self.divider | |||||
| roi_width_pad_joint = skel_map.shape[1] // self.divider | |||||
| skel_map = np.pad( | |||||
| skel_map, | |||||
| ((roi_height_pad_joint, roi_height_pad_joint), | |||||
| (roi_width_pad_joint, roi_width_pad_joint), (0, 0)), | |||||
| 'constant', | |||||
| constant_values=-1) | |||||
| skel_map_resized = cv2.resize( | |||||
| skel_map, (self.network_input_W, self.network_input_H)) | |||||
| skel_map_resized[skel_map_resized < 0] = -1.0 | |||||
| skel_map_resized[skel_map_resized > -0.5] = 1.0 | |||||
| skel_map_transformed = torch.from_numpy( | |||||
| skel_map_resized.transpose((2, 0, 1))) | |||||
| roi_npy = pad_img[roi_bbox[0]:roi_bbox[1], | |||||
| roi_bbox[2]:roi_bbox[3], :].copy() | |||||
| if roi_npy.dtype != np.float32: | |||||
| roi_npy = roi_npy.astype(np.float32) | |||||
| roi_npy = np.pad(roi_npy, | |||||
| ((roi_height_pad, roi_height_pad), | |||||
| (roi_width_pad, roi_width_pad), (0, 0)), | |||||
| 'edge') | |||||
| roi_npy = roi_npy[:, :, ::-1] | |||||
| roi_npy = cv2.resize( | |||||
| roi_npy, (self.network_input_W, self.network_input_H)) | |||||
| roi_npy *= 1.0 / 255 | |||||
| roi_npy -= 0.5 | |||||
| roi_npy *= 2 | |||||
| rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1))) | |||||
| rgb_tensor = rgb_tensor.unsqueeze(0).to(device) | |||||
| skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device) | |||||
| warped_img_val, flow_field_val = flow_net( | |||||
| rgb_tensor, skel_map_tensor | |||||
| ) # inference, connectivity_mask [1,12,16,16] | |||||
| flow_field_val = flow_field_val.detach().squeeze().cpu().numpy( | |||||
| ) | |||||
| flow_field_val = cv2.resize( | |||||
| flow_field_val, (paded_roi_w, paded_roi_h), | |||||
| interpolation=cv2.INTER_LINEAR) | |||||
| flow_field_val[..., 0] = flow_field_val[ | |||||
| ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff | |||||
| flow_field_val[..., 1] = flow_field_val[ | |||||
| ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff | |||||
| # remove pad areas | |||||
| flow_field_val = flow_field_val[ | |||||
| roi_height_pad:flow_field_val.shape[0] - roi_height_pad, | |||||
| roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :] | |||||
| diffuse_width = max(roi_bbox_width // 3, 1) | |||||
| diffuse_height = max(roi_bbox_height // 3, 1) | |||||
| assert roi_bbox_width == flow_field_val.shape[1] | |||||
| assert roi_bbox_height == flow_field_val.shape[0] | |||||
| origin_flow = np.zeros( | |||||
| (pad_img.shape[0] + 2 * diffuse_height, | |||||
| pad_img.shape[1] + 2 * diffuse_width, 2), | |||||
| dtype=np.float32) | |||||
| flow_field_val = np.pad(flow_field_val, | |||||
| ((diffuse_height, diffuse_height), | |||||
| (diffuse_width, diffuse_width), | |||||
| (0, 0)), 'linear_ramp') | |||||
| origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height, | |||||
| roi_bbox[2]:roi_bbox[3] | |||||
| + 2 * diffuse_width] = flow_field_val | |||||
| origin_flow = origin_flow[diffuse_height:-diffuse_height, | |||||
| diffuse_width:-diffuse_width, :] | |||||
| x_flows.append(origin_flow[..., 0]) | |||||
| y_flows.append(origin_flow[..., 1]) | |||||
| if len(x_flows) == 0: | |||||
| return { | |||||
| 'rDx': np.zeros(canvas.shape[:2], dtype=np.float32), | |||||
| 'rDy': np.zeros(canvas.shape[:2], dtype=np.float32), | |||||
| 'multi_bbox': multi_bbox, | |||||
| 'x_fusion_map': | |||||
| np.ones(canvas.shape[:2], dtype=np.float32), | |||||
| 'y_fusion_map': | |||||
| np.ones(canvas.shape[:2], dtype=np.float32) | |||||
| } | |||||
| else: | |||||
| origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow( | |||||
| x_flows, y_flows, device=device) | |||||
| return { | |||||
| 'rDx': origin_rDx, | |||||
| 'rDy': origin_rDy, | |||||
| 'multi_bbox': multi_bbox, | |||||
| 'x_fusion_map': x_fusion_map, | |||||
| 'y_fusion_map': y_fusion_map | |||||
| } | |||||
| @staticmethod | |||||
| def blend_multiscale_flow(x_flows, y_flows, device=None): | |||||
| scale_num = len(x_flows) | |||||
| if scale_num == 1: | |||||
| return x_flows[0], y_flows[0], np.ones_like( | |||||
| x_flows[0]), np.ones_like(x_flows[0]) | |||||
| origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]), | |||||
| dtype=np.float32) | |||||
| origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]), | |||||
| dtype=np.float32) | |||||
| x_fusion_map, x_acc_map = get_map_fusion_map_cuda( | |||||
| x_flows, 1, device=device) | |||||
| y_fusion_map, y_acc_map = get_map_fusion_map_cuda( | |||||
| y_flows, 1, device=device) | |||||
| x_flow_map = 1.0 / x_fusion_map | |||||
| y_flow_map = 1.0 / y_fusion_map | |||||
| all_acc_map = x_acc_map + y_acc_map | |||||
| all_acc_map = all_acc_map.astype(np.uint8) | |||||
| roi_box = get_mask_bbox(all_acc_map, threshold=1) | |||||
| if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[ | |||||
| 3] - roi_box[2] <= 0: | |||||
| roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]] | |||||
| roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1], | |||||
| roi_box[2]:roi_box[3]] | |||||
| roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1], | |||||
| roi_box[2]:roi_box[3]] | |||||
| roi_width = roi_x_flow_map.shape[1] | |||||
| roi_height = roi_x_flow_map.shape[0] | |||||
| roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320) | |||||
| roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320) | |||||
| roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55)) | |||||
| roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55)) | |||||
| roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height)) | |||||
| roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height)) | |||||
| x_flow_map[roi_box[0]:roi_box[1], | |||||
| roi_box[2]:roi_box[3]] = roi_x_flow_map | |||||
| y_flow_map[roi_box[0]:roi_box[1], | |||||
| roi_box[2]:roi_box[3]] = roi_y_flow_map | |||||
| for i in range(scale_num): | |||||
| origin_rDx += x_flows[i] | |||||
| origin_rDy += y_flows[i] | |||||
| origin_rDx *= x_flow_map | |||||
| origin_rDy *= y_flow_map | |||||
| return origin_rDx, origin_rDy, x_flow_map, y_flow_map | |||||
| def __joint_to_body_box(self): | |||||
| joint_left = int(np.min(self.joints, axis=0)[0]) | |||||
| joint_right = int(np.max(self.joints, axis=0)[0]) | |||||
| joint_top = int(np.min(self.joints, axis=0)[1]) | |||||
| joint_bottom = int(np.max(self.joints, axis=0)[1]) | |||||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||||
| def __joint_to_leg_box(self): | |||||
| leg_joints = self.joints[8:, :] | |||||
| if np.max(leg_joints, axis=0)[2] < 0.05: | |||||
| return [0, 0, 0, 0] | |||||
| joint_left = int(np.min(leg_joints, axis=0)[0]) | |||||
| joint_right = int(np.max(leg_joints, axis=0)[0]) | |||||
| joint_top = int(np.min(leg_joints, axis=0)[1]) | |||||
| joint_bottom = int(np.max(leg_joints, axis=0)[1]) | |||||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||||
| def __joint_to_arm_box(self): | |||||
| arm_joints = self.joints[2:8, :] | |||||
| if np.max(arm_joints, axis=0)[2] < 0.05: | |||||
| return [0, 0, 0, 0] | |||||
| joint_left = int(np.min(arm_joints, axis=0)[0]) | |||||
| joint_right = int(np.max(arm_joints, axis=0)[0]) | |||||
| joint_top = int(np.min(arm_joints, axis=0)[1]) | |||||
| joint_bottom = int(np.max(arm_joints, axis=0)[1]) | |||||
| return [joint_top, joint_bottom, joint_left, joint_right] | |||||
| @@ -0,0 +1,272 @@ | |||||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||||
| import math | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| from scipy.ndimage.filters import gaussian_filter | |||||
| from .model import BodyposeModel | |||||
| from .util import pad_rightdown_corner, transfer | |||||
| class Body(object): | |||||
| def __init__(self, model_path, device): | |||||
| self.model = BodyposeModel().to(device) | |||||
| model_dict = transfer(self.model, torch.load(model_path)) | |||||
| self.model.load_state_dict(model_dict) | |||||
| self.model.eval() | |||||
| def __call__(self, oriImg): | |||||
| scale_search = [0.5] | |||||
| boxsize = 368 | |||||
| stride = 8 | |||||
| padValue = 128 | |||||
| thre1 = 0.1 | |||||
| thre2 = 0.05 | |||||
| bodyparts = 18 | |||||
| multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] | |||||
| heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19)) | |||||
| paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) | |||||
| for m in range(len(multiplier)): | |||||
| scale = multiplier[m] | |||||
| imageToTest = cv2.resize( | |||||
| oriImg, (0, 0), | |||||
| fx=scale, | |||||
| fy=scale, | |||||
| interpolation=cv2.INTER_CUBIC) | |||||
| imageToTest_padded, pad = pad_rightdown_corner( | |||||
| imageToTest, stride, padValue) | |||||
| im = np.transpose( | |||||
| np.float32(imageToTest_padded[:, :, :, np.newaxis]), | |||||
| (3, 2, 0, 1)) / 256 - 0.5 | |||||
| im = np.ascontiguousarray(im) | |||||
| data = torch.from_numpy(im).float() | |||||
| if torch.cuda.is_available(): | |||||
| data = data.cuda() | |||||
| with torch.no_grad(): | |||||
| Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data) | |||||
| Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy() | |||||
| Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy() | |||||
| # extract outputs, resize, and remove padding | |||||
| heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), | |||||
| (1, 2, 0)) # output 1 is heatmaps | |||||
| heatmap = cv2.resize( | |||||
| heatmap, (0, 0), | |||||
| fx=stride, | |||||
| fy=stride, | |||||
| interpolation=cv2.INTER_CUBIC) | |||||
| heatmap = heatmap[:imageToTest_padded.shape[0] | |||||
| - pad[2], :imageToTest_padded.shape[1] | |||||
| - pad[3], :] | |||||
| heatmap = cv2.resize( | |||||
| heatmap, (oriImg.shape[1], oriImg.shape[0]), | |||||
| interpolation=cv2.INTER_CUBIC) | |||||
| paf = np.transpose(np.squeeze(Mconv7_stage6_L1), | |||||
| (1, 2, 0)) # output 0 is PAFs | |||||
| paf = cv2.resize( | |||||
| paf, (0, 0), | |||||
| fx=stride, | |||||
| fy=stride, | |||||
| interpolation=cv2.INTER_CUBIC) | |||||
| paf = paf[:imageToTest_padded.shape[0] | |||||
| - pad[2], :imageToTest_padded.shape[1] - pad[3], :] | |||||
| paf = cv2.resize( | |||||
| paf, (oriImg.shape[1], oriImg.shape[0]), | |||||
| interpolation=cv2.INTER_CUBIC) | |||||
| heatmap_avg += heatmap_avg + heatmap / len(multiplier) | |||||
| paf_avg += +paf / len(multiplier) | |||||
| all_peaks = [] | |||||
| peak_counter = 0 | |||||
| for part in range(bodyparts): | |||||
| map_ori = heatmap_avg[:, :, part] | |||||
| one_heatmap = gaussian_filter(map_ori, sigma=3) | |||||
| map_left = np.zeros(one_heatmap.shape) | |||||
| map_left[1:, :] = one_heatmap[:-1, :] | |||||
| map_right = np.zeros(one_heatmap.shape) | |||||
| map_right[:-1, :] = one_heatmap[1:, :] | |||||
| map_up = np.zeros(one_heatmap.shape) | |||||
| map_up[:, 1:] = one_heatmap[:, :-1] | |||||
| map_down = np.zeros(one_heatmap.shape) | |||||
| map_down[:, :-1] = one_heatmap[:, 1:] | |||||
| peaks_binary = np.logical_and.reduce( | |||||
| (one_heatmap >= map_left, one_heatmap >= map_right, | |||||
| one_heatmap >= map_up, one_heatmap >= map_down, | |||||
| one_heatmap > thre1)) | |||||
| peaks = list( | |||||
| zip(np.nonzero(peaks_binary)[1], | |||||
| np.nonzero(peaks_binary)[0])) # note reverse | |||||
| peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks] | |||||
| peak_id = range(peak_counter, peak_counter + len(peaks)) | |||||
| peaks_with_score_and_id = [ | |||||
| peaks_with_score[i] + (peak_id[i], ) | |||||
| for i in range(len(peak_id)) | |||||
| ] | |||||
| all_peaks.append(peaks_with_score_and_id) | |||||
| peak_counter += len(peaks) | |||||
| # find connection in the specified sequence, center 29 is in the position 15 | |||||
| limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], | |||||
| [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], | |||||
| [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]] | |||||
| # the middle joints heatmap correpondence | |||||
| mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], | |||||
| [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30], | |||||
| [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38], | |||||
| [45, 46]] | |||||
| connection_all = [] | |||||
| special_k = [] | |||||
| mid_num = 10 | |||||
| for k in range(len(mapIdx)): | |||||
| score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]] | |||||
| candA = all_peaks[limbSeq[k][0] - 1] | |||||
| candB = all_peaks[limbSeq[k][1] - 1] | |||||
| nA = len(candA) | |||||
| nB = len(candB) | |||||
| if (nA != 0 and nB != 0): | |||||
| connection_candidate = [] | |||||
| for i in range(nA): | |||||
| for j in range(nB): | |||||
| vec = np.subtract(candB[j][:2], candA[i][:2]) | |||||
| norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) | |||||
| norm = max(0.001, norm) | |||||
| vec = np.divide(vec, norm) | |||||
| startend = list( | |||||
| zip( | |||||
| np.linspace( | |||||
| candA[i][0], candB[j][0], num=mid_num), | |||||
| np.linspace( | |||||
| candA[i][1], candB[j][1], num=mid_num))) | |||||
| vec_x = np.array([ | |||||
| score_mid[int(round(startend[item][1])), | |||||
| int(round(startend[item][0])), 0] | |||||
| for item in range(len(startend)) | |||||
| ]) | |||||
| vec_y = np.array([ | |||||
| score_mid[int(round(startend[item][1])), | |||||
| int(round(startend[item][0])), 1] | |||||
| for item in range(len(startend)) | |||||
| ]) | |||||
| score_midpts = np.multiply( | |||||
| vec_x, vec[0]) + np.multiply(vec_y, vec[1]) | |||||
| temp1 = sum(score_midpts) / len(score_midpts) | |||||
| temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0) | |||||
| score_with_dist_prior = temp1 + temp2 | |||||
| criterion1 = len(np.nonzero( | |||||
| score_midpts > thre2)[0]) > 0.8 * len(score_midpts) | |||||
| criterion2 = score_with_dist_prior > 0 | |||||
| if criterion1 and criterion2: | |||||
| connection_candidate.append([ | |||||
| i, j, score_with_dist_prior, | |||||
| score_with_dist_prior + candA[i][2] | |||||
| + candB[j][2] | |||||
| ]) | |||||
| connection_candidate = sorted( | |||||
| connection_candidate, key=lambda x: x[2], reverse=True) | |||||
| connection = np.zeros((0, 5)) | |||||
| for c in range(len(connection_candidate)): | |||||
| i, j, s = connection_candidate[c][0:3] | |||||
| if (i not in connection[:, 3] | |||||
| and j not in connection[:, 4]): | |||||
| connection = np.vstack( | |||||
| [connection, [candA[i][3], candB[j][3], s, i, j]]) | |||||
| if (len(connection) >= min(nA, nB)): | |||||
| break | |||||
| connection_all.append(connection) | |||||
| else: | |||||
| special_k.append(k) | |||||
| connection_all.append([]) | |||||
| # last number in each row is the total parts number of that person | |||||
| # the second last number in each row is the score of the overall configuration | |||||
| subset = -1 * np.ones((0, 20)) | |||||
| candidate = np.array( | |||||
| [item for sublist in all_peaks for item in sublist]) | |||||
| for k in range(len(mapIdx)): | |||||
| if k not in special_k: | |||||
| partAs = connection_all[k][:, 0] | |||||
| partBs = connection_all[k][:, 1] | |||||
| indexA, indexB = np.array(limbSeq[k]) - 1 | |||||
| for i in range(len(connection_all[k])): # = 1:size(temp,1) | |||||
| found = 0 | |||||
| subset_idx = [-1, -1] | |||||
| for j in range(len(subset)): # 1:size(subset,1): | |||||
| if subset[j][indexA] == partAs[i] or subset[j][ | |||||
| indexB] == partBs[i]: | |||||
| subset_idx[found] = j | |||||
| found += 1 | |||||
| if found == 1: | |||||
| j = subset_idx[0] | |||||
| if subset[j][indexB] != partBs[i]: | |||||
| subset[j][indexB] = partBs[i] | |||||
| subset[j][-1] += 1 | |||||
| subset[j][-2] += candidate[ | |||||
| partBs[i].astype(int), | |||||
| 2] + connection_all[k][i][2] | |||||
| elif found == 2: # if found 2 and disjoint, merge them | |||||
| j1, j2 = subset_idx | |||||
| tmp1 = (subset[j1] >= 0).astype(int) | |||||
| tmp2 = (subset[j2] >= 0).astype(int) | |||||
| membership = (tmp1 + tmp2)[:-2] | |||||
| if len(np.nonzero(membership == 2)[0]) == 0: # merge | |||||
| subset[j1][:-2] += (subset[j2][:-2] + 1) | |||||
| subset[j1][-2:] += subset[j2][-2:] | |||||
| subset[j1][-2] += connection_all[k][i][2] | |||||
| subset = np.delete(subset, j2, 0) | |||||
| else: # as like found == 1 | |||||
| subset[j1][indexB] = partBs[i] | |||||
| subset[j1][-1] += 1 | |||||
| subset[j1][-2] += candidate[ | |||||
| partBs[i].astype(int), | |||||
| 2] + connection_all[k][i][2] | |||||
| # if find no partA in the subset, create a new subset | |||||
| elif not found and k < 17: | |||||
| row = -1 * np.ones(20) | |||||
| row[indexA] = partAs[i] | |||||
| row[indexB] = partBs[i] | |||||
| row[-1] = 2 | |||||
| row[-2] = sum( | |||||
| candidate[connection_all[k][i, :2].astype(int), | |||||
| 2]) + connection_all[k][i][2] | |||||
| subset = np.vstack([subset, row]) | |||||
| # delete some rows of subset which has few parts occur | |||||
| deleteIdx = [] | |||||
| for i in range(len(subset)): | |||||
| if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: | |||||
| deleteIdx.append(i) | |||||
| subset = np.delete(subset, deleteIdx, axis=0) | |||||
| # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts | |||||
| # candidate: x, y, score, id | |||||
| count = subset.shape[0] | |||||
| joints = np.zeros(shape=(count, bodyparts, 3)) | |||||
| for i in range(count): | |||||
| for j in range(bodyparts): | |||||
| joints[i, j, :3] = candidate[int(subset[i, j]), :3] | |||||
| confidence = 1.0 if subset[i, j] >= 0 else 0.0 | |||||
| joints[i, j, 2] *= confidence | |||||
| return joints | |||||
| @@ -0,0 +1,141 @@ | |||||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||||
| from collections import OrderedDict | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| def make_layers(block, no_relu_layers): | |||||
| layers = [] | |||||
| for layer_name, v in block.items(): | |||||
| if 'pool' in layer_name: | |||||
| layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2]) | |||||
| layers.append((layer_name, layer)) | |||||
| else: | |||||
| conv2d = nn.Conv2d( | |||||
| in_channels=v[0], | |||||
| out_channels=v[1], | |||||
| kernel_size=v[2], | |||||
| stride=v[3], | |||||
| padding=v[4]) | |||||
| layers.append((layer_name, conv2d)) | |||||
| if layer_name not in no_relu_layers: | |||||
| layers.append(('relu_' + layer_name, nn.ReLU(inplace=True))) | |||||
| return nn.Sequential(OrderedDict(layers)) | |||||
| class BodyposeModel(nn.Module): | |||||
| def __init__(self): | |||||
| super(BodyposeModel, self).__init__() | |||||
| # these layers have no relu layer | |||||
| no_relu_layers = [ | |||||
| 'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1', | |||||
| 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2', | |||||
| 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1', | |||||
| 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1' | |||||
| ] | |||||
| blocks = {} | |||||
| block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]), | |||||
| ('conv1_2', [64, 64, 3, 1, 1]), | |||||
| ('pool1_stage1', [2, 2, 0]), | |||||
| ('conv2_1', [64, 128, 3, 1, 1]), | |||||
| ('conv2_2', [128, 128, 3, 1, 1]), | |||||
| ('pool2_stage1', [2, 2, 0]), | |||||
| ('conv3_1', [128, 256, 3, 1, 1]), | |||||
| ('conv3_2', [256, 256, 3, 1, 1]), | |||||
| ('conv3_3', [256, 256, 3, 1, 1]), | |||||
| ('conv3_4', [256, 256, 3, 1, 1]), | |||||
| ('pool3_stage1', [2, 2, 0]), | |||||
| ('conv4_1', [256, 512, 3, 1, 1]), | |||||
| ('conv4_2', [512, 512, 3, 1, 1]), | |||||
| ('conv4_3_CPM', [512, 256, 3, 1, 1]), | |||||
| ('conv4_4_CPM', [256, 128, 3, 1, 1])]) | |||||
| # Stage 1 | |||||
| block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), | |||||
| ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]), | |||||
| ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), | |||||
| ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]), | |||||
| ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])]) | |||||
| block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), | |||||
| ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]), | |||||
| ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), | |||||
| ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]), | |||||
| ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])]) | |||||
| blocks['block1_1'] = block1_1 | |||||
| blocks['block1_2'] = block1_2 | |||||
| self.model0 = make_layers(block0, no_relu_layers) | |||||
| # Stages 2 - 6 | |||||
| for i in range(2, 7): | |||||
| blocks['block%d_1' % i] = OrderedDict([ | |||||
| ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]), | |||||
| ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]), | |||||
| ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0]) | |||||
| ]) | |||||
| blocks['block%d_2' % i] = OrderedDict([ | |||||
| ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]), | |||||
| ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]), | |||||
| ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]), | |||||
| ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0]) | |||||
| ]) | |||||
| for k in blocks.keys(): | |||||
| blocks[k] = make_layers(blocks[k], no_relu_layers) | |||||
| self.model1_1 = blocks['block1_1'] | |||||
| self.model2_1 = blocks['block2_1'] | |||||
| self.model3_1 = blocks['block3_1'] | |||||
| self.model4_1 = blocks['block4_1'] | |||||
| self.model5_1 = blocks['block5_1'] | |||||
| self.model6_1 = blocks['block6_1'] | |||||
| self.model1_2 = blocks['block1_2'] | |||||
| self.model2_2 = blocks['block2_2'] | |||||
| self.model3_2 = blocks['block3_2'] | |||||
| self.model4_2 = blocks['block4_2'] | |||||
| self.model5_2 = blocks['block5_2'] | |||||
| self.model6_2 = blocks['block6_2'] | |||||
| def forward(self, x): | |||||
| out1 = self.model0(x) | |||||
| out1_1 = self.model1_1(out1) | |||||
| out1_2 = self.model1_2(out1) | |||||
| out2 = torch.cat([out1_1, out1_2, out1], 1) | |||||
| out2_1 = self.model2_1(out2) | |||||
| out2_2 = self.model2_2(out2) | |||||
| out3 = torch.cat([out2_1, out2_2, out1], 1) | |||||
| out3_1 = self.model3_1(out3) | |||||
| out3_2 = self.model3_2(out3) | |||||
| out4 = torch.cat([out3_1, out3_2, out1], 1) | |||||
| out4_1 = self.model4_1(out4) | |||||
| out4_2 = self.model4_2(out4) | |||||
| out5 = torch.cat([out4_1, out4_2, out1], 1) | |||||
| out5_1 = self.model5_1(out5) | |||||
| out5_2 = self.model5_2(out5) | |||||
| out6 = torch.cat([out5_1, out5_2, out1], 1) | |||||
| out6_1 = self.model6_1(out6) | |||||
| out6_2 = self.model6_2(out6) | |||||
| return out6_1, out6_2 | |||||
| @@ -0,0 +1,33 @@ | |||||
| # The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. | |||||
| import numpy as np | |||||
| def pad_rightdown_corner(img, stride, padValue): | |||||
| h = img.shape[0] | |||||
| w = img.shape[1] | |||||
| pad = 4 * [None] | |||||
| pad[0] = 0 # up | |||||
| pad[1] = 0 # left | |||||
| pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down | |||||
| pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right | |||||
| img_padded = img | |||||
| pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1)) | |||||
| img_padded = np.concatenate((pad_up, img_padded), axis=0) | |||||
| pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1)) | |||||
| img_padded = np.concatenate((pad_left, img_padded), axis=1) | |||||
| pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1)) | |||||
| img_padded = np.concatenate((img_padded, pad_down), axis=0) | |||||
| pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1)) | |||||
| img_padded = np.concatenate((img_padded, pad_right), axis=1) | |||||
| return img_padded, pad | |||||
| def transfer(model, model_weights): | |||||
| transfered_model_weights = {} | |||||
| for weights_name in model.state_dict().keys(): | |||||
| transfered_model_weights[weights_name] = model_weights['.'.join( | |||||
| weights_name.split('.')[1:])] | |||||
| return transfered_model_weights | |||||
| @@ -0,0 +1,507 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import math | |||||
| import os | |||||
| import random | |||||
| import cv2 | |||||
| import numba | |||||
| import numpy as np | |||||
| import torch | |||||
| def resize_on_long_side(img, long_side=800): | |||||
| src_height = img.shape[0] | |||||
| src_width = img.shape[1] | |||||
| if src_height > src_width: | |||||
| scale = long_side * 1.0 / src_height | |||||
| _img = cv2.resize( | |||||
| img, (int(src_width * scale), long_side), | |||||
| interpolation=cv2.INTER_LINEAR) | |||||
| else: | |||||
| scale = long_side * 1.0 / src_width | |||||
| _img = cv2.resize( | |||||
| img, (long_side, int(src_height * scale)), | |||||
| interpolation=cv2.INTER_LINEAR) | |||||
| return _img, scale | |||||
| def point_in_box(pt, box): | |||||
| pt_x = pt[0] | |||||
| pt_y = pt[1] | |||||
| if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[ | |||||
| 1] and pt_y <= box[1] + box[3]: | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True): | |||||
| if roi_bbox is None or None in roi_bbox: | |||||
| return [None, None, None, None] | |||||
| top = roi_bbox[0] | |||||
| bottom = roi_bbox[1] | |||||
| left = roi_bbox[2] | |||||
| right = roi_bbox[3] | |||||
| roi_width = roi_bbox[3] - roi_bbox[2] | |||||
| roi_height = roi_bbox[1] - roi_bbox[0] | |||||
| right = left + roi_width | |||||
| bottom = top + roi_height | |||||
| long_side = roi_width if roi_width > roi_height else roi_height | |||||
| if use_long_side: | |||||
| new_left = left - int(long_side * ratio) | |||||
| else: | |||||
| new_left = left - int(roi_width * ratio) | |||||
| new_left = 1 if new_left < 0 else new_left | |||||
| if use_long_side: | |||||
| new_top = top - int(long_side * ratio) | |||||
| else: | |||||
| new_top = top - int(roi_height * ratio) | |||||
| new_top = 1 if new_top < 0 else new_top | |||||
| if use_long_side: | |||||
| new_right = right + int(long_side * ratio) | |||||
| else: | |||||
| new_right = right + int(roi_width * ratio) | |||||
| new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right | |||||
| if use_long_side: | |||||
| new_bottom = bottom + int(long_side * ratio) | |||||
| else: | |||||
| new_bottom = bottom + int(roi_height * ratio) | |||||
| new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom | |||||
| bbox = [new_top, new_bottom, new_left, new_right] | |||||
| return bbox | |||||
| def gen_PAF(image, joints): | |||||
| assert joints.shape[0] == 18 | |||||
| assert joints.shape[1] == 3 | |||||
| org_h = image.shape[0] | |||||
| org_w = image.shape[1] | |||||
| small_image, resize_scale = resize_on_long_side(image, 120) | |||||
| joints[:, :2] = joints[:, :2] * resize_scale | |||||
| joint_left = int(np.min(joints, axis=0)[0]) | |||||
| joint_right = int(np.max(joints, axis=0)[0]) | |||||
| joint_top = int(np.min(joints, axis=0)[1]) | |||||
| joint_bottom = int(np.max(joints, axis=0)[1]) | |||||
| limb_width = min( | |||||
| abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6 | |||||
| if limb_width % 2 == 0: | |||||
| limb_width += 1 | |||||
| kernel_size = limb_width | |||||
| part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12), | |||||
| (12, 13), (8, 9), (9, 10)] | |||||
| map_list = [] | |||||
| mask_list = [] | |||||
| PAF_all = np.zeros( | |||||
| shape=(small_image.shape[0], small_image.shape[1], 2), | |||||
| dtype=np.float32) | |||||
| for c, pair in enumerate(part_orders): | |||||
| idx_a_name = pair[0] | |||||
| idx_b_name = pair[1] | |||||
| jointa = joints[idx_a_name] | |||||
| jointb = joints[idx_b_name] | |||||
| confidence_threshold = 0.05 | |||||
| if jointa[2] > confidence_threshold and jointb[ | |||||
| 2] > confidence_threshold: | |||||
| canvas = np.zeros( | |||||
| shape=(small_image.shape[0], small_image.shape[1]), | |||||
| dtype=np.uint8) | |||||
| canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])), | |||||
| (int(jointb[0]), int(jointb[1])), | |||||
| (255, 255, 255), 5) | |||||
| kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, | |||||
| (kernel_size, kernel_size)) | |||||
| canvas = cv2.dilate(canvas, kernel, 1) | |||||
| canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0) | |||||
| canvas = canvas.astype(np.float32) / 255 | |||||
| PAF = np.zeros( | |||||
| shape=(small_image.shape[0], small_image.shape[1], 2), | |||||
| dtype=np.float32) | |||||
| PAF[..., 0] = jointb[0] - jointa[0] | |||||
| PAF[..., 1] = jointb[1] - jointa[1] | |||||
| mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1]) | |||||
| PAF /= (np.dstack((mag, mag)) + 1e-5) | |||||
| single_PAF = PAF * np.dstack((canvas, canvas)) | |||||
| map_list.append( | |||||
| cv2.GaussianBlur(single_PAF, | |||||
| (kernel_size * 3, kernel_size * 3), 0)) | |||||
| mask_list.append( | |||||
| cv2.GaussianBlur(canvas.copy(), | |||||
| (kernel_size * 3, kernel_size * 3), 0)) | |||||
| PAF_all = PAF_all * (1.0 - np.dstack( | |||||
| (canvas, canvas))) + single_PAF | |||||
| PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0) | |||||
| PAF_all = cv2.resize( | |||||
| PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR) | |||||
| map_list.append(PAF_all) | |||||
| return PAF_all, map_list, mask_list | |||||
| def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None): | |||||
| if type(joints) == list: | |||||
| joints = np.array(joints) | |||||
| assert stack_mode == 'column' or stack_mode == 'depth' | |||||
| part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), | |||||
| (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] | |||||
| def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0): | |||||
| jointa = joints[a] | |||||
| jointb = joints[b] | |||||
| temp1 = int((jointa[0] - x_offset) * scale) | |||||
| temp2 = int((jointa[1] - y_offset) * scale) | |||||
| temp3 = int((jointb[0] - x_offset) * scale) | |||||
| temp4 = int((jointb[1] - y_offset) * scale) | |||||
| cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width) | |||||
| roi_box = input_roi_box | |||||
| roi_box_width = roi_box[3] - roi_box[2] | |||||
| roi_box_height = roi_box[1] - roi_box[0] | |||||
| short_side_length = min(roi_box_width, roi_box_height) | |||||
| line_width = short_side_length // 30 | |||||
| line_width = max(line_width, 2) | |||||
| map_cube = np.zeros( | |||||
| shape=(roi_box_height, roi_box_width, len(part_orders) + 1), | |||||
| dtype=np.float32) | |||||
| use_line_width = min(5, line_width) | |||||
| fx = use_line_width * 1.0 / line_width # fx 最大值为1 | |||||
| if fx < 0.99: | |||||
| map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx) | |||||
| for c, pair in enumerate(part_orders): | |||||
| tmp = map_cube[..., c].copy() | |||||
| link( | |||||
| tmp, | |||||
| pair[0], | |||||
| pair[1], (2.0, 2.0, 2.0), | |||||
| use_line_width, | |||||
| scale=fx, | |||||
| x_offset=roi_box[2], | |||||
| y_offset=roi_box[0]) | |||||
| map_cube[..., c] = tmp | |||||
| tmp = map_cube[..., -1].copy() | |||||
| link( | |||||
| tmp, | |||||
| pair[0], | |||||
| pair[1], (2.0, 2.0, 2.0), | |||||
| use_line_width, | |||||
| scale=fx, | |||||
| x_offset=roi_box[2], | |||||
| y_offset=roi_box[0]) | |||||
| map_cube[..., -1] = tmp | |||||
| map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height)) | |||||
| if stack_mode == 'depth': | |||||
| return map_cube, roi_box | |||||
| elif stack_mode == 'column': | |||||
| joint_maps = [] | |||||
| for c in range(len(part_orders) + 1): | |||||
| joint_maps.append(map_cube[..., c]) | |||||
| joint_map = np.column_stack(joint_maps) | |||||
| return joint_map, roi_box | |||||
| def plot_one_box(x, img, color=None, label=None, line_thickness=None): | |||||
| tl = line_thickness or round( | |||||
| 0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness | |||||
| color = color or [random.randint(0, 255) for _ in range(3)] | |||||
| c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) | |||||
| cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) | |||||
| if label: | |||||
| tf = max(tl - 1, 1) # font thickness | |||||
| t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] | |||||
| c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 | |||||
| cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled | |||||
| cv2.putText( | |||||
| img, | |||||
| label, (c1[0], c1[1] - 2), | |||||
| 0, | |||||
| tl / 3, [225, 255, 255], | |||||
| thickness=tf, | |||||
| lineType=cv2.LINE_AA) | |||||
| def draw_line(im, points, color, stroke_size=2, closed=False): | |||||
| points = points.astype(np.int32) | |||||
| for i in range(len(points) - 1): | |||||
| cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color, | |||||
| stroke_size) | |||||
| if closed: | |||||
| cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size) | |||||
| def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2): | |||||
| left = bbox[0] | |||||
| top = bbox[1] | |||||
| right = bbox[2] | |||||
| bottom = bbox[3] | |||||
| roi_width = right - left | |||||
| roi_height = bottom - top | |||||
| new_left = left - int(roi_width * enlarge_ratio) | |||||
| new_left = 0 if new_left < 0 else new_left | |||||
| new_top = top - int(roi_height * enlarge_ratio) | |||||
| new_top = 0 if new_top < 0 else new_top | |||||
| new_right = right + int(roi_width * enlarge_ratio) | |||||
| new_right = img_width if new_right > img_width else new_right | |||||
| new_bottom = bottom + int(roi_height * enlarge_ratio) | |||||
| new_bottom = img_height if new_bottom > img_height else new_bottom | |||||
| bbox = [new_left, new_top, new_right, new_bottom] | |||||
| bbox = [int(x) for x in bbox] | |||||
| return bbox | |||||
| def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')): | |||||
| map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list] | |||||
| map_concat = torch.stack(tuple(map_list_cuda), dim=-1) | |||||
| map_concat = torch.abs(map_concat) | |||||
| map_concat[map_concat < threshold] = 0 | |||||
| map_concat[map_concat > 1e-5] = 1.0 | |||||
| sum_map = torch.sum(map_concat, dim=2) | |||||
| a = torch.ones_like(sum_map) | |||||
| acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map)) | |||||
| fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map) | |||||
| fusion_map = fusion_map.float() | |||||
| acc_map = acc_map.float() | |||||
| fusion_map = fusion_map.cpu().numpy().astype(np.float32) | |||||
| acc_map = acc_map.cpu().numpy().astype(np.float32) | |||||
| return fusion_map, acc_map | |||||
| def gen_border_shade(height, width, height_band, width_band): | |||||
| height_ratio = height_band * 1.0 / height | |||||
| width_ratio = width_band * 1.0 / width | |||||
| _height_band = int(256 * height_ratio) | |||||
| _width_band = int(256 * width_ratio) | |||||
| canvas = np.zeros((256, 256), dtype=np.float32) | |||||
| canvas[_height_band // 2:-_height_band // 2, | |||||
| _width_band // 2:-_width_band // 2] = 1.0 | |||||
| canvas = cv2.blur(canvas, (_height_band, _width_band)) | |||||
| canvas = cv2.resize(canvas, (width, height)) | |||||
| return canvas | |||||
| def get_mask_bbox(mask, threshold=127): | |||||
| ret, mask = cv2.threshold(mask, threshold, 1, 0) | |||||
| if cv2.countNonZero(mask) == 0: | |||||
| return [None, None, None, None] | |||||
| col_acc = np.sum(mask, 0) | |||||
| row_acc = np.sum(mask, 1) | |||||
| col_acc = col_acc.tolist() | |||||
| row_acc = row_acc.tolist() | |||||
| for x in range(len(col_acc)): | |||||
| if col_acc[x] > 0: | |||||
| left = x | |||||
| break | |||||
| for x in range(1, len(col_acc)): | |||||
| if col_acc[-x] > 0: | |||||
| right = len(col_acc) - x | |||||
| break | |||||
| for x in range(len(row_acc)): | |||||
| if row_acc[x] > 0: | |||||
| top = x | |||||
| break | |||||
| for x in range(1, len(row_acc)): | |||||
| if row_acc[-x] > 0: | |||||
| bottom = len(row_acc[::-1]) - x | |||||
| break | |||||
| return [top, bottom, left, right] | |||||
| def visualize_flow(flow): | |||||
| h, w = flow.shape[:2] | |||||
| hsv = np.zeros((h, w, 3), np.uint8) | |||||
| mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1]) | |||||
| hsv[..., 0] = ang * 180 / np.pi / 2 | |||||
| hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX) | |||||
| hsv[..., 2] = 255 | |||||
| bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) | |||||
| bgr = bgr * 1.0 / 255 | |||||
| return bgr.astype(np.float32) | |||||
| def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1): | |||||
| part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), | |||||
| (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] | |||||
| abandon_idxs = [0, 1, 14, 15, 16, 17] | |||||
| # draw joints | |||||
| for i, joint in enumerate(joints): | |||||
| if i in abandon_idxs: | |||||
| continue | |||||
| if joint[-1] > confidence_threshold: | |||||
| cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2) | |||||
| if show_text: | |||||
| cv2.putText(image, | |||||
| str(i) + '[{:.2f}]'.format(joint[-1]), | |||||
| (int(joint[0]), int(joint[1])), | |||||
| cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) | |||||
| # draw link | |||||
| for pair in part_orders: | |||||
| if joints[pair[0]][-1] > confidence_threshold and joints[ | |||||
| pair[1]][-1] > confidence_threshold: | |||||
| cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])), | |||||
| (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color, | |||||
| 2) | |||||
| return image | |||||
| def get_heatmap_cv(img, magn, max_flow_mag): | |||||
| min_flow_mag = .5 | |||||
| cv_magn = np.clip( | |||||
| 255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7), | |||||
| a_min=0, | |||||
| a_max=255).astype(np.uint8) | |||||
| if img.dtype != np.uint8: | |||||
| img = (255 * img).astype(np.uint8) | |||||
| heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET) | |||||
| heatmap_img = heatmap_img[..., ::-1] | |||||
| h, w = magn.shape | |||||
| img_alpha = np.ones((h, w), dtype=np.double)[:, :, None] | |||||
| heatmap_alpha = np.clip( | |||||
| magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7 | |||||
| heatmap_alpha[heatmap_alpha < .2]**.5 | |||||
| pm_hm = heatmap_img * heatmap_alpha | |||||
| pm_img = img * img_alpha | |||||
| cv_out = pm_hm + pm_img * (1 - heatmap_alpha) | |||||
| cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8) | |||||
| return cv_out | |||||
| def save_heatmap_cv(img, flow, supression=2): | |||||
| flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2) | |||||
| flow_magn -= supression | |||||
| flow_magn[flow_magn <= 0] = 0 | |||||
| cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3) | |||||
| return cv_out | |||||
| @numba.jit(nopython=True, parallel=False) | |||||
| def bilinear_interp(x, y, v11, v12, v21, v22): | |||||
| temp1 = (v11 * (1 - y) + v12 * y) * (1 - x) | |||||
| temp2 = (v21 * (1 - y) + v22 * y) * x | |||||
| result = temp1 + temp2 | |||||
| return result | |||||
| @numba.jit(nopython=True, parallel=False) | |||||
| def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand, | |||||
| height_expand): | |||||
| srcW = oriImg.shape[1] | |||||
| srcH = oriImg.shape[0] | |||||
| newImg = oriImg.copy() | |||||
| for i in range(srcH): | |||||
| for j in range(srcW): | |||||
| _i = i | |||||
| _j = j | |||||
| deltaX = rDx[_i, _j] | |||||
| deltaY = rDy[_i, _j] | |||||
| nx = _j + deltaX * transRatio | |||||
| ny = _i + deltaY * transRatio | |||||
| if nx >= srcW - width_expand - 1: | |||||
| if nx > srcW - 1: | |||||
| nx = srcW - 1 | |||||
| if ny >= srcH - height_expand - 1: | |||||
| if ny > srcH - 1: | |||||
| ny = srcH - 1 | |||||
| if nx < width_expand: | |||||
| if nx < 0: | |||||
| nx = 0 | |||||
| if ny < height_expand: | |||||
| if ny < 0: | |||||
| ny = 0 | |||||
| nxi = int(math.floor(nx)) | |||||
| nyi = int(math.floor(ny)) | |||||
| nxi1 = int(math.ceil(nx)) | |||||
| nyi1 = int(math.ceil(ny)) | |||||
| for ll in range(3): | |||||
| newImg[_i, _j, | |||||
| ll] = bilinear_interp(ny - nyi, nx - nxi, | |||||
| oriImg[nyi, nxi, | |||||
| ll], oriImg[nyi, nxi1, ll], | |||||
| oriImg[nyi1, nxi, | |||||
| ll], oriImg[nyi1, nxi1, | |||||
| ll]) | |||||
| return newImg | |||||
| @@ -1,3 +1,6 @@ | |||||
| # The implementation is adopted from Jingwen He, | |||||
| # made publicly available at https://github.com/hejingwenhejingwen/CSRNet | |||||
| import functools | import functools | ||||
| import math | import math | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os.path as osp | import os.path as osp | ||||
| from copy import deepcopy | from copy import deepcopy | ||||
| from typing import Dict, Union | from typing import Dict, Union | ||||
| @@ -1,3 +1,8 @@ | |||||
| # ------------------------------------------------------------------------ | |||||
| # Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py | |||||
| # Copyright (c) 2022 megvii-model. All Rights Reserved. | |||||
| # ------------------------------------------------------------------------ | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,8 @@ | |||||
| # ------------------------------------------------------------------------ | |||||
| # Modified from BasicSR (https://github.com/xinntao/BasicSR) | |||||
| # Copyright 2018-2020 BasicSR Authors | |||||
| # ------------------------------------------------------------------------ | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,8 +1,8 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from copy import deepcopy | from copy import deepcopy | ||||
| from typing import Any, Dict, Union | from typing import Any, Dict, Union | ||||
| import numpy as np | |||||
| import torch.cuda | import torch.cuda | ||||
| from torch.nn.parallel import DataParallel, DistributedDataParallel | from torch.nn.parallel import DataParallel, DistributedDataParallel | ||||
| @@ -77,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel): | |||||
| def _evaluate_postprocess(self, input: Tensor, | def _evaluate_postprocess(self, input: Tensor, | ||||
| target: Tensor) -> Dict[str, list]: | target: Tensor) -> Dict[str, list]: | ||||
| preds = self.model(input) | preds = self.model(input) | ||||
| preds = list(torch.split(preds, 1, 0)) | |||||
| targets = list(torch.split(target, 1, 0)) | |||||
| preds = [(pred.data * 255.).squeeze(0).permute( | |||||
| 1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds] | |||||
| targets = [(target.data * 255.).squeeze(0).permute( | |||||
| 1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets] | |||||
| preds = list(torch.split(preds.clamp(0, 1), 1, 0)) | |||||
| targets = list(torch.split(target.clamp(0, 1), 1, 0)) | |||||
| return {'pred': preds, 'target': targets} | return {'pred': preds, 'target': targets} | ||||
| @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .image_denoise_dataset import PairedImageDataset | |||||
| from .model import FFTInpainting | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'image_denoise_dataset': ['PairedImageDataset'], | |||||
| 'model': ['FFTInpainting'], | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -0,0 +1,75 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| from typing import Dict, Tuple | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .modules.adversarial import NonSaturatingWithR1 | |||||
| from .modules.ffc import FFCResNetGenerator | |||||
| from .modules.perceptual import ResNetPL | |||||
| from .modules.pix2pixhd import NLayerDiscriminator | |||||
| LOGGER = get_logger() | |||||
| class BaseInpaintingTrainingModule(nn.Module): | |||||
| def __init__(self, | |||||
| model_dir='', | |||||
| use_ddp=True, | |||||
| predict_only=False, | |||||
| visualize_each_iters=100, | |||||
| average_generator=False, | |||||
| generator_avg_beta=0.999, | |||||
| average_generator_start_step=30000, | |||||
| average_generator_period=10, | |||||
| store_discr_outputs_for_vis=False, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| LOGGER.info( | |||||
| f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}' | |||||
| ) | |||||
| self.generator = FFCResNetGenerator() | |||||
| self.use_ddp = use_ddp | |||||
| if not predict_only: | |||||
| self.discriminator = NLayerDiscriminator() | |||||
| self.adversarial_loss = NonSaturatingWithR1( | |||||
| weight=10, | |||||
| gp_coef=0.001, | |||||
| mask_as_fake_target=True, | |||||
| allow_scale_mask=True) | |||||
| self.average_generator = average_generator | |||||
| self.generator_avg_beta = generator_avg_beta | |||||
| self.average_generator_start_step = average_generator_start_step | |||||
| self.average_generator_period = average_generator_period | |||||
| self.generator_average = None | |||||
| self.last_generator_averaging_step = -1 | |||||
| self.store_discr_outputs_for_vis = store_discr_outputs_for_vis | |||||
| self.loss_l1 = nn.L1Loss(reduction='none') | |||||
| self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir) | |||||
| self.visualize_each_iters = visualize_each_iters | |||||
| LOGGER.info('BaseInpaintingTrainingModule init done') | |||||
| def forward(self, batch: Dict[str, | |||||
| torch.Tensor]) -> Dict[str, torch.Tensor]: | |||||
| """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys""" | |||||
| raise NotImplementedError() | |||||
| def generator_loss(self, | |||||
| batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| raise NotImplementedError() | |||||
| def discriminator_loss( | |||||
| self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| raise NotImplementedError() | |||||
| @@ -0,0 +1,210 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import bisect | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .base import BaseInpaintingTrainingModule | |||||
| from .modules.feature_matching import feature_matching_loss, masked_l1_loss | |||||
| LOGGER = get_logger() | |||||
| def set_requires_grad(module, value): | |||||
| for param in module.parameters(): | |||||
| param.requires_grad = value | |||||
| def add_prefix_to_keys(dct, prefix): | |||||
| return {prefix + k: v for k, v in dct.items()} | |||||
| class LinearRamp: | |||||
| def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0): | |||||
| self.start_value = start_value | |||||
| self.end_value = end_value | |||||
| self.start_iter = start_iter | |||||
| self.end_iter = end_iter | |||||
| def __call__(self, i): | |||||
| if i < self.start_iter: | |||||
| return self.start_value | |||||
| if i >= self.end_iter: | |||||
| return self.end_value | |||||
| part = (i - self.start_iter) / (self.end_iter - self.start_iter) | |||||
| return self.start_value * (1 - part) + self.end_value * part | |||||
| class LadderRamp: | |||||
| def __init__(self, start_iters, values): | |||||
| self.start_iters = start_iters | |||||
| self.values = values | |||||
| assert len(values) == len(start_iters) + 1, (len(values), | |||||
| len(start_iters)) | |||||
| def __call__(self, i): | |||||
| segment_i = bisect.bisect_right(self.start_iters, i) | |||||
| return self.values[segment_i] | |||||
| def get_ramp(kind='ladder', **kwargs): | |||||
| if kind == 'linear': | |||||
| return LinearRamp(**kwargs) | |||||
| if kind == 'ladder': | |||||
| return LadderRamp(**kwargs) | |||||
| raise ValueError(f'Unexpected ramp kind: {kind}') | |||||
| class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule): | |||||
| def __init__(self, | |||||
| model_dir='', | |||||
| predict_only=False, | |||||
| concat_mask=True, | |||||
| rescale_scheduler_kwargs=None, | |||||
| image_to_discriminator='predicted_image', | |||||
| add_noise_kwargs=None, | |||||
| noise_fill_hole=False, | |||||
| const_area_crop_kwargs=None, | |||||
| distance_weighter_kwargs=None, | |||||
| distance_weighted_mask_for_discr=False, | |||||
| fake_fakes_proba=0, | |||||
| fake_fakes_generator_kwargs=None, | |||||
| **kwargs): | |||||
| super().__init__(model_dir=model_dir, predict_only=predict_only) | |||||
| self.concat_mask = concat_mask | |||||
| self.rescale_size_getter = get_ramp( | |||||
| **rescale_scheduler_kwargs | |||||
| ) if rescale_scheduler_kwargs is not None else None | |||||
| self.image_to_discriminator = image_to_discriminator | |||||
| self.add_noise_kwargs = add_noise_kwargs | |||||
| self.noise_fill_hole = noise_fill_hole | |||||
| self.const_area_crop_kwargs = const_area_crop_kwargs | |||||
| self.refine_mask_for_losses = None | |||||
| self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr | |||||
| self.feature_matching_weight = 100 | |||||
| self.losses_l1_weight_known = 10 | |||||
| self.losses_l1_weight_missing = 0 | |||||
| self.fake_fakes_proba = fake_fakes_proba | |||||
| def forward(self, batch): | |||||
| img = batch['image'] | |||||
| mask = batch['mask'] | |||||
| masked_img = img * (1 - mask) | |||||
| if self.concat_mask: | |||||
| masked_img = torch.cat([masked_img, mask], dim=1) | |||||
| batch['predicted_image'] = self.generator(masked_img) | |||||
| batch['inpainted'] = mask * batch['predicted_image'] + ( | |||||
| 1 - mask) * batch['image'] | |||||
| batch['mask_for_losses'] = mask | |||||
| return batch | |||||
| def generator_loss(self, batch): | |||||
| img = batch['image'] | |||||
| predicted_img = batch[self.image_to_discriminator] | |||||
| original_mask = batch['mask'] | |||||
| supervised_mask = batch['mask_for_losses'] | |||||
| # L1 | |||||
| l1_value = masked_l1_loss(predicted_img, img, supervised_mask, | |||||
| self.losses_l1_weight_known, | |||||
| self.losses_l1_weight_missing) | |||||
| total_loss = l1_value | |||||
| metrics = dict(gen_l1=l1_value) | |||||
| # discriminator | |||||
| # adversarial_loss calls backward by itself | |||||
| mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask | |||||
| self.adversarial_loss.pre_generator_step( | |||||
| real_batch=img, | |||||
| fake_batch=predicted_img, | |||||
| generator=self.generator, | |||||
| discriminator=self.discriminator) | |||||
| discr_real_pred, discr_real_features = self.discriminator(img) | |||||
| discr_fake_pred, discr_fake_features = self.discriminator( | |||||
| predicted_img) | |||||
| adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss( | |||||
| real_batch=img, | |||||
| fake_batch=predicted_img, | |||||
| discr_real_pred=discr_real_pred, | |||||
| discr_fake_pred=discr_fake_pred, | |||||
| mask=mask_for_discr) | |||||
| total_loss = total_loss + adv_gen_loss | |||||
| metrics['gen_adv'] = adv_gen_loss | |||||
| metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) | |||||
| # feature matching | |||||
| if self.feature_matching_weight > 0: | |||||
| need_mask_in_fm = False | |||||
| mask_for_fm = supervised_mask if need_mask_in_fm else None | |||||
| fm_value = feature_matching_loss( | |||||
| discr_fake_features, discr_real_features, | |||||
| mask=mask_for_fm) * self.feature_matching_weight | |||||
| total_loss = total_loss + fm_value | |||||
| metrics['gen_fm'] = fm_value | |||||
| if self.loss_resnet_pl is not None: | |||||
| resnet_pl_value = self.loss_resnet_pl(predicted_img, img) | |||||
| total_loss = total_loss + resnet_pl_value | |||||
| metrics['gen_resnet_pl'] = resnet_pl_value | |||||
| return total_loss, metrics | |||||
| def discriminator_loss(self, batch): | |||||
| total_loss = 0 | |||||
| metrics = {} | |||||
| predicted_img = batch[self.image_to_discriminator].detach() | |||||
| self.adversarial_loss.pre_discriminator_step( | |||||
| real_batch=batch['image'], | |||||
| fake_batch=predicted_img, | |||||
| generator=self.generator, | |||||
| discriminator=self.discriminator) | |||||
| discr_real_pred, discr_real_features = self.discriminator( | |||||
| batch['image']) | |||||
| discr_fake_pred, discr_fake_features = self.discriminator( | |||||
| predicted_img) | |||||
| adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss( | |||||
| real_batch=batch['image'], | |||||
| fake_batch=predicted_img, | |||||
| discr_real_pred=discr_real_pred, | |||||
| discr_fake_pred=discr_fake_pred, | |||||
| mask=batch['mask']) | |||||
| total_loss = (total_loss + adv_discr_loss) * 0.1 | |||||
| metrics['discr_adv'] = adv_discr_loss | |||||
| metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) | |||||
| return total_loss, metrics | |||||
| def _do_step(self, batch, optimizer_idx=None): | |||||
| if optimizer_idx == 0: # step for generator | |||||
| set_requires_grad(self.generator, True) | |||||
| set_requires_grad(self.discriminator, False) | |||||
| elif optimizer_idx == 1: # step for discriminator | |||||
| set_requires_grad(self.generator, False) | |||||
| set_requires_grad(self.discriminator, True) | |||||
| batch = self(batch) | |||||
| total_loss = 0 | |||||
| if optimizer_idx is None or optimizer_idx == 0: # step for generator | |||||
| total_loss, metrics = self.generator_loss(batch) | |||||
| elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator | |||||
| total_loss, metrics = self.discriminator_loss(batch) | |||||
| result = dict(loss=total_loss) | |||||
| return result | |||||
| @@ -0,0 +1,36 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| from typing import Any, Dict, Optional, Union | |||||
| import torch | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base.base_torch_model import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| LOGGER = get_logger() | |||||
| @MODELS.register_module( | |||||
| Tasks.image_inpainting, module_name=Models.image_inpainting) | |||||
| class FFTInpainting(TorchModel): | |||||
| def __init__(self, model_dir: str, **kwargs): | |||||
| super().__init__(model_dir, **kwargs) | |||||
| from .default import DefaultInpaintingTrainingModule | |||||
| pretrained = kwargs.get('pretrained', True) | |||||
| predict_only = kwargs.get('predict_only', False) | |||||
| net = DefaultInpaintingTrainingModule( | |||||
| model_dir=model_dir, predict_only=predict_only) | |||||
| if pretrained: | |||||
| path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||||
| LOGGER.info(f'loading pretrained model from {path}') | |||||
| state = torch.load(path, map_location='cpu') | |||||
| net.load_state_dict(state, strict=False) | |||||
| self.model = net | |||||
| def forward(self, inputs): | |||||
| return self.model(inputs) | |||||
| @@ -0,0 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .base import ModelBuilder | |||||
| @@ -0,0 +1,380 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import os | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torch.nn.modules import BatchNorm2d | |||||
| from . import resnet | |||||
| NUM_CLASS = 150 | |||||
| # Model Builder | |||||
| class ModelBuilder: | |||||
| # custom weights initialization | |||||
| @staticmethod | |||||
| def weights_init(m): | |||||
| classname = m.__class__.__name__ | |||||
| if classname.find('Conv') != -1: | |||||
| nn.init.kaiming_normal_(m.weight.data) | |||||
| elif classname.find('BatchNorm') != -1: | |||||
| m.weight.data.fill_(1.) | |||||
| m.bias.data.fill_(1e-4) | |||||
| @staticmethod | |||||
| def build_encoder(arch='resnet50dilated', | |||||
| fc_dim=512, | |||||
| weights='', | |||||
| model_dir=''): | |||||
| pretrained = True if len(weights) == 0 else False | |||||
| arch = arch.lower() | |||||
| if arch == 'resnet50dilated': | |||||
| orig_resnet = resnet.__dict__['resnet50']( | |||||
| pretrained=pretrained, model_dir=model_dir) | |||||
| net_encoder = ResnetDilated(orig_resnet, dilate_scale=8) | |||||
| elif arch == 'resnet50': | |||||
| orig_resnet = resnet.__dict__['resnet50']( | |||||
| pretrained=pretrained, model_dir=model_dir) | |||||
| net_encoder = Resnet(orig_resnet) | |||||
| else: | |||||
| raise Exception('Architecture undefined!') | |||||
| # encoders are usually pretrained | |||||
| # net_encoder.apply(ModelBuilder.weights_init) | |||||
| if len(weights) > 0: | |||||
| print('Loading weights for net_encoder') | |||||
| net_encoder.load_state_dict( | |||||
| torch.load(weights, map_location=lambda storage, loc: storage), | |||||
| strict=False) | |||||
| return net_encoder | |||||
| @staticmethod | |||||
| def build_decoder(arch='ppm_deepsup', | |||||
| fc_dim=512, | |||||
| num_class=NUM_CLASS, | |||||
| weights='', | |||||
| use_softmax=False, | |||||
| drop_last_conv=False): | |||||
| arch = arch.lower() | |||||
| if arch == 'ppm_deepsup': | |||||
| net_decoder = PPMDeepsup( | |||||
| num_class=num_class, | |||||
| fc_dim=fc_dim, | |||||
| use_softmax=use_softmax, | |||||
| drop_last_conv=drop_last_conv) | |||||
| elif arch == 'c1_deepsup': | |||||
| net_decoder = C1DeepSup( | |||||
| num_class=num_class, | |||||
| fc_dim=fc_dim, | |||||
| use_softmax=use_softmax, | |||||
| drop_last_conv=drop_last_conv) | |||||
| else: | |||||
| raise Exception('Architecture undefined!') | |||||
| net_decoder.apply(ModelBuilder.weights_init) | |||||
| if len(weights) > 0: | |||||
| print('Loading weights for net_decoder') | |||||
| net_decoder.load_state_dict( | |||||
| torch.load(weights, map_location=lambda storage, loc: storage), | |||||
| strict=False) | |||||
| return net_decoder | |||||
| @staticmethod | |||||
| def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim, | |||||
| drop_last_conv, *arts, **kwargs): | |||||
| path = os.path.join( | |||||
| weights_path, 'ade20k', | |||||
| f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth') | |||||
| return ModelBuilder.build_decoder( | |||||
| arch=arch_decoder, | |||||
| fc_dim=fc_dim, | |||||
| weights=path, | |||||
| use_softmax=True, | |||||
| drop_last_conv=drop_last_conv) | |||||
| @staticmethod | |||||
| def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim, | |||||
| segmentation, *arts, **kwargs): | |||||
| if segmentation: | |||||
| path = os.path.join( | |||||
| weights_path, 'ade20k', | |||||
| f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth') | |||||
| else: | |||||
| path = '' | |||||
| return ModelBuilder.build_encoder( | |||||
| arch=arch_encoder, | |||||
| fc_dim=fc_dim, | |||||
| weights=path, | |||||
| model_dir=weights_path) | |||||
| def conv3x3_bn_relu(in_planes, out_planes, stride=1): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d( | |||||
| in_planes, | |||||
| out_planes, | |||||
| kernel_size=3, | |||||
| stride=stride, | |||||
| padding=1, | |||||
| bias=False), | |||||
| BatchNorm2d(out_planes), | |||||
| nn.ReLU(inplace=True), | |||||
| ) | |||||
| # pyramid pooling, deep supervision | |||||
| class PPMDeepsup(nn.Module): | |||||
| def __init__(self, | |||||
| num_class=NUM_CLASS, | |||||
| fc_dim=4096, | |||||
| use_softmax=False, | |||||
| pool_scales=(1, 2, 3, 6), | |||||
| drop_last_conv=False): | |||||
| super().__init__() | |||||
| self.use_softmax = use_softmax | |||||
| self.drop_last_conv = drop_last_conv | |||||
| self.ppm = [] | |||||
| for scale in pool_scales: | |||||
| self.ppm.append( | |||||
| nn.Sequential( | |||||
| nn.AdaptiveAvgPool2d(scale), | |||||
| nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False), | |||||
| BatchNorm2d(512), nn.ReLU(inplace=True))) | |||||
| self.ppm = nn.ModuleList(self.ppm) | |||||
| self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) | |||||
| self.conv_last = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| fc_dim + len(pool_scales) * 512, | |||||
| 512, | |||||
| kernel_size=3, | |||||
| padding=1, | |||||
| bias=False), BatchNorm2d(512), nn.ReLU(inplace=True), | |||||
| nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1)) | |||||
| self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||||
| self.dropout_deepsup = nn.Dropout2d(0.1) | |||||
| def forward(self, conv_out, segSize=None): | |||||
| conv5 = conv_out[-1] | |||||
| input_size = conv5.size() | |||||
| ppm_out = [conv5] | |||||
| for pool_scale in self.ppm: | |||||
| ppm_out.append( | |||||
| nn.functional.interpolate( | |||||
| pool_scale(conv5), (input_size[2], input_size[3]), | |||||
| mode='bilinear', | |||||
| align_corners=False)) | |||||
| ppm_out = torch.cat(ppm_out, 1) | |||||
| if self.drop_last_conv: | |||||
| return ppm_out | |||||
| else: | |||||
| x = self.conv_last(ppm_out) | |||||
| if self.use_softmax: # is True during inference | |||||
| x = nn.functional.interpolate( | |||||
| x, size=segSize, mode='bilinear', align_corners=False) | |||||
| x = nn.functional.softmax(x, dim=1) | |||||
| return x | |||||
| # deep sup | |||||
| conv4 = conv_out[-2] | |||||
| _ = self.cbr_deepsup(conv4) | |||||
| _ = self.dropout_deepsup(_) | |||||
| _ = self.conv_last_deepsup(_) | |||||
| x = nn.functional.log_softmax(x, dim=1) | |||||
| _ = nn.functional.log_softmax(_, dim=1) | |||||
| return (x, _) | |||||
| class Resnet(nn.Module): | |||||
| def __init__(self, orig_resnet): | |||||
| super(Resnet, self).__init__() | |||||
| # take pretrained resnet, except AvgPool and FC | |||||
| self.conv1 = orig_resnet.conv1 | |||||
| self.bn1 = orig_resnet.bn1 | |||||
| self.relu1 = orig_resnet.relu1 | |||||
| self.conv2 = orig_resnet.conv2 | |||||
| self.bn2 = orig_resnet.bn2 | |||||
| self.relu2 = orig_resnet.relu2 | |||||
| self.conv3 = orig_resnet.conv3 | |||||
| self.bn3 = orig_resnet.bn3 | |||||
| self.relu3 = orig_resnet.relu3 | |||||
| self.maxpool = orig_resnet.maxpool | |||||
| self.layer1 = orig_resnet.layer1 | |||||
| self.layer2 = orig_resnet.layer2 | |||||
| self.layer3 = orig_resnet.layer3 | |||||
| self.layer4 = orig_resnet.layer4 | |||||
| def forward(self, x, return_feature_maps=False): | |||||
| conv_out = [] | |||||
| x = self.relu1(self.bn1(self.conv1(x))) | |||||
| x = self.relu2(self.bn2(self.conv2(x))) | |||||
| x = self.relu3(self.bn3(self.conv3(x))) | |||||
| x = self.maxpool(x) | |||||
| x = self.layer1(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer2(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer3(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer4(x) | |||||
| conv_out.append(x) | |||||
| if return_feature_maps: | |||||
| return conv_out | |||||
| return [x] | |||||
| # Resnet Dilated | |||||
| class ResnetDilated(nn.Module): | |||||
| def __init__(self, orig_resnet, dilate_scale=8): | |||||
| super().__init__() | |||||
| from functools import partial | |||||
| if dilate_scale == 8: | |||||
| orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2)) | |||||
| orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4)) | |||||
| elif dilate_scale == 16: | |||||
| orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2)) | |||||
| # take pretrained resnet, except AvgPool and FC | |||||
| self.conv1 = orig_resnet.conv1 | |||||
| self.bn1 = orig_resnet.bn1 | |||||
| self.relu1 = orig_resnet.relu1 | |||||
| self.conv2 = orig_resnet.conv2 | |||||
| self.bn2 = orig_resnet.bn2 | |||||
| self.relu2 = orig_resnet.relu2 | |||||
| self.conv3 = orig_resnet.conv3 | |||||
| self.bn3 = orig_resnet.bn3 | |||||
| self.relu3 = orig_resnet.relu3 | |||||
| self.maxpool = orig_resnet.maxpool | |||||
| self.layer1 = orig_resnet.layer1 | |||||
| self.layer2 = orig_resnet.layer2 | |||||
| self.layer3 = orig_resnet.layer3 | |||||
| self.layer4 = orig_resnet.layer4 | |||||
| def _nostride_dilate(self, m, dilate): | |||||
| classname = m.__class__.__name__ | |||||
| if classname.find('Conv') != -1: | |||||
| # the convolution with stride | |||||
| if m.stride == (2, 2): | |||||
| m.stride = (1, 1) | |||||
| if m.kernel_size == (3, 3): | |||||
| m.dilation = (dilate // 2, dilate // 2) | |||||
| m.padding = (dilate // 2, dilate // 2) | |||||
| # other convoluions | |||||
| else: | |||||
| if m.kernel_size == (3, 3): | |||||
| m.dilation = (dilate, dilate) | |||||
| m.padding = (dilate, dilate) | |||||
| def forward(self, x, return_feature_maps=False): | |||||
| conv_out = [] | |||||
| x = self.relu1(self.bn1(self.conv1(x))) | |||||
| x = self.relu2(self.bn2(self.conv2(x))) | |||||
| x = self.relu3(self.bn3(self.conv3(x))) | |||||
| x = self.maxpool(x) | |||||
| x = self.layer1(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer2(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer3(x) | |||||
| conv_out.append(x) | |||||
| x = self.layer4(x) | |||||
| conv_out.append(x) | |||||
| if return_feature_maps: | |||||
| return conv_out | |||||
| return [x] | |||||
| # last conv, deep supervision | |||||
| class C1DeepSup(nn.Module): | |||||
| def __init__(self, | |||||
| num_class=150, | |||||
| fc_dim=2048, | |||||
| use_softmax=False, | |||||
| drop_last_conv=False): | |||||
| super(C1DeepSup, self).__init__() | |||||
| self.use_softmax = use_softmax | |||||
| self.drop_last_conv = drop_last_conv | |||||
| self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) | |||||
| self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) | |||||
| # last conv | |||||
| self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||||
| self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||||
| def forward(self, conv_out, segSize=None): | |||||
| conv5 = conv_out[-1] | |||||
| x = self.cbr(conv5) | |||||
| if self.drop_last_conv: | |||||
| return x | |||||
| else: | |||||
| x = self.conv_last(x) | |||||
| if self.use_softmax: # is True during inference | |||||
| x = nn.functional.interpolate( | |||||
| x, size=segSize, mode='bilinear', align_corners=False) | |||||
| x = nn.functional.softmax(x, dim=1) | |||||
| return x | |||||
| # deep sup | |||||
| conv4 = conv_out[-2] | |||||
| _ = self.cbr_deepsup(conv4) | |||||
| _ = self.conv_last_deepsup(_) | |||||
| x = nn.functional.log_softmax(x, dim=1) | |||||
| _ = nn.functional.log_softmax(_, dim=1) | |||||
| return (x, _) | |||||
| # last conv | |||||
| class C1(nn.Module): | |||||
| def __init__(self, num_class=150, fc_dim=2048, use_softmax=False): | |||||
| super(C1, self).__init__() | |||||
| self.use_softmax = use_softmax | |||||
| self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) | |||||
| # last conv | |||||
| self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) | |||||
| def forward(self, conv_out, segSize=None): | |||||
| conv5 = conv_out[-1] | |||||
| x = self.cbr(conv5) | |||||
| x = self.conv_last(x) | |||||
| if self.use_softmax: # is True during inference | |||||
| x = nn.functional.interpolate( | |||||
| x, size=segSize, mode='bilinear', align_corners=False) | |||||
| x = nn.functional.softmax(x, dim=1) | |||||
| else: | |||||
| x = nn.functional.log_softmax(x, dim=1) | |||||
| return x | |||||
| @@ -0,0 +1,183 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import math | |||||
| import os | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from torch.nn import BatchNorm2d | |||||
| __all__ = ['ResNet', 'resnet50'] | |||||
| def conv3x3(in_planes, out_planes, stride=1): | |||||
| '3x3 convolution with padding' | |||||
| return nn.Conv2d( | |||||
| in_planes, | |||||
| out_planes, | |||||
| kernel_size=3, | |||||
| stride=stride, | |||||
| padding=1, | |||||
| bias=False) | |||||
| class BasicBlock(nn.Module): | |||||
| expansion = 1 | |||||
| def __init__(self, inplanes, planes, stride=1, downsample=None): | |||||
| super(BasicBlock, self).__init__() | |||||
| self.conv1 = conv3x3(inplanes, planes, stride) | |||||
| self.bn1 = BatchNorm2d(planes) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| self.conv2 = conv3x3(planes, planes) | |||||
| self.bn2 = BatchNorm2d(planes) | |||||
| self.downsample = downsample | |||||
| self.stride = stride | |||||
| def forward(self, x): | |||||
| residual = x | |||||
| out = self.conv1(x) | |||||
| out = self.bn1(out) | |||||
| out = self.relu(out) | |||||
| out = self.conv2(out) | |||||
| out = self.bn2(out) | |||||
| if self.downsample is not None: | |||||
| residual = self.downsample(x) | |||||
| out += residual | |||||
| out = self.relu(out) | |||||
| return out | |||||
| class Bottleneck(nn.Module): | |||||
| expansion = 4 | |||||
| def __init__(self, inplanes, planes, stride=1, downsample=None): | |||||
| super(Bottleneck, self).__init__() | |||||
| self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | |||||
| self.bn1 = BatchNorm2d(planes) | |||||
| self.conv2 = nn.Conv2d( | |||||
| planes, | |||||
| planes, | |||||
| kernel_size=3, | |||||
| stride=stride, | |||||
| padding=1, | |||||
| bias=False) | |||||
| self.bn2 = BatchNorm2d(planes) | |||||
| self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | |||||
| self.bn3 = BatchNorm2d(planes * 4) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| self.downsample = downsample | |||||
| self.stride = stride | |||||
| def forward(self, x): | |||||
| residual = x | |||||
| out = self.conv1(x) | |||||
| out = self.bn1(out) | |||||
| out = self.relu(out) | |||||
| out = self.conv2(out) | |||||
| out = self.bn2(out) | |||||
| out = self.relu(out) | |||||
| out = self.conv3(out) | |||||
| out = self.bn3(out) | |||||
| if self.downsample is not None: | |||||
| residual = self.downsample(x) | |||||
| out += residual | |||||
| out = self.relu(out) | |||||
| return out | |||||
| class ResNet(nn.Module): | |||||
| def __init__(self, block, layers, num_classes=1000): | |||||
| self.inplanes = 128 | |||||
| super(ResNet, self).__init__() | |||||
| self.conv1 = conv3x3(3, 64, stride=2) | |||||
| self.bn1 = BatchNorm2d(64) | |||||
| self.relu1 = nn.ReLU(inplace=True) | |||||
| self.conv2 = conv3x3(64, 64) | |||||
| self.bn2 = BatchNorm2d(64) | |||||
| self.relu2 = nn.ReLU(inplace=True) | |||||
| self.conv3 = conv3x3(64, 128) | |||||
| self.bn3 = BatchNorm2d(128) | |||||
| self.relu3 = nn.ReLU(inplace=True) | |||||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||||
| self.layer1 = self._make_layer(block, 64, layers[0]) | |||||
| self.layer2 = self._make_layer(block, 128, layers[1], stride=2) | |||||
| self.layer3 = self._make_layer(block, 256, layers[2], stride=2) | |||||
| self.layer4 = self._make_layer(block, 512, layers[3], stride=2) | |||||
| self.avgpool = nn.AvgPool2d(7, stride=1) | |||||
| self.fc = nn.Linear(512 * block.expansion, num_classes) | |||||
| for m in self.modules(): | |||||
| if isinstance(m, nn.Conv2d): | |||||
| n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||||
| m.weight.data.normal_(0, math.sqrt(2. / n)) | |||||
| elif isinstance(m, BatchNorm2d): | |||||
| m.weight.data.fill_(1) | |||||
| m.bias.data.zero_() | |||||
| def _make_layer(self, block, planes, blocks, stride=1): | |||||
| downsample = None | |||||
| if stride != 1 or self.inplanes != planes * block.expansion: | |||||
| downsample = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| self.inplanes, | |||||
| planes * block.expansion, | |||||
| kernel_size=1, | |||||
| stride=stride, | |||||
| bias=False), | |||||
| BatchNorm2d(planes * block.expansion), | |||||
| ) | |||||
| layers = [] | |||||
| layers.append(block(self.inplanes, planes, stride, downsample)) | |||||
| self.inplanes = planes * block.expansion | |||||
| for i in range(1, blocks): | |||||
| layers.append(block(self.inplanes, planes)) | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| x = self.relu1(self.bn1(self.conv1(x))) | |||||
| x = self.relu2(self.bn2(self.conv2(x))) | |||||
| x = self.relu3(self.bn3(self.conv3(x))) | |||||
| x = self.maxpool(x) | |||||
| x = self.layer1(x) | |||||
| x = self.layer2(x) | |||||
| x = self.layer3(x) | |||||
| x = self.layer4(x) | |||||
| x = self.avgpool(x) | |||||
| x = x.view(x.size(0), -1) | |||||
| x = self.fc(x) | |||||
| return x | |||||
| def resnet50(pretrained=False, model_dir='', **kwargs): | |||||
| """Constructs a ResNet-50 model. | |||||
| Args: | |||||
| pretrained (bool): If True, returns a model pre-trained on ImageNet | |||||
| """ | |||||
| model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) | |||||
| if pretrained: | |||||
| cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth') | |||||
| model.load_state_dict( | |||||
| torch.load(cached_file, map_location='cpu'), strict=False) | |||||
| return model | |||||
| @@ -0,0 +1,167 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| from typing import Dict, Optional, Tuple | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class BaseAdversarialLoss: | |||||
| def pre_generator_step(self, real_batch: torch.Tensor, | |||||
| fake_batch: torch.Tensor, generator: nn.Module, | |||||
| discriminator: nn.Module): | |||||
| """ | |||||
| Prepare for generator step | |||||
| :param real_batch: Tensor, a batch of real samples | |||||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||||
| :param generator: | |||||
| :param discriminator: | |||||
| :return: None | |||||
| """ | |||||
| def pre_discriminator_step(self, real_batch: torch.Tensor, | |||||
| fake_batch: torch.Tensor, generator: nn.Module, | |||||
| discriminator: nn.Module): | |||||
| """ | |||||
| Prepare for discriminator step | |||||
| :param real_batch: Tensor, a batch of real samples | |||||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||||
| :param generator: | |||||
| :param discriminator: | |||||
| :return: None | |||||
| """ | |||||
| def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||||
| mask: Optional[torch.Tensor] = None) \ | |||||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| """ | |||||
| Calculate generator loss | |||||
| :param real_batch: Tensor, a batch of real samples | |||||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||||
| :param discr_real_pred: Tensor, discriminator output for real_batch | |||||
| :param discr_fake_pred: Tensor, discriminator output for fake_batch | |||||
| :param mask: Tensor, actual mask, which was at input of generator when making fake_batch | |||||
| :return: total generator loss along with some values that might be interesting to log | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||||
| mask: Optional[torch.Tensor] = None) \ | |||||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| """ | |||||
| Calculate discriminator loss and call .backward() on it | |||||
| :param real_batch: Tensor, a batch of real samples | |||||
| :param fake_batch: Tensor, a batch of samples produced by generator | |||||
| :param discr_real_pred: Tensor, discriminator output for real_batch | |||||
| :param discr_fake_pred: Tensor, discriminator output for fake_batch | |||||
| :param mask: Tensor, actual mask, which was at input of generator when making fake_batch | |||||
| :return: total discriminator loss along with some values that might be interesting to log | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def interpolate_mask(self, mask, shape): | |||||
| assert mask is not None | |||||
| assert self.allow_scale_mask or shape == mask.shape[-2:] | |||||
| if shape != mask.shape[-2:] and self.allow_scale_mask: | |||||
| if self.mask_scale_mode == 'maxpool': | |||||
| mask = F.adaptive_max_pool2d(mask, shape) | |||||
| else: | |||||
| mask = F.interpolate( | |||||
| mask, size=shape, mode=self.mask_scale_mode) | |||||
| return mask | |||||
| def make_r1_gp(discr_real_pred, real_batch): | |||||
| if torch.is_grad_enabled(): | |||||
| grad_real = torch.autograd.grad( | |||||
| outputs=discr_real_pred.sum(), | |||||
| inputs=real_batch, | |||||
| create_graph=True)[0] | |||||
| grad_penalty = (grad_real.view(grad_real.shape[0], | |||||
| -1).norm(2, dim=1)**2).mean() | |||||
| else: | |||||
| grad_penalty = 0 | |||||
| real_batch.requires_grad = False | |||||
| return grad_penalty | |||||
| class NonSaturatingWithR1(BaseAdversarialLoss): | |||||
| def __init__(self, | |||||
| gp_coef=5, | |||||
| weight=1, | |||||
| mask_as_fake_target=False, | |||||
| allow_scale_mask=False, | |||||
| mask_scale_mode='nearest', | |||||
| extra_mask_weight_for_gen=0, | |||||
| use_unmasked_for_gen=True, | |||||
| use_unmasked_for_discr=True): | |||||
| self.gp_coef = gp_coef | |||||
| self.weight = weight | |||||
| # use for discr => use for gen; | |||||
| # otherwise we teach only the discr to pay attention to very small difference | |||||
| assert use_unmasked_for_gen or (not use_unmasked_for_discr) | |||||
| # mask as target => use unmasked for discr: | |||||
| # if we don't care about unmasked regions at all | |||||
| # then it doesn't matter if the value of mask_as_fake_target is true or false | |||||
| assert use_unmasked_for_discr or (not mask_as_fake_target) | |||||
| self.use_unmasked_for_gen = use_unmasked_for_gen | |||||
| self.use_unmasked_for_discr = use_unmasked_for_discr | |||||
| self.mask_as_fake_target = mask_as_fake_target | |||||
| self.allow_scale_mask = allow_scale_mask | |||||
| self.mask_scale_mode = mask_scale_mode | |||||
| self.extra_mask_weight_for_gen = extra_mask_weight_for_gen | |||||
| def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||||
| mask=None) \ | |||||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| fake_loss = F.softplus(-discr_fake_pred) | |||||
| if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \ | |||||
| not self.use_unmasked_for_gen: # == if masked region should be treated differently | |||||
| mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) | |||||
| if not self.use_unmasked_for_gen: | |||||
| fake_loss = fake_loss * mask | |||||
| else: | |||||
| pixel_weights = 1 + mask * self.extra_mask_weight_for_gen | |||||
| fake_loss = fake_loss * pixel_weights | |||||
| return fake_loss.mean() * self.weight, dict() | |||||
| def pre_discriminator_step(self, real_batch: torch.Tensor, | |||||
| fake_batch: torch.Tensor, generator: nn.Module, | |||||
| discriminator: nn.Module): | |||||
| real_batch.requires_grad = True | |||||
| def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, | |||||
| discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, | |||||
| mask=None) \ | |||||
| -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: | |||||
| real_loss = F.softplus(-discr_real_pred) | |||||
| grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef | |||||
| fake_loss = F.softplus(discr_fake_pred) | |||||
| if not self.use_unmasked_for_discr or self.mask_as_fake_target: | |||||
| # == if masked region should be treated differently | |||||
| mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) | |||||
| # use_unmasked_for_discr=False only makes sense for fakes; | |||||
| # for reals there is no difference beetween two regions | |||||
| fake_loss = fake_loss * mask | |||||
| if self.mask_as_fake_target: | |||||
| fake_loss = fake_loss + (1 | |||||
| - mask) * F.softplus(-discr_fake_pred) | |||||
| sum_discr_loss = real_loss + grad_penalty + fake_loss | |||||
| metrics = dict( | |||||
| discr_real_out=discr_real_pred.mean(), | |||||
| discr_fake_out=discr_fake_pred.mean(), | |||||
| discr_real_gp=grad_penalty) | |||||
| return sum_discr_loss.mean(), metrics | |||||
| @@ -0,0 +1,45 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| from typing import List | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| def masked_l2_loss(pred, target, mask, weight_known, weight_missing): | |||||
| per_pixel_l2 = F.mse_loss(pred, target, reduction='none') | |||||
| pixel_weights = mask * weight_missing + (1 - mask) * weight_known | |||||
| return (pixel_weights * per_pixel_l2).mean() | |||||
| def masked_l1_loss(pred, target, mask, weight_known, weight_missing): | |||||
| per_pixel_l1 = F.l1_loss(pred, target, reduction='none') | |||||
| pixel_weights = mask * weight_missing + (1 - mask) * weight_known | |||||
| return (pixel_weights * per_pixel_l1).mean() | |||||
| def feature_matching_loss(fake_features: List[torch.Tensor], | |||||
| target_features: List[torch.Tensor], | |||||
| mask=None): | |||||
| if mask is None: | |||||
| res = torch.stack([ | |||||
| F.mse_loss(fake_feat, target_feat) | |||||
| for fake_feat, target_feat in zip(fake_features, target_features) | |||||
| ]).mean() | |||||
| else: | |||||
| res = 0 | |||||
| norm = 0 | |||||
| for fake_feat, target_feat in zip(fake_features, target_features): | |||||
| cur_mask = F.interpolate( | |||||
| mask, | |||||
| size=fake_feat.shape[-2:], | |||||
| mode='bilinear', | |||||
| align_corners=False) | |||||
| error_weights = 1 - cur_mask | |||||
| cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean() | |||||
| res = res + cur_val | |||||
| norm += 1 | |||||
| res = res / norm | |||||
| return res | |||||
| @@ -0,0 +1,588 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from kornia.geometry.transform import rotate | |||||
| def get_activation(kind='tanh'): | |||||
| if kind == 'tanh': | |||||
| return nn.Tanh() | |||||
| if kind == 'sigmoid': | |||||
| return nn.Sigmoid() | |||||
| if kind is False: | |||||
| return nn.Identity() | |||||
| raise ValueError(f'Unknown activation kind {kind}') | |||||
| class SELayer(nn.Module): | |||||
| def __init__(self, channel, reduction=16): | |||||
| super(SELayer, self).__init__() | |||||
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||||
| self.fc = nn.Sequential( | |||||
| nn.Linear(channel, channel // reduction, bias=False), | |||||
| nn.ReLU(inplace=True), | |||||
| nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid()) | |||||
| def forward(self, x): | |||||
| b, c, _, _ = x.size() | |||||
| y = self.avg_pool(x).view(b, c) | |||||
| y = self.fc(y).view(b, c, 1, 1) | |||||
| res = x * y.expand_as(x) | |||||
| return res | |||||
| class FourierUnit(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| groups=1, | |||||
| spatial_scale_factor=None, | |||||
| spatial_scale_mode='bilinear', | |||||
| spectral_pos_encoding=False, | |||||
| use_se=False, | |||||
| se_kwargs=None, | |||||
| ffc3d=False, | |||||
| fft_norm='ortho'): | |||||
| # bn_layer not used | |||||
| super(FourierUnit, self).__init__() | |||||
| self.groups = groups | |||||
| self.conv_layer = torch.nn.Conv2d( | |||||
| in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), | |||||
| out_channels=out_channels * 2, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| groups=self.groups, | |||||
| bias=False) | |||||
| self.bn = torch.nn.BatchNorm2d(out_channels * 2) | |||||
| self.relu = torch.nn.ReLU(inplace=True) | |||||
| # squeeze and excitation block | |||||
| self.use_se = use_se | |||||
| if use_se: | |||||
| if se_kwargs is None: | |||||
| se_kwargs = {} | |||||
| self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) | |||||
| self.spatial_scale_factor = spatial_scale_factor | |||||
| self.spatial_scale_mode = spatial_scale_mode | |||||
| self.spectral_pos_encoding = spectral_pos_encoding | |||||
| self.ffc3d = ffc3d | |||||
| self.fft_norm = fft_norm | |||||
| def forward(self, x): | |||||
| batch = x.shape[0] | |||||
| if self.spatial_scale_factor is not None: | |||||
| orig_size = x.shape[-2:] | |||||
| x = F.interpolate( | |||||
| x, | |||||
| scale_factor=self.spatial_scale_factor, | |||||
| mode=self.spatial_scale_mode, | |||||
| align_corners=False) | |||||
| # (batch, c, h, w/2+1, 2) | |||||
| fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) | |||||
| ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) | |||||
| ffted = torch.stack((ffted.real, ffted.imag), dim=-1) | |||||
| ffted = ffted.permute(0, 1, 4, 2, | |||||
| 3).contiguous() # (batch, c, 2, h, w/2+1) | |||||
| ffted = ffted.view(( | |||||
| batch, | |||||
| -1, | |||||
| ) + ffted.size()[3:]) | |||||
| if self.spectral_pos_encoding: | |||||
| height, width = ffted.shape[-2:] | |||||
| coords_vert = torch.linspace(0, 1, | |||||
| height)[None, None, :, None].expand( | |||||
| batch, 1, height, width).to(ffted) | |||||
| coords_hor = torch.linspace(0, 1, | |||||
| width)[None, None, None, :].expand( | |||||
| batch, 1, height, width).to(ffted) | |||||
| ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) | |||||
| if self.use_se: | |||||
| ffted = self.se(ffted) | |||||
| ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1) | |||||
| ffted = self.relu(self.bn(ffted)) | |||||
| ffted = ffted.view(( | |||||
| batch, | |||||
| -1, | |||||
| 2, | |||||
| ) + ffted.size()[2:]).permute( | |||||
| 0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2) | |||||
| ffted = torch.complex(ffted[..., 0], ffted[..., 1]) | |||||
| ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] | |||||
| output = torch.fft.irfftn( | |||||
| ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm) | |||||
| if self.spatial_scale_factor is not None: | |||||
| output = F.interpolate( | |||||
| output, | |||||
| size=orig_size, | |||||
| mode=self.spatial_scale_mode, | |||||
| align_corners=False) | |||||
| return output | |||||
| class SpectralTransform(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| stride=1, | |||||
| groups=1, | |||||
| enable_lfu=True, | |||||
| **fu_kwargs): | |||||
| # bn_layer not used | |||||
| super(SpectralTransform, self).__init__() | |||||
| self.enable_lfu = enable_lfu | |||||
| if stride == 2: | |||||
| self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) | |||||
| else: | |||||
| self.downsample = nn.Identity() | |||||
| self.stride = stride | |||||
| self.conv1 = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| in_channels, | |||||
| out_channels // 2, | |||||
| kernel_size=1, | |||||
| groups=groups, | |||||
| bias=False), nn.BatchNorm2d(out_channels // 2), | |||||
| nn.ReLU(inplace=True)) | |||||
| self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups, | |||||
| **fu_kwargs) | |||||
| if self.enable_lfu: | |||||
| self.lfu = FourierUnit(out_channels // 2, out_channels // 2, | |||||
| groups) | |||||
| self.conv2 = torch.nn.Conv2d( | |||||
| out_channels // 2, | |||||
| out_channels, | |||||
| kernel_size=1, | |||||
| groups=groups, | |||||
| bias=False) | |||||
| def forward(self, x): | |||||
| x = self.downsample(x) | |||||
| x = self.conv1(x) | |||||
| output = self.fu(x) | |||||
| if self.enable_lfu: | |||||
| n, c, h, w = x.shape | |||||
| split_no = 2 | |||||
| split_s = h // split_no | |||||
| xs = torch.cat( | |||||
| torch.split(x[:, :c // 4], split_s, dim=-2), | |||||
| dim=1).contiguous() | |||||
| xs = torch.cat( | |||||
| torch.split(xs, split_s, dim=-1), dim=1).contiguous() | |||||
| xs = self.lfu(xs) | |||||
| xs = xs.repeat(1, 1, split_no, split_no).contiguous() | |||||
| else: | |||||
| xs = 0 | |||||
| output = self.conv2(x + output + xs) | |||||
| return output | |||||
| class LearnableSpatialTransformWrapper(nn.Module): | |||||
| def __init__(self, | |||||
| impl, | |||||
| pad_coef=0.5, | |||||
| angle_init_range=80, | |||||
| train_angle=True): | |||||
| super().__init__() | |||||
| self.impl = impl | |||||
| self.angle = torch.rand(1) * angle_init_range | |||||
| if train_angle: | |||||
| self.angle = nn.Parameter(self.angle, requires_grad=True) | |||||
| self.pad_coef = pad_coef | |||||
| def forward(self, x): | |||||
| if torch.is_tensor(x): | |||||
| return self.inverse_transform(self.impl(self.transform(x)), x) | |||||
| elif isinstance(x, tuple): | |||||
| x_trans = tuple(self.transform(elem) for elem in x) | |||||
| y_trans = self.impl(x_trans) | |||||
| return tuple( | |||||
| self.inverse_transform(elem, orig_x) | |||||
| for elem, orig_x in zip(y_trans, x)) | |||||
| else: | |||||
| raise ValueError(f'Unexpected input type {type(x)}') | |||||
| def transform(self, x): | |||||
| height, width = x.shape[2:] | |||||
| pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) | |||||
| x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect') | |||||
| x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded)) | |||||
| return x_padded_rotated | |||||
| def inverse_transform(self, y_padded_rotated, orig_x): | |||||
| height, width = orig_x.shape[2:] | |||||
| pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) | |||||
| y_padded = rotate( | |||||
| y_padded_rotated, angle=-self.angle.to(y_padded_rotated)) | |||||
| y_height, y_width = y_padded.shape[2:] | |||||
| y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w] | |||||
| return y | |||||
| class FFC(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| ratio_gin, | |||||
| ratio_gout, | |||||
| stride=1, | |||||
| padding=0, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| bias=False, | |||||
| enable_lfu=True, | |||||
| padding_type='reflect', | |||||
| gated=False, | |||||
| **spectral_kwargs): | |||||
| super(FFC, self).__init__() | |||||
| assert stride == 1 or stride == 2, 'Stride should be 1 or 2.' | |||||
| self.stride = stride | |||||
| in_cg = int(in_channels * ratio_gin) | |||||
| in_cl = in_channels - in_cg | |||||
| out_cg = int(out_channels * ratio_gout) | |||||
| out_cl = out_channels - out_cg | |||||
| self.ratio_gin = ratio_gin | |||||
| self.ratio_gout = ratio_gout | |||||
| self.global_in_num = in_cg | |||||
| module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d | |||||
| self.convl2l = module( | |||||
| in_cl, | |||||
| out_cl, | |||||
| kernel_size, | |||||
| stride, | |||||
| padding, | |||||
| dilation, | |||||
| groups, | |||||
| bias, | |||||
| padding_mode=padding_type) | |||||
| module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d | |||||
| self.convl2g = module( | |||||
| in_cl, | |||||
| out_cg, | |||||
| kernel_size, | |||||
| stride, | |||||
| padding, | |||||
| dilation, | |||||
| groups, | |||||
| bias, | |||||
| padding_mode=padding_type) | |||||
| module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d | |||||
| self.convg2l = module( | |||||
| in_cg, | |||||
| out_cl, | |||||
| kernel_size, | |||||
| stride, | |||||
| padding, | |||||
| dilation, | |||||
| groups, | |||||
| bias, | |||||
| padding_mode=padding_type) | |||||
| module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform | |||||
| self.convg2g = module(in_cg, out_cg, stride, | |||||
| 1 if groups == 1 else groups // 2, enable_lfu, | |||||
| **spectral_kwargs) | |||||
| self.gated = gated | |||||
| module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d | |||||
| self.gate = module(in_channels, 2, 1) | |||||
| def forward(self, x): | |||||
| x_l, x_g = x if type(x) is tuple else (x, 0) | |||||
| out_xl, out_xg = 0, 0 | |||||
| if self.gated: | |||||
| total_input_parts = [x_l] | |||||
| if torch.is_tensor(x_g): | |||||
| total_input_parts.append(x_g) | |||||
| total_input = torch.cat(total_input_parts, dim=1) | |||||
| gates = torch.sigmoid(self.gate(total_input)) | |||||
| g2l_gate, l2g_gate = gates.chunk(2, dim=1) | |||||
| else: | |||||
| g2l_gate, l2g_gate = 1, 1 | |||||
| if self.ratio_gout != 1: | |||||
| out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate | |||||
| if self.ratio_gout != 0: | |||||
| out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g) | |||||
| return out_xl, out_xg | |||||
| class FFC_BN_ACT(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| ratio_gin, | |||||
| ratio_gout, | |||||
| stride=1, | |||||
| padding=0, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| bias=False, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| activation_layer=nn.Identity, | |||||
| padding_type='reflect', | |||||
| enable_lfu=True, | |||||
| **kwargs): | |||||
| super(FFC_BN_ACT, self).__init__() | |||||
| self.ffc = FFC( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| ratio_gin, | |||||
| ratio_gout, | |||||
| stride, | |||||
| padding, | |||||
| dilation, | |||||
| groups, | |||||
| bias, | |||||
| enable_lfu, | |||||
| padding_type=padding_type, | |||||
| **kwargs) | |||||
| lnorm = nn.Identity if ratio_gout == 1 else norm_layer | |||||
| gnorm = nn.Identity if ratio_gout == 0 else norm_layer | |||||
| global_channels = int(out_channels * ratio_gout) | |||||
| self.bn_l = lnorm(out_channels - global_channels) | |||||
| self.bn_g = gnorm(global_channels) | |||||
| lact = nn.Identity if ratio_gout == 1 else activation_layer | |||||
| gact = nn.Identity if ratio_gout == 0 else activation_layer | |||||
| self.act_l = lact(inplace=True) | |||||
| self.act_g = gact(inplace=True) | |||||
| def forward(self, x): | |||||
| x_l, x_g = self.ffc(x) | |||||
| x_l = self.act_l(self.bn_l(x_l)) | |||||
| x_g = self.act_g(self.bn_g(x_g)) | |||||
| return x_l, x_g | |||||
| class FFCResnetBlock(nn.Module): | |||||
| def __init__(self, | |||||
| dim, | |||||
| padding_type, | |||||
| norm_layer, | |||||
| activation_layer=nn.ReLU, | |||||
| dilation=1, | |||||
| spatial_transform_kwargs=None, | |||||
| inline=False, | |||||
| **conv_kwargs): | |||||
| super().__init__() | |||||
| self.conv1 = FFC_BN_ACT( | |||||
| dim, | |||||
| dim, | |||||
| kernel_size=3, | |||||
| padding=dilation, | |||||
| dilation=dilation, | |||||
| norm_layer=norm_layer, | |||||
| activation_layer=activation_layer, | |||||
| padding_type=padding_type, | |||||
| **conv_kwargs) | |||||
| self.conv2 = FFC_BN_ACT( | |||||
| dim, | |||||
| dim, | |||||
| kernel_size=3, | |||||
| padding=dilation, | |||||
| dilation=dilation, | |||||
| norm_layer=norm_layer, | |||||
| activation_layer=activation_layer, | |||||
| padding_type=padding_type, | |||||
| **conv_kwargs) | |||||
| if spatial_transform_kwargs is not None: | |||||
| self.conv1 = LearnableSpatialTransformWrapper( | |||||
| self.conv1, **spatial_transform_kwargs) | |||||
| self.conv2 = LearnableSpatialTransformWrapper( | |||||
| self.conv2, **spatial_transform_kwargs) | |||||
| self.inline = inline | |||||
| def forward(self, x): | |||||
| if self.inline: | |||||
| x_l, x_g = x[:, :-self.conv1.ffc. | |||||
| global_in_num], x[:, -self.conv1.ffc.global_in_num:] | |||||
| else: | |||||
| x_l, x_g = x if type(x) is tuple else (x, 0) | |||||
| id_l, id_g = x_l, x_g | |||||
| x_l, x_g = self.conv1((x_l, x_g)) | |||||
| x_l, x_g = self.conv2((x_l, x_g)) | |||||
| x_l, x_g = id_l + x_l, id_g + x_g | |||||
| out = x_l, x_g | |||||
| if self.inline: | |||||
| out = torch.cat(out, dim=1) | |||||
| return out | |||||
| class ConcatTupleLayer(nn.Module): | |||||
| def forward(self, x): | |||||
| assert isinstance(x, tuple) | |||||
| x_l, x_g = x | |||||
| assert torch.is_tensor(x_l) or torch.is_tensor(x_g) | |||||
| if not torch.is_tensor(x_g): | |||||
| return x_l | |||||
| return torch.cat(x, dim=1) | |||||
| class FFCResNetGenerator(nn.Module): | |||||
| def __init__(self, | |||||
| input_nc=4, | |||||
| output_nc=3, | |||||
| ngf=64, | |||||
| n_downsampling=3, | |||||
| n_blocks=18, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| padding_type='reflect', | |||||
| activation_layer=nn.ReLU, | |||||
| up_norm_layer=nn.BatchNorm2d, | |||||
| up_activation=nn.ReLU(True), | |||||
| init_conv_kwargs={ | |||||
| 'ratio_gin': 0, | |||||
| 'ratio_gout': 0, | |||||
| 'enable_lfu': False | |||||
| }, | |||||
| downsample_conv_kwargs={ | |||||
| 'ratio_gin': 0, | |||||
| 'ratio_gout': 0, | |||||
| 'enable_lfu': False | |||||
| }, | |||||
| resnet_conv_kwargs={ | |||||
| 'ratio_gin': 0.75, | |||||
| 'ratio_gout': 0.75, | |||||
| 'enable_lfu': False | |||||
| }, | |||||
| spatial_transform_layers=None, | |||||
| spatial_transform_kwargs={}, | |||||
| add_out_act='sigmoid', | |||||
| max_features=1024, | |||||
| out_ffc=False, | |||||
| out_ffc_kwargs={}): | |||||
| assert (n_blocks >= 0) | |||||
| super().__init__() | |||||
| model = [ | |||||
| nn.ReflectionPad2d(3), | |||||
| FFC_BN_ACT( | |||||
| input_nc, | |||||
| ngf, | |||||
| kernel_size=7, | |||||
| padding=0, | |||||
| norm_layer=norm_layer, | |||||
| activation_layer=activation_layer, | |||||
| **init_conv_kwargs) | |||||
| ] | |||||
| # downsample | |||||
| for i in range(n_downsampling): | |||||
| mult = 2**i | |||||
| if i == n_downsampling - 1: | |||||
| cur_conv_kwargs = dict(downsample_conv_kwargs) | |||||
| cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get( | |||||
| 'ratio_gin', 0) | |||||
| else: | |||||
| cur_conv_kwargs = downsample_conv_kwargs | |||||
| model += [ | |||||
| FFC_BN_ACT( | |||||
| min(max_features, ngf * mult), | |||||
| min(max_features, ngf * mult * 2), | |||||
| kernel_size=3, | |||||
| stride=2, | |||||
| padding=1, | |||||
| norm_layer=norm_layer, | |||||
| activation_layer=activation_layer, | |||||
| **cur_conv_kwargs) | |||||
| ] | |||||
| mult = 2**n_downsampling | |||||
| feats_num_bottleneck = min(max_features, ngf * mult) | |||||
| # resnet blocks | |||||
| for i in range(n_blocks): | |||||
| cur_resblock = FFCResnetBlock( | |||||
| feats_num_bottleneck, | |||||
| padding_type=padding_type, | |||||
| activation_layer=activation_layer, | |||||
| norm_layer=norm_layer, | |||||
| **resnet_conv_kwargs) | |||||
| if spatial_transform_layers is not None and i in spatial_transform_layers: | |||||
| cur_resblock = LearnableSpatialTransformWrapper( | |||||
| cur_resblock, **spatial_transform_kwargs) | |||||
| model += [cur_resblock] | |||||
| model += [ConcatTupleLayer()] | |||||
| # upsample | |||||
| for i in range(n_downsampling): | |||||
| mult = 2**(n_downsampling - i) | |||||
| model += [ | |||||
| nn.ConvTranspose2d( | |||||
| min(max_features, ngf * mult), | |||||
| min(max_features, int(ngf * mult / 2)), | |||||
| kernel_size=3, | |||||
| stride=2, | |||||
| padding=1, | |||||
| output_padding=1), | |||||
| up_norm_layer(min(max_features, int(ngf * mult / 2))), | |||||
| up_activation | |||||
| ] | |||||
| if out_ffc: | |||||
| model += [ | |||||
| FFCResnetBlock( | |||||
| ngf, | |||||
| padding_type=padding_type, | |||||
| activation_layer=activation_layer, | |||||
| norm_layer=norm_layer, | |||||
| inline=True, | |||||
| **out_ffc_kwargs) | |||||
| ] | |||||
| model += [ | |||||
| nn.ReflectionPad2d(3), | |||||
| nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0) | |||||
| ] | |||||
| if add_out_act: | |||||
| model.append( | |||||
| get_activation('tanh' if add_out_act is True else add_out_act)) | |||||
| self.model = nn.Sequential(*model) | |||||
| def forward(self, input): | |||||
| return self.model(input) | |||||
| @@ -0,0 +1,324 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torchvision import models | |||||
| from modelscope.utils.logger import get_logger | |||||
| try: | |||||
| from torchvision.models.utils import load_state_dict_from_url | |||||
| except ImportError: | |||||
| from torch.utils.model_zoo import load_url as load_state_dict_from_url | |||||
| # Inception weights ported to Pytorch from | |||||
| # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz | |||||
| FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \ | |||||
| 'fid_weights/pt_inception-2015-12-05-6726825d.pth' | |||||
| LOGGER = get_logger() | |||||
| class InceptionV3(nn.Module): | |||||
| """Pretrained InceptionV3 network returning feature maps""" | |||||
| # Index of default block of inception to return, | |||||
| # corresponds to output of final average pooling | |||||
| DEFAULT_BLOCK_INDEX = 3 | |||||
| # Maps feature dimensionality to their output blocks indices | |||||
| BLOCK_INDEX_BY_DIM = { | |||||
| 64: 0, # First max pooling features | |||||
| 192: 1, # Second max pooling featurs | |||||
| 768: 2, # Pre-aux classifier features | |||||
| 2048: 3 # Final average pooling features | |||||
| } | |||||
| def __init__(self, | |||||
| output_blocks=[DEFAULT_BLOCK_INDEX], | |||||
| resize_input=True, | |||||
| normalize_input=True, | |||||
| requires_grad=False, | |||||
| use_fid_inception=True): | |||||
| """Build pretrained InceptionV3 | |||||
| Parameters | |||||
| ---------- | |||||
| output_blocks : list of int | |||||
| Indices of blocks to return features of. Possible values are: | |||||
| - 0: corresponds to output of first max pooling | |||||
| - 1: corresponds to output of second max pooling | |||||
| - 2: corresponds to output which is fed to aux classifier | |||||
| - 3: corresponds to output of final average pooling | |||||
| resize_input : bool | |||||
| If true, bilinearly resizes input to width and height 299 before | |||||
| feeding input to model. As the network without fully connected | |||||
| layers is fully convolutional, it should be able to handle inputs | |||||
| of arbitrary size, so resizing might not be strictly needed | |||||
| normalize_input : bool | |||||
| If true, scales the input from range (0, 1) to the range the | |||||
| pretrained Inception network expects, namely (-1, 1) | |||||
| requires_grad : bool | |||||
| If true, parameters of the model require gradients. Possibly useful | |||||
| for finetuning the network | |||||
| use_fid_inception : bool | |||||
| If true, uses the pretrained Inception model used in Tensorflow's | |||||
| FID implementation. If false, uses the pretrained Inception model | |||||
| available in torchvision. The FID Inception model has different | |||||
| weights and a slightly different structure from torchvision's | |||||
| Inception model. If you want to compute FID scores, you are | |||||
| strongly advised to set this parameter to true to get comparable | |||||
| results. | |||||
| """ | |||||
| super(InceptionV3, self).__init__() | |||||
| self.resize_input = resize_input | |||||
| self.normalize_input = normalize_input | |||||
| self.output_blocks = sorted(output_blocks) | |||||
| self.last_needed_block = max(output_blocks) | |||||
| assert self.last_needed_block <= 3, \ | |||||
| 'Last possible output block index is 3' | |||||
| self.blocks = nn.ModuleList() | |||||
| if use_fid_inception: | |||||
| inception = fid_inception_v3() | |||||
| else: | |||||
| inception = models.inception_v3(pretrained=True) | |||||
| # Block 0: input to maxpool1 | |||||
| block0 = [ | |||||
| inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, | |||||
| inception.Conv2d_2b_3x3, | |||||
| nn.MaxPool2d(kernel_size=3, stride=2) | |||||
| ] | |||||
| self.blocks.append(nn.Sequential(*block0)) | |||||
| # Block 1: maxpool1 to maxpool2 | |||||
| if self.last_needed_block >= 1: | |||||
| block1 = [ | |||||
| inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, | |||||
| nn.MaxPool2d(kernel_size=3, stride=2) | |||||
| ] | |||||
| self.blocks.append(nn.Sequential(*block1)) | |||||
| # Block 2: maxpool2 to aux classifier | |||||
| if self.last_needed_block >= 2: | |||||
| block2 = [ | |||||
| inception.Mixed_5b, | |||||
| inception.Mixed_5c, | |||||
| inception.Mixed_5d, | |||||
| inception.Mixed_6a, | |||||
| inception.Mixed_6b, | |||||
| inception.Mixed_6c, | |||||
| inception.Mixed_6d, | |||||
| inception.Mixed_6e, | |||||
| ] | |||||
| self.blocks.append(nn.Sequential(*block2)) | |||||
| # Block 3: aux classifier to final avgpool | |||||
| if self.last_needed_block >= 3: | |||||
| block3 = [ | |||||
| inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c, | |||||
| nn.AdaptiveAvgPool2d(output_size=(1, 1)) | |||||
| ] | |||||
| self.blocks.append(nn.Sequential(*block3)) | |||||
| for param in self.parameters(): | |||||
| param.requires_grad = requires_grad | |||||
| def forward(self, inp): | |||||
| """Get Inception feature maps | |||||
| Parameters | |||||
| ---------- | |||||
| inp : torch.autograd.Variable | |||||
| Input tensor of shape Bx3xHxW. Values are expected to be in | |||||
| range (0, 1) | |||||
| Returns | |||||
| ------- | |||||
| List of torch.autograd.Variable, corresponding to the selected output | |||||
| block, sorted ascending by index | |||||
| """ | |||||
| outp = [] | |||||
| x = inp | |||||
| if self.resize_input: | |||||
| x = F.interpolate( | |||||
| x, size=(299, 299), mode='bilinear', align_corners=False) | |||||
| if self.normalize_input: | |||||
| x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) | |||||
| for idx, block in enumerate(self.blocks): | |||||
| x = block(x) | |||||
| if idx in self.output_blocks: | |||||
| outp.append(x) | |||||
| if idx == self.last_needed_block: | |||||
| break | |||||
| return outp | |||||
| def fid_inception_v3(): | |||||
| """Build pretrained Inception model for FID computation | |||||
| The Inception model for FID computation uses a different set of weights | |||||
| and has a slightly different structure than torchvision's Inception. | |||||
| This method first constructs torchvision's Inception and then patches the | |||||
| necessary parts that are different in the FID Inception model. | |||||
| """ | |||||
| LOGGER.info('fid_inception_v3 called') | |||||
| inception = models.inception_v3( | |||||
| num_classes=1008, aux_logits=False, pretrained=False) | |||||
| LOGGER.info('models.inception_v3 done') | |||||
| inception.Mixed_5b = FIDInceptionA(192, pool_features=32) | |||||
| inception.Mixed_5c = FIDInceptionA(256, pool_features=64) | |||||
| inception.Mixed_5d = FIDInceptionA(288, pool_features=64) | |||||
| inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128) | |||||
| inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160) | |||||
| inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160) | |||||
| inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192) | |||||
| inception.Mixed_7b = FIDInceptionE_1(1280) | |||||
| inception.Mixed_7c = FIDInceptionE_2(2048) | |||||
| LOGGER.info('fid_inception_v3 patching done') | |||||
| state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True) | |||||
| LOGGER.info('fid_inception_v3 weights downloaded') | |||||
| inception.load_state_dict(state_dict) | |||||
| LOGGER.info('fid_inception_v3 weights loaded into model') | |||||
| return inception | |||||
| class FIDInceptionA(models.inception.InceptionA): | |||||
| """InceptionA block patched for FID computation""" | |||||
| def __init__(self, in_channels, pool_features): | |||||
| super(FIDInceptionA, self).__init__(in_channels, pool_features) | |||||
| def forward(self, x): | |||||
| branch1x1 = self.branch1x1(x) | |||||
| branch5x5 = self.branch5x5_1(x) | |||||
| branch5x5 = self.branch5x5_2(branch5x5) | |||||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||||
| branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) | |||||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||||
| # its average calculation | |||||
| branch_pool = F.avg_pool2d( | |||||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||||
| branch_pool = self.branch_pool(branch_pool) | |||||
| outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] | |||||
| return torch.cat(outputs, 1) | |||||
| class FIDInceptionC(models.inception.InceptionC): | |||||
| """InceptionC block patched for FID computation""" | |||||
| def __init__(self, in_channels, channels_7x7): | |||||
| super(FIDInceptionC, self).__init__(in_channels, channels_7x7) | |||||
| def forward(self, x): | |||||
| branch1x1 = self.branch1x1(x) | |||||
| branch7x7 = self.branch7x7_1(x) | |||||
| branch7x7 = self.branch7x7_2(branch7x7) | |||||
| branch7x7 = self.branch7x7_3(branch7x7) | |||||
| branch7x7dbl = self.branch7x7dbl_1(x) | |||||
| branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) | |||||
| branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) | |||||
| branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) | |||||
| branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) | |||||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||||
| # its average calculation | |||||
| branch_pool = F.avg_pool2d( | |||||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||||
| branch_pool = self.branch_pool(branch_pool) | |||||
| outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] | |||||
| return torch.cat(outputs, 1) | |||||
| class FIDInceptionE_1(models.inception.InceptionE): | |||||
| """First InceptionE block patched for FID computation""" | |||||
| def __init__(self, in_channels): | |||||
| super(FIDInceptionE_1, self).__init__(in_channels) | |||||
| def forward(self, x): | |||||
| branch1x1 = self.branch1x1(x) | |||||
| branch3x3 = self.branch3x3_1(x) | |||||
| branch3x3 = [ | |||||
| self.branch3x3_2a(branch3x3), | |||||
| self.branch3x3_2b(branch3x3), | |||||
| ] | |||||
| branch3x3 = torch.cat(branch3x3, 1) | |||||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||||
| branch3x3dbl = [ | |||||
| self.branch3x3dbl_3a(branch3x3dbl), | |||||
| self.branch3x3dbl_3b(branch3x3dbl), | |||||
| ] | |||||
| branch3x3dbl = torch.cat(branch3x3dbl, 1) | |||||
| # Patch: Tensorflow's average pool does not use the padded zero's in | |||||
| # its average calculation | |||||
| branch_pool = F.avg_pool2d( | |||||
| x, kernel_size=3, stride=1, padding=1, count_include_pad=False) | |||||
| branch_pool = self.branch_pool(branch_pool) | |||||
| outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] | |||||
| return torch.cat(outputs, 1) | |||||
| class FIDInceptionE_2(models.inception.InceptionE): | |||||
| """Second InceptionE block patched for FID computation""" | |||||
| def __init__(self, in_channels): | |||||
| super(FIDInceptionE_2, self).__init__(in_channels) | |||||
| def forward(self, x): | |||||
| branch1x1 = self.branch1x1(x) | |||||
| branch3x3 = self.branch3x3_1(x) | |||||
| branch3x3 = [ | |||||
| self.branch3x3_2a(branch3x3), | |||||
| self.branch3x3_2b(branch3x3), | |||||
| ] | |||||
| branch3x3 = torch.cat(branch3x3, 1) | |||||
| branch3x3dbl = self.branch3x3dbl_1(x) | |||||
| branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) | |||||
| branch3x3dbl = [ | |||||
| self.branch3x3dbl_3a(branch3x3dbl), | |||||
| self.branch3x3dbl_3b(branch3x3dbl), | |||||
| ] | |||||
| branch3x3dbl = torch.cat(branch3x3dbl, 1) | |||||
| # Patch: The FID Inception model uses max pooling instead of average | |||||
| # pooling. This is likely an error in this specific Inception | |||||
| # implementation, as other Inception models use average pooling here | |||||
| # (which matches the description in the paper). | |||||
| branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) | |||||
| branch_pool = self.branch_pool(branch_pool) | |||||
| outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] | |||||
| return torch.cat(outputs, 1) | |||||
| @@ -0,0 +1,47 @@ | |||||
| """ | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| """ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torchvision | |||||
| from .ade20k import ModelBuilder | |||||
| IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None] | |||||
| IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None] | |||||
| class ResNetPL(nn.Module): | |||||
| def __init__(self, | |||||
| weight=1, | |||||
| weights_path=None, | |||||
| arch_encoder='resnet50dilated', | |||||
| segmentation=True): | |||||
| super().__init__() | |||||
| self.impl = ModelBuilder.get_encoder( | |||||
| weights_path=weights_path, | |||||
| arch_encoder=arch_encoder, | |||||
| arch_decoder='ppm_deepsup', | |||||
| fc_dim=2048, | |||||
| segmentation=segmentation) | |||||
| self.impl.eval() | |||||
| for w in self.impl.parameters(): | |||||
| w.requires_grad_(False) | |||||
| self.weight = weight | |||||
| def forward(self, pred, target): | |||||
| pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred) | |||||
| target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target) | |||||
| pred_feats = self.impl(pred, return_feature_maps=True) | |||||
| target_feats = self.impl(target, return_feature_maps=True) | |||||
| result = torch.stack([ | |||||
| F.mse_loss(cur_pred, cur_target) | |||||
| for cur_pred, cur_target in zip(pred_feats, target_feats) | |||||
| ]).sum() * self.weight | |||||
| return result | |||||
| @@ -0,0 +1,75 @@ | |||||
| """ | |||||
| The implementation is adopted from | |||||
| https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py | |||||
| """ | |||||
| import collections | |||||
| import functools | |||||
| import logging | |||||
| from collections import defaultdict | |||||
| from functools import partial | |||||
| import numpy as np | |||||
| import torch.nn as nn | |||||
| # Defines the PatchGAN discriminator with the specified arguments. | |||||
| class NLayerDiscriminator(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| input_nc=3, | |||||
| ndf=64, | |||||
| n_layers=4, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| ): | |||||
| super().__init__() | |||||
| self.n_layers = n_layers | |||||
| kw = 4 | |||||
| padw = int(np.ceil((kw - 1.0) / 2)) | |||||
| sequence = [[ | |||||
| nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), | |||||
| nn.LeakyReLU(0.2, True) | |||||
| ]] | |||||
| nf = ndf | |||||
| for n in range(1, n_layers): | |||||
| nf_prev = nf | |||||
| nf = min(nf * 2, 512) | |||||
| cur_model = [] | |||||
| cur_model += [ | |||||
| nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw), | |||||
| norm_layer(nf), | |||||
| nn.LeakyReLU(0.2, True) | |||||
| ] | |||||
| sequence.append(cur_model) | |||||
| nf_prev = nf | |||||
| nf = min(nf * 2, 512) | |||||
| cur_model = [] | |||||
| cur_model += [ | |||||
| nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw), | |||||
| norm_layer(nf), | |||||
| nn.LeakyReLU(0.2, True) | |||||
| ] | |||||
| sequence.append(cur_model) | |||||
| sequence += [[ | |||||
| nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw) | |||||
| ]] | |||||
| for n in range(len(sequence)): | |||||
| setattr(self, 'model' + str(n), nn.Sequential(*sequence[n])) | |||||
| def get_all_activations(self, x): | |||||
| res = [x] | |||||
| for n in range(self.n_layers + 2): | |||||
| model = getattr(self, 'model' + str(n)) | |||||
| res.append(model(res[-1])) | |||||
| return res[1:] | |||||
| def forward(self, x): | |||||
| act = self.get_all_activations(x) | |||||
| return act[-1], act[:-1] | |||||
| @@ -0,0 +1,393 @@ | |||||
| ''' | |||||
| Part of the implementation is borrowed and modified from LaMa, publicly available at | |||||
| https://github.com/saic-mdal/lama | |||||
| ''' | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from kornia.filters import gaussian_blur2d | |||||
| from kornia.geometry.transform import resize | |||||
| from kornia.morphology import erosion | |||||
| from torch.nn import functional as F | |||||
| from torch.optim import SGD, Adam | |||||
| from tqdm import tqdm | |||||
| from .modules.ffc import FFCResnetBlock | |||||
| def move_to_device(obj, device): | |||||
| if isinstance(obj, nn.Module): | |||||
| return obj.to(device) | |||||
| if torch.is_tensor(obj): | |||||
| return obj.to(device) | |||||
| if isinstance(obj, (tuple, list)): | |||||
| return [move_to_device(el, device) for el in obj] | |||||
| if isinstance(obj, dict): | |||||
| return {name: move_to_device(val, device) for name, val in obj.items()} | |||||
| raise ValueError(f'Unexpected type {type(obj)}') | |||||
| def ceil_modulo(x, mod): | |||||
| if x % mod == 0: | |||||
| return x | |||||
| return (x // mod + 1) * mod | |||||
| def pad_tensor_to_modulo(img, mod): | |||||
| batch_size, channels, height, width = img.shape | |||||
| out_height = ceil_modulo(height, mod) | |||||
| out_width = ceil_modulo(width, mod) | |||||
| return F.pad( | |||||
| img, | |||||
| pad=(0, out_width - width, 0, out_height - height), | |||||
| mode='reflect') | |||||
| def _pyrdown(im: torch.Tensor, downsize: tuple = None): | |||||
| """downscale the image""" | |||||
| if downsize is None: | |||||
| downsize = (im.shape[2] // 2, im.shape[3] // 2) | |||||
| assert im.shape[ | |||||
| 1] == 3, 'Expected shape for the input to be (n,3,height,width)' | |||||
| im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0)) | |||||
| im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False) | |||||
| return im | |||||
| def _pyrdown_mask(mask: torch.Tensor, | |||||
| downsize: tuple = None, | |||||
| eps: float = 1e-8, | |||||
| blur_mask: bool = True, | |||||
| round_up: bool = True): | |||||
| """downscale the mask tensor | |||||
| Parameters | |||||
| ---------- | |||||
| mask : torch.Tensor | |||||
| mask of size (B, 1, H, W) | |||||
| downsize : tuple, optional | |||||
| size to downscale to. If None, image is downscaled to half, by default None | |||||
| eps : float, optional | |||||
| threshold value for binarizing the mask, by default 1e-8 | |||||
| blur_mask : bool, optional | |||||
| if True, apply gaussian filter before downscaling, by default True | |||||
| round_up : bool, optional | |||||
| if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True | |||||
| Returns | |||||
| ------- | |||||
| torch.Tensor | |||||
| downscaled mask | |||||
| """ | |||||
| if downsize is None: | |||||
| downsize = (mask.shape[2] // 2, mask.shape[3] // 2) | |||||
| assert mask.shape[ | |||||
| 1] == 1, 'Expected shape for the input to be (n,1,height,width)' | |||||
| if blur_mask is True: | |||||
| mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0)) | |||||
| mask = F.interpolate( | |||||
| mask, size=downsize, mode='bilinear', align_corners=False) | |||||
| else: | |||||
| mask = F.interpolate( | |||||
| mask, size=downsize, mode='bilinear', align_corners=False) | |||||
| if round_up: | |||||
| mask[mask >= eps] = 1 | |||||
| mask[mask < eps] = 0 | |||||
| else: | |||||
| mask[mask >= 1.0 - eps] = 1 | |||||
| mask[mask < 1.0 - eps] = 0 | |||||
| return mask | |||||
| def _erode_mask(mask: torch.Tensor, | |||||
| ekernel: torch.Tensor = None, | |||||
| eps: float = 1e-8): | |||||
| """erode the mask, and set gray pixels to 0""" | |||||
| if ekernel is not None: | |||||
| mask = erosion(mask, ekernel) | |||||
| mask[mask >= 1.0 - eps] = 1 | |||||
| mask[mask < 1.0 - eps] = 0 | |||||
| return mask | |||||
| def _l1_loss(pred: torch.Tensor, | |||||
| pred_downscaled: torch.Tensor, | |||||
| ref: torch.Tensor, | |||||
| mask: torch.Tensor, | |||||
| mask_downscaled: torch.Tensor, | |||||
| image: torch.Tensor, | |||||
| on_pred: bool = True): | |||||
| """l1 loss on src pixels, and downscaled predictions if on_pred=True""" | |||||
| loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8])) | |||||
| if on_pred: | |||||
| loss += torch.mean( | |||||
| torch.abs(pred_downscaled[mask_downscaled >= 1e-8] | |||||
| - ref[mask_downscaled >= 1e-8])) | |||||
| return loss | |||||
| def _infer(image: torch.Tensor, | |||||
| mask: torch.Tensor, | |||||
| forward_front: nn.Module, | |||||
| forward_rears: nn.Module, | |||||
| ref_lower_res: torch.Tensor, | |||||
| orig_shape: tuple, | |||||
| devices: list, | |||||
| scale_ind: int, | |||||
| n_iters: int = 15, | |||||
| lr: float = 0.002): | |||||
| """Performs inference with refinement at a given scale. | |||||
| Parameters | |||||
| ---------- | |||||
| image : torch.Tensor | |||||
| input image to be inpainted, of size (1,3,H,W) | |||||
| mask : torch.Tensor | |||||
| input inpainting mask, of size (1,1,H,W) | |||||
| forward_front : nn.Module | |||||
| the front part of the inpainting network | |||||
| forward_rears : nn.Module | |||||
| the rear part of the inpainting network | |||||
| ref_lower_res : torch.Tensor | |||||
| the inpainting at previous scale, used as reference image | |||||
| orig_shape : tuple | |||||
| shape of the original input image before padding | |||||
| devices : list | |||||
| list of available devices | |||||
| scale_ind : int | |||||
| the scale index | |||||
| n_iters : int, optional | |||||
| number of iterations of refinement, by default 15 | |||||
| lr : float, optional | |||||
| learning rate, by default 0.002 | |||||
| Returns | |||||
| ------- | |||||
| torch.Tensor | |||||
| inpainted image | |||||
| """ | |||||
| masked_image = image * (1 - mask) | |||||
| masked_image = torch.cat([masked_image, mask], dim=1) | |||||
| mask = mask.repeat(1, 3, 1, 1) | |||||
| if ref_lower_res is not None: | |||||
| ref_lower_res = ref_lower_res.detach() | |||||
| with torch.no_grad(): | |||||
| z1, z2 = forward_front(masked_image) | |||||
| # Inference | |||||
| mask = mask.to(devices[-1]) | |||||
| ekernel = torch.from_numpy( | |||||
| cv2.getStructuringElement(cv2.MORPH_ELLIPSE, | |||||
| (15, 15)).astype(bool)).float() | |||||
| ekernel = ekernel.to(devices[-1]) | |||||
| image = image.to(devices[-1]) | |||||
| z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0]) | |||||
| z1.requires_grad, z2.requires_grad = True, True | |||||
| optimizer = Adam([z1, z2], lr=lr) | |||||
| pbar = tqdm(range(n_iters), leave=False) | |||||
| for idi in pbar: | |||||
| optimizer.zero_grad() | |||||
| input_feat = (z1, z2) | |||||
| for idd, forward_rear in enumerate(forward_rears): | |||||
| output_feat = forward_rear(input_feat) | |||||
| if idd < len(devices) - 1: | |||||
| midz1, midz2 = output_feat | |||||
| midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to( | |||||
| devices[idd + 1]) | |||||
| input_feat = (midz1, midz2) | |||||
| else: | |||||
| pred = output_feat | |||||
| if ref_lower_res is None: | |||||
| break | |||||
| losses = {} | |||||
| # scaled loss with downsampler | |||||
| pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]]) | |||||
| mask_downscaled = _pyrdown_mask( | |||||
| mask[:, :1, :orig_shape[0], :orig_shape[1]], | |||||
| blur_mask=False, | |||||
| round_up=False) | |||||
| mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel) | |||||
| mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1) | |||||
| losses['ms_l1'] = _l1_loss( | |||||
| pred, | |||||
| pred_downscaled, | |||||
| ref_lower_res, | |||||
| mask, | |||||
| mask_downscaled, | |||||
| image, | |||||
| on_pred=True) | |||||
| loss = sum(losses.values()) | |||||
| pbar.set_description( | |||||
| 'Refining scale {} using scale {} ...current loss: {:.4f}'.format( | |||||
| scale_ind + 1, scale_ind, loss.item())) | |||||
| if idi < n_iters - 1: | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| del pred_downscaled | |||||
| del loss | |||||
| del pred | |||||
| # "pred" is the prediction after Plug-n-Play module | |||||
| inpainted = mask * pred + (1 - mask) * image | |||||
| inpainted = inpainted.detach().cpu() | |||||
| return inpainted | |||||
| def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int, | |||||
| px_budget: int): | |||||
| """Build the image mask pyramid | |||||
| Parameters | |||||
| ---------- | |||||
| batch : dict | |||||
| batch containing image, mask, etc | |||||
| min_side : int | |||||
| minimum side length to limit the number of scales of the pyramid | |||||
| max_scales : int | |||||
| maximum number of scales allowed | |||||
| px_budget : int | |||||
| the product H*W cannot exceed this budget, because of resource constraints | |||||
| Returns | |||||
| ------- | |||||
| tuple | |||||
| image-mask pyramid in the form of list of images and list of masks | |||||
| """ | |||||
| assert batch['image'].shape[ | |||||
| 0] == 1, 'refiner works on only batches of size 1!' | |||||
| h, w = batch['unpad_to_size'] | |||||
| h, w = h[0].item(), w[0].item() | |||||
| image = batch['image'][..., :h, :w] | |||||
| mask = batch['mask'][..., :h, :w] | |||||
| if h * w > px_budget: | |||||
| # resize | |||||
| ratio = np.sqrt(px_budget / float(h * w)) | |||||
| h_orig, w_orig = h, w | |||||
| h, w = int(h * ratio), int(w * ratio) | |||||
| print( | |||||
| f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...' | |||||
| ) | |||||
| image = resize( | |||||
| image, (h, w), interpolation='bilinear', align_corners=False) | |||||
| mask = resize( | |||||
| mask, (h, w), interpolation='bilinear', align_corners=False) | |||||
| mask[mask > 1e-8] = 1 | |||||
| breadth = min(h, w) | |||||
| n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))), | |||||
| max_scales) | |||||
| ls_images = [] | |||||
| ls_masks = [] | |||||
| ls_images.append(image) | |||||
| ls_masks.append(mask) | |||||
| for _ in range(n_scales - 1): | |||||
| image_p = _pyrdown(ls_images[-1]) | |||||
| mask_p = _pyrdown_mask(ls_masks[-1]) | |||||
| ls_images.append(image_p) | |||||
| ls_masks.append(mask_p) | |||||
| # reverse the lists because we want the lowest resolution image as index 0 | |||||
| return ls_images[::-1], ls_masks[::-1] | |||||
| def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str, | |||||
| modulo: int, n_iters: int, lr: float, min_side: int, | |||||
| max_scales: int, px_budget: int): | |||||
| """Refines the inpainting of the network | |||||
| Parameters | |||||
| ---------- | |||||
| batch : dict | |||||
| image-mask batch, currently we assume the batchsize to be 1 | |||||
| inpainter : nn.Module | |||||
| the inpainting neural network | |||||
| gpu_ids : str | |||||
| the GPU ids of the machine to use. If only single GPU, use: "0," | |||||
| modulo : int | |||||
| pad the image to ensure dimension % modulo == 0 | |||||
| n_iters : int | |||||
| number of iterations of refinement for each scale | |||||
| lr : float | |||||
| learning rate | |||||
| min_side : int | |||||
| all sides of image on all scales should be >= min_side / sqrt(2) | |||||
| max_scales : int | |||||
| max number of downscaling scales for the image-mask pyramid | |||||
| px_budget : int | |||||
| pixels budget. Any image will be resized to satisfy height*width <= px_budget | |||||
| Returns | |||||
| ------- | |||||
| torch.Tensor | |||||
| inpainted image of size (1,3,H,W) | |||||
| """ | |||||
| inpainter = inpainter.model | |||||
| assert not inpainter.training | |||||
| assert not inpainter.add_noise_kwargs | |||||
| assert inpainter.concat_mask | |||||
| gpu_ids = [ | |||||
| f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',') | |||||
| if gpuid.isdigit() | |||||
| ] | |||||
| n_resnet_blocks = 0 | |||||
| first_resblock_ind = 0 | |||||
| found_first_resblock = False | |||||
| for idl in range(len(inpainter.generator.model)): | |||||
| if isinstance(inpainter.generator.model[idl], FFCResnetBlock): | |||||
| n_resnet_blocks += 1 | |||||
| found_first_resblock = True | |||||
| elif not found_first_resblock: | |||||
| first_resblock_ind += 1 | |||||
| resblocks_per_gpu = n_resnet_blocks // len(gpu_ids) | |||||
| devices = [torch.device(gpu_id) for gpu_id in gpu_ids] | |||||
| # split the model into front, and rear parts | |||||
| forward_front = inpainter.generator.model[0:first_resblock_ind] | |||||
| forward_front.to(devices[0]) | |||||
| forward_rears = [] | |||||
| for idd in range(len(gpu_ids)): | |||||
| if idd < len(gpu_ids) - 1: | |||||
| forward_rears.append( | |||||
| inpainter.generator.model[first_resblock_ind | |||||
| + resblocks_per_gpu | |||||
| * (idd):first_resblock_ind | |||||
| + resblocks_per_gpu * (idd + 1)]) | |||||
| else: | |||||
| forward_rears.append( | |||||
| inpainter.generator.model[first_resblock_ind | |||||
| + resblocks_per_gpu * (idd):]) | |||||
| forward_rears[idd].to(devices[idd]) | |||||
| ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales, | |||||
| px_budget) | |||||
| image_inpainted = None | |||||
| for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)): | |||||
| orig_shape = image.shape[2:] | |||||
| image = pad_tensor_to_modulo(image, modulo) | |||||
| mask = pad_tensor_to_modulo(mask, modulo) | |||||
| mask[mask >= 1e-8] = 1.0 | |||||
| mask[mask < 1e-8] = 0.0 | |||||
| image, mask = move_to_device(image, devices[0]), move_to_device( | |||||
| mask, devices[0]) | |||||
| if image_inpainted is not None: | |||||
| image_inpainted = move_to_device(image_inpainted, devices[-1]) | |||||
| image_inpainted = _infer(image, mask, forward_front, forward_rears, | |||||
| image_inpainted, orig_shape, devices, ids, | |||||
| n_iters, lr) | |||||
| image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]] | |||||
| # detach everything to save resources | |||||
| image = image.detach().cpu() | |||||
| mask = mask.detach().cpu() | |||||
| return image_inpainted | |||||
| @@ -10,7 +10,7 @@ if TYPE_CHECKING: | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'mmdet_model': ['DetectionModel'], | 'mmdet_model': ['DetectionModel'], | ||||
| 'yolox_pai': ['YOLOX'] | |||||
| 'yolox_pai': ['YOLOX'], | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks | |||||
| @MODELS.register_module( | @MODELS.register_module( | ||||
| group_key=Tasks.image_object_detection, module_name=Models.yolox) | group_key=Tasks.image_object_detection, module_name=Models.yolox) | ||||
| @MODELS.register_module( | |||||
| group_key=Tasks.image_object_detection, | |||||
| module_name=Models.image_object_detection_auto) | |||||
| class YOLOX(EasyCVBaseModel, _YOLOX): | class YOLOX(EasyCVBaseModel, _YOLOX): | ||||
| def __init__(self, model_dir=None, *args, **kwargs): | def __init__(self, model_dir=None, *args, **kwargs): | ||||
| @@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .realtime_detector import RealtimeDetector | from .realtime_detector import RealtimeDetector | ||||
| from .realtime_video_detector import RealtimeVideoDetector | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'realtime_detector': ['RealtimeDetector'], | 'realtime_detector': ['RealtimeDetector'], | ||||
| 'realtime_video_detector': ['RealtimeVideoDetector'], | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -0,0 +1,117 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import argparse | |||||
| import logging as logger | |||||
| import os | |||||
| import os.path as osp | |||||
| import time | |||||
| import cv2 | |||||
| import json | |||||
| import torch | |||||
| from tqdm import tqdm | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base.base_torch_model import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.config import Config | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from .yolox.data.data_augment import ValTransform | |||||
| from .yolox.exp import get_exp_by_name | |||||
| from .yolox.utils import postprocess | |||||
| @MODELS.register_module( | |||||
| group_key=Tasks.video_object_detection, | |||||
| module_name=Models.realtime_video_object_detection) | |||||
| class RealtimeVideoDetector(TorchModel): | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| super().__init__(model_dir, *args, **kwargs) | |||||
| self.config = Config.from_file( | |||||
| os.path.join(self.model_dir, ModelFile.CONFIGURATION)) | |||||
| # model type | |||||
| self.exp = get_exp_by_name(self.config.model_type) | |||||
| # build model | |||||
| self.model = self.exp.get_model() | |||||
| model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) | |||||
| ckpt = torch.load(model_path, map_location='cpu') | |||||
| # load the model state dict | |||||
| self.model.load_state_dict(ckpt['model']) | |||||
| self.model.eval() | |||||
| # params setting | |||||
| self.exp.num_classes = self.config.num_classes | |||||
| self.confthre = self.config.conf_thr | |||||
| self.num_classes = self.exp.num_classes | |||||
| self.nmsthre = self.exp.nmsthre | |||||
| self.test_size = self.exp.test_size | |||||
| self.preproc = ValTransform(legacy=False) | |||||
| self.current_buffer = None | |||||
| self.label_mapping = self.config['labels'] | |||||
| def inference(self, img): | |||||
| with torch.no_grad(): | |||||
| outputs, self.current_buffer = self.model( | |||||
| img, buffer=self.current_buffer, mode='on_pipe') | |||||
| return outputs | |||||
| def forward(self, inputs): | |||||
| return self.inference_video(inputs) | |||||
| def preprocess(self, img): | |||||
| img = LoadImage.convert_to_ndarray(img) | |||||
| height, width = img.shape[:2] | |||||
| self.ratio = min(self.test_size[0] / img.shape[0], | |||||
| self.test_size[1] / img.shape[1]) | |||||
| img, _ = self.preproc(img, None, self.test_size) | |||||
| img = torch.from_numpy(img).unsqueeze(0) | |||||
| img = img.float() | |||||
| # Video decoding and preprocessing automatically are not supported by Pipeline/Model | |||||
| # Sending preprocessed video frame tensor to GPU buffer self-adaptively | |||||
| if next(self.model.parameters()).is_cuda: | |||||
| img = img.to(next(self.model.parameters()).device) | |||||
| return img | |||||
| def postprocess(self, input): | |||||
| outputs = postprocess( | |||||
| input, | |||||
| self.num_classes, | |||||
| self.confthre, | |||||
| self.nmsthre, | |||||
| class_agnostic=True) | |||||
| if len(outputs) == 1: | |||||
| bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio | |||||
| scores = outputs[0][:, 5].cpu().numpy() | |||||
| labels = outputs[0][:, 6].cpu().int().numpy() | |||||
| pred_label_names = [] | |||||
| for lab in labels: | |||||
| pred_label_names.append(self.label_mapping[lab]) | |||||
| return bboxes, scores, pred_label_names | |||||
| def inference_video(self, v_path): | |||||
| outputs = [] | |||||
| desc = 'Detecting video: {}'.format(v_path) | |||||
| for frame, result in tqdm( | |||||
| self.inference_video_iter(v_path), desc=desc): | |||||
| outputs.append(result) | |||||
| return outputs | |||||
| def inference_video_iter(self, v_path): | |||||
| capture = cv2.VideoCapture(v_path) | |||||
| while capture.isOpened(): | |||||
| ret, frame = capture.read() | |||||
| if not ret: | |||||
| break | |||||
| output = self.preprocess(frame) | |||||
| output = self.inference(output) | |||||
| output = self.postprocess(output) | |||||
| yield frame, output | |||||
| @@ -13,6 +13,8 @@ def get_exp_by_name(exp_name): | |||||
| from .default import YoloXNanoExp as YoloXExp | from .default import YoloXNanoExp as YoloXExp | ||||
| elif exp == 'yolox_tiny': | elif exp == 'yolox_tiny': | ||||
| from .default import YoloXTinyExp as YoloXExp | from .default import YoloXTinyExp as YoloXExp | ||||
| elif exp == 'streamyolo': | |||||
| from .default import StreamYoloExp as YoloXExp | |||||
| else: | else: | ||||
| pass | pass | ||||
| return YoloXExp() | return YoloXExp() | ||||
| @@ -1,5 +1,5 @@ | |||||
| # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | ||||
| from .streamyolo import StreamYoloExp | |||||
| from .yolox_nano import YoloXNanoExp | from .yolox_nano import YoloXNanoExp | ||||
| from .yolox_s import YoloXSExp | from .yolox_s import YoloXSExp | ||||
| from .yolox_tiny import YoloXTinyExp | from .yolox_tiny import YoloXTinyExp | ||||
| @@ -0,0 +1,43 @@ | |||||
| # The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO | |||||
| import os | |||||
| import sys | |||||
| import torch | |||||
| from ..yolox_base import Exp as YoloXExp | |||||
| class StreamYoloExp(YoloXExp): | |||||
| def __init__(self): | |||||
| super(YoloXExp, self).__init__() | |||||
| self.depth = 1.0 | |||||
| self.width = 1.0 | |||||
| self.num_classes = 8 | |||||
| self.test_size = (600, 960) | |||||
| self.test_conf = 0.3 | |||||
| self.nmsthre = 0.65 | |||||
| def get_model(self): | |||||
| from ...models import StreamYOLO, DFPPAFPN, TALHead | |||||
| def init_yolo(M): | |||||
| for m in M.modules(): | |||||
| if isinstance(m, nn.BatchNorm2d): | |||||
| m.eps = 1e-3 | |||||
| m.momentum = 0.03 | |||||
| if getattr(self, 'model', None) is None: | |||||
| in_channels = [256, 512, 1024] | |||||
| backbone = DFPPAFPN( | |||||
| self.depth, self.width, in_channels=in_channels) | |||||
| head = TALHead( | |||||
| self.num_classes, | |||||
| self.width, | |||||
| in_channels=in_channels, | |||||
| gamma=1.0, | |||||
| ignore_thr=0.5, | |||||
| ignore_value=1.6) | |||||
| self.model = StreamYOLO(backbone, head) | |||||
| return self.model | |||||
| @@ -1,5 +1,4 @@ | |||||
| # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | ||||
| import os | import os | ||||
| import random | import random | ||||