| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:772b19f76c98044e39330853928624f10e085106a4292b4dd19f865531080747 | |||||
| size 959 | |||||
| @@ -263,6 +263,7 @@ class Pipelines(object): | |||||
| text_to_image_synthesis = 'text-to-image-synthesis' | text_to_image_synthesis = 'text-to-image-synthesis' | ||||
| video_multi_modal_embedding = 'video-multi-modal-embedding' | video_multi_modal_embedding = 'video-multi-modal-embedding' | ||||
| image_text_retrieval = 'image-text-retrieval' | image_text_retrieval = 'image-text-retrieval' | ||||
| ofa_ocr_recognition = 'ofa-ocr-recognition' | |||||
| class Trainers(object): | class Trainers(object): | ||||
| @@ -462,3 +463,4 @@ class Datasets(object): | |||||
| SegDataset = 'SegDataset' | SegDataset = 'SegDataset' | ||||
| DetDataset = 'DetDataset' | DetDataset = 'DetDataset' | ||||
| DetImagesMixDataset = 'DetImagesMixDataset' | DetImagesMixDataset = 'DetImagesMixDataset' | ||||
| PairedDataset = 'PairedDataset' | |||||
| @@ -2,6 +2,7 @@ | |||||
| # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py | # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py | ||||
| from typing import Dict | from typing import Dict | ||||
| import cv2 | |||||
| import numpy as np | import numpy as np | ||||
| from modelscope.metainfo import Metrics | from modelscope.metainfo import Metrics | ||||
| @@ -37,6 +38,7 @@ class ImagePortraitEnhancementMetric(Metric): | |||||
| def add(self, outputs: Dict, inputs: Dict): | def add(self, outputs: Dict, inputs: Dict): | ||||
| ground_truths = outputs['target'] | ground_truths = outputs['target'] | ||||
| eval_results = outputs['pred'] | eval_results = outputs['pred'] | ||||
| self.preds.extend(eval_results) | self.preds.extend(eval_results) | ||||
| self.targets.extend(ground_truths) | self.targets.extend(ground_truths) | ||||
| @@ -35,7 +35,7 @@ class ImagePortraitEnhancement(TorchModel): | |||||
| """ | """ | ||||
| super().__init__(model_dir, *args, **kwargs) | super().__init__(model_dir, *args, **kwargs) | ||||
| self.size = 512 | |||||
| self.size = 256 | |||||
| self.style_dim = 512 | self.style_dim = 512 | ||||
| self.n_mlp = 8 | self.n_mlp = 8 | ||||
| self.mean_path_length = 0 | self.mean_path_length = 0 | ||||
| @@ -131,9 +131,9 @@ class ImagePortraitEnhancement(TorchModel): | |||||
| return path_penalty, path_mean.detach(), path_lengths | return path_penalty, path_mean.detach(), path_lengths | ||||
| @torch.no_grad() | @torch.no_grad() | ||||
| def _evaluate_postprocess(self, src: Tensor, | |||||
| def _evaluate_postprocess(self, input: Tensor, | |||||
| target: Tensor) -> Dict[str, list]: | target: Tensor) -> Dict[str, list]: | ||||
| preds, _ = self.generator(src) | |||||
| preds, _ = self.generator(input) | |||||
| preds = list(torch.split(preds, 1, 0)) | preds = list(torch.split(preds, 1, 0)) | ||||
| targets = list(torch.split(target, 1, 0)) | targets = list(torch.split(target, 1, 0)) | ||||
| @@ -144,11 +144,11 @@ class ImagePortraitEnhancement(TorchModel): | |||||
| return {'pred': preds, 'target': targets} | return {'pred': preds, 'target': targets} | ||||
| def _train_forward_d(self, src: Tensor, target: Tensor) -> Tensor: | |||||
| def _train_forward_d(self, input: Tensor, target: Tensor) -> Tensor: | |||||
| self.requires_grad(self.generator, False) | self.requires_grad(self.generator, False) | ||||
| self.requires_grad(self.discriminator, True) | self.requires_grad(self.discriminator, True) | ||||
| preds, _ = self.generator(src) | |||||
| preds, _ = self.generator(input) | |||||
| fake_pred = self.discriminator(preds) | fake_pred = self.discriminator(preds) | ||||
| real_pred = self.discriminator(target) | real_pred = self.discriminator(target) | ||||
| @@ -156,27 +156,27 @@ class ImagePortraitEnhancement(TorchModel): | |||||
| return d_loss | return d_loss | ||||
| def _train_forward_d_r1(self, src: Tensor, target: Tensor) -> Tensor: | |||||
| src.requires_grad = True | |||||
| def _train_forward_d_r1(self, input: Tensor, target: Tensor) -> Tensor: | |||||
| input.requires_grad = True | |||||
| target.requires_grad = True | target.requires_grad = True | ||||
| real_pred = self.discriminator(target) | real_pred = self.discriminator(target) | ||||
| r1_loss = self.d_r1_loss(real_pred, target) | r1_loss = self.d_r1_loss(real_pred, target) | ||||
| return r1_loss | return r1_loss | ||||
| def _train_forward_g(self, src: Tensor, target: Tensor) -> Tensor: | |||||
| def _train_forward_g(self, input: Tensor, target: Tensor) -> Tensor: | |||||
| self.requires_grad(self.generator, True) | self.requires_grad(self.generator, True) | ||||
| self.requires_grad(self.discriminator, False) | self.requires_grad(self.discriminator, False) | ||||
| preds, _ = self.generator(src) | |||||
| preds, _ = self.generator(input) | |||||
| fake_pred = self.discriminator(preds) | fake_pred = self.discriminator(preds) | ||||
| g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, src) | |||||
| g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, input) | |||||
| return g_loss | return g_loss | ||||
| def _train_forward_g_path(self, src: Tensor, target: Tensor) -> Tensor: | |||||
| fake_img, latents = self.generator(src, return_latents=True) | |||||
| def _train_forward_g_path(self, input: Tensor, target: Tensor) -> Tensor: | |||||
| fake_img, latents = self.generator(input, return_latents=True) | |||||
| path_loss, self.mean_path_length, path_lengths = self.g_path_regularize( | path_loss, self.mean_path_length, path_lengths = self.g_path_regularize( | ||||
| fake_img, latents, self.mean_path_length) | fake_img, latents, self.mean_path_length) | ||||
| @@ -184,8 +184,8 @@ class ImagePortraitEnhancement(TorchModel): | |||||
| return path_loss | return path_loss | ||||
| @torch.no_grad() | @torch.no_grad() | ||||
| def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]: | |||||
| return {'outputs': (self.generator(src)[0] * 0.5 + 0.5).clamp(0, 1)} | |||||
| def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]: | |||||
| return {'outputs': (self.generator(input)[0] * 0.5 + 0.5).clamp(0, 1)} | |||||
| def forward(self, input: Dict[str, | def forward(self, input: Dict[str, | ||||
| Tensor]) -> Dict[str, Union[list, Tensor]]: | Tensor]) -> Dict[str, Union[list, Tensor]]: | ||||
| @@ -1,2 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from . import data, models, ops | from . import data, models, ops | ||||
| @@ -1,4 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import random | import random | ||||
| @@ -1,4 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| @@ -1,4 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| @@ -0,0 +1,24 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .model_translation import UNet | |||||
| else: | |||||
| _import_structure = { | |||||
| 'image_to_image_translation_model': ['UNet'], | |||||
| } | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -1 +1,2 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .transforms import * # noqa F403 | from .transforms import * # noqa F403 | ||||
| @@ -1,2 +1,3 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .autoencoder import * # noqa F403 | from .autoencoder import * # noqa F403 | ||||
| from .clip import * # noqa F403 | from .clip import * # noqa F403 | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .degradation import * # noqa F403 | from .degradation import * # noqa F403 | ||||
| from .diffusion import * # noqa F403 | from .diffusion import * # noqa F403 | ||||
| from .losses import * # noqa F403 | from .losses import * # noqa F403 | ||||
| @@ -1,4 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import os.path as osp | import os.path as osp | ||||
| from typing import Any, Dict | from typing import Any, Dict | ||||
| @@ -161,7 +161,7 @@ def summary_format(summary, fps): | |||||
| is_summary_frame = False | is_summary_frame = False | ||||
| if is_summary_frame and summary[-1] == 1: | if is_summary_frame and summary[-1] == 1: | ||||
| end_frame = len(frame_idxes) - 1 | |||||
| end_frame = len(summary) - 1 | |||||
| frames_list.append([start_frame, end_frame]) | frames_list.append([start_frame, end_frame]) | ||||
| output = [] | output = [] | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .model import DiffusionForTextToImageSynthesis | from .model import DiffusionForTextToImageSynthesis | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .gemm_model import GEMMForMultiModalEmbedding | from .gemm_model import GEMMForMultiModalEmbedding | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .model import MultiStageDiffusionForTextToImageSynthesis | from .model import MultiStageDiffusionForTextToImageSynthesis | ||||
| @@ -3,6 +3,7 @@ from modelscope.outputs import OutputKeys | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| OFA_TASK_KEY_MAPPING = { | OFA_TASK_KEY_MAPPING = { | ||||
| Tasks.ofa_ocr_recognition: OutputKeys.TEXT, | |||||
| Tasks.image_captioning: OutputKeys.CAPTION, | Tasks.image_captioning: OutputKeys.CAPTION, | ||||
| Tasks.summarization: OutputKeys.TEXT, | Tasks.summarization: OutputKeys.TEXT, | ||||
| Tasks.visual_question_answering: OutputKeys.TEXT, | Tasks.visual_question_answering: OutputKeys.TEXT, | ||||
| @@ -28,6 +28,7 @@ __all__ = ['OfaForAllTasks'] | |||||
| @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) | @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) | ||||
| @MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa) | |||||
| @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa) | @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa) | ||||
| @MODELS.register_module( | @MODELS.register_module( | ||||
| Tasks.visual_question_answering, module_name=Models.ofa) | Tasks.visual_question_answering, module_name=Models.ofa) | ||||
| @@ -97,6 +98,7 @@ class OfaForAllTasks(TorchModel): | |||||
| 'traverse': self._traverse_inference, | 'traverse': self._traverse_inference, | ||||
| } | } | ||||
| self.task_inference_mapping = { | self.task_inference_mapping = { | ||||
| Tasks.ofa_ocr_recognition: self._text_gen_inference, | |||||
| Tasks.image_captioning: self._text_gen_inference, | Tasks.image_captioning: self._text_gen_inference, | ||||
| Tasks.summarization: self._text_gen_inference, | Tasks.summarization: self._text_gen_inference, | ||||
| Tasks.visual_grounding: self._visual_grounding_inference, | Tasks.visual_grounding: self._visual_grounding_inference, | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from .team_model import TEAMForMultiModalSimilarity | from .team_model import TEAMForMultiModalSimilarity | ||||
| @@ -27,6 +27,8 @@ else: | |||||
| 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], | 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], | ||||
| 'image_inpainting': ['ImageInpaintingDataset'], | 'image_inpainting': ['ImageInpaintingDataset'], | ||||
| 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], | 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], | ||||
| 'image_portrait_enhancement_dataset': | |||||
| ['ImagePortraitEnhancementDataset'], | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -0,0 +1,23 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .image_portrait_enhancement_dataset import ImagePortraitEnhancementDataset | |||||
| else: | |||||
| _import_structure = { | |||||
| 'image_portrait_enhancement_dataset': | |||||
| ['ImagePortraitEnhancementDataset'], | |||||
| } | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,32 @@ | |||||
| # ------------------------------------------------------------------------ | |||||
| # Modified from BasicSR (https://github.com/xinntao/BasicSR) | |||||
| # Copyright 2018-2020 BasicSR Authors | |||||
| # ------------------------------------------------------------------------ | |||||
| import cv2 | |||||
| import torch | |||||
| def img2tensor(imgs, bgr2rgb=True, float32=True): | |||||
| """Numpy array to tensor. | |||||
| Args: | |||||
| imgs (list[ndarray] | ndarray): Input images. | |||||
| bgr2rgb (bool): Whether to change bgr to rgb. | |||||
| float32 (bool): Whether to change to float32. | |||||
| Returns: | |||||
| list[tensor] | tensor: Tensor images. If returned results only have | |||||
| one element, just return tensor. | |||||
| """ | |||||
| def _totensor(img, bgr2rgb, float32): | |||||
| if img.shape[2] == 3 and bgr2rgb: | |||||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||||
| img = torch.from_numpy(img.transpose(2, 0, 1)) | |||||
| if float32: | |||||
| img = img.float() | |||||
| return img | |||||
| if isinstance(imgs, list): | |||||
| return [_totensor(img, bgr2rgb, float32) for img in imgs] | |||||
| else: | |||||
| return _totensor(imgs, bgr2rgb, float32) | |||||
| @@ -0,0 +1,51 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import cv2 | |||||
| import numpy as np | |||||
| from modelscope.metainfo import Datasets, Models | |||||
| from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS | |||||
| from modelscope.msdatasets.task_datasets.torch_base_dataset import \ | |||||
| TorchTaskDataset | |||||
| from modelscope.utils.constant import Tasks | |||||
| from .data_utils import img2tensor | |||||
| def default_loader(path): | |||||
| return cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0 | |||||
| @TASK_DATASETS.register_module( | |||||
| Tasks.image_portrait_enhancement, module_name=Datasets.PairedDataset) | |||||
| class ImagePortraitEnhancementDataset(TorchTaskDataset): | |||||
| """Paired image dataset for image portrait enhancement. | |||||
| """ | |||||
| def __init__(self, dataset, is_train): | |||||
| self.dataset = dataset | |||||
| self.gt_size = 256 | |||||
| self.is_train = is_train | |||||
| def __len__(self): | |||||
| return len(self.dataset) | |||||
| def __getitem__(self, index): | |||||
| # Load gt and lq images. Dimension order: HWC; channel order: BGR; | |||||
| # image range: [0, 1], float32. | |||||
| item_dict = self.dataset[index] | |||||
| gt_path = item_dict['hq:FILE'] | |||||
| img_gt = default_loader(gt_path) | |||||
| lq_path = item_dict['lq:FILE'] | |||||
| img_lq = default_loader(lq_path) | |||||
| gt_size = self.gt_size | |||||
| img_gt = cv2.resize(img_gt, (gt_size, gt_size)) | |||||
| img_lq = cv2.resize(img_lq, (gt_size, gt_size)) | |||||
| # BGR to RGB, HWC to CHW, numpy to tensor | |||||
| img_gt, img_lq = img2tensor([img_gt, img_lq], | |||||
| bgr2rgb=True, | |||||
| float32=True) | |||||
| return {'input': (img_lq - 0.5) / 0.5, 'target': (img_gt - 0.5) / 0.5} | |||||
| @@ -661,6 +661,7 @@ TASK_OUTPUTS = { | |||||
| # "caption": "this is an image caption text." | # "caption": "this is an image caption text." | ||||
| # } | # } | ||||
| Tasks.image_captioning: [OutputKeys.CAPTION], | Tasks.image_captioning: [OutputKeys.CAPTION], | ||||
| Tasks.ofa_ocr_recognition: [OutputKeys.TEXT], | |||||
| # visual grounding result for single sample | # visual grounding result for single sample | ||||
| # { | # { | ||||
| @@ -53,6 +53,7 @@ class ImageReidPersonPipeline(Pipeline): | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | ||||
| img = input['img'] | img = input['img'] | ||||
| img_embedding = self.model(img) | img_embedding = self.model(img) | ||||
| img_embedding = img_embedding.detach().cpu().numpy() | |||||
| return {OutputKeys.IMG_EMBEDDING: img_embedding} | return {OutputKeys.IMG_EMBEDDING: img_embedding} | ||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | ||||
| @@ -0,0 +1,52 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import Any, Dict, Optional, Union | |||||
| import torch | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.models.multi_modal import OfaForAllTasks | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Model, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||||
| from modelscope.utils.constant import Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| @PIPELINES.register_module( | |||||
| Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition) | |||||
| class OcrRecognitionPipeline(Pipeline): | |||||
| def __init__(self, | |||||
| model: Union[Model, str], | |||||
| preprocessor: Optional[Preprocessor] = None, | |||||
| **kwargs): | |||||
| """ | |||||
| use `model` and `preprocessor` to create a ocr recognition pipeline for prediction | |||||
| Args: | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model) | |||||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||||
| 'model must be a single str or OfaForAllTasks' | |||||
| if isinstance(model, str): | |||||
| pipe_model = Model.from_pretrained(model) | |||||
| elif isinstance(model, Model): | |||||
| pipe_model = model | |||||
| else: | |||||
| raise NotImplementedError | |||||
| pipe_model.model.eval() | |||||
| if preprocessor is None: | |||||
| if isinstance(pipe_model, OfaForAllTasks): | |||||
| preprocessor = OfaPreprocessor(pipe_model.model_dir) | |||||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||||
| def forward(self, inputs: Dict[str, Any], | |||||
| **forward_params) -> Dict[str, Any]: | |||||
| with torch.no_grad(): | |||||
| return super().forward(inputs, **forward_params) | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| return inputs | |||||
| @@ -40,6 +40,7 @@ class OfaPreprocessor(Preprocessor): | |||||
| """ | """ | ||||
| super().__init__(*args, **kwargs) | super().__init__(*args, **kwargs) | ||||
| preprocess_mapping = { | preprocess_mapping = { | ||||
| Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor, | |||||
| Tasks.image_captioning: OfaImageCaptioningPreprocessor, | Tasks.image_captioning: OfaImageCaptioningPreprocessor, | ||||
| Tasks.visual_grounding: OfaVisualGroundingPreprocessor, | Tasks.visual_grounding: OfaVisualGroundingPreprocessor, | ||||
| Tasks.visual_question_answering: | Tasks.visual_question_answering: | ||||
| @@ -51,6 +52,7 @@ class OfaPreprocessor(Preprocessor): | |||||
| Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor | Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor | ||||
| } | } | ||||
| input_key_mapping = { | input_key_mapping = { | ||||
| Tasks.ofa_ocr_recognition: ['image'], | |||||
| Tasks.image_captioning: ['image'], | Tasks.image_captioning: ['image'], | ||||
| Tasks.image_classification: ['image'], | Tasks.image_classification: ['image'], | ||||
| Tasks.summarization: ['text'], | Tasks.summarization: ['text'], | ||||
| @@ -1,6 +1,7 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| from .image_captioning import OfaImageCaptioningPreprocessor | from .image_captioning import OfaImageCaptioningPreprocessor | ||||
| from .image_classification import OfaImageClassificationPreprocessor | from .image_classification import OfaImageClassificationPreprocessor | ||||
| from .ocr_recognition import OfaOcrRecognitionPreprocessor | |||||
| from .summarization import OfaSummarizationPreprocessor | from .summarization import OfaSummarizationPreprocessor | ||||
| from .text_classification import OfaTextClassificationPreprocessor | from .text_classification import OfaTextClassificationPreprocessor | ||||
| from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor | from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor | ||||
| @@ -0,0 +1,99 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import random | |||||
| import unicodedata | |||||
| from typing import Any, Dict, Union | |||||
| import torch | |||||
| from PIL import Image | |||||
| from torchvision import transforms | |||||
| from torchvision.transforms import InterpolationMode | |||||
| from torchvision.transforms import functional as F | |||||
| from modelscope.preprocessors.image import load_image | |||||
| from .base import OfaBasePreprocessor | |||||
| IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) | |||||
| IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) | |||||
| def ocr_resize(img, patch_image_size, is_document=False): | |||||
| img = img.convert('RGB') | |||||
| width, height = img.size | |||||
| if is_document: | |||||
| new_height, new_width = 64, 1920 | |||||
| else: | |||||
| if width >= height: | |||||
| new_width = max(64, patch_image_size) | |||||
| new_height = max(64, int(patch_image_size * (height / width))) | |||||
| top = (patch_image_size - new_height) // 2 | |||||
| bottom = patch_image_size - new_height - top | |||||
| left, right = 0, 0 | |||||
| else: | |||||
| new_height = max(64, patch_image_size) | |||||
| new_width = max(64, int(patch_image_size * (width / height))) | |||||
| left = (patch_image_size - new_width) // 2 | |||||
| right = patch_image_size - new_width - left | |||||
| top, bottom = 0, 0 | |||||
| img_new = F.resize( | |||||
| img, | |||||
| (new_height, new_width), | |||||
| interpolation=InterpolationMode.BICUBIC, | |||||
| ) | |||||
| if is_document: | |||||
| img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1) | |||||
| img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2)) | |||||
| new_width, new_height = img_new.size | |||||
| top = (patch_image_size - new_height) // 2 | |||||
| bottom = patch_image_size - new_height - top | |||||
| left, right = 0, 0 | |||||
| img_new = F.pad( | |||||
| img_new, padding=[left, top, right, bottom], padding_mode='edge') | |||||
| assert img_new.size == (patch_image_size, patch_image_size) | |||||
| return img_new | |||||
| class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): | |||||
| def __init__(self, cfg, model_dir): | |||||
| """preprocess the data | |||||
| Args: | |||||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||||
| model_dir (str): model path | |||||
| """ | |||||
| super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir) | |||||
| # Initialize transform | |||||
| if self.cfg.model.imagenet_default_mean_and_std: | |||||
| mean = IMAGENET_DEFAULT_MEAN | |||||
| std = IMAGENET_DEFAULT_STD | |||||
| else: | |||||
| mean = [0.5, 0.5, 0.5] | |||||
| std = [0.5, 0.5, 0.5] | |||||
| self.patch_resize_transform = transforms.Compose([ | |||||
| lambda image: ocr_resize( | |||||
| image, | |||||
| self.cfg.model.patch_image_size, | |||||
| is_document=self.cfg.model.is_document), | |||||
| transforms.ToTensor(), | |||||
| transforms.Normalize(mean=mean, std=std), | |||||
| ]) | |||||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||||
| image = data['image'] if isinstance( | |||||
| data['image'], Image.Image) else load_image(data['image']) | |||||
| patch_image = self.patch_resize_transform(image) | |||||
| prompt = self.cfg.model.get('prompt', '图片上的文字是什么?') | |||||
| inputs = self.get_inputs(prompt) | |||||
| sample = { | |||||
| 'source': inputs, | |||||
| 'patch_image': patch_image, | |||||
| 'patch_mask': torch.tensor([True]) | |||||
| } | |||||
| return sample | |||||
| @@ -151,6 +151,7 @@ class MultiModalTasks(object): | |||||
| visual_entailment = 'visual-entailment' | visual_entailment = 'visual-entailment' | ||||
| video_multi_modal_embedding = 'video-multi-modal-embedding' | video_multi_modal_embedding = 'video-multi-modal-embedding' | ||||
| image_text_retrieval = 'image-text-retrieval' | image_text_retrieval = 'image-text-retrieval' | ||||
| ofa_ocr_recognition = 'ofa-ocr-recognition' | |||||
| class TasksIODescriptions(object): | class TasksIODescriptions(object): | ||||
| @@ -45,6 +45,14 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck): | |||||
| result = img_captioning('data/test/images/image_captioning.png') | result = img_captioning('data/test/images/image_captioning.png') | ||||
| print(result[OutputKeys.CAPTION]) | print(result[OutputKeys.CAPTION]) | ||||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||||
| def test_run_with_ocr_recognize_with_name(self): | |||||
| ocr_recognize = pipeline( | |||||
| Tasks.ofa_ocr_recognition, | |||||
| model='damo/ofa_ocr-recognition_scene_base_zh') | |||||
| result = ocr_recognize('data/test/images/image_ocr_recognition.jpg') | |||||
| print(result[OutputKeys.TEXT]) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
| def test_run_with_image_classification_with_model(self): | def test_run_with_image_classification_with_model(self): | ||||
| model = Model.from_pretrained( | model = Model.from_pretrained( | ||||
| @@ -14,52 +14,14 @@ from modelscope.hub.snapshot_download import snapshot_download | |||||
| from modelscope.metainfo import Trainers | from modelscope.metainfo import Trainers | ||||
| from modelscope.models.cv.image_portrait_enhancement import \ | from modelscope.models.cv.image_portrait_enhancement import \ | ||||
| ImagePortraitEnhancement | ImagePortraitEnhancement | ||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.msdatasets.task_datasets.image_portrait_enhancement import \ | |||||
| ImagePortraitEnhancementDataset | |||||
| from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
| from modelscope.utils.constant import ModelFile | |||||
| from modelscope.utils.constant import DownloadMode, ModelFile | |||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| class PairedImageDataset(data.Dataset): | |||||
| def __init__(self, root, size=512): | |||||
| super(PairedImageDataset, self).__init__() | |||||
| self.size = size | |||||
| gt_dir = osp.join(root, 'gt') | |||||
| lq_dir = osp.join(root, 'lq') | |||||
| self.gt_filelist = os.listdir(gt_dir) | |||||
| self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4])) | |||||
| self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist] | |||||
| self.lq_filelist = os.listdir(lq_dir) | |||||
| self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4])) | |||||
| self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist] | |||||
| def _img_to_tensor(self, img): | |||||
| img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type( | |||||
| torch.float32) / 255. | |||||
| return (img - 0.5) / 0.5 | |||||
| def __getitem__(self, index): | |||||
| lq = cv2.imread(self.lq_filelist[index]) | |||||
| gt = cv2.imread(self.gt_filelist[index]) | |||||
| lq = cv2.resize( | |||||
| lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC) | |||||
| gt = cv2.resize( | |||||
| gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC) | |||||
| return \ | |||||
| {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)} | |||||
| def __len__(self): | |||||
| return len(self.gt_filelist) | |||||
| def to_torch_dataset(self, | |||||
| columns: Union[str, List[str]] = None, | |||||
| preprocessors: Union[Callable, List[Callable]] = None, | |||||
| **format_kwargs): | |||||
| # self.preprocessor = preprocessors | |||||
| return self | |||||
| class TestImagePortraitEnhancementTrainer(unittest.TestCase): | class TestImagePortraitEnhancementTrainer(unittest.TestCase): | ||||
| def setUp(self): | def setUp(self): | ||||
| @@ -70,8 +32,23 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): | |||||
| self.model_id = 'damo/cv_gpen_image-portrait-enhancement' | self.model_id = 'damo/cv_gpen_image-portrait-enhancement' | ||||
| self.dataset = PairedImageDataset( | |||||
| './data/test/images/face_enhancement/') | |||||
| dataset_train = MsDataset.load( | |||||
| 'image-portrait-enhancement-dataset', | |||||
| namespace='modelscope', | |||||
| subset_name='default', | |||||
| split='test', | |||||
| download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds | |||||
| dataset_val = MsDataset.load( | |||||
| 'image-portrait-enhancement-dataset', | |||||
| namespace='modelscope', | |||||
| subset_name='default', | |||||
| split='test', | |||||
| download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds | |||||
| self.dataset_train = ImagePortraitEnhancementDataset( | |||||
| dataset_train, is_train=True) | |||||
| self.dataset_val = ImagePortraitEnhancementDataset( | |||||
| dataset_val, is_train=False) | |||||
| def tearDown(self): | def tearDown(self): | ||||
| shutil.rmtree(self.tmp_dir, ignore_errors=True) | shutil.rmtree(self.tmp_dir, ignore_errors=True) | ||||
| @@ -81,8 +58,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): | |||||
| def test_trainer(self): | def test_trainer(self): | ||||
| kwargs = dict( | kwargs = dict( | ||||
| model=self.model_id, | model=self.model_id, | ||||
| train_dataset=self.dataset, | |||||
| eval_dataset=self.dataset, | |||||
| train_dataset=self.dataset_train, | |||||
| eval_dataset=self.dataset_val, | |||||
| device='gpu', | device='gpu', | ||||
| work_dir=self.tmp_dir) | work_dir=self.tmp_dir) | ||||
| @@ -101,8 +78,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): | |||||
| kwargs = dict( | kwargs = dict( | ||||
| cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | ||||
| model=model, | model=model, | ||||
| train_dataset=self.dataset, | |||||
| eval_dataset=self.dataset, | |||||
| train_dataset=self.dataset_train, | |||||
| eval_dataset=self.dataset_val, | |||||
| device='gpu', | device='gpu', | ||||
| max_epochs=2, | max_epochs=2, | ||||
| work_dir=self.tmp_dir) | work_dir=self.tmp_dir) | ||||