diff --git a/data/test/images/image_ocr_recognition.jpg b/data/test/images/image_ocr_recognition.jpg new file mode 100644 index 00000000..b41287cd --- /dev/null +++ b/data/test/images/image_ocr_recognition.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772b19f76c98044e39330853928624f10e085106a4292b4dd19f865531080747 +size 959 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 03f1f92a..e55c32d1 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -263,6 +263,7 @@ class Pipelines(object): text_to_image_synthesis = 'text-to-image-synthesis' video_multi_modal_embedding = 'video-multi-modal-embedding' image_text_retrieval = 'image-text-retrieval' + ofa_ocr_recognition = 'ofa-ocr-recognition' class Trainers(object): @@ -462,3 +463,4 @@ class Datasets(object): SegDataset = 'SegDataset' DetDataset = 'DetDataset' DetImagesMixDataset = 'DetImagesMixDataset' + PairedDataset = 'PairedDataset' diff --git a/modelscope/metrics/image_portrait_enhancement_metric.py b/modelscope/metrics/image_portrait_enhancement_metric.py index 5a81e956..7d94aade 100644 --- a/modelscope/metrics/image_portrait_enhancement_metric.py +++ b/modelscope/metrics/image_portrait_enhancement_metric.py @@ -2,6 +2,7 @@ # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py from typing import Dict +import cv2 import numpy as np from modelscope.metainfo import Metrics @@ -37,6 +38,7 @@ class ImagePortraitEnhancementMetric(Metric): def add(self, outputs: Dict, inputs: Dict): ground_truths = outputs['target'] eval_results = outputs['pred'] + self.preds.extend(eval_results) self.targets.extend(ground_truths) diff --git a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py index 3650ac7b..26e9e532 100644 --- a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py +++ b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py @@ -35,7 +35,7 @@ class ImagePortraitEnhancement(TorchModel): """ super().__init__(model_dir, *args, **kwargs) - self.size = 512 + self.size = 256 self.style_dim = 512 self.n_mlp = 8 self.mean_path_length = 0 @@ -131,9 +131,9 @@ class ImagePortraitEnhancement(TorchModel): return path_penalty, path_mean.detach(), path_lengths @torch.no_grad() - def _evaluate_postprocess(self, src: Tensor, + def _evaluate_postprocess(self, input: Tensor, target: Tensor) -> Dict[str, list]: - preds, _ = self.generator(src) + preds, _ = self.generator(input) preds = list(torch.split(preds, 1, 0)) targets = list(torch.split(target, 1, 0)) @@ -144,11 +144,11 @@ class ImagePortraitEnhancement(TorchModel): return {'pred': preds, 'target': targets} - def _train_forward_d(self, src: Tensor, target: Tensor) -> Tensor: + def _train_forward_d(self, input: Tensor, target: Tensor) -> Tensor: self.requires_grad(self.generator, False) self.requires_grad(self.discriminator, True) - preds, _ = self.generator(src) + preds, _ = self.generator(input) fake_pred = self.discriminator(preds) real_pred = self.discriminator(target) @@ -156,27 +156,27 @@ class ImagePortraitEnhancement(TorchModel): return d_loss - def _train_forward_d_r1(self, src: Tensor, target: Tensor) -> Tensor: - src.requires_grad = True + def _train_forward_d_r1(self, input: Tensor, target: Tensor) -> Tensor: + input.requires_grad = True target.requires_grad = True real_pred = self.discriminator(target) r1_loss = self.d_r1_loss(real_pred, target) return r1_loss - def _train_forward_g(self, src: Tensor, target: Tensor) -> Tensor: + def _train_forward_g(self, input: Tensor, target: Tensor) -> Tensor: self.requires_grad(self.generator, True) self.requires_grad(self.discriminator, False) - preds, _ = self.generator(src) + preds, _ = self.generator(input) fake_pred = self.discriminator(preds) - g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, src) + g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, input) return g_loss - def _train_forward_g_path(self, src: Tensor, target: Tensor) -> Tensor: - fake_img, latents = self.generator(src, return_latents=True) + def _train_forward_g_path(self, input: Tensor, target: Tensor) -> Tensor: + fake_img, latents = self.generator(input, return_latents=True) path_loss, self.mean_path_length, path_lengths = self.g_path_regularize( fake_img, latents, self.mean_path_length) @@ -184,8 +184,8 @@ class ImagePortraitEnhancement(TorchModel): return path_loss @torch.no_grad() - def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]: - return {'outputs': (self.generator(src)[0] * 0.5 + 0.5).clamp(0, 1)} + def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]: + return {'outputs': (self.generator(input)[0] * 0.5 + 0.5).clamp(0, 1)} def forward(self, input: Dict[str, Tensor]) -> Dict[str, Union[list, Tensor]]: diff --git a/modelscope/models/cv/image_to_image_generation/__init__.py b/modelscope/models/cv/image_to_image_generation/__init__.py index fb408086..1af3e55f 100644 --- a/modelscope/models/cv/image_to_image_generation/__init__.py +++ b/modelscope/models/cv/image_to_image_generation/__init__.py @@ -1,2 +1,2 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from . import data, models, ops diff --git a/modelscope/models/cv/image_to_image_generation/data/__init__.py b/modelscope/models/cv/image_to_image_generation/data/__init__.py index 33c8cf44..22b9d22c 100644 --- a/modelscope/models/cv/image_to_image_generation/data/__init__.py +++ b/modelscope/models/cv/image_to_image_generation/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/cv/image_to_image_generation/data/transforms.py b/modelscope/models/cv/image_to_image_generation/data/transforms.py index 5376d813..29a25b4b 100644 --- a/modelscope/models/cv/image_to_image_generation/data/transforms.py +++ b/modelscope/models/cv/image_to_image_generation/data/transforms.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math import random diff --git a/modelscope/models/cv/image_to_image_generation/models/__init__.py b/modelscope/models/cv/image_to_image_generation/models/__init__.py index ec6a46fd..e98421f2 100644 --- a/modelscope/models/cv/image_to_image_generation/models/__init__.py +++ b/modelscope/models/cv/image_to_image_generation/models/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/cv/image_to_image_generation/ops/__init__.py b/modelscope/models/cv/image_to_image_generation/ops/__init__.py index 49674b49..e3dac584 100644 --- a/modelscope/models/cv/image_to_image_generation/ops/__init__.py +++ b/modelscope/models/cv/image_to_image_generation/ops/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/cv/image_to_image_translation/__init__.py b/modelscope/models/cv/image_to_image_translation/__init__.py index e69de29b..35aab6be 100644 --- a/modelscope/models/cv/image_to_image_translation/__init__.py +++ b/modelscope/models/cv/image_to_image_translation/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + + from .model_translation import UNet + +else: + _import_structure = { + 'image_to_image_translation_model': ['UNet'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_to_image_translation/data/__init__.py b/modelscope/models/cv/image_to_image_translation/data/__init__.py index 72450016..724bca04 100644 --- a/modelscope/models/cv/image_to_image_translation/data/__init__.py +++ b/modelscope/models/cv/image_to_image_translation/data/__init__.py @@ -1 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .transforms import * # noqa F403 diff --git a/modelscope/models/cv/image_to_image_translation/models/__init__.py b/modelscope/models/cv/image_to_image_translation/models/__init__.py index 322d78f2..7fdd8189 100644 --- a/modelscope/models/cv/image_to_image_translation/models/__init__.py +++ b/modelscope/models/cv/image_to_image_translation/models/__init__.py @@ -1,2 +1,3 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .autoencoder import * # noqa F403 from .clip import * # noqa F403 diff --git a/modelscope/models/cv/image_to_image_translation/ops/__init__.py b/modelscope/models/cv/image_to_image_translation/ops/__init__.py index 59082d72..474c811b 100644 --- a/modelscope/models/cv/image_to_image_translation/ops/__init__.py +++ b/modelscope/models/cv/image_to_image_translation/ops/__init__.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .degradation import * # noqa F403 from .diffusion import * # noqa F403 from .losses import * # noqa F403 diff --git a/modelscope/models/cv/product_retrieval_embedding/__init__.py b/modelscope/models/cv/product_retrieval_embedding/__init__.py index 7a02a60f..2cbc9099 100644 --- a/modelscope/models/cv/product_retrieval_embedding/__init__.py +++ b/modelscope/models/cv/product_retrieval_embedding/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/cv/product_retrieval_embedding/item_detection.py b/modelscope/models/cv/product_retrieval_embedding/item_detection.py index d5589969..2002c6cb 100644 --- a/modelscope/models/cv/product_retrieval_embedding/item_detection.py +++ b/modelscope/models/cv/product_retrieval_embedding/item_detection.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import cv2 import numpy as np diff --git a/modelscope/models/cv/product_retrieval_embedding/item_embedding.py b/modelscope/models/cv/product_retrieval_embedding/item_embedding.py index 0444596c..ea9ec846 100644 --- a/modelscope/models/cv/product_retrieval_embedding/item_embedding.py +++ b/modelscope/models/cv/product_retrieval_embedding/item_embedding.py @@ -1,3 +1,4 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import cv2 import numpy as np import torch.nn as nn diff --git a/modelscope/models/cv/product_retrieval_embedding/item_model.py b/modelscope/models/cv/product_retrieval_embedding/item_model.py index 85a636c0..3964efbe 100644 --- a/modelscope/models/cv/product_retrieval_embedding/item_model.py +++ b/modelscope/models/cv/product_retrieval_embedding/item_model.py @@ -1,3 +1,5 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. + import os.path as osp from typing import Any, Dict diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py index 75251989..c9987670 100644 --- a/modelscope/models/cv/video_summarization/summarizer.py +++ b/modelscope/models/cv/video_summarization/summarizer.py @@ -161,7 +161,7 @@ def summary_format(summary, fps): is_summary_frame = False if is_summary_frame and summary[-1] == 1: - end_frame = len(frame_idxes) - 1 + end_frame = len(summary) - 1 frames_list.append([start_frame, end_frame]) output = [] diff --git a/modelscope/models/multi_modal/diffusion/__init__.py b/modelscope/models/multi_modal/diffusion/__init__.py index 28813cc9..e7e374b6 100644 --- a/modelscope/models/multi_modal/diffusion/__init__.py +++ b/modelscope/models/multi_modal/diffusion/__init__.py @@ -1 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .model import DiffusionForTextToImageSynthesis diff --git a/modelscope/models/multi_modal/gemm/__init__.py b/modelscope/models/multi_modal/gemm/__init__.py index b920628e..fe5df1fe 100644 --- a/modelscope/models/multi_modal/gemm/__init__.py +++ b/modelscope/models/multi_modal/gemm/__init__.py @@ -1 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .gemm_model import GEMMForMultiModalEmbedding diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py index accbb56e..1b3f445b 100644 --- a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py +++ b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py @@ -1 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .model import MultiStageDiffusionForTextToImageSynthesis diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py index 124afefa..99b7b85c 100644 --- a/modelscope/models/multi_modal/ofa/utils/constant.py +++ b/modelscope/models/multi_modal/ofa/utils/constant.py @@ -3,6 +3,7 @@ from modelscope.outputs import OutputKeys from modelscope.utils.constant import Tasks OFA_TASK_KEY_MAPPING = { + Tasks.ofa_ocr_recognition: OutputKeys.TEXT, Tasks.image_captioning: OutputKeys.CAPTION, Tasks.summarization: OutputKeys.TEXT, Tasks.visual_question_answering: OutputKeys.TEXT, diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index 41ca1f0b..5bc38567 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -28,6 +28,7 @@ __all__ = ['OfaForAllTasks'] @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) +@MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa) @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa) @MODELS.register_module( Tasks.visual_question_answering, module_name=Models.ofa) @@ -97,6 +98,7 @@ class OfaForAllTasks(TorchModel): 'traverse': self._traverse_inference, } self.task_inference_mapping = { + Tasks.ofa_ocr_recognition: self._text_gen_inference, Tasks.image_captioning: self._text_gen_inference, Tasks.summarization: self._text_gen_inference, Tasks.visual_grounding: self._visual_grounding_inference, diff --git a/modelscope/models/multi_modal/team/__init__.py b/modelscope/models/multi_modal/team/__init__.py index 0597040c..58bbdca5 100644 --- a/modelscope/models/multi_modal/team/__init__.py +++ b/modelscope/models/multi_modal/team/__init__.py @@ -1 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. from .team_model import TEAMForMultiModalSimilarity diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py index 7c31969a..914c41bf 100644 --- a/modelscope/msdatasets/task_datasets/__init__.py +++ b/modelscope/msdatasets/task_datasets/__init__.py @@ -27,6 +27,8 @@ else: 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], 'image_inpainting': ['ImageInpaintingDataset'], 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], + 'image_portrait_enhancement_dataset': + ['ImagePortraitEnhancementDataset'], } import sys diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py new file mode 100644 index 00000000..4df24fae --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .image_portrait_enhancement_dataset import ImagePortraitEnhancementDataset + +else: + _import_structure = { + 'image_portrait_enhancement_dataset': + ['ImagePortraitEnhancementDataset'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py new file mode 100644 index 00000000..1133d3c2 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py @@ -0,0 +1,32 @@ +# ------------------------------------------------------------------------ +# Modified from BasicSR (https://github.com/xinntao/BasicSR) +# Copyright 2018-2020 BasicSR Authors +# ------------------------------------------------------------------------ + +import cv2 +import torch + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py new file mode 100644 index 00000000..58d40778 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import cv2 +import numpy as np + +from modelscope.metainfo import Datasets, Models +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.msdatasets.task_datasets.torch_base_dataset import \ + TorchTaskDataset +from modelscope.utils.constant import Tasks +from .data_utils import img2tensor + + +def default_loader(path): + return cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0 + + +@TASK_DATASETS.register_module( + Tasks.image_portrait_enhancement, module_name=Datasets.PairedDataset) +class ImagePortraitEnhancementDataset(TorchTaskDataset): + """Paired image dataset for image portrait enhancement. + """ + + def __init__(self, dataset, is_train): + self.dataset = dataset + self.gt_size = 256 + self.is_train = is_train + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + + # Load gt and lq images. Dimension order: HWC; channel order: BGR; + # image range: [0, 1], float32. + item_dict = self.dataset[index] + gt_path = item_dict['hq:FILE'] + img_gt = default_loader(gt_path) + lq_path = item_dict['lq:FILE'] + img_lq = default_loader(lq_path) + + gt_size = self.gt_size + img_gt = cv2.resize(img_gt, (gt_size, gt_size)) + img_lq = cv2.resize(img_lq, (gt_size, gt_size)) + + # BGR to RGB, HWC to CHW, numpy to tensor + img_gt, img_lq = img2tensor([img_gt, img_lq], + bgr2rgb=True, + float32=True) + + return {'input': (img_lq - 0.5) / 0.5, 'target': (img_gt - 0.5) / 0.5} diff --git a/modelscope/outputs.py b/modelscope/outputs.py index fbe15646..365e2bf9 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -661,6 +661,7 @@ TASK_OUTPUTS = { # "caption": "this is an image caption text." # } Tasks.image_captioning: [OutputKeys.CAPTION], + Tasks.ofa_ocr_recognition: [OutputKeys.TEXT], # visual grounding result for single sample # { diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py index 64674a65..9f60142a 100644 --- a/modelscope/pipelines/cv/image_reid_person_pipeline.py +++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py @@ -53,6 +53,7 @@ class ImageReidPersonPipeline(Pipeline): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: img = input['img'] img_embedding = self.model(img) + img_embedding = img_embedding.detach().cpu().numpy() return {OutputKeys.IMG_EMBEDDING: img_embedding} def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: diff --git a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py new file mode 100644 index 00000000..9cd63b6c --- /dev/null +++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py @@ -0,0 +1,52 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import OfaPreprocessor, Preprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition) +class OcrRecognitionPipeline(Pipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, + **kwargs): + """ + use `model` and `preprocessor` to create a ocr recognition pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model) + assert isinstance(model, str) or isinstance(model, Model), \ + 'model must be a single str or OfaForAllTasks' + if isinstance(model, str): + pipe_model = Model.from_pretrained(model) + elif isinstance(model, Model): + pipe_model = model + else: + raise NotImplementedError + pipe_model.model.eval() + if preprocessor is None: + if isinstance(pipe_model, OfaForAllTasks): + preprocessor = OfaPreprocessor(pipe_model.model_dir) + super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return super().forward(inputs, **forward_params) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 73742c47..92b4fe1c 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -40,6 +40,7 @@ class OfaPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) preprocess_mapping = { + Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor, Tasks.image_captioning: OfaImageCaptioningPreprocessor, Tasks.visual_grounding: OfaVisualGroundingPreprocessor, Tasks.visual_question_answering: @@ -51,6 +52,7 @@ class OfaPreprocessor(Preprocessor): Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor } input_key_mapping = { + Tasks.ofa_ocr_recognition: ['image'], Tasks.image_captioning: ['image'], Tasks.image_classification: ['image'], Tasks.summarization: ['text'], diff --git a/modelscope/preprocessors/ofa/__init__.py b/modelscope/preprocessors/ofa/__init__.py index 95d72fe1..59b94b2b 100644 --- a/modelscope/preprocessors/ofa/__init__.py +++ b/modelscope/preprocessors/ofa/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from .image_captioning import OfaImageCaptioningPreprocessor from .image_classification import OfaImageClassificationPreprocessor +from .ocr_recognition import OfaOcrRecognitionPreprocessor from .summarization import OfaSummarizationPreprocessor from .text_classification import OfaTextClassificationPreprocessor from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py new file mode 100644 index 00000000..1d30e572 --- /dev/null +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -0,0 +1,99 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import random +import unicodedata +from typing import Any, Dict, Union + +import torch +from PIL import Image +from torchvision import transforms +from torchvision.transforms import InterpolationMode +from torchvision.transforms import functional as F + +from modelscope.preprocessors.image import load_image +from .base import OfaBasePreprocessor + +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + + +def ocr_resize(img, patch_image_size, is_document=False): + img = img.convert('RGB') + width, height = img.size + + if is_document: + new_height, new_width = 64, 1920 + else: + if width >= height: + new_width = max(64, patch_image_size) + new_height = max(64, int(patch_image_size * (height / width))) + top = (patch_image_size - new_height) // 2 + bottom = patch_image_size - new_height - top + left, right = 0, 0 + else: + new_height = max(64, patch_image_size) + new_width = max(64, int(patch_image_size * (width / height))) + left = (patch_image_size - new_width) // 2 + right = patch_image_size - new_width - left + top, bottom = 0, 0 + + img_new = F.resize( + img, + (new_height, new_width), + interpolation=InterpolationMode.BICUBIC, + ) + + if is_document: + img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1) + img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2)) + new_width, new_height = img_new.size + top = (patch_image_size - new_height) // 2 + bottom = patch_image_size - new_height - top + left, right = 0, 0 + + img_new = F.pad( + img_new, padding=[left, top, right, bottom], padding_mode='edge') + assert img_new.size == (patch_image_size, patch_image_size) + + return img_new + + +class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): + + def __init__(self, cfg, model_dir): + """preprocess the data + + Args: + cfg(modelscope.utils.config.ConfigDict) : model config + model_dir (str): model path + """ + super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir) + # Initialize transform + if self.cfg.model.imagenet_default_mean_and_std: + mean = IMAGENET_DEFAULT_MEAN + std = IMAGENET_DEFAULT_STD + else: + mean = [0.5, 0.5, 0.5] + std = [0.5, 0.5, 0.5] + + self.patch_resize_transform = transforms.Compose([ + lambda image: ocr_resize( + image, + self.cfg.model.patch_image_size, + is_document=self.cfg.model.is_document), + transforms.ToTensor(), + transforms.Normalize(mean=mean, std=std), + ]) + + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = data['image'] if isinstance( + data['image'], Image.Image) else load_image(data['image']) + patch_image = self.patch_resize_transform(image) + prompt = self.cfg.model.get('prompt', '图片上的文字是什么?') + inputs = self.get_inputs(prompt) + + sample = { + 'source': inputs, + 'patch_image': patch_image, + 'patch_mask': torch.tensor([True]) + } + return sample diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 6c0f3e98..865e1d4f 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -151,6 +151,7 @@ class MultiModalTasks(object): visual_entailment = 'visual-entailment' video_multi_modal_embedding = 'video-multi-modal-embedding' image_text_retrieval = 'image-text-retrieval' + ofa_ocr_recognition = 'ofa-ocr-recognition' class TasksIODescriptions(object): diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py index f8366508..05ecc719 100644 --- a/tests/pipelines/test_ofa_tasks.py +++ b/tests/pipelines/test_ofa_tasks.py @@ -45,6 +45,14 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck): result = img_captioning('data/test/images/image_captioning.png') print(result[OutputKeys.CAPTION]) + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_ocr_recognize_with_name(self): + ocr_recognize = pipeline( + Tasks.ofa_ocr_recognition, + model='damo/ofa_ocr-recognition_scene_base_zh') + result = ocr_recognize('data/test/images/image_ocr_recognition.jpg') + print(result[OutputKeys.TEXT]) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_image_classification_with_model(self): model = Model.from_pretrained( diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py index 049adf7e..5c47a59b 100644 --- a/tests/trainers/test_image_portrait_enhancement_trainer.py +++ b/tests/trainers/test_image_portrait_enhancement_trainer.py @@ -14,52 +14,14 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.models.cv.image_portrait_enhancement import \ ImagePortraitEnhancement +from modelscope.msdatasets import MsDataset +from modelscope.msdatasets.task_datasets.image_portrait_enhancement import \ + ImagePortraitEnhancementDataset from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import DownloadMode, ModelFile from modelscope.utils.test_utils import test_level -class PairedImageDataset(data.Dataset): - - def __init__(self, root, size=512): - super(PairedImageDataset, self).__init__() - self.size = size - gt_dir = osp.join(root, 'gt') - lq_dir = osp.join(root, 'lq') - self.gt_filelist = os.listdir(gt_dir) - self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4])) - self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist] - self.lq_filelist = os.listdir(lq_dir) - self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4])) - self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist] - - def _img_to_tensor(self, img): - img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type( - torch.float32) / 255. - return (img - 0.5) / 0.5 - - def __getitem__(self, index): - lq = cv2.imread(self.lq_filelist[index]) - gt = cv2.imread(self.gt_filelist[index]) - lq = cv2.resize( - lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC) - gt = cv2.resize( - gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC) - - return \ - {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)} - - def __len__(self): - return len(self.gt_filelist) - - def to_torch_dataset(self, - columns: Union[str, List[str]] = None, - preprocessors: Union[Callable, List[Callable]] = None, - **format_kwargs): - # self.preprocessor = preprocessors - return self - - class TestImagePortraitEnhancementTrainer(unittest.TestCase): def setUp(self): @@ -70,8 +32,23 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): self.model_id = 'damo/cv_gpen_image-portrait-enhancement' - self.dataset = PairedImageDataset( - './data/test/images/face_enhancement/') + dataset_train = MsDataset.load( + 'image-portrait-enhancement-dataset', + namespace='modelscope', + subset_name='default', + split='test', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds + dataset_val = MsDataset.load( + 'image-portrait-enhancement-dataset', + namespace='modelscope', + subset_name='default', + split='test', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds + + self.dataset_train = ImagePortraitEnhancementDataset( + dataset_train, is_train=True) + self.dataset_val = ImagePortraitEnhancementDataset( + dataset_val, is_train=False) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) @@ -81,8 +58,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): def test_trainer(self): kwargs = dict( model=self.model_id, - train_dataset=self.dataset, - eval_dataset=self.dataset, + train_dataset=self.dataset_train, + eval_dataset=self.dataset_val, device='gpu', work_dir=self.tmp_dir) @@ -101,8 +78,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase): kwargs = dict( cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), model=model, - train_dataset=self.dataset, - eval_dataset=self.dataset, + train_dataset=self.dataset_train, + eval_dataset=self.dataset_val, device='gpu', max_epochs=2, work_dir=self.tmp_dir)