Browse Source

Merge remote-tracking branch 'origin/master' into ofa/finetune

master
行嗔 3 years ago
parent
commit
330a8ac68f
37 changed files with 364 additions and 68 deletions
  1. +3
    -0
      data/test/images/image_ocr_recognition.jpg
  2. +2
    -0
      modelscope/metainfo.py
  3. +2
    -0
      modelscope/metrics/image_portrait_enhancement_metric.py
  4. +14
    -14
      modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
  5. +1
    -1
      modelscope/models/cv/image_to_image_generation/__init__.py
  6. +1
    -1
      modelscope/models/cv/image_to_image_generation/data/__init__.py
  7. +1
    -0
      modelscope/models/cv/image_to_image_generation/data/transforms.py
  8. +1
    -1
      modelscope/models/cv/image_to_image_generation/models/__init__.py
  9. +1
    -1
      modelscope/models/cv/image_to_image_generation/ops/__init__.py
  10. +24
    -0
      modelscope/models/cv/image_to_image_translation/__init__.py
  11. +1
    -0
      modelscope/models/cv/image_to_image_translation/data/__init__.py
  12. +1
    -0
      modelscope/models/cv/image_to_image_translation/models/__init__.py
  13. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/__init__.py
  14. +1
    -1
      modelscope/models/cv/product_retrieval_embedding/__init__.py
  15. +1
    -0
      modelscope/models/cv/product_retrieval_embedding/item_detection.py
  16. +1
    -0
      modelscope/models/cv/product_retrieval_embedding/item_embedding.py
  17. +2
    -0
      modelscope/models/cv/product_retrieval_embedding/item_model.py
  18. +1
    -1
      modelscope/models/cv/video_summarization/summarizer.py
  19. +1
    -0
      modelscope/models/multi_modal/diffusion/__init__.py
  20. +1
    -0
      modelscope/models/multi_modal/gemm/__init__.py
  21. +1
    -0
      modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
  22. +1
    -0
      modelscope/models/multi_modal/ofa/utils/constant.py
  23. +2
    -0
      modelscope/models/multi_modal/ofa_for_all_tasks.py
  24. +1
    -0
      modelscope/models/multi_modal/team/__init__.py
  25. +2
    -0
      modelscope/msdatasets/task_datasets/__init__.py
  26. +23
    -0
      modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py
  27. +32
    -0
      modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py
  28. +51
    -0
      modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py
  29. +1
    -0
      modelscope/outputs.py
  30. +1
    -0
      modelscope/pipelines/cv/image_reid_person_pipeline.py
  31. +52
    -0
      modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
  32. +2
    -0
      modelscope/preprocessors/multi_modal.py
  33. +1
    -0
      modelscope/preprocessors/ofa/__init__.py
  34. +99
    -0
      modelscope/preprocessors/ofa/ocr_recognition.py
  35. +1
    -0
      modelscope/utils/constant.py
  36. +8
    -0
      tests/pipelines/test_ofa_tasks.py
  37. +25
    -48
      tests/trainers/test_image_portrait_enhancement_trainer.py

+ 3
- 0
data/test/images/image_ocr_recognition.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:772b19f76c98044e39330853928624f10e085106a4292b4dd19f865531080747
size 959

+ 2
- 0
modelscope/metainfo.py View File

@@ -263,6 +263,7 @@ class Pipelines(object):
text_to_image_synthesis = 'text-to-image-synthesis'
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'
ofa_ocr_recognition = 'ofa-ocr-recognition'


class Trainers(object):
@@ -462,3 +463,4 @@ class Datasets(object):
SegDataset = 'SegDataset'
DetDataset = 'DetDataset'
DetImagesMixDataset = 'DetImagesMixDataset'
PairedDataset = 'PairedDataset'

+ 2
- 0
modelscope/metrics/image_portrait_enhancement_metric.py View File

@@ -2,6 +2,7 @@
# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
from typing import Dict

import cv2
import numpy as np

from modelscope.metainfo import Metrics
@@ -37,6 +38,7 @@ class ImagePortraitEnhancementMetric(Metric):
def add(self, outputs: Dict, inputs: Dict):
ground_truths = outputs['target']
eval_results = outputs['pred']

self.preds.extend(eval_results)
self.targets.extend(ground_truths)



+ 14
- 14
modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py View File

@@ -35,7 +35,7 @@ class ImagePortraitEnhancement(TorchModel):
"""
super().__init__(model_dir, *args, **kwargs)

self.size = 512
self.size = 256
self.style_dim = 512
self.n_mlp = 8
self.mean_path_length = 0
@@ -131,9 +131,9 @@ class ImagePortraitEnhancement(TorchModel):
return path_penalty, path_mean.detach(), path_lengths

@torch.no_grad()
def _evaluate_postprocess(self, src: Tensor,
def _evaluate_postprocess(self, input: Tensor,
target: Tensor) -> Dict[str, list]:
preds, _ = self.generator(src)
preds, _ = self.generator(input)
preds = list(torch.split(preds, 1, 0))
targets = list(torch.split(target, 1, 0))

@@ -144,11 +144,11 @@ class ImagePortraitEnhancement(TorchModel):

return {'pred': preds, 'target': targets}

def _train_forward_d(self, src: Tensor, target: Tensor) -> Tensor:
def _train_forward_d(self, input: Tensor, target: Tensor) -> Tensor:
self.requires_grad(self.generator, False)
self.requires_grad(self.discriminator, True)

preds, _ = self.generator(src)
preds, _ = self.generator(input)
fake_pred = self.discriminator(preds)
real_pred = self.discriminator(target)

@@ -156,27 +156,27 @@ class ImagePortraitEnhancement(TorchModel):

return d_loss

def _train_forward_d_r1(self, src: Tensor, target: Tensor) -> Tensor:
src.requires_grad = True
def _train_forward_d_r1(self, input: Tensor, target: Tensor) -> Tensor:
input.requires_grad = True
target.requires_grad = True
real_pred = self.discriminator(target)
r1_loss = self.d_r1_loss(real_pred, target)

return r1_loss

def _train_forward_g(self, src: Tensor, target: Tensor) -> Tensor:
def _train_forward_g(self, input: Tensor, target: Tensor) -> Tensor:
self.requires_grad(self.generator, True)
self.requires_grad(self.discriminator, False)

preds, _ = self.generator(src)
preds, _ = self.generator(input)
fake_pred = self.discriminator(preds)

g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, src)
g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, input)

return g_loss

def _train_forward_g_path(self, src: Tensor, target: Tensor) -> Tensor:
fake_img, latents = self.generator(src, return_latents=True)
def _train_forward_g_path(self, input: Tensor, target: Tensor) -> Tensor:
fake_img, latents = self.generator(input, return_latents=True)

path_loss, self.mean_path_length, path_lengths = self.g_path_regularize(
fake_img, latents, self.mean_path_length)
@@ -184,8 +184,8 @@ class ImagePortraitEnhancement(TorchModel):
return path_loss

@torch.no_grad()
def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]:
return {'outputs': (self.generator(src)[0] * 0.5 + 0.5).clamp(0, 1)}
def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
return {'outputs': (self.generator(input)[0] * 0.5 + 0.5).clamp(0, 1)}

def forward(self, input: Dict[str,
Tensor]) -> Dict[str, Union[list, Tensor]]:


+ 1
- 1
modelscope/models/cv/image_to_image_generation/__init__.py View File

@@ -1,2 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from . import data, models, ops

+ 1
- 1
modelscope/models/cv/image_to_image_generation/data/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule


+ 1
- 0
modelscope/models/cv/image_to_image_generation/data/transforms.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math
import random



+ 1
- 1
modelscope/models/cv/image_to_image_generation/models/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule


+ 1
- 1
modelscope/models/cv/image_to_image_generation/ops/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule


+ 24
- 0
modelscope/models/cv/image_to_image_translation/__init__.py View File

@@ -0,0 +1,24 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:

from .model_translation import UNet

else:
_import_structure = {
'image_to_image_translation_model': ['UNet'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 1
- 0
modelscope/models/cv/image_to_image_translation/data/__init__.py View File

@@ -1 +1,2 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .transforms import * # noqa F403

+ 1
- 0
modelscope/models/cv/image_to_image_translation/models/__init__.py View File

@@ -1,2 +1,3 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .autoencoder import * # noqa F403
from .clip import * # noqa F403

+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/__init__.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .degradation import * # noqa F403
from .diffusion import * # noqa F403
from .losses import * # noqa F403


+ 1
- 1
modelscope/models/cv/product_retrieval_embedding/__init__.py View File

@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule


+ 1
- 0
modelscope/models/cv/product_retrieval_embedding/item_detection.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import cv2
import numpy as np



+ 1
- 0
modelscope/models/cv/product_retrieval_embedding/item_embedding.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import cv2
import numpy as np
import torch.nn as nn


+ 2
- 0
modelscope/models/cv/product_retrieval_embedding/item_model.py View File

@@ -1,3 +1,5 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

import os.path as osp
from typing import Any, Dict



+ 1
- 1
modelscope/models/cv/video_summarization/summarizer.py View File

@@ -161,7 +161,7 @@ def summary_format(summary, fps):
is_summary_frame = False

if is_summary_frame and summary[-1] == 1:
end_frame = len(frame_idxes) - 1
end_frame = len(summary) - 1
frames_list.append([start_frame, end_frame])

output = []


+ 1
- 0
modelscope/models/multi_modal/diffusion/__init__.py View File

@@ -1 +1,2 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .model import DiffusionForTextToImageSynthesis

+ 1
- 0
modelscope/models/multi_modal/gemm/__init__.py View File

@@ -1 +1,2 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .gemm_model import GEMMForMultiModalEmbedding

+ 1
- 0
modelscope/models/multi_modal/multi_stage_diffusion/__init__.py View File

@@ -1 +1,2 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .model import MultiStageDiffusionForTextToImageSynthesis

+ 1
- 0
modelscope/models/multi_modal/ofa/utils/constant.py View File

@@ -3,6 +3,7 @@ from modelscope.outputs import OutputKeys
from modelscope.utils.constant import Tasks

OFA_TASK_KEY_MAPPING = {
Tasks.ofa_ocr_recognition: OutputKeys.TEXT,
Tasks.image_captioning: OutputKeys.CAPTION,
Tasks.summarization: OutputKeys.TEXT,
Tasks.visual_question_answering: OutputKeys.TEXT,


+ 2
- 0
modelscope/models/multi_modal/ofa_for_all_tasks.py View File

@@ -28,6 +28,7 @@ __all__ = ['OfaForAllTasks']


@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
@MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa)
@MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa)
@MODELS.register_module(
Tasks.visual_question_answering, module_name=Models.ofa)
@@ -97,6 +98,7 @@ class OfaForAllTasks(TorchModel):
'traverse': self._traverse_inference,
}
self.task_inference_mapping = {
Tasks.ofa_ocr_recognition: self._text_gen_inference,
Tasks.image_captioning: self._text_gen_inference,
Tasks.summarization: self._text_gen_inference,
Tasks.visual_grounding: self._visual_grounding_inference,


+ 1
- 0
modelscope/models/multi_modal/team/__init__.py View File

@@ -1 +1,2 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from .team_model import TEAMForMultiModalSimilarity

+ 2
- 0
modelscope/msdatasets/task_datasets/__init__.py View File

@@ -27,6 +27,8 @@ else:
'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
'image_inpainting': ['ImageInpaintingDataset'],
'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
'image_portrait_enhancement_dataset':
['ImagePortraitEnhancementDataset'],
}
import sys



+ 23
- 0
modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py View File

@@ -0,0 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .image_portrait_enhancement_dataset import ImagePortraitEnhancementDataset

else:
_import_structure = {
'image_portrait_enhancement_dataset':
['ImagePortraitEnhancementDataset'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 32
- 0
modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py View File

@@ -0,0 +1,32 @@
# ------------------------------------------------------------------------
# Modified from BasicSR (https://github.com/xinntao/BasicSR)
# Copyright 2018-2020 BasicSR Authors
# ------------------------------------------------------------------------

import cv2
import torch


def img2tensor(imgs, bgr2rgb=True, float32=True):
"""Numpy array to tensor.
Args:
imgs (list[ndarray] | ndarray): Input images.
bgr2rgb (bool): Whether to change bgr to rgb.
float32 (bool): Whether to change to float32.
Returns:
list[tensor] | tensor: Tensor images. If returned results only have
one element, just return tensor.
"""

def _totensor(img, bgr2rgb, float32):
if img.shape[2] == 3 and bgr2rgb:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = torch.from_numpy(img.transpose(2, 0, 1))
if float32:
img = img.float()
return img

if isinstance(imgs, list):
return [_totensor(img, bgr2rgb, float32) for img in imgs]
else:
return _totensor(imgs, bgr2rgb, float32)

+ 51
- 0
modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py View File

@@ -0,0 +1,51 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import cv2
import numpy as np

from modelscope.metainfo import Datasets, Models
from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
from modelscope.msdatasets.task_datasets.torch_base_dataset import \
TorchTaskDataset
from modelscope.utils.constant import Tasks
from .data_utils import img2tensor


def default_loader(path):
return cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0


@TASK_DATASETS.register_module(
Tasks.image_portrait_enhancement, module_name=Datasets.PairedDataset)
class ImagePortraitEnhancementDataset(TorchTaskDataset):
"""Paired image dataset for image portrait enhancement.
"""

def __init__(self, dataset, is_train):
self.dataset = dataset
self.gt_size = 256
self.is_train = is_train

def __len__(self):
return len(self.dataset)

def __getitem__(self, index):

# Load gt and lq images. Dimension order: HWC; channel order: BGR;
# image range: [0, 1], float32.
item_dict = self.dataset[index]
gt_path = item_dict['hq:FILE']
img_gt = default_loader(gt_path)
lq_path = item_dict['lq:FILE']
img_lq = default_loader(lq_path)

gt_size = self.gt_size
img_gt = cv2.resize(img_gt, (gt_size, gt_size))
img_lq = cv2.resize(img_lq, (gt_size, gt_size))

# BGR to RGB, HWC to CHW, numpy to tensor
img_gt, img_lq = img2tensor([img_gt, img_lq],
bgr2rgb=True,
float32=True)

return {'input': (img_lq - 0.5) / 0.5, 'target': (img_gt - 0.5) / 0.5}

+ 1
- 0
modelscope/outputs.py View File

@@ -661,6 +661,7 @@ TASK_OUTPUTS = {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: [OutputKeys.CAPTION],
Tasks.ofa_ocr_recognition: [OutputKeys.TEXT],

# visual grounding result for single sample
# {


+ 1
- 0
modelscope/pipelines/cv/image_reid_person_pipeline.py View File

@@ -53,6 +53,7 @@ class ImageReidPersonPipeline(Pipeline):
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
img = input['img']
img_embedding = self.model(img)
img_embedding = img_embedding.detach().cpu().numpy()
return {OutputKeys.IMG_EMBEDDING: img_embedding}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:


+ 52
- 0
modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py View File

@@ -0,0 +1,52 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict, Optional, Union

import torch

from modelscope.metainfo import Pipelines
from modelscope.models.multi_modal import OfaForAllTasks
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Model, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import OfaPreprocessor, Preprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
class OcrRecognitionPipeline(Pipeline):

def __init__(self,
model: Union[Model, str],
preprocessor: Optional[Preprocessor] = None,
**kwargs):
"""
use `model` and `preprocessor` to create a ocr recognition pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model)
assert isinstance(model, str) or isinstance(model, Model), \
'model must be a single str or OfaForAllTasks'
if isinstance(model, str):
pipe_model = Model.from_pretrained(model)
elif isinstance(model, Model):
pipe_model = model
else:
raise NotImplementedError
pipe_model.model.eval()
if preprocessor is None:
if isinstance(pipe_model, OfaForAllTasks):
preprocessor = OfaPreprocessor(pipe_model.model_dir)
super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
with torch.no_grad():
return super().forward(inputs, **forward_params)

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 2
- 0
modelscope/preprocessors/multi_modal.py View File

@@ -40,6 +40,7 @@ class OfaPreprocessor(Preprocessor):
"""
super().__init__(*args, **kwargs)
preprocess_mapping = {
Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor,
Tasks.image_captioning: OfaImageCaptioningPreprocessor,
Tasks.visual_grounding: OfaVisualGroundingPreprocessor,
Tasks.visual_question_answering:
@@ -51,6 +52,7 @@ class OfaPreprocessor(Preprocessor):
Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
}
input_key_mapping = {
Tasks.ofa_ocr_recognition: ['image'],
Tasks.image_captioning: ['image'],
Tasks.image_classification: ['image'],
Tasks.summarization: ['text'],


+ 1
- 0
modelscope/preprocessors/ofa/__init__.py View File

@@ -1,6 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .image_captioning import OfaImageCaptioningPreprocessor
from .image_classification import OfaImageClassificationPreprocessor
from .ocr_recognition import OfaOcrRecognitionPreprocessor
from .summarization import OfaSummarizationPreprocessor
from .text_classification import OfaTextClassificationPreprocessor
from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor


+ 99
- 0
modelscope/preprocessors/ofa/ocr_recognition.py View File

@@ -0,0 +1,99 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import random
import unicodedata
from typing import Any, Dict, Union

import torch
from PIL import Image
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from torchvision.transforms import functional as F

from modelscope.preprocessors.image import load_image
from .base import OfaBasePreprocessor

IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)


def ocr_resize(img, patch_image_size, is_document=False):
img = img.convert('RGB')
width, height = img.size

if is_document:
new_height, new_width = 64, 1920
else:
if width >= height:
new_width = max(64, patch_image_size)
new_height = max(64, int(patch_image_size * (height / width)))
top = (patch_image_size - new_height) // 2
bottom = patch_image_size - new_height - top
left, right = 0, 0
else:
new_height = max(64, patch_image_size)
new_width = max(64, int(patch_image_size * (width / height)))
left = (patch_image_size - new_width) // 2
right = patch_image_size - new_width - left
top, bottom = 0, 0

img_new = F.resize(
img,
(new_height, new_width),
interpolation=InterpolationMode.BICUBIC,
)

if is_document:
img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1)
img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2))
new_width, new_height = img_new.size
top = (patch_image_size - new_height) // 2
bottom = patch_image_size - new_height - top
left, right = 0, 0

img_new = F.pad(
img_new, padding=[left, top, right, bottom], padding_mode='edge')
assert img_new.size == (patch_image_size, patch_image_size)

return img_new


class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):

def __init__(self, cfg, model_dir):
"""preprocess the data

Args:
cfg(modelscope.utils.config.ConfigDict) : model config
model_dir (str): model path
"""
super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir)
# Initialize transform
if self.cfg.model.imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]

self.patch_resize_transform = transforms.Compose([
lambda image: ocr_resize(
image,
self.cfg.model.patch_image_size,
is_document=self.cfg.model.is_document),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
image = data['image'] if isinstance(
data['image'], Image.Image) else load_image(data['image'])
patch_image = self.patch_resize_transform(image)
prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
inputs = self.get_inputs(prompt)

sample = {
'source': inputs,
'patch_image': patch_image,
'patch_mask': torch.tensor([True])
}
return sample

+ 1
- 0
modelscope/utils/constant.py View File

@@ -151,6 +151,7 @@ class MultiModalTasks(object):
visual_entailment = 'visual-entailment'
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'
ofa_ocr_recognition = 'ofa-ocr-recognition'


class TasksIODescriptions(object):


+ 8
- 0
tests/pipelines/test_ofa_tasks.py View File

@@ -45,6 +45,14 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
result = img_captioning('data/test/images/image_captioning.png')
print(result[OutputKeys.CAPTION])

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_ocr_recognize_with_name(self):
ocr_recognize = pipeline(
Tasks.ofa_ocr_recognition,
model='damo/ofa_ocr-recognition_scene_base_zh')
result = ocr_recognize('data/test/images/image_ocr_recognition.jpg')
print(result[OutputKeys.TEXT])

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_image_classification_with_model(self):
model = Model.from_pretrained(


+ 25
- 48
tests/trainers/test_image_portrait_enhancement_trainer.py View File

@@ -14,52 +14,14 @@ from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Trainers
from modelscope.models.cv.image_portrait_enhancement import \
ImagePortraitEnhancement
from modelscope.msdatasets import MsDataset
from modelscope.msdatasets.task_datasets.image_portrait_enhancement import \
ImagePortraitEnhancementDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile
from modelscope.utils.constant import DownloadMode, ModelFile
from modelscope.utils.test_utils import test_level


class PairedImageDataset(data.Dataset):

def __init__(self, root, size=512):
super(PairedImageDataset, self).__init__()
self.size = size
gt_dir = osp.join(root, 'gt')
lq_dir = osp.join(root, 'lq')
self.gt_filelist = os.listdir(gt_dir)
self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
self.lq_filelist = os.listdir(lq_dir)
self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]

def _img_to_tensor(self, img):
img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
torch.float32) / 255.
return (img - 0.5) / 0.5

def __getitem__(self, index):
lq = cv2.imread(self.lq_filelist[index])
gt = cv2.imread(self.gt_filelist[index])
lq = cv2.resize(
lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
gt = cv2.resize(
gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)

return \
{'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}

def __len__(self):
return len(self.gt_filelist)

def to_torch_dataset(self,
columns: Union[str, List[str]] = None,
preprocessors: Union[Callable, List[Callable]] = None,
**format_kwargs):
# self.preprocessor = preprocessors
return self


class TestImagePortraitEnhancementTrainer(unittest.TestCase):

def setUp(self):
@@ -70,8 +32,23 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):

self.model_id = 'damo/cv_gpen_image-portrait-enhancement'

self.dataset = PairedImageDataset(
'./data/test/images/face_enhancement/')
dataset_train = MsDataset.load(
'image-portrait-enhancement-dataset',
namespace='modelscope',
subset_name='default',
split='test',
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
dataset_val = MsDataset.load(
'image-portrait-enhancement-dataset',
namespace='modelscope',
subset_name='default',
split='test',
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds

self.dataset_train = ImagePortraitEnhancementDataset(
dataset_train, is_train=True)
self.dataset_val = ImagePortraitEnhancementDataset(
dataset_val, is_train=False)

def tearDown(self):
shutil.rmtree(self.tmp_dir, ignore_errors=True)
@@ -81,8 +58,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
def test_trainer(self):
kwargs = dict(
model=self.model_id,
train_dataset=self.dataset,
eval_dataset=self.dataset,
train_dataset=self.dataset_train,
eval_dataset=self.dataset_val,
device='gpu',
work_dir=self.tmp_dir)

@@ -101,8 +78,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
kwargs = dict(
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
model=model,
train_dataset=self.dataset,
eval_dataset=self.dataset,
train_dataset=self.dataset_train,
eval_dataset=self.dataset_val,
device='gpu',
max_epochs=2,
work_dir=self.tmp_dir)


Loading…
Cancel
Save