From a4930fc3200253d303ccfc0cb9285750ca72530e Mon Sep 17 00:00:00 2001 From: "xuangen.hlh" Date: Wed, 6 Jul 2022 07:22:50 +0800 Subject: [PATCH] [to #42322933] Update cv-person-image-cartoon-pipeline to maas lib Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9268865 --- modelscope/models/base.py | 2 +- .../models/multi_modal/imagen/imagen_model.py | 18 ++++++++++-------- .../text_to_image_synthesis_pipeline.py | 9 ++++++--- .../pipelines/test_text_to_image_synthesis.py | 14 ++++++++++---- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/modelscope/models/base.py b/modelscope/models/base.py index 40929a21..bb8cd1cd 100644 --- a/modelscope/models/base.py +++ b/modelscope/models/base.py @@ -69,7 +69,7 @@ class Model(ABC): model_cfg.model_dir = local_model_dir for k, v in kwargs.items(): - model_cfg.k = v + model_cfg[k] = v model = build_model(model_cfg, task_name) # dynamically add pipeline info to model for pipeline inference diff --git a/modelscope/models/multi_modal/imagen/imagen_model.py b/modelscope/models/multi_modal/imagen/imagen_model.py index e394ccf2..dd00ca07 100644 --- a/modelscope/models/multi_modal/imagen/imagen_model.py +++ b/modelscope/models/multi_modal/imagen/imagen_model.py @@ -215,8 +215,9 @@ class ImagenForTextToImageSynthesis(Model): eta=input.get('generator_ddim_eta', 0.0)) # upsampling (64->256) - img = F.interpolate( - img, scale_factor=4.0, mode='bilinear', align_corners=False) + if not input.get('debug', False): + img = F.interpolate( + img, scale_factor=4.0, mode='bilinear', align_corners=False) img = self.diffusion_imagen_upsampler_256.ddim_sample_loop( noise=torch.randn_like(img), model=self.unet_imagen_upsampler_256, @@ -233,14 +234,15 @@ class ImagenForTextToImageSynthesis(Model): 'context': torch.zeros_like(context), 'mask': torch.zeros_like(attention_mask) }], - percentile=input.get('generator_percentile', 0.995), - guide_scale=input.get('generator_guide_scale', 5.0), - ddim_timesteps=input.get('generator_ddim_timesteps', 50), - eta=input.get('generator_ddim_eta', 0.0)) + percentile=input.get('upsampler_256_percentile', 0.995), + guide_scale=input.get('upsampler_256_guide_scale', 5.0), + ddim_timesteps=input.get('upsampler_256_ddim_timesteps', 50), + eta=input.get('upsampler_256_ddim_eta', 0.0)) # upsampling (256->1024) - img = F.interpolate( - img, scale_factor=4.0, mode='bilinear', align_corners=False) + if not input.get('debug', False): + img = F.interpolate( + img, scale_factor=4.0, mode='bilinear', align_corners=False) img = self.diffusion_upsampler_1024.ddim_sample_loop( noise=torch.randn_like(img), model=self.unet_upsampler_1024, diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py index 02a34428..603a86fd 100644 --- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py +++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py @@ -1,5 +1,7 @@ from typing import Any, Dict +import torch + from modelscope.metainfo import Pipelines from modelscope.pipelines.base import Input from modelscope.utils.constant import Tasks @@ -16,16 +18,17 @@ logger = get_logger() module_name=Pipelines.text_to_image_synthesis) class TextToImageSynthesisPipeline(Pipeline): - def __init__(self, model: str, device_id: int = -1): + def __init__(self, model: str, **kwargs): + device_id = 0 if torch.cuda.is_available() else -1 if isinstance(model, str): - pipe_model = Model.from_pretrained(model) + pipe_model = Model.from_pretrained(model, device_id=device_id) elif isinstance(model, Model): pipe_model = model else: raise NotImplementedError( f'expecting a Model instance or str, but get {type(model)}.') - super().__init__(model=pipe_model) + super().__init__(model=pipe_model, **kwargs) def preprocess(self, input: Input) -> Dict[str, Any]: return input diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py index 568b4832..1e12548a 100644 --- a/tests/pipelines/test_text_to_image_synthesis.py +++ b/tests/pipelines/test_text_to_image_synthesis.py @@ -13,9 +13,15 @@ from modelscope.utils.test_utils import test_level class TextToImageSynthesisTest(unittest.TestCase): model_id = 'damo/cv_imagen_text-to-image-synthesis_tiny' - test_text = {'text': '宇航员'} - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + test_text = { + 'text': '宇航员', + 'generator_ddim_timesteps': 2, + 'upsampler_256_ddim_timesteps': 2, + 'upsampler_1024_ddim_timesteps': 2, + 'debug': True + } + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) pipe_line_text_to_image_synthesis = pipeline( @@ -24,7 +30,7 @@ class TextToImageSynthesisTest(unittest.TestCase): self.test_text)[OutputKeys.OUTPUT_IMG] print(np.sum(np.abs(img))) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_name(self): pipe_line_text_to_image_synthesis = pipeline( task=Tasks.text_to_image_synthesis, model=self.model_id)