From a3c460b13903cbdc10cf88d8f54cb67cc7edebe5 Mon Sep 17 00:00:00 2001 From: "xuangen.hlh" Date: Tue, 2 Aug 2022 22:15:45 +0800 Subject: [PATCH] [to #42322933]rename imagen to diffusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 出于可能法务风险,将复现的Google 「Imagen」算法重命名为「diffusion」。 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9608897 --- modelscope/metainfo.py | 2 +- modelscope/models/multi_modal/__init__.py | 4 +- .../models/multi_modal/diffusion/__init__.py | 1 + .../{imagen => diffusion}/diffusion.py | 0 .../imagen_model.py => diffusion/model.py} | 81 +++++++++---------- .../{imagen => diffusion}/structbert.py | 0 .../{imagen => diffusion}/tokenizer.py | 0 .../{imagen => diffusion}/unet_generator.py | 6 +- .../unet_upsampler_1024.py | 6 +- .../unet_upsampler_256.py} | 0 .../models/multi_modal/imagen/__init__.py | 1 - modelscope/pipelines/builder.py | 2 +- .../pipelines/test_text_to_image_synthesis.py | 4 +- 13 files changed, 53 insertions(+), 54 deletions(-) create mode 100644 modelscope/models/multi_modal/diffusion/__init__.py rename modelscope/models/multi_modal/{imagen => diffusion}/diffusion.py (100%) rename modelscope/models/multi_modal/{imagen/imagen_model.py => diffusion/model.py} (76%) rename modelscope/models/multi_modal/{imagen => diffusion}/structbert.py (100%) rename modelscope/models/multi_modal/{imagen => diffusion}/tokenizer.py (100%) rename modelscope/models/multi_modal/{imagen => diffusion}/unet_generator.py (98%) rename modelscope/models/multi_modal/{imagen => diffusion}/unet_upsampler_1024.py (98%) rename modelscope/models/multi_modal/{imagen/unet_imagen_upsampler_256.py => diffusion/unet_upsampler_256.py} (100%) delete mode 100644 modelscope/models/multi_modal/imagen/__init__.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 2a7a9c0a..451c0bec 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -41,7 +41,7 @@ class Models(object): clip = 'clip-multi-modal-embedding' gemm = 'gemm-generative-multi-modal' mplug = 'mplug' - imagen = 'imagen-text-to-image-synthesis' + diffusion = 'diffusion-text-to-image-synthesis' video_clip = 'video-clip-multi-modal-embedding' diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py index 8e6e2a39..cd368739 100644 --- a/modelscope/models/multi_modal/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: from .clip import CLIPForMultiModalEmbedding from .gemm import GEMMForMultiModalEmbedding - from .imagen import ImagenForTextToImageSynthesis + from .diffusion import DiffusionForTextToImageSynthesis from .mmr import VideoCLIPForMultiModalEmbedding from .mplug_for_visual_question_answering import \ MPlugForVisualQuestionAnswering @@ -15,7 +15,7 @@ if TYPE_CHECKING: else: _import_structure = { 'clip': ['CLIPForMultiModalEmbedding'], - 'imagen': ['ImagenForTextToImageSynthesis'], + 'diffusion': ['DiffusionForTextToImageSynthesis'], 'gemm': ['GEMMForMultiModalEmbedding'], 'mmr': ['VideoCLIPForMultiModalEmbedding'], 'mplug_for_visual_question_answering': diff --git a/modelscope/models/multi_modal/diffusion/__init__.py b/modelscope/models/multi_modal/diffusion/__init__.py new file mode 100644 index 00000000..28813cc9 --- /dev/null +++ b/modelscope/models/multi_modal/diffusion/__init__.py @@ -0,0 +1 @@ +from .model import DiffusionForTextToImageSynthesis diff --git a/modelscope/models/multi_modal/imagen/diffusion.py b/modelscope/models/multi_modal/diffusion/diffusion.py similarity index 100% rename from modelscope/models/multi_modal/imagen/diffusion.py rename to modelscope/models/multi_modal/diffusion/diffusion.py diff --git a/modelscope/models/multi_modal/imagen/imagen_model.py b/modelscope/models/multi_modal/diffusion/model.py similarity index 76% rename from modelscope/models/multi_modal/imagen/imagen_model.py rename to modelscope/models/multi_modal/diffusion/model.py index 37dacb71..4d61e2d1 100644 --- a/modelscope/models/multi_modal/imagen/imagen_model.py +++ b/modelscope/models/multi_modal/diffusion/model.py @@ -10,22 +10,23 @@ import torch.nn.functional as F from modelscope.metainfo import Models from modelscope.models import Model from modelscope.models.builder import MODELS -from modelscope.models.multi_modal.imagen.diffusion import (GaussianDiffusion, - beta_schedule) -from modelscope.models.multi_modal.imagen.structbert import (BertConfig, - BertModel) -from modelscope.models.multi_modal.imagen.tokenizer import FullTokenizer -from modelscope.models.multi_modal.imagen.unet_generator import ImagenGenerator -from modelscope.models.multi_modal.imagen.unet_imagen_upsampler_256 import \ +from modelscope.models.multi_modal.diffusion.diffusion import ( + GaussianDiffusion, beta_schedule) +from modelscope.models.multi_modal.diffusion.structbert import (BertConfig, + BertModel) +from modelscope.models.multi_modal.diffusion.tokenizer import FullTokenizer +from modelscope.models.multi_modal.diffusion.unet_generator import \ + DiffusionGenerator +from modelscope.models.multi_modal.diffusion.unet_upsampler_256 import \ SuperResUNet256 -from modelscope.models.multi_modal.imagen.unet_upsampler_1024 import \ - ImagenUpsampler1024 +from modelscope.models.multi_modal.diffusion.unet_upsampler_1024 import \ + SuperResUNet1024 from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.logger import get_logger logger = get_logger() -__all__ = ['ImagenForTextToImageSynthesis'] +__all__ = ['DiffusionForTextToImageSynthesis'] def make_diffusion(schedule, @@ -68,13 +69,13 @@ class Tokenizer(object): return input_ids, segment_ids, input_mask -class ImagenModel(nn.Module): +class DiffusionModel(nn.Module): def __init__(self, model_dir): - super(ImagenModel, self).__init__() + super(DiffusionModel, self).__init__() # including text and generator config model_config = json.load( - open('{}/imagen_config.json'.format(model_dir))) + open('{}/model_config.json'.format(model_dir))) # text encoder text_config = model_config['text_config'] @@ -82,17 +83,15 @@ class ImagenModel(nn.Module): # generator (64x64) generator_config = model_config['generator_config'] - self.unet_generator = ImagenGenerator(**generator_config) + self.unet_generator = DiffusionGenerator(**generator_config) - # imagen upsampler (256x256) - imagen_upsampler_256_config = model_config[ - 'imagen_upsampler_256_config'] - self.unet_imagen_upsampler_256 = SuperResUNet256( - **imagen_upsampler_256_config) + # upsampler (256x256) + upsampler_256_config = model_config['upsampler_256_config'] + self.unet_upsampler_256 = SuperResUNet256(**upsampler_256_config) - # dalle2 upsampler (1024x1024) + # upsampler (1024x1024) upsampler_1024_config = model_config['upsampler_1024_config'] - self.unet_upsampler_1024 = ImagenUpsampler1024(**upsampler_1024_config) + self.unet_upsampler_1024 = SuperResUNet1024(**upsampler_1024_config) def forward(self, noise, timesteps, input_ids, token_type_ids, attention_mask): @@ -102,39 +101,39 @@ class ImagenModel(nn.Module): attention_mask=attention_mask) context = context[-1] x = self.unet_generator(noise, timesteps, y, context, attention_mask) - x = self.unet_imagen_upsampler_256(noise, timesteps, x, - torch.zeros_like(timesteps), y, - context, attention_mask) + x = self.unet_upsampler_256(noise, timesteps, x, + torch.zeros_like(timesteps), y, context, + attention_mask) x = self.unet_upsampler_1024(x, t, x) return x @MODELS.register_module( - Tasks.text_to_image_synthesis, module_name=Models.imagen) -class ImagenForTextToImageSynthesis(Model): + Tasks.text_to_image_synthesis, module_name=Models.diffusion) +class DiffusionForTextToImageSynthesis(Model): def __init__(self, model_dir, device_id=-1): super().__init__(model_dir=model_dir, device_id=device_id) - imagen_model = ImagenModel(model_dir=model_dir) + diffusion_model = DiffusionModel(model_dir=model_dir) pretrained_params = torch.load( osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu') - imagen_model.load_state_dict(pretrained_params) - imagen_model.eval() + diffusion_model.load_state_dict(pretrained_params) + diffusion_model.eval() self.device_id = device_id if self.device_id >= 0: self.device = torch.device(f'cuda:{self.device_id}') - imagen_model.to('cuda:{}'.format(self.device_id)) + diffusion_model.to('cuda:{}'.format(self.device_id)) logger.info('Use GPU: {}'.format(self.device_id)) else: self.device = torch.device('cpu') logger.info('Use CPU for inference') # modules - self.text_encoder = imagen_model.text_encoder - self.unet_generator = imagen_model.unet_generator - self.unet_imagen_upsampler_256 = imagen_model.unet_imagen_upsampler_256 - self.unet_upsampler_1024 = imagen_model.unet_upsampler_1024 + self.text_encoder = diffusion_model.text_encoder + self.unet_generator = diffusion_model.unet_generator + self.unet_upsampler_256 = diffusion_model.unet_upsampler_256 + self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024 # text tokenizer vocab_path = '{}/vocab.txt'.format(model_dir) @@ -145,8 +144,8 @@ class ImagenForTextToImageSynthesis(Model): open('{}/diffusion_config.json'.format(model_dir))) self.diffusion_generator = make_diffusion( **diffusion_params['generator_config']) - self.diffusion_imagen_upsampler_256 = make_diffusion( - **diffusion_params['imagen_upsampler_256_config']) + self.diffusion_upsampler_256 = make_diffusion( + **diffusion_params['upsampler_256_config']) self.diffusion_upsampler_1024 = make_diffusion( **diffusion_params['upsampler_1024_config']) @@ -166,9 +165,9 @@ class ImagenForTextToImageSynthesis(Model): attention_mask=attention_mask) context = context[-1] x = self.unet_generator(noise, timesteps, y, context, attention_mask) - x = self.unet_imagen_upsampler_256(noise, timesteps, x, - torch.zeros_like(timesteps), y, - context, attention_mask) + x = self.unet_upsampler_256(noise, timesteps, x, + torch.zeros_like(timesteps), y, context, + attention_mask) x = self.unet_upsampler_1024(x, t, x) img = x.clamp(-1, 1).add(1).mul(127.5) img = img.squeeze(0).permute(1, 2, 0).cpu().numpy().astype(np.uint8) @@ -217,9 +216,9 @@ class ImagenForTextToImageSynthesis(Model): if not input.get('debug', False): img = F.interpolate( img, scale_factor=4.0, mode='bilinear', align_corners=False) - img = self.diffusion_imagen_upsampler_256.ddim_sample_loop( + img = self.diffusion_upsampler_256.ddim_sample_loop( noise=torch.randn_like(img), - model=self.unet_imagen_upsampler_256, + model=self.unet_upsampler_256, model_kwargs=[{ 'lx': img, 'lt': torch.zeros(1).to(self.device), diff --git a/modelscope/models/multi_modal/imagen/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py similarity index 100% rename from modelscope/models/multi_modal/imagen/structbert.py rename to modelscope/models/multi_modal/diffusion/structbert.py diff --git a/modelscope/models/multi_modal/imagen/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py similarity index 100% rename from modelscope/models/multi_modal/imagen/tokenizer.py rename to modelscope/models/multi_modal/diffusion/tokenizer.py diff --git a/modelscope/models/multi_modal/imagen/unet_generator.py b/modelscope/models/multi_modal/diffusion/unet_generator.py similarity index 98% rename from modelscope/models/multi_modal/imagen/unet_generator.py rename to modelscope/models/multi_modal/diffusion/unet_generator.py index 2b780a36..9b507223 100644 --- a/modelscope/models/multi_modal/imagen/unet_generator.py +++ b/modelscope/models/multi_modal/diffusion/unet_generator.py @@ -4,7 +4,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -__all__ = ['ImagenGenerator'] +__all__ = ['DiffusionGenerator'] def sinusoidal_embedding(timesteps, dim): @@ -156,7 +156,7 @@ class AttentionBlock(nn.Module): return x + identity -class ImagenGenerator(nn.Module): +class DiffusionGenerator(nn.Module): def __init__(self, in_dim=3, @@ -173,7 +173,7 @@ class ImagenGenerator(nn.Module): use_scale_shift_norm=True, dropout=0.0): embed_dim = dim * 4 - super(ImagenGenerator, self).__init__() + super(DiffusionGenerator, self).__init__() self.in_dim = in_dim self.dim = dim self.text_dim = text_dim diff --git a/modelscope/models/multi_modal/imagen/unet_upsampler_1024.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py similarity index 98% rename from modelscope/models/multi_modal/imagen/unet_upsampler_1024.py rename to modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py index 07d3648c..1c66b2fe 100644 --- a/modelscope/models/multi_modal/imagen/unet_upsampler_1024.py +++ b/modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py @@ -4,7 +4,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -__all__ = ['ImagenUpsampler1024'] +__all__ = ['SuperResUNet1024'] def sinusoidal_embedding(timesteps, dim): @@ -99,7 +99,7 @@ class ResidualBlock(nn.Module): return x -class ImagenUpsampler1024(nn.Module): +class SuperResUNet1024(nn.Module): def __init__(self, in_dim=6, @@ -111,7 +111,7 @@ class ImagenUpsampler1024(nn.Module): use_scale_shift_norm=True, dropout=0.0): embed_dim = dim * 4 - super(ImagenUpsampler1024, self).__init__() + super(SuperResUNet1024, self).__init__() self.in_dim = in_dim self.dim = dim self.out_dim = out_dim diff --git a/modelscope/models/multi_modal/imagen/unet_imagen_upsampler_256.py b/modelscope/models/multi_modal/diffusion/unet_upsampler_256.py similarity index 100% rename from modelscope/models/multi_modal/imagen/unet_imagen_upsampler_256.py rename to modelscope/models/multi_modal/diffusion/unet_upsampler_256.py diff --git a/modelscope/models/multi_modal/imagen/__init__.py b/modelscope/models/multi_modal/imagen/__init__.py deleted file mode 100644 index 0f5cd0ed..00000000 --- a/modelscope/models/multi_modal/imagen/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .imagen_model import ImagenForTextToImageSynthesis diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 14a3de1e..0acc6d49 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -86,7 +86,7 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_r2p1d_video_embedding'), Tasks.text_to_image_synthesis: (Pipelines.text_to_image_synthesis, - 'damo/cv_imagen_text-to-image-synthesis_tiny'), + 'damo/cv_diffusion_text-to-image-synthesis_tiny'), Tasks.face_detection: (Pipelines.face_detection, 'damo/cv_resnet_facedetection_scrfd10gkps'), Tasks.face_recognition: (Pipelines.face_recognition, diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py index 6a2edb57..32778ffb 100644 --- a/tests/pipelines/test_text_to_image_synthesis.py +++ b/tests/pipelines/test_text_to_image_synthesis.py @@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level class TextToImageSynthesisTest(unittest.TestCase): - model_id = 'damo/cv_imagen_text-to-image-synthesis_tiny' + model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny' test_text = { 'text': '宇航员', 'generator_ddim_timesteps': 2, @@ -30,7 +30,7 @@ class TextToImageSynthesisTest(unittest.TestCase): self.test_text)[OutputKeys.OUTPUT_IMG] print(np.sum(np.abs(img))) - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_name(self): pipe_line_text_to_image_synthesis = pipeline( task=Tasks.text_to_image_synthesis, model=self.model_id)