Browse Source

[to #42322933]rename imagen to diffusion

出于可能法务风险,将复现的Google 「Imagen」算法重命名为「diffusion」。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9608897
master
xuangen.hlh yingda.chen 3 years ago
parent
commit
a3c460b139
13 changed files with 53 additions and 54 deletions
  1. +1
    -1
      modelscope/metainfo.py
  2. +2
    -2
      modelscope/models/multi_modal/__init__.py
  3. +1
    -0
      modelscope/models/multi_modal/diffusion/__init__.py
  4. +0
    -0
      modelscope/models/multi_modal/diffusion/diffusion.py
  5. +40
    -41
      modelscope/models/multi_modal/diffusion/model.py
  6. +0
    -0
      modelscope/models/multi_modal/diffusion/structbert.py
  7. +0
    -0
      modelscope/models/multi_modal/diffusion/tokenizer.py
  8. +3
    -3
      modelscope/models/multi_modal/diffusion/unet_generator.py
  9. +3
    -3
      modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
  10. +0
    -0
      modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
  11. +0
    -1
      modelscope/models/multi_modal/imagen/__init__.py
  12. +1
    -1
      modelscope/pipelines/builder.py
  13. +2
    -2
      tests/pipelines/test_text_to_image_synthesis.py

+ 1
- 1
modelscope/metainfo.py View File

@@ -41,7 +41,7 @@ class Models(object):
clip = 'clip-multi-modal-embedding' clip = 'clip-multi-modal-embedding'
gemm = 'gemm-generative-multi-modal' gemm = 'gemm-generative-multi-modal'
mplug = 'mplug' mplug = 'mplug'
imagen = 'imagen-text-to-image-synthesis'
diffusion = 'diffusion-text-to-image-synthesis'
video_clip = 'video-clip-multi-modal-embedding' video_clip = 'video-clip-multi-modal-embedding'






+ 2
- 2
modelscope/models/multi_modal/__init__.py View File

@@ -7,7 +7,7 @@ if TYPE_CHECKING:


from .clip import CLIPForMultiModalEmbedding from .clip import CLIPForMultiModalEmbedding
from .gemm import GEMMForMultiModalEmbedding from .gemm import GEMMForMultiModalEmbedding
from .imagen import ImagenForTextToImageSynthesis
from .diffusion import DiffusionForTextToImageSynthesis
from .mmr import VideoCLIPForMultiModalEmbedding from .mmr import VideoCLIPForMultiModalEmbedding
from .mplug_for_visual_question_answering import \ from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering MPlugForVisualQuestionAnswering
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
else: else:
_import_structure = { _import_structure = {
'clip': ['CLIPForMultiModalEmbedding'], 'clip': ['CLIPForMultiModalEmbedding'],
'imagen': ['ImagenForTextToImageSynthesis'],
'diffusion': ['DiffusionForTextToImageSynthesis'],
'gemm': ['GEMMForMultiModalEmbedding'], 'gemm': ['GEMMForMultiModalEmbedding'],
'mmr': ['VideoCLIPForMultiModalEmbedding'], 'mmr': ['VideoCLIPForMultiModalEmbedding'],
'mplug_for_visual_question_answering': 'mplug_for_visual_question_answering':


+ 1
- 0
modelscope/models/multi_modal/diffusion/__init__.py View File

@@ -0,0 +1 @@
from .model import DiffusionForTextToImageSynthesis

modelscope/models/multi_modal/imagen/diffusion.py → modelscope/models/multi_modal/diffusion/diffusion.py View File


modelscope/models/multi_modal/imagen/imagen_model.py → modelscope/models/multi_modal/diffusion/model.py View File

@@ -10,22 +10,23 @@ import torch.nn.functional as F
from modelscope.metainfo import Models from modelscope.metainfo import Models
from modelscope.models import Model from modelscope.models import Model
from modelscope.models.builder import MODELS from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.imagen.diffusion import (GaussianDiffusion,
beta_schedule)
from modelscope.models.multi_modal.imagen.structbert import (BertConfig,
BertModel)
from modelscope.models.multi_modal.imagen.tokenizer import FullTokenizer
from modelscope.models.multi_modal.imagen.unet_generator import ImagenGenerator
from modelscope.models.multi_modal.imagen.unet_imagen_upsampler_256 import \
from modelscope.models.multi_modal.diffusion.diffusion import (
GaussianDiffusion, beta_schedule)
from modelscope.models.multi_modal.diffusion.structbert import (BertConfig,
BertModel)
from modelscope.models.multi_modal.diffusion.tokenizer import FullTokenizer
from modelscope.models.multi_modal.diffusion.unet_generator import \
DiffusionGenerator
from modelscope.models.multi_modal.diffusion.unet_upsampler_256 import \
SuperResUNet256 SuperResUNet256
from modelscope.models.multi_modal.imagen.unet_upsampler_1024 import \
ImagenUpsampler1024
from modelscope.models.multi_modal.diffusion.unet_upsampler_1024 import \
SuperResUNet1024
from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()


__all__ = ['ImagenForTextToImageSynthesis']
__all__ = ['DiffusionForTextToImageSynthesis']




def make_diffusion(schedule, def make_diffusion(schedule,
@@ -68,13 +69,13 @@ class Tokenizer(object):
return input_ids, segment_ids, input_mask return input_ids, segment_ids, input_mask




class ImagenModel(nn.Module):
class DiffusionModel(nn.Module):


def __init__(self, model_dir): def __init__(self, model_dir):
super(ImagenModel, self).__init__()
super(DiffusionModel, self).__init__()
# including text and generator config # including text and generator config
model_config = json.load( model_config = json.load(
open('{}/imagen_config.json'.format(model_dir)))
open('{}/model_config.json'.format(model_dir)))


# text encoder # text encoder
text_config = model_config['text_config'] text_config = model_config['text_config']
@@ -82,17 +83,15 @@ class ImagenModel(nn.Module):


# generator (64x64) # generator (64x64)
generator_config = model_config['generator_config'] generator_config = model_config['generator_config']
self.unet_generator = ImagenGenerator(**generator_config)
self.unet_generator = DiffusionGenerator(**generator_config)


# imagen upsampler (256x256)
imagen_upsampler_256_config = model_config[
'imagen_upsampler_256_config']
self.unet_imagen_upsampler_256 = SuperResUNet256(
**imagen_upsampler_256_config)
# upsampler (256x256)
upsampler_256_config = model_config['upsampler_256_config']
self.unet_upsampler_256 = SuperResUNet256(**upsampler_256_config)


# dalle2 upsampler (1024x1024)
# upsampler (1024x1024)
upsampler_1024_config = model_config['upsampler_1024_config'] upsampler_1024_config = model_config['upsampler_1024_config']
self.unet_upsampler_1024 = ImagenUpsampler1024(**upsampler_1024_config)
self.unet_upsampler_1024 = SuperResUNet1024(**upsampler_1024_config)


def forward(self, noise, timesteps, input_ids, token_type_ids, def forward(self, noise, timesteps, input_ids, token_type_ids,
attention_mask): attention_mask):
@@ -102,39 +101,39 @@ class ImagenModel(nn.Module):
attention_mask=attention_mask) attention_mask=attention_mask)
context = context[-1] context = context[-1]
x = self.unet_generator(noise, timesteps, y, context, attention_mask) x = self.unet_generator(noise, timesteps, y, context, attention_mask)
x = self.unet_imagen_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y,
context, attention_mask)
x = self.unet_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y, context,
attention_mask)
x = self.unet_upsampler_1024(x, t, x) x = self.unet_upsampler_1024(x, t, x)
return x return x




@MODELS.register_module( @MODELS.register_module(
Tasks.text_to_image_synthesis, module_name=Models.imagen)
class ImagenForTextToImageSynthesis(Model):
Tasks.text_to_image_synthesis, module_name=Models.diffusion)
class DiffusionForTextToImageSynthesis(Model):


def __init__(self, model_dir, device_id=-1): def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id) super().__init__(model_dir=model_dir, device_id=device_id)
imagen_model = ImagenModel(model_dir=model_dir)
diffusion_model = DiffusionModel(model_dir=model_dir)
pretrained_params = torch.load( pretrained_params = torch.load(
osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu') osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
imagen_model.load_state_dict(pretrained_params)
imagen_model.eval()
diffusion_model.load_state_dict(pretrained_params)
diffusion_model.eval()


self.device_id = device_id self.device_id = device_id
if self.device_id >= 0: if self.device_id >= 0:
self.device = torch.device(f'cuda:{self.device_id}') self.device = torch.device(f'cuda:{self.device_id}')
imagen_model.to('cuda:{}'.format(self.device_id))
diffusion_model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id)) logger.info('Use GPU: {}'.format(self.device_id))
else: else:
self.device = torch.device('cpu') self.device = torch.device('cpu')
logger.info('Use CPU for inference') logger.info('Use CPU for inference')


# modules # modules
self.text_encoder = imagen_model.text_encoder
self.unet_generator = imagen_model.unet_generator
self.unet_imagen_upsampler_256 = imagen_model.unet_imagen_upsampler_256
self.unet_upsampler_1024 = imagen_model.unet_upsampler_1024
self.text_encoder = diffusion_model.text_encoder
self.unet_generator = diffusion_model.unet_generator
self.unet_upsampler_256 = diffusion_model.unet_upsampler_256
self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024


# text tokenizer # text tokenizer
vocab_path = '{}/vocab.txt'.format(model_dir) vocab_path = '{}/vocab.txt'.format(model_dir)
@@ -145,8 +144,8 @@ class ImagenForTextToImageSynthesis(Model):
open('{}/diffusion_config.json'.format(model_dir))) open('{}/diffusion_config.json'.format(model_dir)))
self.diffusion_generator = make_diffusion( self.diffusion_generator = make_diffusion(
**diffusion_params['generator_config']) **diffusion_params['generator_config'])
self.diffusion_imagen_upsampler_256 = make_diffusion(
**diffusion_params['imagen_upsampler_256_config'])
self.diffusion_upsampler_256 = make_diffusion(
**diffusion_params['upsampler_256_config'])
self.diffusion_upsampler_1024 = make_diffusion( self.diffusion_upsampler_1024 = make_diffusion(
**diffusion_params['upsampler_1024_config']) **diffusion_params['upsampler_1024_config'])


@@ -166,9 +165,9 @@ class ImagenForTextToImageSynthesis(Model):
attention_mask=attention_mask) attention_mask=attention_mask)
context = context[-1] context = context[-1]
x = self.unet_generator(noise, timesteps, y, context, attention_mask) x = self.unet_generator(noise, timesteps, y, context, attention_mask)
x = self.unet_imagen_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y,
context, attention_mask)
x = self.unet_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y, context,
attention_mask)
x = self.unet_upsampler_1024(x, t, x) x = self.unet_upsampler_1024(x, t, x)
img = x.clamp(-1, 1).add(1).mul(127.5) img = x.clamp(-1, 1).add(1).mul(127.5)
img = img.squeeze(0).permute(1, 2, 0).cpu().numpy().astype(np.uint8) img = img.squeeze(0).permute(1, 2, 0).cpu().numpy().astype(np.uint8)
@@ -217,9 +216,9 @@ class ImagenForTextToImageSynthesis(Model):
if not input.get('debug', False): if not input.get('debug', False):
img = F.interpolate( img = F.interpolate(
img, scale_factor=4.0, mode='bilinear', align_corners=False) img, scale_factor=4.0, mode='bilinear', align_corners=False)
img = self.diffusion_imagen_upsampler_256.ddim_sample_loop(
img = self.diffusion_upsampler_256.ddim_sample_loop(
noise=torch.randn_like(img), noise=torch.randn_like(img),
model=self.unet_imagen_upsampler_256,
model=self.unet_upsampler_256,
model_kwargs=[{ model_kwargs=[{
'lx': img, 'lx': img,
'lt': torch.zeros(1).to(self.device), 'lt': torch.zeros(1).to(self.device),

modelscope/models/multi_modal/imagen/structbert.py → modelscope/models/multi_modal/diffusion/structbert.py View File


modelscope/models/multi_modal/imagen/tokenizer.py → modelscope/models/multi_modal/diffusion/tokenizer.py View File


modelscope/models/multi_modal/imagen/unet_generator.py → modelscope/models/multi_modal/diffusion/unet_generator.py View File

@@ -4,7 +4,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F


__all__ = ['ImagenGenerator']
__all__ = ['DiffusionGenerator']




def sinusoidal_embedding(timesteps, dim): def sinusoidal_embedding(timesteps, dim):
@@ -156,7 +156,7 @@ class AttentionBlock(nn.Module):
return x + identity return x + identity




class ImagenGenerator(nn.Module):
class DiffusionGenerator(nn.Module):


def __init__(self, def __init__(self,
in_dim=3, in_dim=3,
@@ -173,7 +173,7 @@ class ImagenGenerator(nn.Module):
use_scale_shift_norm=True, use_scale_shift_norm=True,
dropout=0.0): dropout=0.0):
embed_dim = dim * 4 embed_dim = dim * 4
super(ImagenGenerator, self).__init__()
super(DiffusionGenerator, self).__init__()
self.in_dim = in_dim self.in_dim = in_dim
self.dim = dim self.dim = dim
self.text_dim = text_dim self.text_dim = text_dim

modelscope/models/multi_modal/imagen/unet_upsampler_1024.py → modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py View File

@@ -4,7 +4,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F


__all__ = ['ImagenUpsampler1024']
__all__ = ['SuperResUNet1024']




def sinusoidal_embedding(timesteps, dim): def sinusoidal_embedding(timesteps, dim):
@@ -99,7 +99,7 @@ class ResidualBlock(nn.Module):
return x return x




class ImagenUpsampler1024(nn.Module):
class SuperResUNet1024(nn.Module):


def __init__(self, def __init__(self,
in_dim=6, in_dim=6,
@@ -111,7 +111,7 @@ class ImagenUpsampler1024(nn.Module):
use_scale_shift_norm=True, use_scale_shift_norm=True,
dropout=0.0): dropout=0.0):
embed_dim = dim * 4 embed_dim = dim * 4
super(ImagenUpsampler1024, self).__init__()
super(SuperResUNet1024, self).__init__()
self.in_dim = in_dim self.in_dim = in_dim
self.dim = dim self.dim = dim
self.out_dim = out_dim self.out_dim = out_dim

modelscope/models/multi_modal/imagen/unet_imagen_upsampler_256.py → modelscope/models/multi_modal/diffusion/unet_upsampler_256.py View File


+ 0
- 1
modelscope/models/multi_modal/imagen/__init__.py View File

@@ -1 +0,0 @@
from .imagen_model import ImagenForTextToImageSynthesis

+ 1
- 1
modelscope/pipelines/builder.py View File

@@ -86,7 +86,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/cv_r2p1d_video_embedding'), 'damo/cv_r2p1d_video_embedding'),
Tasks.text_to_image_synthesis: Tasks.text_to_image_synthesis:
(Pipelines.text_to_image_synthesis, (Pipelines.text_to_image_synthesis,
'damo/cv_imagen_text-to-image-synthesis_tiny'),
'damo/cv_diffusion_text-to-image-synthesis_tiny'),
Tasks.face_detection: (Pipelines.face_detection, Tasks.face_detection: (Pipelines.face_detection,
'damo/cv_resnet_facedetection_scrfd10gkps'), 'damo/cv_resnet_facedetection_scrfd10gkps'),
Tasks.face_recognition: (Pipelines.face_recognition, Tasks.face_recognition: (Pipelines.face_recognition,


+ 2
- 2
tests/pipelines/test_text_to_image_synthesis.py View File

@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level




class TextToImageSynthesisTest(unittest.TestCase): class TextToImageSynthesisTest(unittest.TestCase):
model_id = 'damo/cv_imagen_text-to-image-synthesis_tiny'
model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
test_text = { test_text = {
'text': '宇航员', 'text': '宇航员',
'generator_ddim_timesteps': 2, 'generator_ddim_timesteps': 2,
@@ -30,7 +30,7 @@ class TextToImageSynthesisTest(unittest.TestCase):
self.test_text)[OutputKeys.OUTPUT_IMG] self.test_text)[OutputKeys.OUTPUT_IMG]
print(np.sum(np.abs(img))) print(np.sum(np.abs(img)))


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
pipe_line_text_to_image_synthesis = pipeline( pipe_line_text_to_image_synthesis = pipeline(
task=Tasks.text_to_image_synthesis, model=self.model_id) task=Tasks.text_to_image_synthesis, model=self.model_id)


Loading…
Cancel
Save