Browse Source

[to #42322933]rename imagen to diffusion

出于可能法务风险,将复现的Google 「Imagen」算法重命名为「diffusion」。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9608897
master
xuangen.hlh yingda.chen 3 years ago
parent
commit
a3c460b139
13 changed files with 53 additions and 54 deletions
  1. +1
    -1
      modelscope/metainfo.py
  2. +2
    -2
      modelscope/models/multi_modal/__init__.py
  3. +1
    -0
      modelscope/models/multi_modal/diffusion/__init__.py
  4. +0
    -0
      modelscope/models/multi_modal/diffusion/diffusion.py
  5. +40
    -41
      modelscope/models/multi_modal/diffusion/model.py
  6. +0
    -0
      modelscope/models/multi_modal/diffusion/structbert.py
  7. +0
    -0
      modelscope/models/multi_modal/diffusion/tokenizer.py
  8. +3
    -3
      modelscope/models/multi_modal/diffusion/unet_generator.py
  9. +3
    -3
      modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py
  10. +0
    -0
      modelscope/models/multi_modal/diffusion/unet_upsampler_256.py
  11. +0
    -1
      modelscope/models/multi_modal/imagen/__init__.py
  12. +1
    -1
      modelscope/pipelines/builder.py
  13. +2
    -2
      tests/pipelines/test_text_to_image_synthesis.py

+ 1
- 1
modelscope/metainfo.py View File

@@ -41,7 +41,7 @@ class Models(object):
clip = 'clip-multi-modal-embedding'
gemm = 'gemm-generative-multi-modal'
mplug = 'mplug'
imagen = 'imagen-text-to-image-synthesis'
diffusion = 'diffusion-text-to-image-synthesis'
video_clip = 'video-clip-multi-modal-embedding'




+ 2
- 2
modelscope/models/multi_modal/__init__.py View File

@@ -7,7 +7,7 @@ if TYPE_CHECKING:

from .clip import CLIPForMultiModalEmbedding
from .gemm import GEMMForMultiModalEmbedding
from .imagen import ImagenForTextToImageSynthesis
from .diffusion import DiffusionForTextToImageSynthesis
from .mmr import VideoCLIPForMultiModalEmbedding
from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
else:
_import_structure = {
'clip': ['CLIPForMultiModalEmbedding'],
'imagen': ['ImagenForTextToImageSynthesis'],
'diffusion': ['DiffusionForTextToImageSynthesis'],
'gemm': ['GEMMForMultiModalEmbedding'],
'mmr': ['VideoCLIPForMultiModalEmbedding'],
'mplug_for_visual_question_answering':


+ 1
- 0
modelscope/models/multi_modal/diffusion/__init__.py View File

@@ -0,0 +1 @@
from .model import DiffusionForTextToImageSynthesis

modelscope/models/multi_modal/imagen/diffusion.py → modelscope/models/multi_modal/diffusion/diffusion.py View File


modelscope/models/multi_modal/imagen/imagen_model.py → modelscope/models/multi_modal/diffusion/model.py View File

@@ -10,22 +10,23 @@ import torch.nn.functional as F
from modelscope.metainfo import Models
from modelscope.models import Model
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.imagen.diffusion import (GaussianDiffusion,
beta_schedule)
from modelscope.models.multi_modal.imagen.structbert import (BertConfig,
BertModel)
from modelscope.models.multi_modal.imagen.tokenizer import FullTokenizer
from modelscope.models.multi_modal.imagen.unet_generator import ImagenGenerator
from modelscope.models.multi_modal.imagen.unet_imagen_upsampler_256 import \
from modelscope.models.multi_modal.diffusion.diffusion import (
GaussianDiffusion, beta_schedule)
from modelscope.models.multi_modal.diffusion.structbert import (BertConfig,
BertModel)
from modelscope.models.multi_modal.diffusion.tokenizer import FullTokenizer
from modelscope.models.multi_modal.diffusion.unet_generator import \
DiffusionGenerator
from modelscope.models.multi_modal.diffusion.unet_upsampler_256 import \
SuperResUNet256
from modelscope.models.multi_modal.imagen.unet_upsampler_1024 import \
ImagenUpsampler1024
from modelscope.models.multi_modal.diffusion.unet_upsampler_1024 import \
SuperResUNet1024
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['ImagenForTextToImageSynthesis']
__all__ = ['DiffusionForTextToImageSynthesis']


def make_diffusion(schedule,
@@ -68,13 +69,13 @@ class Tokenizer(object):
return input_ids, segment_ids, input_mask


class ImagenModel(nn.Module):
class DiffusionModel(nn.Module):

def __init__(self, model_dir):
super(ImagenModel, self).__init__()
super(DiffusionModel, self).__init__()
# including text and generator config
model_config = json.load(
open('{}/imagen_config.json'.format(model_dir)))
open('{}/model_config.json'.format(model_dir)))

# text encoder
text_config = model_config['text_config']
@@ -82,17 +83,15 @@ class ImagenModel(nn.Module):

# generator (64x64)
generator_config = model_config['generator_config']
self.unet_generator = ImagenGenerator(**generator_config)
self.unet_generator = DiffusionGenerator(**generator_config)

# imagen upsampler (256x256)
imagen_upsampler_256_config = model_config[
'imagen_upsampler_256_config']
self.unet_imagen_upsampler_256 = SuperResUNet256(
**imagen_upsampler_256_config)
# upsampler (256x256)
upsampler_256_config = model_config['upsampler_256_config']
self.unet_upsampler_256 = SuperResUNet256(**upsampler_256_config)

# dalle2 upsampler (1024x1024)
# upsampler (1024x1024)
upsampler_1024_config = model_config['upsampler_1024_config']
self.unet_upsampler_1024 = ImagenUpsampler1024(**upsampler_1024_config)
self.unet_upsampler_1024 = SuperResUNet1024(**upsampler_1024_config)

def forward(self, noise, timesteps, input_ids, token_type_ids,
attention_mask):
@@ -102,39 +101,39 @@ class ImagenModel(nn.Module):
attention_mask=attention_mask)
context = context[-1]
x = self.unet_generator(noise, timesteps, y, context, attention_mask)
x = self.unet_imagen_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y,
context, attention_mask)
x = self.unet_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y, context,
attention_mask)
x = self.unet_upsampler_1024(x, t, x)
return x


@MODELS.register_module(
Tasks.text_to_image_synthesis, module_name=Models.imagen)
class ImagenForTextToImageSynthesis(Model):
Tasks.text_to_image_synthesis, module_name=Models.diffusion)
class DiffusionForTextToImageSynthesis(Model):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)
imagen_model = ImagenModel(model_dir=model_dir)
diffusion_model = DiffusionModel(model_dir=model_dir)
pretrained_params = torch.load(
osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
imagen_model.load_state_dict(pretrained_params)
imagen_model.eval()
diffusion_model.load_state_dict(pretrained_params)
diffusion_model.eval()

self.device_id = device_id
if self.device_id >= 0:
self.device = torch.device(f'cuda:{self.device_id}')
imagen_model.to('cuda:{}'.format(self.device_id))
diffusion_model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id))
else:
self.device = torch.device('cpu')
logger.info('Use CPU for inference')

# modules
self.text_encoder = imagen_model.text_encoder
self.unet_generator = imagen_model.unet_generator
self.unet_imagen_upsampler_256 = imagen_model.unet_imagen_upsampler_256
self.unet_upsampler_1024 = imagen_model.unet_upsampler_1024
self.text_encoder = diffusion_model.text_encoder
self.unet_generator = diffusion_model.unet_generator
self.unet_upsampler_256 = diffusion_model.unet_upsampler_256
self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024

# text tokenizer
vocab_path = '{}/vocab.txt'.format(model_dir)
@@ -145,8 +144,8 @@ class ImagenForTextToImageSynthesis(Model):
open('{}/diffusion_config.json'.format(model_dir)))
self.diffusion_generator = make_diffusion(
**diffusion_params['generator_config'])
self.diffusion_imagen_upsampler_256 = make_diffusion(
**diffusion_params['imagen_upsampler_256_config'])
self.diffusion_upsampler_256 = make_diffusion(
**diffusion_params['upsampler_256_config'])
self.diffusion_upsampler_1024 = make_diffusion(
**diffusion_params['upsampler_1024_config'])

@@ -166,9 +165,9 @@ class ImagenForTextToImageSynthesis(Model):
attention_mask=attention_mask)
context = context[-1]
x = self.unet_generator(noise, timesteps, y, context, attention_mask)
x = self.unet_imagen_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y,
context, attention_mask)
x = self.unet_upsampler_256(noise, timesteps, x,
torch.zeros_like(timesteps), y, context,
attention_mask)
x = self.unet_upsampler_1024(x, t, x)
img = x.clamp(-1, 1).add(1).mul(127.5)
img = img.squeeze(0).permute(1, 2, 0).cpu().numpy().astype(np.uint8)
@@ -217,9 +216,9 @@ class ImagenForTextToImageSynthesis(Model):
if not input.get('debug', False):
img = F.interpolate(
img, scale_factor=4.0, mode='bilinear', align_corners=False)
img = self.diffusion_imagen_upsampler_256.ddim_sample_loop(
img = self.diffusion_upsampler_256.ddim_sample_loop(
noise=torch.randn_like(img),
model=self.unet_imagen_upsampler_256,
model=self.unet_upsampler_256,
model_kwargs=[{
'lx': img,
'lt': torch.zeros(1).to(self.device),

modelscope/models/multi_modal/imagen/structbert.py → modelscope/models/multi_modal/diffusion/structbert.py View File


modelscope/models/multi_modal/imagen/tokenizer.py → modelscope/models/multi_modal/diffusion/tokenizer.py View File


modelscope/models/multi_modal/imagen/unet_generator.py → modelscope/models/multi_modal/diffusion/unet_generator.py View File

@@ -4,7 +4,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F

__all__ = ['ImagenGenerator']
__all__ = ['DiffusionGenerator']


def sinusoidal_embedding(timesteps, dim):
@@ -156,7 +156,7 @@ class AttentionBlock(nn.Module):
return x + identity


class ImagenGenerator(nn.Module):
class DiffusionGenerator(nn.Module):

def __init__(self,
in_dim=3,
@@ -173,7 +173,7 @@ class ImagenGenerator(nn.Module):
use_scale_shift_norm=True,
dropout=0.0):
embed_dim = dim * 4
super(ImagenGenerator, self).__init__()
super(DiffusionGenerator, self).__init__()
self.in_dim = in_dim
self.dim = dim
self.text_dim = text_dim

modelscope/models/multi_modal/imagen/unet_upsampler_1024.py → modelscope/models/multi_modal/diffusion/unet_upsampler_1024.py View File

@@ -4,7 +4,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F

__all__ = ['ImagenUpsampler1024']
__all__ = ['SuperResUNet1024']


def sinusoidal_embedding(timesteps, dim):
@@ -99,7 +99,7 @@ class ResidualBlock(nn.Module):
return x


class ImagenUpsampler1024(nn.Module):
class SuperResUNet1024(nn.Module):

def __init__(self,
in_dim=6,
@@ -111,7 +111,7 @@ class ImagenUpsampler1024(nn.Module):
use_scale_shift_norm=True,
dropout=0.0):
embed_dim = dim * 4
super(ImagenUpsampler1024, self).__init__()
super(SuperResUNet1024, self).__init__()
self.in_dim = in_dim
self.dim = dim
self.out_dim = out_dim

modelscope/models/multi_modal/imagen/unet_imagen_upsampler_256.py → modelscope/models/multi_modal/diffusion/unet_upsampler_256.py View File


+ 0
- 1
modelscope/models/multi_modal/imagen/__init__.py View File

@@ -1 +0,0 @@
from .imagen_model import ImagenForTextToImageSynthesis

+ 1
- 1
modelscope/pipelines/builder.py View File

@@ -86,7 +86,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/cv_r2p1d_video_embedding'),
Tasks.text_to_image_synthesis:
(Pipelines.text_to_image_synthesis,
'damo/cv_imagen_text-to-image-synthesis_tiny'),
'damo/cv_diffusion_text-to-image-synthesis_tiny'),
Tasks.face_detection: (Pipelines.face_detection,
'damo/cv_resnet_facedetection_scrfd10gkps'),
Tasks.face_recognition: (Pipelines.face_recognition,


+ 2
- 2
tests/pipelines/test_text_to_image_synthesis.py View File

@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level


class TextToImageSynthesisTest(unittest.TestCase):
model_id = 'damo/cv_imagen_text-to-image-synthesis_tiny'
model_id = 'damo/cv_diffusion_text-to-image-synthesis_tiny'
test_text = {
'text': '宇航员',
'generator_ddim_timesteps': 2,
@@ -30,7 +30,7 @@ class TextToImageSynthesisTest(unittest.TestCase):
self.test_text)[OutputKeys.OUTPUT_IMG]
print(np.sum(np.abs(img)))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_name(self):
pipe_line_text_to_image_synthesis = pipeline(
task=Tasks.text_to_image_synthesis, model=self.model_id)


Loading…
Cancel
Save