tianxi.tl yingda.chen 3 years ago
parent
commit
fc89cf8f3e
18 changed files with 2290 additions and 2 deletions
  1. +3
    -0
      data/test/images/img2img_style.jpg
  2. +1
    -0
      modelscope/metainfo.py
  3. +3
    -2
      modelscope/models/cv/__init__.py
  4. +2
    -0
      modelscope/models/cv/image_to_image_generation/__init__.py
  5. +24
    -0
      modelscope/models/cv/image_to_image_generation/data/__init__.py
  6. +121
    -0
      modelscope/models/cv/image_to_image_generation/data/transforms.py
  7. +322
    -0
      modelscope/models/cv/image_to_image_generation/model.py
  8. +24
    -0
      modelscope/models/cv/image_to_image_generation/models/__init__.py
  9. +412
    -0
      modelscope/models/cv/image_to_image_generation/models/autoencoder.py
  10. +418
    -0
      modelscope/models/cv/image_to_image_generation/models/clip.py
  11. +22
    -0
      modelscope/models/cv/image_to_image_generation/ops/__init__.py
  12. +598
    -0
      modelscope/models/cv/image_to_image_generation/ops/diffusion.py
  13. +35
    -0
      modelscope/models/cv/image_to_image_generation/ops/losses.py
  14. +3
    -0
      modelscope/pipelines/builder.py
  15. +3
    -0
      modelscope/pipelines/cv/__init__.py
  16. +250
    -0
      modelscope/pipelines/cv/image_to_image_generate_pipeline.py
  17. +1
    -0
      modelscope/utils/constant.py
  18. +48
    -0
      tests/pipelines/test_image2image_generation.py

+ 3
- 0
data/test/images/img2img_style.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ef06465535002fd565f3e50d16772bdcb8e47f474fb7d7c318510fff49ab1090
size 212790

+ 1
- 0
modelscope/metainfo.py View File

@@ -91,6 +91,7 @@ class Pipelines(object):
image2image_translation = 'image-to-image-translation'
live_category = 'live-category'
video_category = 'video-category'
image_to_image_generation = 'image-to-image-generation'

# nlp tasks
sentence_similarity = 'sentence-similarity'


+ 3
- 2
modelscope/models/cv/__init__.py View File

@@ -3,5 +3,6 @@ from . import (action_recognition, animal_recognition, cartoon,
cmdssl_video_embedding, face_detection, face_generation,
image_classification, image_color_enhance, image_colorization,
image_denoise, image_instance_segmentation,
image_to_image_translation, object_detection,
product_retrieval_embedding, super_resolution, virual_tryon)
image_to_image_generation, image_to_image_translation,
object_detection, product_retrieval_embedding, super_resolution,
virual_tryon)

+ 2
- 0
modelscope/models/cv/image_to_image_generation/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from . import data, models, ops

+ 24
- 0
modelscope/models/cv/image_to_image_generation/data/__init__.py View File

@@ -0,0 +1,24 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .transforms import PadToSquare

else:
_import_structure = {
'transforms': ['PadToSquare'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

# from .transforms import * # noqa F403

+ 121
- 0
modelscope/models/cv/image_to_image_generation/data/transforms.py View File

@@ -0,0 +1,121 @@
import math
import random

import torchvision.transforms.functional as TF
from PIL import Image, ImageFilter

__all__ = [
'Identity', 'PadToSquare', 'RandomScale', 'RandomRotate',
'RandomGaussianBlur', 'RandomCrop'
]


class Identity(object):

def __call__(self, *args):
if len(args) == 0:
return None
elif len(args) == 1:
return args[0]
else:
return args


class PadToSquare(object):

def __init__(self, fill=(255, 255, 255)):
self.fill = fill

def __call__(self, img):
w, h = img.size
if w != h:
if w > h:
t = (w - h) // 2
b = w - h - t
padding = (0, t, 0, b)
else:
left = (h - w) // 2
right = h - w - l
padding = (left, 0, right, 0)
img = TF.pad(img, padding, fill=self.fill)
return img


class RandomScale(object):

def __init__(self,
min_scale=0.5,
max_scale=2.0,
min_ratio=0.8,
max_ratio=1.25):
self.min_scale = min_scale
self.max_scale = max_scale
self.min_ratio = min_ratio
self.max_ratio = max_ratio

def __call__(self, img):
w, h = img.size
scale = 2**random.uniform(
math.log2(self.min_scale), math.log2(self.max_scale))
ratio = 2**random.uniform(
math.log2(self.min_ratio), math.log2(self.max_ratio))
ow = int(w * scale * math.sqrt(ratio))
oh = int(h * scale / math.sqrt(ratio))
img = img.resize((ow, oh), Image.BILINEAR)
return img


class RandomRotate(object):

def __init__(self,
min_angle=-10.0,
max_angle=10.0,
padding=(255, 255, 255),
p=0.5):
self.min_angle = min_angle
self.max_angle = max_angle
self.padding = padding
self.p = p

def __call__(self, img):
if random.random() < self.p:
angle = random.uniform(self.min_angle, self.max_angle)
img = img.rotate(angle, Image.BILINEAR, fillcolor=self.padding)
return img


class RandomGaussianBlur(object):

def __init__(self, radius=5, p=0.5):
self.radius = radius
self.p = p

def __call__(self, img):
if random.random() < self.p:
img = img.filter(ImageFilter.GaussianBlur(radius=self.radius))
return img


class RandomCrop(object):

def __init__(self, size, padding=(255, 255, 255)):
self.size = size
self.padding = padding

def __call__(self, img):
# pad
w, h = img.size
pad_w = max(0, self.size - w)
pad_h = max(0, self.size - h)
if pad_w > 0 or pad_h > 0:
half_w = pad_w // 2
half_h = pad_h // 2
pad = (half_w, half_h, pad_w - half_w, pad_h - half_h)
img = TF.pad(img, pad, fill=self.padding)

# crop
w, h = img.size
x1 = random.randint(0, w - self.size)
y1 = random.randint(0, h - self.size)
img = img.crop((x1, y1, x1 + self.size, y1 + self.size))
return img

+ 322
- 0
modelscope/models/cv/image_to_image_generation/model.py View File

@@ -0,0 +1,322 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

__all__ = ['UNet']


def sinusoidal_embedding(timesteps, dim):
# check input
half = dim // 2
timesteps = timesteps.float()

# compute sinusoidal embedding
sinusoid = torch.outer(
timesteps, torch.pow(10000,
-torch.arange(half).to(timesteps).div(half)))
x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
if dim % 2 != 0:
x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
return x


class Resample(nn.Module):

def __init__(self, scale_factor=1.0):
assert scale_factor in [0.5, 1.0, 2.0]
super(Resample, self).__init__()
self.scale_factor = scale_factor

def forward(self, x):
if self.scale_factor == 2.0:
x = F.interpolate(x, scale_factor=2, mode='nearest')
elif self.scale_factor == 0.5:
x = F.avg_pool2d(x, kernel_size=2, stride=2)
return x


class ResidualBlock(nn.Module):

def __init__(self, in_dim, embed_dim, out_dim, dropout=0.0):
super(ResidualBlock, self).__init__()
self.in_dim = in_dim
self.embed_dim = embed_dim
self.out_dim = out_dim

# layers
self.layer1 = nn.Sequential(
nn.GroupNorm(32, in_dim), nn.SiLU(),
nn.Conv2d(in_dim, out_dim, 3, padding=1))
self.embedding = nn.Sequential(nn.SiLU(),
nn.Linear(embed_dim, out_dim))
self.layer2 = nn.Sequential(
nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
nn.Conv2d(out_dim, out_dim, 3, padding=1))
self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
in_dim, out_dim, 1)

# zero out the last layer params
nn.init.zeros_(self.layer2[-1].weight)

def forward(self, x, y):
identity = x
x = self.layer1(x)
x = x + self.embedding(y).unsqueeze(-1).unsqueeze(-1)
x = self.layer2(x)
x = x + self.shortcut(identity)
return x


class MultiHeadAttention(nn.Module):

def __init__(self, dim, context_dim=None, num_heads=8, dropout=0.0):
assert dim % num_heads == 0
assert context_dim is None or context_dim % num_heads == 0
context_dim = context_dim or dim
super(MultiHeadAttention, self).__init__()
self.dim = dim
self.context_dim = context_dim
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = math.pow(self.head_dim, -0.25)

# layers
self.q = nn.Linear(dim, dim, bias=False)
self.k = nn.Linear(context_dim, dim, bias=False)
self.v = nn.Linear(context_dim, dim, bias=False)
self.o = nn.Linear(dim, dim)
self.dropout = nn.Dropout(dropout)

def forward(self, x, context=None):
# check inputs
context = x if context is None else context
b, n, c = x.size(0), self.num_heads, self.head_dim

# compute query, key, value
q = self.q(x).view(b, -1, n, c)
k = self.k(context).view(b, -1, n, c)
v = self.v(context).view(b, -1, n, c)

# compute attention
attn = torch.einsum('binc,bjnc->bnij', q * self.scale, k * self.scale)
attn = F.softmax(attn, dim=-1)
attn = self.dropout(attn)

# gather context
x = torch.einsum('bnij,bjnc->binc', attn, v)
x = x.reshape(b, -1, n * c)

# output
x = self.o(x)
x = self.dropout(x)
return x


class GLU(nn.Module):

def __init__(self, in_dim, out_dim):
super(GLU, self).__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.proj = nn.Linear(in_dim, out_dim * 2)

def forward(self, x):
x, gate = self.proj(x).chunk(2, dim=-1)
return x * F.gelu(gate)


class TransformerBlock(nn.Module):

def __init__(self, dim, context_dim, num_heads, dropout=0.0):
super(TransformerBlock, self).__init__()
self.dim = dim
self.context_dim = context_dim
self.num_heads = num_heads
self.head_dim = dim // num_heads

# input
self.norm1 = nn.GroupNorm(32, dim, eps=1e-6, affine=True)
self.conv1 = nn.Conv2d(dim, dim, 1)

# self attention
self.norm2 = nn.LayerNorm(dim)
self.self_attn = MultiHeadAttention(dim, None, num_heads, dropout)

# cross attention
self.norm3 = nn.LayerNorm(dim)
self.cross_attn = MultiHeadAttention(dim, context_dim, num_heads,
dropout)

# ffn
self.norm4 = nn.LayerNorm(dim)
self.ffn = nn.Sequential(
GLU(dim, dim * 4), nn.Dropout(dropout), nn.Linear(dim * 4, dim))

# output
self.conv2 = nn.Conv2d(dim, dim, 1)

# zero out the last layer params
nn.init.zeros_(self.conv2.weight)

def forward(self, x, context):
b, c, h, w = x.size()
identity = x

# input
x = self.norm1(x)
x = self.conv1(x).view(b, c, -1).transpose(1, 2)

# attention
x = x + self.self_attn(self.norm2(x))
x = x + self.cross_attn(self.norm3(x), context)
x = x + self.ffn(self.norm4(x))

# output
x = x.transpose(1, 2).view(b, c, h, w)
x = self.conv2(x)
return x + identity


class UNet(nn.Module):

def __init__(self,
resolution=64,
in_dim=3,
dim=192,
label_dim=512,
context_dim=512,
out_dim=3,
dim_mult=[1, 2, 3, 5],
num_heads=1,
head_dim=None,
num_res_blocks=2,
attn_scales=[1 / 2, 1 / 4, 1 / 8],
dropout=0.0):
embed_dim = dim * 4
super(UNet, self).__init__()
self.resolution = resolution
self.in_dim = in_dim
self.dim = dim
self.context_dim = context_dim
self.out_dim = out_dim
self.dim_mult = dim_mult
self.num_heads = num_heads
self.head_dim = head_dim
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales

# params
enc_dims = [dim * u for u in [1] + dim_mult]
dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
shortcut_dims = []
scale = 1.0

# embeddings
self.time_embedding = nn.Sequential(
nn.Linear(dim, embed_dim), nn.SiLU(),
nn.Linear(embed_dim, embed_dim))
self.clip_embedding = nn.Sequential(
nn.Linear(label_dim, context_dim), nn.SiLU(),
nn.Linear(context_dim, context_dim))

# encoder
self.encoder = nn.ModuleList(
[nn.Conv2d(self.in_dim, dim, 3, padding=1)])
shortcut_dims.append(dim)
for i, (in_dim,
out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
for j in range(num_res_blocks):
# residual (+attention) blocks
block = nn.ModuleList(
[ResidualBlock(in_dim, embed_dim, out_dim, dropout)])
if scale in attn_scales:
block.append(
TransformerBlock(out_dim, context_dim, num_heads))
in_dim = out_dim
self.encoder.append(block)
shortcut_dims.append(out_dim)

# downsample
if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
self.encoder.append(
nn.Conv2d(out_dim, out_dim, 3, stride=2, padding=1))
shortcut_dims.append(out_dim)
scale /= 2.0

# middle
self.middle = nn.ModuleList([
ResidualBlock(out_dim, embed_dim, out_dim, dropout),
TransformerBlock(out_dim, context_dim, num_heads),
ResidualBlock(out_dim, embed_dim, out_dim, dropout)
])

# decoder
self.decoder = nn.ModuleList()
for i, (in_dim,
out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
for j in range(num_res_blocks + 1):
# residual (+attention) blocks
block = nn.ModuleList([
ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
out_dim, dropout)
])
if scale in attn_scales:
block.append(
TransformerBlock(out_dim, context_dim, num_heads,
dropout))
in_dim = out_dim

# upsample
if i != len(dim_mult) - 1 and j == num_res_blocks:
block.append(
nn.Sequential(
Resample(scale_factor=2.0),
nn.Conv2d(out_dim, out_dim, 3, padding=1)))
scale *= 2.0
self.decoder.append(block)

# head
self.head = nn.Sequential(
nn.GroupNorm(32, out_dim), nn.SiLU(),
nn.Conv2d(out_dim, self.out_dim, 3, padding=1))

# zero out the last layer params
nn.init.zeros_(self.head[-1].weight)

def forward(self, x, t, y):
# embeddings
t = self.time_embedding(sinusoidal_embedding(t, self.dim))
y = self.clip_embedding(y)

# encoder
xs = []
for block in self.encoder:
x = self._forward_single(block, x, t, y)
xs.append(x)

# middle
for block in self.middle:
x = self._forward_single(block, x, t, y)

# decoder
for block in self.decoder:
x = torch.cat([x, xs.pop()], dim=1)
x = self._forward_single(block, x, t, y)

# head
x = self.head(x)
return x

def _forward_single(self, module, x, t, y):
if isinstance(module, ResidualBlock):
x = module(x, t)
elif isinstance(module, TransformerBlock):
x = module(x, y)
elif isinstance(module, nn.ModuleList):
for block in module:
x = self._forward_single(block, x, t, y)
else:
x = module(x)
return x

+ 24
- 0
modelscope/models/cv/image_to_image_generation/models/__init__.py View File

@@ -0,0 +1,24 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .autoencoder import VQAutoencoder
from .clip import VisionTransformer

else:
_import_structure = {
'autoencoder': ['VQAutoencoder'],
'clip': ['VisionTransformer']
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 412
- 0
modelscope/models/cv/image_to_image_generation/models/autoencoder.py View File

@@ -0,0 +1,412 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

__all__ = ['VQAutoencoder', 'KLAutoencoder', 'PatchDiscriminator']


def group_norm(dim):
return nn.GroupNorm(32, dim, eps=1e-6, affine=True)


class Resample(nn.Module):

def __init__(self, dim, scale_factor):
super(Resample, self).__init__()
self.dim = dim
self.scale_factor = scale_factor

# layers
if scale_factor == 2.0:
self.resample = nn.Sequential(
nn.Upsample(scale_factor=scale_factor, mode='nearest'),
nn.Conv2d(dim, dim, 3, padding=1))
elif scale_factor == 0.5:
self.resample = nn.Sequential(
nn.ZeroPad2d((0, 1, 0, 1)),
nn.Conv2d(dim, dim, 3, stride=2, padding=0))
else:
self.resample = nn.Identity()

def forward(self, x):
return self.resample(x)


class ResidualBlock(nn.Module):

def __init__(self, in_dim, out_dim, dropout=0.0):
super(ResidualBlock, self).__init__()
self.in_dim = in_dim
self.out_dim = out_dim

# layers
self.residual = nn.Sequential(
group_norm(in_dim), nn.SiLU(),
nn.Conv2d(in_dim, out_dim, 3, padding=1), group_norm(out_dim),
nn.SiLU(), nn.Dropout(dropout),
nn.Conv2d(out_dim, out_dim, 3, padding=1))
self.shortcut = nn.Conv2d(in_dim, out_dim,
1) if in_dim != out_dim else nn.Identity()

# zero out the last layer params
nn.init.zeros_(self.residual[-1].weight)

def forward(self, x):
return self.residual(x) + self.shortcut(x)


class AttentionBlock(nn.Module):

def __init__(self, dim):
super(AttentionBlock, self).__init__()
self.dim = dim
self.scale = math.pow(dim, -0.25)

# layers
self.norm = group_norm(dim)
self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
self.proj = nn.Conv2d(dim, dim, 1)

# zero out the last layer params
nn.init.zeros_(self.proj.weight)

def forward(self, x):
identity = x
b, c, h, w = x.size()

# compute query, key, value
x = self.norm(x)
q, k, v = self.to_qkv(x).view(b, c * 3, -1).chunk(3, dim=1)

# compute attention
attn = torch.einsum('bci,bcj->bij', q * self.scale, k * self.scale)
attn = F.softmax(attn, dim=-1)

# gather context
x = torch.einsum('bij,bcj->bci', attn, v)
x = x.reshape(b, c, h, w)

# output
x = self.proj(x)
return x + identity


class Encoder(nn.Module):

def __init__(self,
dim=128,
z_dim=3,
dim_mult=[1, 2, 4],
num_res_blocks=2,
attn_scales=[],
dropout=0.0):
super(Encoder, self).__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales

# params
dims = [dim * u for u in [1] + dim_mult]
scale = 1.0

# init block
self.conv1 = nn.Conv2d(3, dims[0], 3, padding=1)

# downsample blocks
downsamples = []
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
# residual (+attention) blocks
for _ in range(num_res_blocks):
downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
if scale in attn_scales:
downsamples.append(AttentionBlock(out_dim))
in_dim = out_dim

# downsample block
if i != len(dim_mult) - 1:
downsamples.append(Resample(out_dim, scale_factor=0.5))
scale /= 2.0
self.downsamples = nn.Sequential(*downsamples)

# middle blocks
self.middle = nn.Sequential(
ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
ResidualBlock(out_dim, out_dim, dropout))

# output blocks
self.head = nn.Sequential(
group_norm(out_dim), nn.SiLU(),
nn.Conv2d(out_dim, z_dim, 3, padding=1))

def forward(self, x):
x = self.conv1(x)
x = self.downsamples(x)
x = self.middle(x)
x = self.head(x)
return x


class Decoder(nn.Module):

def __init__(self,
dim=128,
z_dim=3,
dim_mult=[1, 2, 4],
num_res_blocks=2,
attn_scales=[],
dropout=0.0):
super(Decoder, self).__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales

# params
dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
scale = 1.0 / 2**(len(dim_mult) - 2)

# init block
self.conv1 = nn.Conv2d(z_dim, dims[0], 3, padding=1)

# middle blocks
self.middle = nn.Sequential(
ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
ResidualBlock(dims[0], dims[0], dropout))

# upsample blocks
upsamples = []
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
# residual (+attention) blocks
for _ in range(num_res_blocks + 1):
upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
if scale in attn_scales:
upsamples.append(AttentionBlock(out_dim))
in_dim = out_dim

# upsample block
if i != len(dim_mult) - 1:
upsamples.append(Resample(out_dim, scale_factor=2.0))
scale *= 2.0
self.upsamples = nn.Sequential(*upsamples)

# output blocks
self.head = nn.Sequential(
group_norm(out_dim), nn.SiLU(),
nn.Conv2d(out_dim, 3, 3, padding=1))

def forward(self, x):
x = self.conv1(x)
x = self.middle(x)
x = self.upsamples(x)
x = self.head(x)
return x


class VectorQuantizer(nn.Module):

def __init__(self, codebook_size=8192, z_dim=3, beta=0.25):
super(VectorQuantizer, self).__init__()
self.codebook_size = codebook_size
self.z_dim = z_dim
self.beta = beta

# init codebook
eps = math.sqrt(1.0 / codebook_size)
self.codebook = nn.Parameter(
torch.empty(codebook_size, z_dim).uniform_(-eps, eps))

def forward(self, z):
# preprocess
b, c, h, w = z.size()
flatten = z.permute(0, 2, 3, 1).reshape(-1, c)

# quantization
with torch.no_grad():
tokens = torch.cdist(flatten, self.codebook).argmin(dim=1)
quantized = F.embedding(tokens,
self.codebook).view(b, h, w,
c).permute(0, 3, 1, 2)

# compute loss
codebook_loss = F.mse_loss(quantized, z.detach())
commitment_loss = F.mse_loss(quantized.detach(), z)
loss = codebook_loss + self.beta * commitment_loss

# perplexity
counts = F.one_hot(tokens, self.codebook_size).sum(dim=0).to(z.dtype)
# dist.all_reduce(counts)
p = counts / counts.sum()
perplexity = torch.exp(-torch.sum(p * torch.log(p + 1e-10)))

# postprocess
tokens = tokens.view(b, h, w)
quantized = z + (quantized - z).detach()
return quantized, tokens, loss, perplexity


class VQAutoencoder(nn.Module):

def __init__(self,
dim=128,
z_dim=3,
dim_mult=[1, 2, 4],
num_res_blocks=2,
attn_scales=[],
dropout=0.0,
codebook_size=8192,
beta=0.25):
super(VQAutoencoder, self).__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales
self.codebook_size = codebook_size
self.beta = beta

# blocks
self.encoder = Encoder(dim, z_dim, dim_mult, num_res_blocks,
attn_scales, dropout)
self.conv1 = nn.Conv2d(z_dim, z_dim, 1)
self.quantizer = VectorQuantizer(codebook_size, z_dim, beta)
self.conv2 = nn.Conv2d(z_dim, z_dim, 1)
self.decoder = Decoder(dim, z_dim, dim_mult, num_res_blocks,
attn_scales, dropout)

def forward(self, x):
z = self.encoder(x)
z = self.conv1(z)
z, tokens, loss, perplexity = self.quantizer(z)
z = self.conv2(z)
x = self.decoder(z)
return x, tokens, loss, perplexity

def encode(self, imgs):
z = self.encoder(imgs)
z = self.conv1(z)
return z

def decode(self, z):
r"""Absort the quantizer in the decoder.
"""
z = self.quantizer(z)[0]
z = self.conv2(z)
imgs = self.decoder(z)
return imgs

@torch.no_grad()
def encode_to_tokens(self, imgs):
# preprocess
z = self.encoder(imgs)
z = self.conv1(z)

# quantization
b, c, h, w = z.size()
flatten = z.permute(0, 2, 3, 1).reshape(-1, c)
tokens = torch.cdist(flatten, self.quantizer.codebook).argmin(dim=1)
return tokens.view(b, -1)

@torch.no_grad()
def decode_from_tokens(self, tokens):
# dequantization
z = F.embedding(tokens, self.quantizer.codebook)

# postprocess
b, l, c = z.size()
h = w = int(math.sqrt(l))
z = z.view(b, h, w, c).permute(0, 3, 1, 2)
z = self.conv2(z)
imgs = self.decoder(z)
return imgs


class KLAutoencoder(nn.Module):

def __init__(self,
dim=128,
z_dim=4,
dim_mult=[1, 2, 4, 4],
num_res_blocks=2,
attn_scales=[],
dropout=0.0):
super(KLAutoencoder, self).__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales

# blocks
self.encoder = Encoder(dim, z_dim * 2, dim_mult, num_res_blocks,
attn_scales, dropout)
self.conv1 = nn.Conv2d(z_dim * 2, z_dim * 2, 1)
self.conv2 = nn.Conv2d(z_dim, z_dim, 1)
self.decoder = Decoder(dim, z_dim, dim_mult, num_res_blocks,
attn_scales, dropout)

def forward(self, x):
mu, log_var = self.encode(x)
z = self.reparameterize(mu, log_var)
x = self.decode(z)
return x, mu, log_var

def encode(self, x):
x = self.encoder(x)
mu, log_var = self.conv1(x).chunk(2, dim=1)
return mu, log_var

def decode(self, z):
x = self.conv2(z)
x = self.decoder(x)
return x

def reparameterize(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return eps * std + mu


class PatchDiscriminator(nn.Module):

def __init__(self, in_dim=3, dim=64, num_layers=3):
super(PatchDiscriminator, self).__init__()
self.in_dim = in_dim
self.dim = dim
self.num_layers = num_layers

# params
dims = [dim * min(8, 2**u) for u in range(num_layers + 1)]

# layers
layers = [
nn.Conv2d(in_dim, dim, 4, stride=2, padding=1),
nn.LeakyReLU(0.2)
]
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
stride = 1 if i == num_layers - 1 else 2
layers += [
nn.Conv2d(
in_dim, out_dim, 4, stride=stride, padding=1, bias=False),
nn.BatchNorm2d(out_dim),
nn.LeakyReLU(0.2)
]
layers += [nn.Conv2d(out_dim, 1, 4, stride=1, padding=1)]
self.layers = nn.Sequential(*layers)

# initialize weights
self.apply(self.init_weights)

def forward(self, x):
return self.layers(x)

def init_weights(self, m):
if isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight, 0.0, 0.02)
elif isinstance(m, nn.BatchNorm2d):
nn.init.normal_(m.weight, 1.0, 0.02)
nn.init.zeros_(m.bias)

+ 418
- 0
modelscope/models/cv/image_to_image_generation/models/clip.py View File

@@ -0,0 +1,418 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

import modelscope.models.cv.image_to_image_translation.ops as ops # for using differentiable all_gather

__all__ = [
'CLIP', 'clip_vit_b_32', 'clip_vit_b_16', 'clip_vit_l_14',
'clip_vit_l_14_336px', 'clip_vit_h_16'
]


def to_fp16(m):
if isinstance(m, (nn.Linear, nn.Conv2d)):
m.weight.data = m.weight.data.half()
if m.bias is not None:
m.bias.data = m.bias.data.half()
elif hasattr(m, 'head'):
p = getattr(m, 'head')
p.data = p.data.half()


class QuickGELU(nn.Module):

def forward(self, x):
return x * torch.sigmoid(1.702 * x)


class LayerNorm(nn.LayerNorm):
r"""Subclass of nn.LayerNorm to handle fp16.
"""

def forward(self, x):
return super(LayerNorm, self).forward(x.float()).type_as(x)


class SelfAttention(nn.Module):

def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
assert dim % num_heads == 0
super(SelfAttention, self).__init__()
self.dim = dim
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = 1.0 / math.sqrt(self.head_dim)

# layers
self.to_qkv = nn.Linear(dim, dim * 3)
self.attn_dropout = nn.Dropout(attn_dropout)
self.proj = nn.Linear(dim, dim)
self.proj_dropout = nn.Dropout(proj_dropout)

def forward(self, x, mask=None):
r"""x: [B, L, C].
mask: [*, L, L].
"""
b, l, _, n = *x.size(), self.num_heads

# compute query, key, and value
q, k, v = self.to_qkv(x.transpose(0, 1)).chunk(3, dim=-1)
q = q.reshape(l, b * n, -1).transpose(0, 1)
k = k.reshape(l, b * n, -1).transpose(0, 1)
v = v.reshape(l, b * n, -1).transpose(0, 1)

# compute attention
attn = self.scale * torch.bmm(q, k.transpose(1, 2))
if mask is not None:
attn = attn.masked_fill(mask[:, :l, :l] == 0, float('-inf'))
attn = F.softmax(attn.float(), dim=-1).type_as(attn)
attn = self.attn_dropout(attn)

# gather context
x = torch.bmm(attn, v)
x = x.view(b, n, l, -1).transpose(1, 2).reshape(b, l, -1)

# output
x = self.proj(x)
x = self.proj_dropout(x)
return x


class AttentionBlock(nn.Module):

def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
super(AttentionBlock, self).__init__()
self.dim = dim
self.num_heads = num_heads

# layers
self.norm1 = LayerNorm(dim)
self.attn = SelfAttention(dim, num_heads, attn_dropout, proj_dropout)
self.norm2 = LayerNorm(dim)
self.mlp = nn.Sequential(
nn.Linear(dim, dim * 4), QuickGELU(), nn.Linear(dim * 4, dim),
nn.Dropout(proj_dropout))

def forward(self, x, mask=None):
x = x + self.attn(self.norm1(x), mask)
x = x + self.mlp(self.norm2(x))
return x


class VisionTransformer(nn.Module):

def __init__(self,
image_size=224,
patch_size=16,
dim=768,
out_dim=512,
num_heads=12,
num_layers=12,
attn_dropout=0.0,
proj_dropout=0.0,
embedding_dropout=0.0):
assert image_size % patch_size == 0
super(VisionTransformer, self).__init__()
self.image_size = image_size
self.patch_size = patch_size
self.dim = dim
self.out_dim = out_dim
self.num_heads = num_heads
self.num_layers = num_layers
self.num_patches = (image_size // patch_size)**2

# embeddings
gain = 1.0 / math.sqrt(dim)
self.patch_embedding = nn.Conv2d(
3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
self.pos_embedding = nn.Parameter(
gain * torch.randn(1, self.num_patches + 1, dim))
self.dropout = nn.Dropout(embedding_dropout)

# transformer
self.pre_norm = LayerNorm(dim)
self.transformer = nn.Sequential(*[
AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
for _ in range(num_layers)
])
self.post_norm = LayerNorm(dim)

# head
self.head = nn.Parameter(gain * torch.randn(dim, out_dim))

def forward(self, x):
b, dtype = x.size(0), self.head.dtype
x = x.type(dtype)

# patch-embedding
x = self.patch_embedding(x).flatten(2).permute(0, 2, 1) # [b, n, c]
x = torch.cat([self.cls_embedding.repeat(b, 1, 1).type(dtype), x],
dim=1)
x = self.dropout(x + self.pos_embedding.type(dtype))
x = self.pre_norm(x)

# transformer
x = self.transformer(x)

# head
x = self.post_norm(x)
x = torch.mm(x[:, 0, :], self.head)
return x

def fp16(self):
return self.apply(to_fp16)


class TextTransformer(nn.Module):

def __init__(self,
vocab_size,
text_len,
dim=512,
out_dim=512,
num_heads=8,
num_layers=12,
attn_dropout=0.0,
proj_dropout=0.0,
embedding_dropout=0.0):
super(TextTransformer, self).__init__()
self.vocab_size = vocab_size
self.text_len = text_len
self.dim = dim
self.out_dim = out_dim
self.num_heads = num_heads
self.num_layers = num_layers

# embeddings
self.token_embedding = nn.Embedding(vocab_size, dim)
self.pos_embedding = nn.Parameter(0.01 * torch.randn(1, text_len, dim))
self.dropout = nn.Dropout(embedding_dropout)

# transformer
self.transformer = nn.ModuleList([
AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
for _ in range(num_layers)
])
self.norm = LayerNorm(dim)

# head
gain = 1.0 / math.sqrt(dim)
self.head = nn.Parameter(gain * torch.randn(dim, out_dim))

# causal attention mask
self.register_buffer('attn_mask',
torch.tril(torch.ones(1, text_len, text_len)))

def forward(self, x):
eot, dtype = x.argmax(dim=-1), self.head.dtype

# embeddings
x = self.dropout(
self.token_embedding(x).type(dtype)
+ self.pos_embedding.type(dtype))

# transformer
for block in self.transformer:
x = block(x, self.attn_mask)

# head
x = self.norm(x)
x = torch.mm(x[torch.arange(x.size(0)), eot], self.head)
return x

def fp16(self):
return self.apply(to_fp16)


class CLIP(nn.Module):

def __init__(self,
embed_dim=512,
image_size=224,
patch_size=16,
vision_dim=768,
vision_heads=12,
vision_layers=12,
vocab_size=49408,
text_len=77,
text_dim=512,
text_heads=8,
text_layers=12,
attn_dropout=0.0,
proj_dropout=0.0,
embedding_dropout=0.0):
super(CLIP, self).__init__()
self.embed_dim = embed_dim
self.image_size = image_size
self.patch_size = patch_size
self.vision_dim = vision_dim
self.vision_heads = vision_heads
self.vision_layers = vision_layers
self.vocab_size = vocab_size
self.text_len = text_len
self.text_dim = text_dim
self.text_heads = text_heads
self.text_layers = text_layers

# models
self.visual = VisionTransformer(
image_size=image_size,
patch_size=patch_size,
dim=vision_dim,
out_dim=embed_dim,
num_heads=vision_heads,
num_layers=vision_layers,
attn_dropout=attn_dropout,
proj_dropout=proj_dropout,
embedding_dropout=embedding_dropout)
self.textual = TextTransformer(
vocab_size=vocab_size,
text_len=text_len,
dim=text_dim,
out_dim=embed_dim,
num_heads=text_heads,
num_layers=text_layers,
attn_dropout=attn_dropout,
proj_dropout=proj_dropout,
embedding_dropout=embedding_dropout)
self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))

def forward(self, imgs, txt_tokens):
r"""imgs: [B, C, H, W] of torch.float32.
txt_tokens: [B, T] of torch.long.
"""
xi = self.visual(imgs)
xt = self.textual(txt_tokens)

# normalize features
xi = F.normalize(xi, p=2, dim=1)
xt = F.normalize(xt, p=2, dim=1)

# gather features from all ranks
full_xi = ops.diff_all_gather(xi)
full_xt = ops.diff_all_gather(xt)

# logits
scale = self.log_scale.exp()
logits_i2t = scale * torch.mm(xi, full_xt.t())
logits_t2i = scale * torch.mm(xt, full_xi.t())

# labels
labels = torch.arange(
len(xi) * ops.get_rank(),
len(xi) * (ops.get_rank() + 1),
dtype=torch.long,
device=xi.device)
return logits_i2t, logits_t2i, labels

def init_weights(self):
# embeddings
nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
nn.init.normal_(self.visual.patch_embedding.weight, tsd=0.1)

# attentions
for modality in ['visual', 'textual']:
dim = self.vision_dim if modality == 'visual' else 'textual'
transformer = getattr(self, modality).transformer
proj_gain = (1.0 / math.sqrt(dim)) * (
1.0 / math.sqrt(2 * transformer.num_layers))
attn_gain = 1.0 / math.sqrt(dim)
mlp_gain = 1.0 / math.sqrt(2.0 * dim)
for block in transformer.layers:
nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
nn.init.normal_(block.attn.proj.weight, std=proj_gain)
nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
nn.init.normal_(block.mlp[2].weight, std=proj_gain)

def param_groups(self):
groups = [{
'params': [
p for n, p in self.named_parameters()
if 'norm' in n or n.endswith('bias')
],
'weight_decay':
0.0
}, {
'params': [
p for n, p in self.named_parameters()
if not ('norm' in n or n.endswith('bias'))
]
}]
return groups

def fp16(self):
return self.apply(to_fp16)


def clip_vit_b_32(**kwargs):
return CLIP(
embed_dim=512,
image_size=224,
patch_size=32,
vision_dim=768,
vision_heads=12,
vision_layers=12,
text_dim=512,
text_heads=8,
text_layers=12,
**kwargs)


def clip_vit_b_16(**kwargs):
return CLIP(
embed_dim=512,
image_size=224,
patch_size=16,
vision_dim=768,
vision_heads=12,
vision_layers=12,
text_dim=512,
text_heads=8,
text_layers=12,
**kwargs)


def clip_vit_l_14(**kwargs):
return CLIP(
embed_dim=768,
image_size=224,
patch_size=14,
vision_dim=1024,
vision_heads=16,
vision_layers=24,
text_dim=768,
text_heads=12,
text_layers=12,
**kwargs)


def clip_vit_l_14_336px(**kwargs):
return CLIP(
embed_dim=768,
image_size=336,
patch_size=14,
vision_dim=1024,
vision_heads=16,
vision_layers=24,
text_dim=768,
text_heads=12,
text_layers=12,
**kwargs)


def clip_vit_h_16(**kwargs):
return CLIP(
embed_dim=1024,
image_size=256,
patch_size=16,
vision_dim=1280,
vision_heads=16,
vision_layers=32,
text_dim=1024,
text_heads=16,
text_layers=24,
**kwargs)

+ 22
- 0
modelscope/models/cv/image_to_image_generation/ops/__init__.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .diffusion import GaussianDiffusion, beta_schedule

else:
_import_structure = {
'diffusion': ['GaussianDiffusion', 'beta_schedule'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 598
- 0
modelscope/models/cv/image_to_image_generation/ops/diffusion.py View File

@@ -0,0 +1,598 @@
import math

import torch

from .losses import discretized_gaussian_log_likelihood, kl_divergence

__all__ = ['GaussianDiffusion', 'beta_schedule']


def _i(tensor, t, x):
r"""Index tensor using t and format the output according to x.
"""
shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
return tensor[t].view(shape).to(x)


def beta_schedule(schedule,
num_timesteps=1000,
init_beta=None,
last_beta=None):
if schedule == 'linear':
scale = 1000.0 / num_timesteps
init_beta = init_beta or scale * 0.0001
last_beta = last_beta or scale * 0.02
return torch.linspace(
init_beta, last_beta, num_timesteps, dtype=torch.float64)
elif schedule == 'quadratic':
init_beta = init_beta or 0.0015
last_beta = last_beta or 0.0195
return torch.linspace(
init_beta**0.5, last_beta**0.5, num_timesteps,
dtype=torch.float64)**2
elif schedule == 'cosine':
betas = []
for step in range(num_timesteps):
t1 = step / num_timesteps
t2 = (step + 1) / num_timesteps

# fn = lambda u: math.cos((u + 0.008) / 1.008 * math.pi / 2)**2
def fn(u):
return math.cos((u + 0.008) / 1.008 * math.pi / 2)**2

betas.append(min(1.0 - fn(t2) / fn(t1), 0.999))
return torch.tensor(betas, dtype=torch.float64)
else:
raise ValueError(f'Unsupported schedule: {schedule}')


class GaussianDiffusion(object):

def __init__(self,
betas,
mean_type='eps',
var_type='learned_range',
loss_type='mse',
rescale_timesteps=False):
# check input
if not isinstance(betas, torch.DoubleTensor):
betas = torch.tensor(betas, dtype=torch.float64)
assert min(betas) > 0 and max(betas) <= 1
assert mean_type in ['x0', 'x_{t-1}', 'eps']
assert var_type in [
'learned', 'learned_range', 'fixed_large', 'fixed_small'
]
assert loss_type in [
'mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1'
]
self.betas = betas
self.num_timesteps = len(betas)
self.mean_type = mean_type
self.var_type = var_type
self.loss_type = loss_type
self.rescale_timesteps = rescale_timesteps

# alphas
alphas = 1 - self.betas
self.alphas_cumprod = torch.cumprod(alphas, dim=0)
self.alphas_cumprod_prev = torch.cat(
[alphas.new_ones([1]), self.alphas_cumprod[:-1]])
self.alphas_cumprod_next = torch.cat(
[self.alphas_cumprod[1:],
alphas.new_zeros([1])])

# q(x_t | x_{t-1})
self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0
- self.alphas_cumprod)
self.log_one_minus_alphas_cumprod = torch.log(1.0
- self.alphas_cumprod)
self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod
- 1)

# q(x_{t-1} | x_t, x_0)
self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
1.0 - self.alphas_cumprod)
self.posterior_log_variance_clipped = torch.log(
self.posterior_variance.clamp(1e-20))
self.posterior_mean_coef1 = betas * torch.sqrt(
self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
self.posterior_mean_coef2 = (
1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (
1.0 - self.alphas_cumprod)

def q_sample(self, x0, t, noise=None):
r"""Sample from q(x_t | x_0).
"""
noise = torch.randn_like(x0) if noise is None else noise
return _i(self.sqrt_alphas_cumprod, t, x0) * x0 + _i(
self.sqrt_one_minus_alphas_cumprod, t, x0) * noise

def q_mean_variance(self, x0, t):
r"""Distribution of q(x_t | x_0).
"""
mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
var = _i(1.0 - self.alphas_cumprod, t, x0)
log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
return mu, var, log_var

def q_posterior_mean_variance(self, x0, xt, t):
r"""Distribution of q(x_{t-1} | x_t, x_0).
"""
mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(
self.posterior_mean_coef2, t, xt) * xt
var = _i(self.posterior_variance, t, xt)
log_var = _i(self.posterior_log_variance_clipped, t, xt)
return mu, var, log_var

@torch.no_grad()
def p_sample(self,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None):
r"""Sample from p(x_{t-1} | x_t).
- condition_fn: for classifier-based guidance (guided-diffusion).
- guide_scale: for classifier-free guidance (glide/dalle-2).
"""
# predict distribution of p(x_{t-1} | x_t)
mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
clamp, percentile,
guide_scale)

# random sample (with optional conditional function)
noise = torch.randn_like(xt)
# no noise when t == 0
mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
if condition_fn is not None:
grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
mu = mu.float() + var * grad.float()
xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
return xt_1, x0

@torch.no_grad()
def p_sample_loop(self,
noise,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None):
r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
"""
# prepare input
b, c, h, w = noise.size()
xt = noise

# diffusion process
for step in torch.arange(self.num_timesteps).flip(0):
t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp,
percentile, condition_fn, guide_scale)
return xt

def p_mean_variance(self,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None,
guide_scale=None):
r"""Distribution of p(x_{t-1} | x_t).
"""
# predict distribution
if guide_scale is None:
out = model(xt, self._scale_timesteps(t), **model_kwargs)
else:
# classifier-free guidance
# (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
assert self.mean_type == 'eps'
y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
out = torch.cat(
[
u_out[:, :3] + guide_scale * # noqa W504
(y_out[:, :3] - u_out[:, :3]),
y_out[:, 3:]
],
dim=1)

# compute variance
if self.var_type == 'learned':
out, log_var = out.chunk(2, dim=1)
var = torch.exp(log_var)
elif self.var_type == 'learned_range':
out, fraction = out.chunk(2, dim=1)
min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
max_log_var = _i(torch.log(self.betas), t, xt)
fraction = (fraction + 1) / 2.0
log_var = fraction * max_log_var + (1 - fraction) * min_log_var
var = torch.exp(log_var)
elif self.var_type == 'fixed_large':
var = _i(
torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t,
xt)
log_var = torch.log(var)
elif self.var_type == 'fixed_small':
var = _i(self.posterior_variance, t, xt)
log_var = _i(self.posterior_log_variance_clipped, t, xt)

# compute mean and x0
if self.mean_type == 'x_{t-1}':
mu = out # x_{t-1}
x0 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu - _i(
self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
xt) * xt
elif self.mean_type == 'x0':
x0 = out
mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
elif self.mean_type == 'eps':
x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
self.sqrt_recipm1_alphas_cumprod, t, xt) * out
mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)

# restrict the range of x0
if percentile is not None:
assert percentile > 0 and percentile <= 1 # e.g., 0.995
s = torch.quantile(
x0.flatten(1).abs(), percentile,
dim=1).clamp_(1.0).view(-1, 1, 1, 1)
x0 = torch.min(s, torch.max(-s, x0)) / s
elif clamp is not None:
x0 = x0.clamp(-clamp, clamp)
return mu, var, log_var, x0

@torch.no_grad()
def ddim_sample(self,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None,
ddim_timesteps=20,
eta=0.0):
stride = self.num_timesteps // ddim_timesteps

# predict distribution of p(x_{t-1} | x_t)
_, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
percentile, guide_scale)
if condition_fn is not None:
# x0 -> eps
alpha = _i(self.alphas_cumprod, t, xt)
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
self.sqrt_recipm1_alphas_cumprod, t, xt)
eps = eps - (1 - alpha).sqrt() * condition_fn(
xt, self._scale_timesteps(t), **model_kwargs)

# eps -> x0
x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

# derive variables
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
self.sqrt_recipm1_alphas_cumprod, t, xt)
alphas = _i(self.alphas_cumprod, t, xt)
alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
sigmas = eta * torch.sqrt((1 - alphas_prev) / # noqa W504
(1 - alphas) * # noqa W504
(1 - alphas / alphas_prev))

# random sample
noise = torch.randn_like(xt)
direction = torch.sqrt(1 - alphas_prev - sigmas**2) * eps
mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
return xt_1, x0

@torch.no_grad()
def ddim_sample_loop(self,
noise,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None,
ddim_timesteps=20,
eta=0.0):
# prepare input
b, c, h, w = noise.size()
xt = noise

# diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
steps = (1 + torch.arange(0, self.num_timesteps,
self.num_timesteps // ddim_timesteps)).clamp(
0, self.num_timesteps - 1).flip(0)
for step in steps:
t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp,
percentile, condition_fn, guide_scale,
ddim_timesteps, eta)
return xt

@torch.no_grad()
def ddim_reverse_sample(self,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None,
guide_scale=None,
ddim_timesteps=20):
r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
"""
stride = self.num_timesteps // ddim_timesteps

# predict distribution of p(x_{t-1} | x_t)
_, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
percentile, guide_scale)

# derive variables
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
self.sqrt_recipm1_alphas_cumprod, t, xt)
alphas_next = _i(
torch.cat(
[self.alphas_cumprod,
self.alphas_cumprod.new_zeros([1])]),
(t + stride).clamp(0, self.num_timesteps), xt)

# reverse sample
mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
return mu, x0

@torch.no_grad()
def ddim_reverse_sample_loop(self,
x0,
model,
model_kwargs={},
clamp=None,
percentile=None,
guide_scale=None,
ddim_timesteps=20):
# prepare input
b, c, h, w = x0.size()
xt = x0

# reconstruction steps
steps = torch.arange(0, self.num_timesteps,
self.num_timesteps // ddim_timesteps)
for step in steps:
t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp,
percentile, guide_scale,
ddim_timesteps)
return xt

@torch.no_grad()
def plms_sample(self,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None,
plms_timesteps=20):
r"""Sample from p(x_{t-1} | x_t) using PLMS.
- condition_fn: for classifier-based guidance (guided-diffusion).
- guide_scale: for classifier-free guidance (glide/dalle-2).
"""
stride = self.num_timesteps // plms_timesteps

# function for compute eps
def compute_eps(xt, t):
# predict distribution of p(x_{t-1} | x_t)
_, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
clamp, percentile, guide_scale)

# condition
if condition_fn is not None:
# x0 -> eps
alpha = _i(self.alphas_cumprod, t, xt)
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
- x0) / _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
eps = eps - (1 - alpha).sqrt() * condition_fn(
xt, self._scale_timesteps(t), **model_kwargs)

# eps -> x0
x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

# derive eps
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
self.sqrt_recipm1_alphas_cumprod, t, xt)
return eps

# function for compute x_0 and x_{t-1}
def compute_x0(eps, t):
# eps -> x0
x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

# deterministic sample
alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
direction = torch.sqrt(1 - alphas_prev) * eps
# mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
xt_1 = torch.sqrt(alphas_prev) * x0 + direction
return xt_1, x0

# PLMS sample
eps = compute_eps(xt, t)
if len(eps_cache) == 0:
# 2nd order pseudo improved Euler
xt_1, x0 = compute_x0(eps, t)
eps_next = compute_eps(xt_1, (t - stride).clamp(0))
eps_prime = (eps + eps_next) / 2.0
elif len(eps_cache) == 1:
# 2nd order pseudo linear multistep (Adams-Bashforth)
eps_prime = (3 * eps - eps_cache[-1]) / 2.0
elif len(eps_cache) == 2:
# 3nd order pseudo linear multistep (Adams-Bashforth)
eps_prime = (23 * eps - 16 * eps_cache[-1]
+ 5 * eps_cache[-2]) / 12.0
elif len(eps_cache) >= 3:
# 4nd order pseudo linear multistep (Adams-Bashforth)
eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2]
- 9 * eps_cache[-3]) / 24.0
xt_1, x0 = compute_x0(eps_prime, t)
return xt_1, x0, eps

@torch.no_grad()
def plms_sample_loop(self,
noise,
model,
model_kwargs={},
clamp=None,
percentile=None,
condition_fn=None,
guide_scale=None,
plms_timesteps=20):
# prepare input
b, c, h, w = noise.size()
xt = noise

# diffusion process
steps = (1 + torch.arange(0, self.num_timesteps,
self.num_timesteps // plms_timesteps)).clamp(
0, self.num_timesteps - 1).flip(0)
eps_cache = []
for step in steps:
# PLMS sampling step
t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp,
percentile, condition_fn,
guide_scale, plms_timesteps,
eps_cache)

# update eps cache
eps_cache.append(eps)
if len(eps_cache) >= 4:
eps_cache.pop(0)
return xt

def loss(self, x0, t, model, model_kwargs={}, noise=None):
noise = torch.randn_like(x0) if noise is None else noise
xt = self.q_sample(x0, t, noise=noise)

# compute loss
if self.loss_type in ['kl', 'rescaled_kl']:
loss, _ = self.variational_lower_bound(x0, xt, t, model,
model_kwargs)
if self.loss_type == 'rescaled_kl':
loss = loss * self.num_timesteps
elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']:
out = model(xt, self._scale_timesteps(t), **model_kwargs)

# VLB for variation
loss_vlb = 0.0
if self.var_type in ['learned', 'learned_range']:
out, var = out.chunk(2, dim=1)
frozen = torch.cat([
out.detach(), var
], dim=1) # learn var without affecting the prediction of mean
loss_vlb, _ = self.variational_lower_bound(
x0, xt, t, model=lambda *args, **kwargs: frozen)
if self.loss_type.startswith('rescaled_'):
loss_vlb = loss_vlb * self.num_timesteps / 1000.0

# MSE/L1 for x0/eps
target = {
'eps': noise,
'x0': x0,
'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]
}[self.mean_type]
loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2
).abs().flatten(1).mean(dim=1)

# total loss
loss = loss + loss_vlb
return loss

def variational_lower_bound(self,
x0,
xt,
t,
model,
model_kwargs={},
clamp=None,
percentile=None):
# compute groundtruth and predicted distributions
mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
clamp, percentile)

# compute KL loss
kl = kl_divergence(mu1, log_var1, mu2, log_var2)
kl = kl.flatten(1).mean(dim=1) / math.log(2.0)

# compute discretized NLL loss (for p(x0 | x1) only)
nll = -discretized_gaussian_log_likelihood(
x0, mean=mu2, log_scale=0.5 * log_var2)
nll = nll.flatten(1).mean(dim=1) / math.log(2.0)

# NLL for p(x0 | x1) and KL otherwise
vlb = torch.where(t == 0, nll, kl)
return vlb, x0

@torch.no_grad()
def variational_lower_bound_loop(self,
x0,
model,
model_kwargs={},
clamp=None,
percentile=None):
r"""Compute the entire variational lower bound, measured in bits-per-dim.
"""
# prepare input and output
b, c, h, w = x0.size()
metrics = {'vlb': [], 'mse': [], 'x0_mse': []}

# loop
for step in torch.arange(self.num_timesteps).flip(0):
# compute VLB
t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
noise = torch.randn_like(x0)
xt = self.q_sample(x0, t, noise)
vlb, pred_x0 = self.variational_lower_bound(
x0, xt, t, model, model_kwargs, clamp, percentile)

# predict eps from x0
eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
self.sqrt_recipm1_alphas_cumprod, t, xt)

# collect metrics
metrics['vlb'].append(vlb)
metrics['x0_mse'].append(
(pred_x0 - x0).square().flatten(1).mean(dim=1))
metrics['mse'].append(
(eps - noise).square().flatten(1).mean(dim=1))
metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}

# compute the prior KL term for VLB, measured in bits-per-dim
mu, _, log_var = self.q_mean_variance(x0, t)
kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu),
torch.zeros_like(log_var))
kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)

# update metrics
metrics['prior_bits_per_dim'] = kl_prior
metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
return metrics

def _scale_timesteps(self, t):
if self.rescale_timesteps:
return t.float() * 1000.0 / self.num_timesteps
return t

+ 35
- 0
modelscope/models/cv/image_to_image_generation/ops/losses.py View File

@@ -0,0 +1,35 @@
import math

import torch

__all__ = ['kl_divergence', 'discretized_gaussian_log_likelihood']


def kl_divergence(mu1, logvar1, mu2, logvar2):
return 0.5 * (
-1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + # noqa W504
((mu1 - mu2)**2) * torch.exp(-logvar2))


def standard_normal_cdf(x):
r"""A fast approximation of the cumulative distribution function of the standard normal.
"""
return 0.5 * (1.0 + torch.tanh(
math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def discretized_gaussian_log_likelihood(x0, mean, log_scale):
assert x0.shape == mean.shape == log_scale.shape
cx = x0 - mean
inv_stdv = torch.exp(-log_scale)
cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
cdf_delta = cdf_plus - cdf_min
log_probs = torch.where(
x0 < -0.999, log_cdf_plus,
torch.where(x0 > 0.999, log_one_minus_cdf_min,
torch.log(cdf_delta.clamp(min=1e-12))))
assert log_probs.shape == x0.shape
return log_probs

+ 3
- 0
modelscope/pipelines/builder.py View File

@@ -118,6 +118,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.image_classification_dailylife:
(Pipelines.daily_image_classification,
'damo/cv_vit-base_image-classification_Dailylife-labels'),
Tasks.image_to_image_generation:
(Pipelines.image_to_image_generation,
'damo/cv_latent_diffusion_image2image_generate'),
}




+ 3
- 0
modelscope/pipelines/cv/__init__.py View File

@@ -20,6 +20,7 @@ if TYPE_CHECKING:
from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
from .image_matting_pipeline import ImageMattingPipeline
from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
from .image_to_image_generation_pipeline import Image2ImageGenerationePipeline
from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
from .style_transfer_pipeline import StyleTransferPipeline
@@ -51,6 +52,8 @@ else:
'product_retrieval_embedding_pipeline':
['ProductRetrievalEmbeddingPipeline'],
'live_category_pipeline': ['LiveCategoryPipeline'],
'image_to_image_generation_pipeline':
['Image2ImageGenerationePipeline'],
'ocr_detection_pipeline': ['OCRDetectionPipeline'],
'style_transfer_pipeline': ['StyleTransferPipeline'],
'video_category_pipeline': ['VideoCategoryPipeline'],


+ 250
- 0
modelscope/pipelines/cv/image_to_image_generate_pipeline.py View File

@@ -0,0 +1,250 @@
import os.path as osp
from typing import Any, Dict

import cv2
import numpy as np
import PIL
import torch
import torch.nn.functional as F
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from PIL import Image
from torchvision.utils import save_image

import modelscope.models.cv.image_to_image_generation.data as data
import modelscope.models.cv.image_to_image_generation.models as models
import modelscope.models.cv.image_to_image_generation.ops as ops
from modelscope.metainfo import Pipelines
from modelscope.models.cv.image_to_image_generation.model import UNet
from modelscope.models.cv.image_to_image_generation.models.clip import \
VisionTransformer
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.image_to_image_generation,
module_name=Pipelines.image_to_image_generation)
class Image2ImageGenerationePipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a image-to-image generation pipeline
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model)
config_path = osp.join(self.model, ModelFile.CONFIGURATION)
logger.info(f'loading config from {config_path}')
self.cfg = Config.from_file(config_path)
if torch.cuda.is_available():
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')
self.repetition = 4
# load vit model
vit_model_path = osp.join(self.model,
self.cfg.ModelPath.vit_model_path)
logger.info(f'loading vit model from {vit_model_path}')
self.vit = VisionTransformer(
image_size=self.cfg.Params.vit.vit_image_size,
patch_size=self.cfg.Params.vit.vit_patch_size,
dim=self.cfg.Params.vit.vit_dim,
out_dim=self.cfg.Params.vit.vit_out_dim,
num_heads=self.cfg.Params.vit.vit_num_heads,
num_layers=self.cfg.Params.vit.vit_num_layers).eval(
).requires_grad_(False).to(self._device) # noqa E123
state = torch.load(vit_model_path)
state = {
k[len('visual.'):]: v
for k, v in state.items() if k.startswith('visual.')
}
self.vit.load_state_dict(state)
logger.info('load vit model done')

# load autoencoder model
ae_model_path = osp.join(self.model, self.cfg.ModelPath.ae_model_path)
logger.info(f'loading autoencoder model from {ae_model_path}')
self.autoencoder = models.VQAutoencoder(
dim=self.cfg.Params.ae.ae_dim,
z_dim=self.cfg.Params.ae.ae_z_dim,
dim_mult=self.cfg.Params.ae.ae_dim_mult,
attn_scales=self.cfg.Params.ae.ae_attn_scales,
codebook_size=self.cfg.Params.ae.ae_codebook_size).eval(
).requires_grad_(False).to(self._device) # noqa E123
self.autoencoder.load_state_dict(
torch.load(ae_model_path, map_location=self._device))
logger.info('load autoencoder model done')

# load decoder model
decoder_model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
logger.info(f'loading decoder model from {decoder_model_path}')
self.decoder = UNet(
resolution=self.cfg.Params.unet.unet_resolution,
in_dim=self.cfg.Params.unet.unet_in_dim,
dim=self.cfg.Params.unet.unet_dim,
label_dim=self.cfg.Params.vit.vit_out_dim,
context_dim=self.cfg.Params.unet.unet_context_dim,
out_dim=self.cfg.Params.unet.unet_out_dim,
dim_mult=self.cfg.Params.unet.unet_dim_mult,
num_heads=self.cfg.Params.unet.unet_num_heads,
head_dim=None,
num_res_blocks=self.cfg.Params.unet.unet_res_blocks,
attn_scales=self.cfg.Params.unet.unet_attn_scales,
dropout=self.cfg.Params.unet.unet_dropout).eval().requires_grad_(
False).to(self._device)
self.decoder.load_state_dict(
torch.load(decoder_model_path, map_location=self._device))
logger.info('load decoder model done')

# diffusion
logger.info('Initialization diffusion ...')
betas = ops.beta_schedule(self.cfg.Params.diffusion.schedule,
self.cfg.Params.diffusion.num_timesteps)
self.diffusion = ops.GaussianDiffusion(
betas=betas,
mean_type=self.cfg.Params.diffusion.mean_type,
var_type=self.cfg.Params.diffusion.var_type,
loss_type=self.cfg.Params.diffusion.loss_type,
rescale_timesteps=False)

def preprocess(self, input: Input) -> Dict[str, Any]:
input_img_list = []
if isinstance(input, str):
input_img_list = [input]
input_type = 0
elif isinstance(input, tuple) and len(input) == 2:
input_img_list = list(input)
input_type = 1
else:
raise TypeError(
'modelscope error: Only support "str" or "tuple (img1, img2)" , but got {type(input)}'
)

if input_type == 0:
logger.info('Processing Similar Image Generation mode')
if input_type == 1:
logger.info('Processing Interpolation mode')

img_list = []
for i, input_img in enumerate(input_img_list):
img = LoadImage.convert_to_img(input_img)
logger.info(f'Load {i}-th image done')
img_list.append(img)

transforms = T.Compose([
data.PadToSquare(),
T.Resize(
self.cfg.DATA.scale_size,
interpolation=T.InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=self.cfg.DATA.mean, std=self.cfg.DATA.std)
])

y_list = []
for img in img_list:
img = transforms(img)
imgs = torch.unsqueeze(img, 0)
imgs = imgs.to(self._device)
imgs_x0 = self.autoencoder.encode(imgs)
b, c, h, w = imgs_x0.shape
aug_imgs = TF.normalize(
F.interpolate(
imgs.add(1).div(2), (self.cfg.Params.vit.vit_image_size,
self.cfg.Params.vit.vit_image_size),
mode='bilinear',
align_corners=True), self.cfg.Params.vit.vit_mean,
self.cfg.Params.vit.vit_std)
uy = self.vit(aug_imgs)
y = F.normalize(uy, p=2, dim=1)
y_list.append(y)

if input_type == 0:
result = {
'image_data': y_list[0],
'c': c,
'h': h,
'w': w,
'type': input_type
}
elif input_type == 1:
result = {
'image_data': y_list[0],
'image_data_s': y_list[1],
'c': c,
'h': h,
'w': w,
'type': input_type
}
return result

@torch.no_grad()
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
type_ = input['type']
if type_ == 0:
# Similar Image Generation #
y = input['image_data']

# fix seed
torch.manual_seed(1 * 8888)
torch.cuda.manual_seed(1 * 8888)
i_y = y.repeat(self.repetition, 1)

# sample images
x0 = self.diffusion.ddim_sample_loop(
noise=torch.randn(self.repetition, input['c'], input['h'],
input['w']).to(self._device),
model=self.decoder,
model_kwargs=[{
'y': i_y
}, {
'y': torch.zeros_like(i_y)
}],
guide_scale=1.0,
clamp=None,
ddim_timesteps=50,
eta=1.0)
i_gen_imgs = self.autoencoder.decode(x0)
return {OutputKeys.OUTPUT_IMG: i_gen_imgs}
else:
# Interpolation #
# get content-style pairs
y = input['image_data']
y_s = input['image_data_s']

# fix seed
torch.manual_seed(1 * 8888)
torch.cuda.manual_seed(1 * 8888)
noise = torch.randn(self.repetition, input['c'], input['h'],
input['w']).to(self._device)

# interpolation between y_cid and y_sid
factors = torch.linspace(0, 1, self.repetition).unsqueeze(1).to(
self._device)
i_y = (1 - factors) * y + factors * y_s

# sample images
x0 = self.diffusion.ddim_sample_loop(
noise=noise,
model=self.decoder,
model_kwargs=[{
'y': i_y
}, {
'y': torch.zeros_like(i_y)
}],
guide_scale=3.0,
clamp=None,
ddim_timesteps=50,
eta=0.0)
i_gen_imgs = self.autoencoder.decode(x0)
return {OutputKeys.OUTPUT_IMG: i_gen_imgs}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 1
- 0
modelscope/utils/constant.py View File

@@ -42,6 +42,7 @@ class CVTasks(object):
video_category = 'video-category'
image_classification_imagenet = 'image-classification-imagenet'
image_classification_dailylife = 'image-classification-dailylife'
image_to_image_generation = 'image-to-image-generation'


class NLPTasks(object):


+ 48
- 0
tests/pipelines/test_image2image_generation.py View File

@@ -0,0 +1,48 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp
import shutil
import unittest

from torchvision.utils import save_image

from modelscope.fileio import File
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.test_utils import test_level


class Image2ImageGenerationTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub(self):
r"""We provide two generation modes, i.e., Similar Image Generation and Interpolation.
You can pass the following parameters for different mode.
1. Similar Image Generation Mode:
2. Interpolation Mode:
"""
img2img_gen_pipeline = pipeline(
Tasks.image_to_image_generation,
model='damo/cv_latent_diffusion_image2image_generate')

# Similar Image Generation mode
result1 = img2img_gen_pipeline('data/test/images/img2img_input.jpg')
# Interpolation Mode
result2 = img2img_gen_pipeline(('data/test/images/img2img_input.jpg',
'data/test/images/img2img_style.jpg'))
save_image(
result1['output_img'].clamp(-1, 1),
'result1.jpg',
range=(-1, 1),
normalize=True,
nrow=4)
save_image(
result2['output_img'].clamp(-1, 1),
'result2.jpg',
range=(-1, 1),
normalize=True,
nrow=4)


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save