ya235025 yingda.chen 3 years ago
parent
commit
4c08bd752a
12 changed files with 1791 additions and 469 deletions
  1. +2
    -0
      modelscope/models/multi_modal/__init__.py
  2. +1
    -1
      modelscope/models/multi_modal/clip/__init__.py
  3. +422
    -0
      modelscope/models/multi_modal/clip/bert_tokenizer.py
  4. +0
    -29
      modelscope/models/multi_modal/clip/clip_bert.py
  5. +0
    -216
      modelscope/models/multi_modal/clip/clip_model.py
  6. +0
    -131
      modelscope/models/multi_modal/clip/clip_vit.py
  7. +82
    -0
      modelscope/models/multi_modal/clip/configuration_bert.py
  8. +677
    -0
      modelscope/models/multi_modal/clip/model.py
  9. +507
    -0
      modelscope/models/multi_modal/clip/modeling_bert.py
  10. +61
    -1
      modelscope/models/multi_modal/mplug/clip/clip.py
  11. +39
    -31
      tests/pipelines/test_multi_modal_embedding.py
  12. +0
    -60
      tests/trainers/test_clip_multi_modal_embedding_trainer.py

+ 2
- 0
modelscope/models/multi_modal/__init__.py View File

@@ -12,6 +12,8 @@ if TYPE_CHECKING:
from .mplug_for_visual_question_answering import \ from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering MPlugForVisualQuestionAnswering
from .ofa_for_all_tasks import OfaForAllTasks from .ofa_for_all_tasks import OfaForAllTasks
from .ofa_for_text_to_image_synthesis_model import \
OfaForTextToImageSynthesis


else: else:
_import_structure = { _import_structure = {


+ 1
- 1
modelscope/models/multi_modal/clip/__init__.py View File

@@ -1 +1 @@
from .clip_model import CLIPForMultiModalEmbedding
from .model import CLIPForMultiModalEmbedding

+ 422
- 0
modelscope/models/multi_modal/clip/bert_tokenizer.py View File

@@ -0,0 +1,422 @@
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""

from __future__ import absolute_import, division, print_function
import collections
import os
import re
import unicodedata

import six


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""

# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.

if not init_checkpoint:
return

m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
if m is None:
return

model_name = m.group(1)

lower_models = [
'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
]

cased_models = [
'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
'multi_cased_L-12_H-768_A-12'
]

is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = 'False'
case_name = 'lowercased'
opposite_flag = 'True'

if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = 'True'
case_name = 'cased'
opposite_flag = 'False'

if is_bad_config:
raise ValueError(
'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
'However, `%s` seems to be a %s model, so you '
'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
'how the model was pre-training. If this error is wrong, please '
'just comment out this check.' %
(actual_flag, init_checkpoint, model_name, case_name,
opposite_flag))


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode('utf-8', 'ignore')
elif isinstance(text, unicode):
return text
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode('utf-8')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, 'r') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)

return split_tokens

def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)

def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)

@staticmethod
def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
""" Converts a sequence of tokens (string) in a single string. """

def clean_up_tokenization(out_string):
""" Clean up a list of simple English tokenization artifacts
like spaces before punctuations and abreviated forms.
"""
out_string = (
out_string.replace(' .', '.').replace(' ?', '?').replace(
' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
" n't", "n't").replace(" 'm", "'m").replace(
" 's", "'s").replace(" 've",
"'ve").replace(" 're", "'re"))
return out_string

text = ' '.join(tokens).replace(' ##', '').strip()
if clean_up_tokenization_spaces:
clean_text = clean_up_tokenization(text)
return clean_text
else:
return text

def vocab_size(self):
return len(self.vocab)


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)

# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)

orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(' '.join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize('NFD', text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == 'Mn':
continue
output.append(char)
return ''.join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return [''.join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(' ')
output.append(char)
output.append(' ')
else:
output.append(char)
return ''.join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(' ')
else:
output.append(char)
return ''.join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""

def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = ''.join(chars[start:end])
if start > 0:
substr = '##' + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
return True
cat = unicodedata.category(char)
if cat == 'Zs':
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == '\t' or char == '\n' or char == '\r':
return False
cat = unicodedata.category(char)
if cat in ('Cc', 'Cf'):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith('P'):
return True
return False

+ 0
- 29
modelscope/models/multi_modal/clip/clip_bert.py View File

@@ -1,29 +0,0 @@
import torch.nn as nn
from transformers import BertConfig, BertForMaskedLM


class TextTransformer(nn.Module):

def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True):
super(TextTransformer, self).__init__()
bert_config = BertConfig.from_dict(config_dict)
if use_grad_ckp:
bert_config.gradient_checkpointing = True

self.bert = BertForMaskedLM(bert_config).bert

self.projector = nn.Linear(
bert_config.hidden_size, feat_dim, bias=False)

def forward(self, input_ids, attention_mask):
trans_features = {
'input_ids': input_ids,
'attention_mask': attention_mask
}

output_states = self.bert(**trans_features, return_dict=False)
output_tokens = output_states[0]

cls_tokens = output_tokens[:, 0, :]

return self.projector(cls_tokens)

+ 0
- 216
modelscope/models/multi_modal/clip/clip_model.py View File

@@ -1,216 +0,0 @@
from typing import Any, Dict

import cv2
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from tokenizers import BertWordPieceTokenizer
from torch.distributed.nn.functional import \
all_gather as all_gather_with_backprop
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.clip_bert import TextTransformer
from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer
from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class CLIPModel(nn.Module):

def __init__(self, model_dir):
super(CLIPModel, self).__init__()
# including vision config and text config
model_config = json.load(
open('{}/encoder_config.json'.format(model_dir)))

# vision encoder
vision_config = model_config['vision_config']
self.img_size = vision_config['input_resolution']
self.vision_encoder = VisionTransformer(
input_resolution=self.img_size,
patch_size=vision_config['patch_size'],
width=vision_config['width'],
layers=vision_config['layers'],
heads=vision_config['heads'],
output_dim=vision_config['feat_dim'],
use_grad_ckp=True)

# text encoder
text_config = model_config['text_config']
self.text_encoder = TextTransformer(
text_config['bert_config'], feat_dim=text_config['feat_dim'])

self.logit_scale = nn.Parameter(torch.ones([]) * 4.6)

def contrastive_loss(self, logits, dim):
neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
return -neg_ce.mean()

def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None):
if img_idx is not None and all_img_idx is not None:
with torch.no_grad():
false_neg_indicator = (
img_idx[:, None] == all_img_idx[None, :])
false_neg_indicator.fill_diagonal_(False)
t2i_sim.masked_fill_(false_neg_indicator, float('-inf'))
i2t_sim.masked_fill_(false_neg_indicator, float('-inf'))
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
else:
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
return (caption_loss + image_loss) / 2.0

def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor,
img_id_list):
img_feat = self.forward(img_tensor, input_type='img')
text_feat = self.forward((text_ids_tensor, text_masks_tensor),
input_type='text')

global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0)
global_text_feat = torch.cat(
all_gather_with_backprop(text_feat), dim=0)
global_img_id_list = torch.cat(
all_gather_with_backprop(img_id_list), dim=0)

t2i_sim_mat = text_feat @ global_img_feat.t()
i2t_sim_mat = img_feat @ global_text_feat.t()

logit_scale = self.logit_scale.exp().clamp(max=100.0)
t2i_sim_mat_logits = t2i_sim_mat * logit_scale
i2t_sim_mat_logits = i2t_sim_mat * logit_scale

loss = self.clip_loss(
t2i_sim_mat_logits,
i2t_sim_mat_logits,
img_idx=img_id_list,
all_img_idx=global_img_id_list)

return loss

def forward(self, input_data, input_type):
if input_type == 'img':
img_embedding = self.vision_encoder(input_data)
img_embedding = F.normalize(img_embedding, p=2.0, dim=1)
return img_embedding
elif input_type == 'text':
text_ids_tensor, text_mask_tensor = input_data
text_embedding = self.text_encoder(text_ids_tensor,
text_mask_tensor)
text_embedding = F.normalize(text_embedding, p=2.0, dim=1)
return text_embedding
elif input_type == ModeKeys.TRAIN:
return self.get_loss(*input_data)
else:
raise ValueError('Unknown input type')


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)
self.clip_model = CLIPModel(model_dir=model_dir)
pretrained_params = torch.load(
'{}/pytorch_model.bin'.format(model_dir), 'cpu')
self.clip_model.load_state_dict(pretrained_params)
self.clip_model.eval()

self.device_id = device_id
if self.device_id >= 0:
self.clip_model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id))
else:
logger.info('Use CPU for inference')

# image preprocessor
norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711))
self.img_preprocessor = Compose([
Resize((self.clip_model.img_size, self.clip_model.img_size),
interpolation=Image.BICUBIC),
ToTensor(), norm_op
])

# text tokenizer
vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.text_tokenizer = BertWordPieceTokenizer(
vocab_path, lowercase=False)
self.text_tokenizer.enable_truncation(max_length=30)

def tokenize_text(self, text_str):
tokens = self.text_tokenizer.encode(text_str)
max_tokens = 30
text_ids_tensor = torch.zeros((1, max_tokens)).long()
text_mask_tensor = torch.zeros((1, max_tokens))

text_ids, text_mask = tokens.ids, tokens.attention_mask
text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)

return text_ids_tensor, text_mask_tensor

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
input_img = input['img']
if isinstance(input_img, Image.Image):
img_tensor = self.img_preprocessor(input_img)[None, ...]
elif isinstance(input_img, np.ndarray):
if len(input_img.shape) == 2:
input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
input_img = input_img[:, :, ::-1] # in rgb order
input_img = Image.fromarray(
input_img.astype('uint8')).convert('RGB')
img_tensor = self.img_preprocessor(input_img)[None, ...]
else:
raise TypeError(
f'img should be either PIL.Image or np.array, but got {type(input_img)}'
)

if self.device_id >= 0:
img_tensor = img_tensor.to('cuda:{}'.format(self.device_id))

img_embedding = self.clip_model(
input_data=img_tensor, input_type='img')
from modelscope.outputs import OutputKeys
output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy()

if 'text' in input and input['text'] is not None:
text_str = input['text']
if isinstance(text_str, str):
text_ids_tensor, text_mask_tensor = self.tokenize_text(
text_str)
else:
raise TypeError(
f'text should be str, but got {type(text_str)}')

if self.device_id >= 0:
text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
self.device_id))
text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
self.device_id))

text_embedding = self.clip_model(
input_data=(text_ids_tensor, text_mask_tensor),
input_type='text')
output['text_embedding'] = text_embedding.data.cpu().numpy()

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 0
- 131
modelscope/models/multi_modal/clip/clip_vit.py View File

@@ -1,131 +0,0 @@
# Copyright 2021 The OpenAI CLIP Authors. All rights reserved.

from collections import OrderedDict
from typing import Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)


class VisionTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int, use_grad_ckp: bool):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(
width, layers, heads, use_grad_ckp=use_grad_ckp)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
class_embeddings = self.class_embedding.to(x.dtype) + \
torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
x = torch.cat([class_embeddings, x], dim=1)
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x

+ 82
- 0
modelscope/models/multi_modal/clip/configuration_bert.py View File

@@ -0,0 +1,82 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging

logger = logging.getLogger(__name__)


class BertConfig(object):
r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
`BertModel`.


Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""

def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act='gelu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
output_attentions=False,
output_hidden_states=False):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.output_attentions = output_attentions
self.output_hidden_states = output_hidden_states

+ 677
- 0
modelscope/models/multi_modal/clip/model.py View File

@@ -0,0 +1,677 @@
import os
from collections import OrderedDict
from typing import Any, Dict, Iterable, List, Tuple, Union

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
from modelscope.models.multi_modal.clip.configuration_bert import BertConfig
from modelscope.models.multi_modal.clip.modeling_bert import BertModel
from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)

self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)

self.relu = nn.ReLU(inplace=True)
self.downsample = None
self.stride = stride

if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict([('-1', nn.AvgPool2d(stride)),
('0',
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False)),
('1', nn.BatchNorm2d(planes * self.expansion))]))

def forward(self, x: torch.Tensor):
identity = x

out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)
return out


class AttentionPool2d(nn.Module):

def __init__(self,
spacial_dim: int,
embed_dim: int,
num_heads: int,
output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(
torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads

def forward(self, x):
x = x.reshape(x.shape[0], x.shape[1],
x.shape[2] * x.shape[3]).permute(2, 0,
1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
x, _ = F.multi_head_attention_forward(
query=x,
key=x,
value=x,
embed_dim_to_check=x.shape[-1],
num_heads=self.num_heads,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
in_proj_weight=None,
in_proj_bias=torch.cat(
[self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=0,
out_proj_weight=self.c_proj.weight,
out_proj_bias=self.c_proj.bias,
use_separate_proj_weight=True,
training=self.training,
need_weights=False)

return x[0]


class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim,
heads,
input_resolution=224,
width=64):
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(2)
self.relu = nn.ReLU(inplace=True)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
(self.conv3, self.bn3)]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.attnpool(x)

return x


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

def forward(self, x: torch.Tensor):
return self.resblocks(x)


class VisualTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(width, layers, heads)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat(
[ # noqa
self.class_embedding.to(x.dtype) + torch.zeros( # noqa
x.shape[0],
1,
x.shape[-1],
dtype=x.dtype,
device=x.device),
x # noqa
],
dim=1) # noqa shape = [*, grid ** 2 + 1, width]
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x


class CLIP(nn.Module):

def __init__(
self,
embed_dim: int,
# vision
image_resolution: int,
vision_layers: Union[Tuple[int, int, int, int], int],
vision_width: int,
vision_patch_size: int,
# text
vocab_size: int,
text_attention_probs_dropout_prob: float,
text_hidden_act: str,
text_hidden_dropout_prob: float,
text_hidden_size: int,
text_initializer_range: float,
text_intermediate_size: int,
text_max_position_embeddings: int,
text_num_attention_heads: int,
text_num_hidden_layers: int,
text_type_vocab_size: int,
tokenizer: FullTokenizer,
):
super().__init__()

if isinstance(vision_layers, (tuple, list)):
vision_heads = vision_width * 32 // 64
self.visual = ModifiedResNet(
layers=vision_layers,
output_dim=embed_dim,
heads=vision_heads,
input_resolution=image_resolution,
width=vision_width)
else:
vision_heads = vision_width // 64
self.visual = VisualTransformer(
input_resolution=image_resolution,
patch_size=vision_patch_size,
width=vision_width,
layers=vision_layers,
heads=vision_heads,
output_dim=embed_dim)

self.bert_config = BertConfig(
vocab_size_or_config_json_file=vocab_size,
hidden_size=text_hidden_size,
num_hidden_layers=text_num_hidden_layers,
num_attention_heads=text_num_attention_heads,
intermediate_size=text_intermediate_size,
hidden_act=text_hidden_act,
hidden_dropout_prob=text_hidden_dropout_prob,
attention_probs_dropout_prob=text_attention_probs_dropout_prob,
max_position_embeddings=text_max_position_embeddings,
type_vocab_size=text_type_vocab_size,
initializer_range=text_initializer_range,
layer_norm_eps=1e-12,
)
self.bert = BertModel(self.bert_config)

self.text_projection = nn.Parameter(
torch.empty(text_hidden_size, embed_dim))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

self.tokenizer = tokenizer

self.initialize_parameters()

def initialize_parameters(self):
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

if isinstance(self.visual, ModifiedResNet):
if self.visual.attnpool is not None:
std = self.visual.attnpool.c_proj.in_features**-0.5
nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

for resnet_block in [
self.visual.layer1, self.visual.layer2, self.visual.layer3,
self.visual.layer4
]:
for name, param in resnet_block.named_parameters():
if name.endswith('bn3.weight'):
nn.init.zeros_(param)

if self.text_projection is not None:
nn.init.normal_(
self.text_projection, std=self.bert_config.hidden_size**-0.5)

@property
def dtype(self):
return self.visual.conv1.weight.dtype

def encode_image(self, image):
return self.visual(image.type(self.dtype))

def encode_text(self, text):
pad_index = self.tokenizer.vocab['[PAD]']
attn_mask = text.ne(pad_index).type(self.dtype)
x = self.bert(
text, attention_mask=attn_mask)[0].type(
self.dtype) # [batch_size, seq_length, hidden_size]
return x[:, 0, :] @ self.text_projection

def forward(self, image, text):
assert image is not None or text is not None, 'text and image cannot both be None!'

if image is None:
return self.encode_text(text)
elif text is None:
return self.encode_image(image)
image_features = self.encode_image(image)
text_features = self.encode_text(text)

image_features = image_features / image_features.norm(
dim=-1, keepdim=True)
text_features = text_features / text_features.norm(
dim=-1, keepdim=True)

return image_features, text_features, self.logit_scale.exp()

def get_similarity(self, image, text):
image_features = self.encode_image(image)
text_features = self.encode_text(text)

# normalized features
image_features = image_features / image_features.norm(
dim=1, keepdim=True)
text_features = text_features / text_features.norm(dim=1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()

# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text


def convert_models_to_fp32(model):
for p in model.parameters():
p.data = p.data.float()
if p.grad:
p.grad.data = p.grad.data.float()


def convert_weights(model: nn.Module):
"""Convert applicable model parameters to fp16"""

def _convert_weights_to_fp16(module):
if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)):
module.weight.data = module.weight.data.half()
if module.bias is not None:
module.bias.data = module.bias.data.half()

if isinstance(module, nn.MultiheadAttention):
for attr in [
*[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
'in_proj_bias', 'bias_k', 'bias_v'
]:
tensor = getattr(module, attr)
if tensor is not None:
tensor.data = tensor.data.half()

if isinstance(module, BertModel):
module.to(torch.half)

for name in ['text_projection', 'proj']:
if hasattr(module, name):
attr = getattr(module, name)
if attr is not None:
attr.data = attr.data.half()

model.apply(_convert_weights_to_fp16)


def _convert_to_rgb(image):
return image.convert('RGB')


def image_transform(image_size=224):
transform = Compose([
_convert_to_rgb,
Resize((image_size, image_size)),
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711)),
])
return transform


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)

# Initialize the model.
vision_model_config_file = '{}/vision_model_config.json'.format(
model_dir)
logger.info(
f'Loading vision model config from {vision_model_config_file}')
assert os.path.exists(vision_model_config_file)

text_model_config_file = '{}/text_model_config.json'.format(model_dir)
logger.info(f'Loading text model config from {text_model_config_file}')
assert os.path.exists(text_model_config_file)

with open(vision_model_config_file,
'r') as fv, open(text_model_config_file, 'r') as ft:
model_info = json.load(fv)
for k, v in json.load(ft).items():
model_info[k] = v

# image preprocess
self.img_preprocess = image_transform(model_info['image_resolution'])

# text tokenizer
vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.tokenizer = FullTokenizer(vocab_file=vocab_file)

# initialize the model
self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
convert_weights(self.clip_model)

# restore the pretrained weight
checkpoint = torch.load(
f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
sd = checkpoint['state_dict']
if next(iter(sd.items()))[0].startswith('module'):
sd = {k[len('module.'):]: v for k, v in sd.items()}
self.clip_model.load_state_dict(sd)
self.clip_model.eval()

# place the model
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
if self.device == 'cuda':
self.clip_model.to(self.device)
logger.info('Use GPU for inference')
else:
self.clip_model.float()
logger.info('Use CPU for inference')

def tokenize(self,
texts: Union[str, List[str]],
context_length: int = 52) -> torch.LongTensor:
"""
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize
context_length : int
The context length to use; all baseline models use 24 as the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
"""
if isinstance(texts, str):
texts = [texts]

all_tokens = []
for text in texts:
all_tokens.append(
[self.tokenizer.vocab['[CLS]']]
+ self.tokenizer.convert_tokens_to_ids(
self.tokenizer.tokenize(text))[:context_length - 2]
+ [self.tokenizer.vocab['[SEP]']])

result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

for i, tokens in enumerate(all_tokens):
assert len(tokens) <= context_length
result[i, :len(tokens)] = torch.tensor(tokens)

return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
image_input = input['img']

# single image input
if isinstance(image_input, Image.Image):
image_tensor = self.img_preprocess(image_input).unsqueeze(0)
# multi images input
elif isinstance(image_input, list):
if all([isinstance(elem, Image.Image)
for elem in image_input]):
image_tensor = torch.stack(
[self.img_preprocess(elem) for elem in image_input],
dim=0)
else:
unsupported_elem_type = [
type(elem) for elem in image_input
if not isinstance(elem, Image.Image)
][0]
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], \
but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
)

image_tensor = image_tensor.to(self.device)

with torch.no_grad():
image_features = self.clip_model.encode_image(image_tensor)
image_features /= image_features.norm(
dim=-1, keepdim=True) # l2-normalize

output[OutputKeys.IMG_EMBEDDING] = image_features

if 'text' in input and input['text'] is not None:
text_input = input['text']

# single text input
if isinstance(text_input, str):
text_tensor = self.tokenize(text_input)
# multi texts input
elif isinstance(text_input, list):
if all([isinstance(elem, str) for elem in text_input]):
text_tensor = self.tokenize(text_input)
else:
unsupported_elem_type = [
type(elem) for elem in text_input
if not isinstance(elem, str)
][0]
raise TypeError(
f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'text should be str or List[str], but got {type(text_input)}'
)

text_tensor = text_tensor.to(self.device)

with torch.no_grad():
text_features = self.clip_model.encode_text(text_tensor)
text_features /= text_features.norm(
dim=-1, keepdim=True) # l2-normalize
output[OutputKeys.TEXT_EMBEDDING] = text_features

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

@property
def temperature(self):
return 1.0 / self.clip_model.logit_scale.exp()

+ 507
- 0
modelscope/models/multi_modal/clip/modeling_bert.py View File

@@ -0,0 +1,507 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model. """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import math
import os
import sys
from io import open

import json
import torch
from torch import nn

from .configuration_bert import BertConfig

logger = logging.getLogger(__name__)


def gelu(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1 + torch.tanh(
math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {
'gelu': gelu,
'relu': torch.nn.functional.relu,
'swish': swish,
'gelu_new': gelu_new
}

BertLayerNorm = torch.nn.LayerNorm


class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""

def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
config.hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None, position_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):

def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
'The hidden size (%d) is not a multiple of the number of attention '
'heads (%d)' %
(config.hidden_size, config.num_attention_heads))
self.output_attentions = config.output_attentions

self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size
/ config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads,
self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer,
key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(
self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = torch.matmul(attention_probs, value_layer)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (
self.all_head_size, )
context_layer = context_layer.view(*new_context_layer_shape)

outputs = (context_layer,
attention_probs) if self.output_attentions else (
context_layer, )
return outputs


class BertSelfOutput(nn.Module):

def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):

def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
self.pruned_heads = set()

def forward(self, input_tensor, attention_mask=None, head_mask=None):
self_outputs = self.self(input_tensor, attention_mask, head_mask)
attention_output = self.output(self_outputs[0], input_tensor)
outputs = (attention_output,
) + self_outputs[1:] # add attentions if we output them
return outputs


class BertIntermediate(nn.Module):

def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):

def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):

def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
attention_outputs = self.attention(hidden_states, attention_mask,
head_mask)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output, ) + attention_outputs[
1:] # add attentions if we output them
return outputs


class BertEncoder(nn.Module):

def __init__(self, config):
super(BertEncoder, self).__init__()
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList(
[BertLayer(config) for _ in range(config.num_hidden_layers)])

def forward(self, hidden_states, attention_mask=None, head_mask=None):
all_hidden_states = ()
all_attentions = ()
for i, layer_module in enumerate(self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

layer_outputs = layer_module(hidden_states, attention_mask,
head_mask[i])
hidden_states = layer_outputs[0]

if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1], )

# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

outputs = (hidden_states, )
if self.output_hidden_states:
outputs = outputs + (all_hidden_states, )
if self.output_attentions:
outputs = outputs + (all_attentions, )
return outputs # last-layer hidden state, (all hidden states), (all attentions)


class BertPooler(nn.Module):

def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertPredictionHeadTransform(nn.Module):

def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states


class BertLMPredictionHead(nn.Module):

def __init__(self, config):
super(BertLMPredictionHead, self).__init__()
self.transform = BertPredictionHeadTransform(config)

# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(
config.hidden_size, config.vocab_size, bias=False)

self.bias = nn.Parameter(torch.zeros(config.vocab_size))

def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias
return hidden_states


class BertOnlyMLMHead(nn.Module):

def __init__(self, config):
super(BertOnlyMLMHead, self).__init__()
self.predictions = BertLMPredictionHead(config)

def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores


class BertOnlyNSPHead(nn.Module):

def __init__(self, config):
super(BertOnlyNSPHead, self).__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score


class BertPreTrainingHeads(nn.Module):

def __init__(self, config):
super(BertPreTrainingHeads, self).__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score


class BertPreTrainedModel(nn.Module):
config_class = BertConfig
base_model_prefix = 'bert'

def __init__(self, config):
super(BertPreTrainedModel, self).__init__()
self.config = config

def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(
mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()


class BertModel(BertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer)
of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax,
used to compute the weighted average in the self-attention heads.

Examples::

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple

"""

def __init__(self, config):
super(BertModel, self).__init__(config)

self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)

self.apply(self._init_weights)

def forward(self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(
dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters(
)).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers

embedding_output = self.embeddings(
input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids)
encoder_outputs = self.encoder(
embedding_output, extended_attention_mask, head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)

outputs = (
sequence_output,
pooled_output,
) + encoder_outputs[
1:] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)

+ 61
- 1
modelscope/models/multi_modal/mplug/clip/clip.py View File

@@ -5,9 +5,69 @@ from typing import Tuple, Union


import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn from torch import nn


from modelscope.models.multi_modal.clip.clip_vit import Transformer

class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])
self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)




class Bottleneck(nn.Module): class Bottleneck(nn.Module):


+ 39
- 31
tests/pipelines/test_multi_modal_embedding.py View File

@@ -2,50 +2,58 @@


import unittest import unittest


import numpy as np
import torch


from modelscope.models import Model from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level




class MultiModalEmbeddingTest(unittest.TestCase): class MultiModalEmbeddingTest(unittest.TestCase):
model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
test_text = {'text': '一张风景图'}
model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
test_input = {'text': '皮卡丘'}
model_version = 'dev'


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run(self): def test_run(self):
pipe_line_multi_modal_embedding = pipeline(
Tasks.multi_modal_embedding, model=self.model_id)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
pipeline_multi_modal_embedding = pipeline(
Tasks.multi_modal_embedding,
model=self.model_id,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding, model=model)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_name(self):
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding, model=self.model_id)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
pipeline_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding,
model=model,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self): def test_run_with_default_model(self):
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))
pipeline_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0




if __name__ == '__main__': if __name__ == '__main__':


+ 0
- 60
tests/trainers/test_clip_multi_modal_embedding_trainer.py View File

@@ -1,60 +0,0 @@
import os
import tempfile
import unittest

import requests
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import test_level

logger = get_logger()


def clip_train_worker(local_rank, ngpus, node_size, node_rank):
global_rank = local_rank + node_rank * ngpus
dist_world_size = node_size * ngpus

dist.init_process_group(
backend='nccl', world_size=dist_world_size, rank=global_rank)

model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
local_model_dir = snapshot_download(model_id)

default_args = dict(
cfg_file='{}/{}'.format(local_model_dir, ModelFile.CONFIGURATION),
model=model_id,
device_id=local_rank)
trainer = build_trainer(
name=Trainers.clip_multi_modal_embedding, default_args=default_args)

trainer.train()
trainer.evaluate()


class CLIPMultiModalEmbeddingTrainerTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer(self):
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2001'
NODE_SIZE, NODE_RANK = 1, 0
logger.info('Train clip with {} machines'.format(NODE_SIZE))
ngpus = torch.cuda.device_count()
logger.info('Machine: {} has {} GPUs'.format(NODE_RANK, ngpus))
mp.spawn(
clip_train_worker,
nprocs=ngpus,
args=(ngpus, NODE_SIZE, NODE_RANK))
logger.info('Training done')


if __name__ == '__main__':
unittest.main()
...

Loading…
Cancel
Save