diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index af39f3f4..af89cf33 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -24,6 +24,7 @@ class Models(object): # multi-modal models ofa = 'ofa' + clip = 'clip-multi-modal-embedding' class Pipelines(object): @@ -55,6 +56,7 @@ class Pipelines(object): # multi-modal tasks image_caption = 'image-caption' + multi_modal_embedding = 'multi-modal-embedding' class Trainers(object): diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py index f873dcca..06380035 100644 --- a/modelscope/models/__init__.py +++ b/modelscope/models/__init__.py @@ -4,5 +4,5 @@ from .audio.tts.am import SambertNetHifi16k from .audio.tts.vocoder import Hifigan16k from .base import Model from .builder import MODELS, build_model -from .multi_model import OfaForImageCaptioning +from .multi_modal import OfaForImageCaptioning from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity diff --git a/modelscope/models/multi_model/__init__.py b/modelscope/models/multi_modal/__init__.py similarity index 50% rename from modelscope/models/multi_model/__init__.py rename to modelscope/models/multi_modal/__init__.py index 02e8d6ab..2e6cc3bf 100644 --- a/modelscope/models/multi_model/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -1 +1,2 @@ +from .clip.clip_model import CLIPForMultiModalEmbedding from .image_captioning_model import OfaForImageCaptioning diff --git a/modelscope/models/multi_modal/clip/__init__.py b/modelscope/models/multi_modal/clip/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/multi_modal/clip/clip_bert.py b/modelscope/models/multi_modal/clip/clip_bert.py new file mode 100644 index 00000000..50ddba99 --- /dev/null +++ b/modelscope/models/multi_modal/clip/clip_bert.py @@ -0,0 +1,26 @@ +import torch.nn as nn +from transformers import BertConfig, BertForMaskedLM + + +class TextTransformer(nn.Module): + + def __init__(self, config_dict, feat_dim=768): + super(TextTransformer, self).__init__() + bert_config = BertConfig.from_dict(config_dict) + self.bert = BertForMaskedLM(bert_config).bert + + self.projector = nn.Linear( + bert_config.hidden_size, feat_dim, bias=False) + + def forward(self, input_ids, attention_mask): + trans_features = { + 'input_ids': input_ids, + 'attention_mask': attention_mask + } + + output_states = self.bert(**trans_features, return_dict=False) + output_tokens = output_states[0] + + cls_tokens = output_tokens[:, 0, :] + + return self.projector(cls_tokens) diff --git a/modelscope/models/multi_modal/clip/clip_model.py b/modelscope/models/multi_modal/clip/clip_model.py new file mode 100644 index 00000000..4283886f --- /dev/null +++ b/modelscope/models/multi_modal/clip/clip_model.py @@ -0,0 +1,158 @@ +import os.path as osp +from typing import Any, Dict + +import json +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from tokenizers import BertWordPieceTokenizer +from torchvision.transforms import Compose, Normalize, Resize, ToTensor + +from modelscope.metainfo import Models +from modelscope.models.base import Model +from modelscope.models.builder import MODELS +from modelscope.models.multi_modal.clip.clip_bert import TextTransformer +from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['CLIPForMultiModalEmbedding'] + + +class CLIPModel(nn.Module): + + def __init__(self, model_dir): + super(CLIPModel, self).__init__() + # including vision config and text config + model_config = json.load( + open('{}/encoder_config.json'.format(model_dir))) + + # vision encoder + vision_config = model_config['vision_config'] + self.img_size = vision_config['input_resolution'] + self.vision_encoder = VisionTransformer( + input_resolution=self.img_size, + patch_size=vision_config['patch_size'], + width=vision_config['width'], + layers=vision_config['layers'], + heads=vision_config['heads'], + output_dim=vision_config['feat_dim']) + + # text encoder + text_config = model_config['text_config'] + self.text_encoder = TextTransformer( + text_config['bert_config'], feat_dim=text_config['feat_dim']) + + def forward(self, input_data, input_type): + if input_type == 'img': + img_embedding = self.vision_encoder(input_data) + img_embedding = F.normalize(img_embedding, p=2.0, dim=1) + return img_embedding + elif input_type == 'text': + text_ids_tensor, text_mask_tensor = input_data + text_embedding = self.text_encoder(text_ids_tensor, + text_mask_tensor) + text_embedding = F.normalize(text_embedding, p=2.0, dim=1) + return text_embedding + else: + raise ValueError('Unknown input type') + + +@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) +class CLIPForMultiModalEmbedding(Model): + + def __init__(self, model_dir, device_id=-1): + super().__init__(model_dir=model_dir, device_id=device_id) + self.clip_model = CLIPModel(model_dir=model_dir) + pretrained_params = torch.load( + '{}/pytorch_model.bin'.format(model_dir), 'cpu') + self.clip_model.load_state_dict(pretrained_params) + self.clip_model.eval() + + self.device_id = device_id + if self.device_id >= 0: + self.clip_model.to('cuda:{}'.format(self.device_id)) + logger.info('Use GPU: {}'.format(self.device_id)) + else: + logger.info('Use CPU for inference') + + # image preprocessor + norm_op = Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)) + self.img_preprocessor = Compose([ + Resize((self.clip_model.img_size, self.clip_model.img_size), + interpolation=Image.BICUBIC), + ToTensor(), norm_op + ]) + + # text tokenizer + vocab_path = '{}/vocab.txt'.format(model_dir) + self.text_tokenizer = BertWordPieceTokenizer( + vocab_path, lowercase=False) + self.text_tokenizer.enable_truncation(max_length=30) + + def tokenize_text(self, text_str): + tokens = self.text_tokenizer.encode(text_str) + max_tokens = 30 + text_ids_tensor = torch.zeros((1, max_tokens)).long() + text_mask_tensor = torch.zeros((1, max_tokens)) + + text_ids, text_mask = tokens.ids, tokens.attention_mask + text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids) + text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask) + + return text_ids_tensor, text_mask_tensor + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + output = {'img_embedding': None, 'text_embedding': None} + if 'img' in input and input['img'] is not None: + input_img = input['img'] + if isinstance(input_img, Image.Image): + img_tensor = self.img_preprocessor(input_img)[None, ...] + elif isinstance(input_img, np.ndarray): + if len(input_img.shape) == 2: + input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR) + input_img = input_img[:, :, ::-1] # in rgb order + input_img = Image.fromarray( + input_img.astype('uint8')).convert('RGB') + img_tensor = self.img_preprocessor(input_img)[None, ...] + else: + raise TypeError( + f'img should be either PIL.Image or np.array, but got {type(input_img)}' + ) + + if self.device_id >= 0: + img_tensor = img_tensor.to('cuda:{}'.format(self.device_id)) + + img_embedding = self.clip_model( + input_data=img_tensor, input_type='img') + output['img_embedding'] = img_embedding.data.cpu().numpy() + + if 'text' in input and input['text'] is not None: + text_str = input['text'] + if isinstance(text_str, str): + text_ids_tensor, text_mask_tensor = self.tokenize_text( + text_str) + else: + raise TypeError( + f'text should be str, but got {type(text_str)}') + + if self.device_id >= 0: + text_ids_tensor = text_ids_tensor.to('cuda:{}'.format( + self.device_id)) + text_mask_tensor = text_mask_tensor.to('cuda:{}'.format( + self.device_id)) + + text_embedding = self.clip_model( + input_data=(text_ids_tensor, text_mask_tensor), + input_type='text') + output['text_embedding'] = text_embedding.data.cpu().numpy() + + return output + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/models/multi_modal/clip/clip_vit.py b/modelscope/models/multi_modal/clip/clip_vit.py new file mode 100644 index 00000000..95bb1adc --- /dev/null +++ b/modelscope/models/multi_modal/clip/clip_vit.py @@ -0,0 +1,121 @@ +# Copyright 2021 The OpenAI CLIP Authors. All rights reserved. + +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + + def __init__(self, input_resolution: int, patch_size: int, width: int, + layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + class_embeddings = self.class_embedding.to(x.dtype) + \ + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([class_embeddings, x], dim=1) + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x diff --git a/modelscope/models/multi_model/image_captioning_model.py b/modelscope/models/multi_modal/image_captioning_model.py similarity index 100% rename from modelscope/models/multi_model/image_captioning_model.py rename to modelscope/models/multi_modal/image_captioning_model.py diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index d3be06bc..41cd73da 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -21,8 +21,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.sentence_similarity: (Pipelines.sentence_similarity, 'damo/nlp_structbert_sentence-similarity_chinese-base'), - Tasks.image_matting: - (Pipelines.image_matting, 'damo/cv_unet_image-matting'), + Tasks.image_matting: (Pipelines.image_matting, + 'damo/cv_unet_image-matting'), Tasks.text_classification: (Pipelines.sentiment_analysis, 'damo/bert-base-sst2'), Tasks.text_generation: (Pipelines.text_generation, @@ -37,6 +37,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), Tasks.action_recognition: (Pipelines.action_recognition, 'damo/cv_TAdaConv_action-recognition'), + Tasks.multi_modal_embedding: + (Pipelines.multi_modal_embedding, + 'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding') } diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py index b7402b93..6c96d843 100644 --- a/modelscope/pipelines/multi_modal/__init__.py +++ b/modelscope/pipelines/multi_modal/__init__.py @@ -1 +1,2 @@ from .image_captioning_pipeline import ImageCaptionPipeline +from .multi_modal_embedding_pipeline import MultiModalEmbeddingPipeline diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py new file mode 100644 index 00000000..a21ecc79 --- /dev/null +++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py @@ -0,0 +1,34 @@ +from typing import Any, Dict, Union + +from modelscope.metainfo import Pipelines +from modelscope.pipelines.base import Input +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from ..base import Model, Pipeline +from ..builder import PIPELINES + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.multi_modal_embedding, module_name=Pipelines.multi_modal_embedding) +class MultiModalEmbeddingPipeline(Pipeline): + + def __init__(self, model: str, device_id: int = -1): + if isinstance(model, str): + pipe_model = Model.from_pretrained(model) + elif isinstance(model, Model): + pipe_model = model + else: + raise NotImplementedError('model must be a single str') + + super().__init__(model=pipe_model) + + def preprocess(self, input: Input) -> Dict[str, Any]: + return input + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + return self.model(input) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/outputs.py b/modelscope/pipelines/outputs.py index 3b1c67de..52b7eeae 100644 --- a/modelscope/pipelines/outputs.py +++ b/modelscope/pipelines/outputs.py @@ -117,6 +117,13 @@ TASK_OUTPUTS = { # } Tasks.image_captioning: ['caption'], + # multi-modal embedding result for single sample + # { + # "img_embedding": np.array with shape [1, D], + # "text_embedding": np.array with shape [1, D] + # } + Tasks.multi_modal_embedding: ['img_embedding', 'text_embedding'], + # visual grounding result for single sample # { # "boxes": [ diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index e824db9a..2045efb6 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -57,6 +57,7 @@ class Tasks(object): image_captioning = 'image-captioning' visual_grounding = 'visual-grounding' text_to_image_synthesis = 'text-to-image-synthesis' + multi_modal_embedding = 'multi-modal-embedding' class InputFields(object): diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py new file mode 100644 index 00000000..001bf951 --- /dev/null +++ b/tests/pipelines/test_multi_modal_embedding.py @@ -0,0 +1,52 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +import numpy as np + +from modelscope.models import Model +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class MultiModalEmbeddingTest(unittest.TestCase): + model_id = 'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding' + test_text = {'text': '一张风景图'} + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run(self): + pipe_line_multi_modal_embedding = pipeline( + Tasks.multi_modal_embedding, model=self.model_id) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, model=model) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_name(self): + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, model=self.model_id) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_default_model(self): + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) + + +if __name__ == '__main__': + unittest.main()