# Conflicts: # modelscope/preprocessors/ofa/base.py # modelscope/preprocessors/ofa/image_captioning.py # modelscope/preprocessors/ofa/image_classification.py # modelscope/preprocessors/ofa/summarization.py # modelscope/preprocessors/ofa/text_classification.py # modelscope/preprocessors/ofa/text_to_image_synthesis.py # modelscope/preprocessors/ofa/visual_entailment.py # modelscope/preprocessors/ofa/visual_grounding.py # modelscope/preprocessors/ofa/visual_question_answering.pymaster
| @@ -4,3 +4,4 @@ | |||
| *.wav filter=lfs diff=lfs merge=lfs -text | |||
| *.JPEG filter=lfs diff=lfs merge=lfs -text | |||
| *.jpeg filter=lfs diff=lfs merge=lfs -text | |||
| *.avi filter=lfs diff=lfs merge=lfs -text | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:3ad1a268c614076614a2ae6528abc29cc85ae35826d172079d7d9b26a0299559 | |||
| size 4325096 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:3637ee0628d0953f77d5a32327980af542c43230c4127d2a72b4df1ea2ffb0be | |||
| size 320042 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:cc116af609a66f431f94df6b385ff2aa362f8a2d437c2279f5401e47f9178469 | |||
| size 320042 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:9354345a6297f4522e690d337546aa9a686a7e61eefcd935478a2141b924db8f | |||
| size 76770 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:70ea0c06f9cfe3882253f7175221d47e394ab9c469076ab220e880b17dbcdd02 | |||
| size 48552 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:29f2ad929c852f6456367054d13e113078cf06b763fe54d73fd324f789331aa3 | |||
| size 61611 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:469090fb217a34a2c096cfd42c251da69dca9fcd1a3c1faae7d29183c1816c14 | |||
| size 12834294 | |||
| @@ -362,8 +362,10 @@ class HubApi: | |||
| dataset_name: str, | |||
| namespace: str, | |||
| revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
| return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| f'Revision={revision}&FilePath={file_name}' | |||
| if file_name.endswith('.csv'): | |||
| file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
| f'Revision={revision}&FilePath={file_name}' | |||
| return file_name | |||
| def get_dataset_access_config( | |||
| self, | |||
| @@ -38,6 +38,7 @@ class Models(object): | |||
| # audio models | |||
| sambert_hifigan = 'sambert-hifigan' | |||
| speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | |||
| speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' | |||
| kws_kwsbp = 'kws-kwsbp' | |||
| generic_asr = 'generic-asr' | |||
| @@ -86,6 +87,7 @@ class Pipelines(object): | |||
| body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' | |||
| human_detection = 'resnet18-human-detection' | |||
| object_detection = 'vit-object-detection' | |||
| salient_detection = 'u2net-salient-detection' | |||
| image_classification = 'image-classification' | |||
| face_detection = 'resnet-face-detection-scrfd10gkps' | |||
| live_category = 'live-category' | |||
| @@ -109,6 +111,7 @@ class Pipelines(object): | |||
| skin_retouching = 'unet-skin-retouching' | |||
| tinynas_classification = 'tinynas-classification' | |||
| crowd_counting = 'hrnet-crowd-counting' | |||
| video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' | |||
| # nlp tasks | |||
| sentence_similarity = 'sentence-similarity' | |||
| @@ -132,6 +135,7 @@ class Pipelines(object): | |||
| sambert_hifigan_tts = 'sambert-hifigan-tts' | |||
| speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k' | |||
| speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | |||
| speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' | |||
| kws_kwsbp = 'kws-kwsbp' | |||
| asr_inference = 'asr-inference' | |||
| @@ -215,7 +219,7 @@ class Preprocessors(object): | |||
| # multi-modal preprocessor | |||
| ofa_tasks_preprocessor = 'ofa-tasks-preprocessor' | |||
| mplug_visual_question_answering = 'mplug-visual-question-answering' | |||
| mplug_tasks_preprocessor = 'mplug-tasks-preprocessor' | |||
| class Metrics(object): | |||
| @@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .generic_key_word_spotting import GenericKeyWordSpotting | |||
| from .farfield.model import FSMNSeleNetV2Decorator | |||
| else: | |||
| _import_structure = { | |||
| 'generic_key_word_spotting': ['GenericKeyWordSpotting'], | |||
| 'farfield.model': ['FSMNSeleNetV2Decorator'], | |||
| } | |||
| import sys | |||
| @@ -0,0 +1,495 @@ | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from .model_def import (HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32, | |||
| printNeonMatrix, printNeonVector) | |||
| DEBUG = False | |||
| def to_kaldi_matrix(np_mat): | |||
| """ function that transform as str numpy mat to standard kaldi str matrix | |||
| Args: | |||
| np_mat: numpy mat | |||
| Returns: str | |||
| """ | |||
| np.set_printoptions(threshold=np.inf, linewidth=np.nan) | |||
| out_str = str(np_mat) | |||
| out_str = out_str.replace('[', '') | |||
| out_str = out_str.replace(']', '') | |||
| return '[ %s ]\n' % out_str | |||
| def print_tensor(torch_tensor): | |||
| """ print torch tensor for debug | |||
| Args: | |||
| torch_tensor: a tensor | |||
| """ | |||
| re_str = '' | |||
| x = torch_tensor.detach().squeeze().numpy() | |||
| re_str += to_kaldi_matrix(x) | |||
| re_str += '<!EndOfComponent>\n' | |||
| print(re_str) | |||
| class LinearTransform(nn.Module): | |||
| def __init__(self, input_dim, output_dim): | |||
| super(LinearTransform, self).__init__() | |||
| self.input_dim = input_dim | |||
| self.output_dim = output_dim | |||
| self.linear = nn.Linear(input_dim, output_dim, bias=False) | |||
| self.debug = False | |||
| self.dataout = None | |||
| def forward(self, input): | |||
| output = self.linear(input) | |||
| if self.debug: | |||
| self.dataout = output | |||
| return output | |||
| def print_model(self): | |||
| printNeonMatrix(self.linear.weight) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '' | |||
| re_str += '<LinearTransform> %d %d\n' % (self.output_dim, | |||
| self.input_dim) | |||
| re_str += '<LearnRateCoef> 1\n' | |||
| linear_weights = self.state_dict()['linear.weight'] | |||
| x = linear_weights.squeeze().numpy() | |||
| re_str += to_kaldi_matrix(x) | |||
| re_str += '<!EndOfComponent>\n' | |||
| return re_str | |||
| class AffineTransform(nn.Module): | |||
| def __init__(self, input_dim, output_dim): | |||
| super(AffineTransform, self).__init__() | |||
| self.input_dim = input_dim | |||
| self.output_dim = output_dim | |||
| self.linear = nn.Linear(input_dim, output_dim) | |||
| self.debug = False | |||
| self.dataout = None | |||
| def forward(self, input): | |||
| output = self.linear(input) | |||
| if self.debug: | |||
| self.dataout = output | |||
| return output | |||
| def print_model(self): | |||
| printNeonMatrix(self.linear.weight) | |||
| printNeonVector(self.linear.bias) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '' | |||
| re_str += '<AffineTransform> %d %d\n' % (self.output_dim, | |||
| self.input_dim) | |||
| re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n' | |||
| linear_weights = self.state_dict()['linear.weight'] | |||
| x = linear_weights.squeeze().numpy() | |||
| re_str += to_kaldi_matrix(x) | |||
| linear_bias = self.state_dict()['linear.bias'] | |||
| x = linear_bias.squeeze().numpy() | |||
| re_str += to_kaldi_matrix(x) | |||
| re_str += '<!EndOfComponent>\n' | |||
| return re_str | |||
| class Fsmn(nn.Module): | |||
| """ | |||
| FSMN implementation. | |||
| """ | |||
| def __init__(self, | |||
| input_dim, | |||
| output_dim, | |||
| lorder=None, | |||
| rorder=None, | |||
| lstride=None, | |||
| rstride=None): | |||
| super(Fsmn, self).__init__() | |||
| self.dim = input_dim | |||
| if lorder is None: | |||
| return | |||
| self.lorder = lorder | |||
| self.rorder = rorder | |||
| self.lstride = lstride | |||
| self.rstride = rstride | |||
| self.conv_left = nn.Conv2d( | |||
| self.dim, | |||
| self.dim, (lorder, 1), | |||
| dilation=(lstride, 1), | |||
| groups=self.dim, | |||
| bias=False) | |||
| if rorder > 0: | |||
| self.conv_right = nn.Conv2d( | |||
| self.dim, | |||
| self.dim, (rorder, 1), | |||
| dilation=(rstride, 1), | |||
| groups=self.dim, | |||
| bias=False) | |||
| else: | |||
| self.conv_right = None | |||
| self.debug = False | |||
| self.dataout = None | |||
| def forward(self, input): | |||
| x = torch.unsqueeze(input, 1) | |||
| x_per = x.permute(0, 3, 2, 1) | |||
| y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0]) | |||
| if self.conv_right is not None: | |||
| y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride]) | |||
| y_right = y_right[:, :, self.rstride:, :] | |||
| out = x_per + self.conv_left(y_left) + self.conv_right(y_right) | |||
| else: | |||
| out = x_per + self.conv_left(y_left) | |||
| out1 = out.permute(0, 3, 2, 1) | |||
| output = out1.squeeze(1) | |||
| if self.debug: | |||
| self.dataout = output | |||
| return output | |||
| def print_model(self): | |||
| tmpw = self.conv_left.weight | |||
| tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) | |||
| for j in range(tmpw.shape[0]): | |||
| tmpwm[:, j] = tmpw[j, 0, :, 0] | |||
| printNeonMatrix(tmpwm) | |||
| if self.conv_right is not None: | |||
| tmpw = self.conv_right.weight | |||
| tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) | |||
| for j in range(tmpw.shape[0]): | |||
| tmpwm[:, j] = tmpw[j, 0, :, 0] | |||
| printNeonMatrix(tmpwm) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '' | |||
| re_str += '<Fsmn> %d %d\n' % (self.dim, self.dim) | |||
| re_str += '<LearnRateCoef> %d <LOrder> %d <ROrder> %d <LStride> %d <RStride> %d <MaxNorm> 0\n' % ( | |||
| 1, self.lorder, self.rorder, self.lstride, self.rstride) | |||
| lfiters = self.state_dict()['conv_left.weight'] | |||
| x = np.flipud(lfiters.squeeze().numpy().T) | |||
| re_str += to_kaldi_matrix(x) | |||
| if self.conv_right is not None: | |||
| rfiters = self.state_dict()['conv_right.weight'] | |||
| x = (rfiters.squeeze().numpy().T) | |||
| re_str += to_kaldi_matrix(x) | |||
| re_str += '<!EndOfComponent>\n' | |||
| return re_str | |||
| class RectifiedLinear(nn.Module): | |||
| def __init__(self, input_dim, output_dim): | |||
| super(RectifiedLinear, self).__init__() | |||
| self.dim = input_dim | |||
| self.relu = nn.ReLU() | |||
| def forward(self, input): | |||
| return self.relu(input) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '' | |||
| re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim) | |||
| re_str += '<!EndOfComponent>\n' | |||
| return re_str | |||
| class FSMNNet(nn.Module): | |||
| """ | |||
| FSMN net for keyword spotting | |||
| """ | |||
| def __init__(self, | |||
| input_dim=200, | |||
| linear_dim=128, | |||
| proj_dim=128, | |||
| lorder=10, | |||
| rorder=1, | |||
| num_syn=5, | |||
| fsmn_layers=4): | |||
| """ | |||
| Args: | |||
| input_dim: input dimension | |||
| linear_dim: fsmn input dimension | |||
| proj_dim: fsmn projection dimension | |||
| lorder: fsmn left order | |||
| rorder: fsmn right order | |||
| num_syn: output dimension | |||
| fsmn_layers: no. of sequential fsmn layers | |||
| """ | |||
| super(FSMNNet, self).__init__() | |||
| self.input_dim = input_dim | |||
| self.linear_dim = linear_dim | |||
| self.proj_dim = proj_dim | |||
| self.lorder = lorder | |||
| self.rorder = rorder | |||
| self.num_syn = num_syn | |||
| self.fsmn_layers = fsmn_layers | |||
| self.linear1 = AffineTransform(input_dim, linear_dim) | |||
| self.relu = RectifiedLinear(linear_dim, linear_dim) | |||
| self.fsmn = self._build_repeats(linear_dim, proj_dim, lorder, rorder, | |||
| fsmn_layers) | |||
| self.linear2 = AffineTransform(linear_dim, num_syn) | |||
| @staticmethod | |||
| def _build_repeats(linear_dim=136, | |||
| proj_dim=68, | |||
| lorder=3, | |||
| rorder=2, | |||
| fsmn_layers=5): | |||
| repeats = [ | |||
| nn.Sequential( | |||
| LinearTransform(linear_dim, proj_dim), | |||
| Fsmn(proj_dim, proj_dim, lorder, rorder, 1, 1), | |||
| AffineTransform(proj_dim, linear_dim), | |||
| RectifiedLinear(linear_dim, linear_dim)) | |||
| for i in range(fsmn_layers) | |||
| ] | |||
| return nn.Sequential(*repeats) | |||
| def forward(self, input): | |||
| x1 = self.linear1(input) | |||
| x2 = self.relu(x1) | |||
| x3 = self.fsmn(x2) | |||
| x4 = self.linear2(x3) | |||
| return x4 | |||
| def print_model(self): | |||
| self.linear1.print_model() | |||
| for layer in self.fsmn: | |||
| layer[0].print_model() | |||
| layer[1].print_model() | |||
| layer[2].print_model() | |||
| self.linear2.print_model() | |||
| def print_header(self): | |||
| # | |||
| # write total header | |||
| # | |||
| header = [0.0] * HEADER_BLOCK_SIZE * 4 | |||
| # numins | |||
| header[0] = 0.0 | |||
| # numouts | |||
| header[1] = 0.0 | |||
| # dimins | |||
| header[2] = self.input_dim | |||
| # dimouts | |||
| header[3] = self.num_syn | |||
| # numlayers | |||
| header[4] = 3 | |||
| # | |||
| # write each layer's header | |||
| # | |||
| hidx = 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_DENSE.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = self.input_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = self.linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = float( | |||
| ActivationType.ACTIVATION_RELU.value) | |||
| hidx += 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_SEQUENTIAL_FSMN.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = self.proj_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = self.lorder | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = self.rorder | |||
| header[HEADER_BLOCK_SIZE * hidx + 6] = self.fsmn_layers | |||
| header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0 | |||
| hidx += 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_DENSE.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = self.num_syn | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = float( | |||
| ActivationType.ACTIVATION_SOFTMAX.value) | |||
| for h in header: | |||
| print(f32ToI32(h)) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '' | |||
| re_str += '<Nnet>\n' | |||
| re_str += self.linear1.to_kaldi_nnet() | |||
| re_str += self.relu.to_kaldi_nnet() | |||
| for fsmn in self.fsmn: | |||
| re_str += fsmn[0].to_kaldi_nnet() | |||
| re_str += fsmn[1].to_kaldi_nnet() | |||
| re_str += fsmn[2].to_kaldi_nnet() | |||
| re_str += fsmn[3].to_kaldi_nnet() | |||
| re_str += self.linear2.to_kaldi_nnet() | |||
| re_str += '<Softmax> %d %d\n' % (self.num_syn, self.num_syn) | |||
| re_str += '<!EndOfComponent>\n' | |||
| re_str += '</Nnet>\n' | |||
| return re_str | |||
| class DFSMN(nn.Module): | |||
| """ | |||
| One deep fsmn layer | |||
| """ | |||
| def __init__(self, | |||
| dimproj=64, | |||
| dimlinear=128, | |||
| lorder=20, | |||
| rorder=1, | |||
| lstride=1, | |||
| rstride=1): | |||
| """ | |||
| Args: | |||
| dimproj: projection dimension, input and output dimension of memory blocks | |||
| dimlinear: dimension of mapping layer | |||
| lorder: left order | |||
| rorder: right order | |||
| lstride: left stride | |||
| rstride: right stride | |||
| """ | |||
| super(DFSMN, self).__init__() | |||
| self.lorder = lorder | |||
| self.rorder = rorder | |||
| self.lstride = lstride | |||
| self.rstride = rstride | |||
| self.expand = AffineTransform(dimproj, dimlinear) | |||
| self.shrink = LinearTransform(dimlinear, dimproj) | |||
| self.conv_left = nn.Conv2d( | |||
| dimproj, | |||
| dimproj, (lorder, 1), | |||
| dilation=(lstride, 1), | |||
| groups=dimproj, | |||
| bias=False) | |||
| if rorder > 0: | |||
| self.conv_right = nn.Conv2d( | |||
| dimproj, | |||
| dimproj, (rorder, 1), | |||
| dilation=(rstride, 1), | |||
| groups=dimproj, | |||
| bias=False) | |||
| else: | |||
| self.conv_right = None | |||
| def forward(self, input): | |||
| f1 = F.relu(self.expand(input)) | |||
| p1 = self.shrink(f1) | |||
| x = torch.unsqueeze(p1, 1) | |||
| x_per = x.permute(0, 3, 2, 1) | |||
| y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0]) | |||
| if self.conv_right is not None: | |||
| y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride]) | |||
| y_right = y_right[:, :, self.rstride:, :] | |||
| out = x_per + self.conv_left(y_left) + self.conv_right(y_right) | |||
| else: | |||
| out = x_per + self.conv_left(y_left) | |||
| out1 = out.permute(0, 3, 2, 1) | |||
| output = input + out1.squeeze(1) | |||
| return output | |||
| def print_model(self): | |||
| self.expand.print_model() | |||
| self.shrink.print_model() | |||
| tmpw = self.conv_left.weight | |||
| tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) | |||
| for j in range(tmpw.shape[0]): | |||
| tmpwm[:, j] = tmpw[j, 0, :, 0] | |||
| printNeonMatrix(tmpwm) | |||
| if self.conv_right is not None: | |||
| tmpw = self.conv_right.weight | |||
| tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) | |||
| for j in range(tmpw.shape[0]): | |||
| tmpwm[:, j] = tmpw[j, 0, :, 0] | |||
| printNeonMatrix(tmpwm) | |||
| def build_dfsmn_repeats(linear_dim=128, | |||
| proj_dim=64, | |||
| lorder=20, | |||
| rorder=1, | |||
| fsmn_layers=6): | |||
| """ | |||
| build stacked dfsmn layers | |||
| Args: | |||
| linear_dim: | |||
| proj_dim: | |||
| lorder: | |||
| rorder: | |||
| fsmn_layers: | |||
| Returns: | |||
| """ | |||
| repeats = [ | |||
| nn.Sequential(DFSMN(proj_dim, linear_dim, lorder, rorder, 1, 1)) | |||
| for i in range(fsmn_layers) | |||
| ] | |||
| return nn.Sequential(*repeats) | |||
| @@ -0,0 +1,236 @@ | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear | |||
| from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32 | |||
| class FSMNUnit(nn.Module): | |||
| """ A multi-channel fsmn unit | |||
| """ | |||
| def __init__(self, dimlinear=128, dimproj=64, lorder=20, rorder=1): | |||
| """ | |||
| Args: | |||
| dimlinear: input / output dimension | |||
| dimproj: fsmn input / output dimension | |||
| lorder: left ofder | |||
| rorder: right order | |||
| """ | |||
| super(FSMNUnit, self).__init__() | |||
| self.shrink = LinearTransform(dimlinear, dimproj) | |||
| self.fsmn = Fsmn(dimproj, dimproj, lorder, rorder, 1, 1) | |||
| self.expand = AffineTransform(dimproj, dimlinear) | |||
| self.debug = False | |||
| self.dataout = None | |||
| ''' | |||
| batch, time, channel, feature | |||
| ''' | |||
| def forward(self, input): | |||
| if torch.cuda.is_available(): | |||
| out = torch.zeros(input.shape).cuda() | |||
| else: | |||
| out = torch.zeros(input.shape) | |||
| for n in range(input.shape[2]): | |||
| out1 = self.shrink(input[:, :, n, :]) | |||
| out2 = self.fsmn(out1) | |||
| out[:, :, n, :] = F.relu(self.expand(out2)) | |||
| if self.debug: | |||
| self.dataout = out | |||
| return out | |||
| def print_model(self): | |||
| self.shrink.print_model() | |||
| self.fsmn.print_model() | |||
| self.expand.print_model() | |||
| def to_kaldi_nnet(self): | |||
| re_str = self.shrink.to_kaldi_nnet() | |||
| re_str += self.fsmn.to_kaldi_nnet() | |||
| re_str += self.expand.to_kaldi_nnet() | |||
| relu = RectifiedLinear(self.expand.linear.out_features, | |||
| self.expand.linear.out_features) | |||
| re_str += relu.to_kaldi_nnet() | |||
| return re_str | |||
| class FSMNSeleNetV2(nn.Module): | |||
| """ FSMN model with channel selection. | |||
| """ | |||
| def __init__(self, | |||
| input_dim=120, | |||
| linear_dim=128, | |||
| proj_dim=64, | |||
| lorder=20, | |||
| rorder=1, | |||
| num_syn=5, | |||
| fsmn_layers=5, | |||
| sele_layer=0): | |||
| """ | |||
| Args: | |||
| input_dim: input dimension | |||
| linear_dim: fsmn input dimension | |||
| proj_dim: fsmn projection dimension | |||
| lorder: fsmn left order | |||
| rorder: fsmn right order | |||
| num_syn: output dimension | |||
| fsmn_layers: no. of fsmn units | |||
| sele_layer: channel selection layer index | |||
| """ | |||
| super(FSMNSeleNetV2, self).__init__() | |||
| self.sele_layer = sele_layer | |||
| self.featmap = AffineTransform(input_dim, linear_dim) | |||
| self.mem = [] | |||
| for i in range(fsmn_layers): | |||
| unit = FSMNUnit(linear_dim, proj_dim, lorder, rorder) | |||
| self.mem.append(unit) | |||
| self.add_module('mem_{:d}'.format(i), unit) | |||
| self.decision = AffineTransform(linear_dim, num_syn) | |||
| def forward(self, input): | |||
| # multi-channel feature mapping | |||
| if torch.cuda.is_available(): | |||
| x = torch.zeros(input.shape[0], input.shape[1], input.shape[2], | |||
| self.featmap.linear.out_features).cuda() | |||
| else: | |||
| x = torch.zeros(input.shape[0], input.shape[1], input.shape[2], | |||
| self.featmap.linear.out_features) | |||
| for n in range(input.shape[2]): | |||
| x[:, :, n, :] = F.relu(self.featmap(input[:, :, n, :])) | |||
| for i, unit in enumerate(self.mem): | |||
| y = unit(x) | |||
| # perform channel selection | |||
| if i == self.sele_layer: | |||
| pool = nn.MaxPool2d((y.shape[2], 1), stride=(y.shape[2], 1)) | |||
| y = pool(y) | |||
| x = y | |||
| # remove channel dimension | |||
| y = torch.squeeze(y, -2) | |||
| z = self.decision(y) | |||
| return z | |||
| def print_model(self): | |||
| self.featmap.print_model() | |||
| for unit in self.mem: | |||
| unit.print_model() | |||
| self.decision.print_model() | |||
| def print_header(self): | |||
| ''' | |||
| get FSMN params | |||
| ''' | |||
| input_dim = self.featmap.linear.in_features | |||
| linear_dim = self.featmap.linear.out_features | |||
| proj_dim = self.mem[0].shrink.linear.out_features | |||
| lorder = self.mem[0].fsmn.conv_left.kernel_size[0] | |||
| rorder = 0 | |||
| if self.mem[0].fsmn.conv_right is not None: | |||
| rorder = self.mem[0].fsmn.conv_right.kernel_size[0] | |||
| num_syn = self.decision.linear.out_features | |||
| fsmn_layers = len(self.mem) | |||
| # no. of output channels, 0.0 means the same as numins | |||
| # numouts = 0.0 | |||
| numouts = 1.0 | |||
| # | |||
| # write total header | |||
| # | |||
| header = [0.0] * HEADER_BLOCK_SIZE * 4 | |||
| # numins | |||
| header[0] = 0.0 | |||
| # numouts | |||
| header[1] = numouts | |||
| # dimins | |||
| header[2] = input_dim | |||
| # dimouts | |||
| header[3] = num_syn | |||
| # numlayers | |||
| header[4] = 3 | |||
| # | |||
| # write each layer's header | |||
| # | |||
| hidx = 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_DENSE.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = float( | |||
| ActivationType.ACTIVATION_RELU.value) | |||
| hidx += 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_SEQUENTIAL_FSMN.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = proj_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = lorder | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = rorder | |||
| header[HEADER_BLOCK_SIZE * hidx + 6] = fsmn_layers | |||
| if numouts == 1.0: | |||
| header[HEADER_BLOCK_SIZE * hidx + 7] = float(self.sele_layer) | |||
| else: | |||
| header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0 | |||
| hidx += 1 | |||
| header[HEADER_BLOCK_SIZE * hidx + 0] = float( | |||
| LayerType.LAYER_DENSE.value) | |||
| header[HEADER_BLOCK_SIZE * hidx + 1] = numouts | |||
| header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim | |||
| header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn | |||
| header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 | |||
| header[HEADER_BLOCK_SIZE * hidx + 5] = float( | |||
| ActivationType.ACTIVATION_SOFTMAX.value) | |||
| for h in header: | |||
| print(f32ToI32(h)) | |||
| def to_kaldi_nnet(self): | |||
| re_str = '<Nnet>\n' | |||
| re_str = self.featmap.to_kaldi_nnet() | |||
| relu = RectifiedLinear(self.featmap.linear.out_features, | |||
| self.featmap.linear.out_features) | |||
| re_str += relu.to_kaldi_nnet() | |||
| for unit in self.mem: | |||
| re_str += unit.to_kaldi_nnet() | |||
| re_str += self.decision.to_kaldi_nnet() | |||
| re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features, | |||
| self.decision.linear.out_features) | |||
| re_str += '<!EndOfComponent>\n' | |||
| re_str += '</Nnet>\n' | |||
| return re_str | |||
| @@ -0,0 +1,74 @@ | |||
| import os | |||
| from typing import Dict | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models import TorchModel | |||
| from modelscope.models.base import Tensor | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .fsmn_sele_v2 import FSMNSeleNetV2 | |||
| @MODELS.register_module( | |||
| Tasks.keyword_spotting, module_name=Models.speech_dfsmn_kws_char_farfield) | |||
| class FSMNSeleNetV2Decorator(TorchModel): | |||
| r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """ | |||
| MODEL_TXT = 'model.txt' | |||
| SC_CONFIG = 'sound_connect.conf' | |||
| SC_CONF_ITEM_KWS_MODEL = '${kws_model}' | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """initialize the dfsmn model from the `model_dir` path. | |||
| Args: | |||
| model_dir (str): the model path. | |||
| """ | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| sc_config_file = os.path.join(model_dir, self.SC_CONFIG) | |||
| model_txt_file = os.path.join(model_dir, self.MODEL_TXT) | |||
| model_bin_file = os.path.join(model_dir, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| self._model = None | |||
| if os.path.exists(model_bin_file): | |||
| self._model = FSMNSeleNetV2(*args, **kwargs) | |||
| checkpoint = torch.load(model_bin_file) | |||
| self._model.load_state_dict(checkpoint, strict=False) | |||
| self._sc = None | |||
| if os.path.exists(model_txt_file): | |||
| with open(sc_config_file) as f: | |||
| lines = f.readlines() | |||
| with open(sc_config_file, 'w') as f: | |||
| for line in lines: | |||
| if self.SC_CONF_ITEM_KWS_MODEL in line: | |||
| line = line.replace(self.SC_CONF_ITEM_KWS_MODEL, | |||
| model_txt_file) | |||
| f.write(line) | |||
| import py_sound_connect | |||
| self._sc = py_sound_connect.SoundConnect(sc_config_file) | |||
| self.size_in = self._sc.bytesPerBlockIn() | |||
| self.size_out = self._sc.bytesPerBlockOut() | |||
| if self._model is None and self._sc is None: | |||
| raise Exception( | |||
| f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.' | |||
| ) | |||
| def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
| ... | |||
| def forward_decode(self, data: bytes): | |||
| result = {'pcm': self._sc.process(data, self.size_out)} | |||
| state = self._sc.kwsState() | |||
| if state == 2: | |||
| result['kws'] = { | |||
| 'keyword': | |||
| self._sc.kwsKeyword(self._sc.kwsSpottedKeywordIndex()), | |||
| 'offset': self._sc.kwsKeywordOffset(), | |||
| 'length': self._sc.kwsKeywordLength(), | |||
| 'confidence': self._sc.kwsConfidence() | |||
| } | |||
| return result | |||
| @@ -0,0 +1,121 @@ | |||
| import math | |||
| import struct | |||
| from enum import Enum | |||
| HEADER_BLOCK_SIZE = 10 | |||
| class LayerType(Enum): | |||
| LAYER_DENSE = 1 | |||
| LAYER_GRU = 2 | |||
| LAYER_ATTENTION = 3 | |||
| LAYER_FSMN = 4 | |||
| LAYER_SEQUENTIAL_FSMN = 5 | |||
| LAYER_FSMN_SELE = 6 | |||
| LAYER_GRU_ATTENTION = 7 | |||
| LAYER_DFSMN = 8 | |||
| class ActivationType(Enum): | |||
| ACTIVATION_NONE = 0 | |||
| ACTIVATION_RELU = 1 | |||
| ACTIVATION_TANH = 2 | |||
| ACTIVATION_SIGMOID = 3 | |||
| ACTIVATION_SOFTMAX = 4 | |||
| ACTIVATION_LOGSOFTMAX = 5 | |||
| def f32ToI32(f): | |||
| """ | |||
| print layer | |||
| """ | |||
| bs = struct.pack('f', f) | |||
| ba = bytearray() | |||
| ba.append(bs[0]) | |||
| ba.append(bs[1]) | |||
| ba.append(bs[2]) | |||
| ba.append(bs[3]) | |||
| return struct.unpack('i', ba)[0] | |||
| def printNeonMatrix(w): | |||
| """ | |||
| print matrix with neon padding | |||
| """ | |||
| numrows, numcols = w.shape | |||
| numnecols = math.ceil(numcols / 4) | |||
| for i in range(numrows): | |||
| for j in range(numcols): | |||
| print(f32ToI32(w[i, j])) | |||
| for j in range(numnecols * 4 - numcols): | |||
| print(0) | |||
| def printNeonVector(b): | |||
| """ | |||
| print vector with neon padding | |||
| """ | |||
| size = b.shape[0] | |||
| nesize = math.ceil(size / 4) | |||
| for i in range(size): | |||
| print(f32ToI32(b[i])) | |||
| for i in range(nesize * 4 - size): | |||
| print(0) | |||
| def printDense(layer): | |||
| """ | |||
| save dense layer | |||
| """ | |||
| statedict = layer.state_dict() | |||
| printNeonMatrix(statedict['weight']) | |||
| printNeonVector(statedict['bias']) | |||
| def printGRU(layer): | |||
| """ | |||
| save gru layer | |||
| """ | |||
| statedict = layer.state_dict() | |||
| weight = [statedict['weight_ih_l0'], statedict['weight_hh_l0']] | |||
| bias = [statedict['bias_ih_l0'], statedict['bias_hh_l0']] | |||
| numins, numouts = weight[0].shape | |||
| numins = numins // 3 | |||
| # output input weights | |||
| w_rx = weight[0][:numins, :] | |||
| w_zx = weight[0][numins:numins * 2, :] | |||
| w_x = weight[0][numins * 2:, :] | |||
| printNeonMatrix(w_zx) | |||
| printNeonMatrix(w_rx) | |||
| printNeonMatrix(w_x) | |||
| # output recurrent weights | |||
| w_rh = weight[1][:numins, :] | |||
| w_zh = weight[1][numins:numins * 2, :] | |||
| w_h = weight[1][numins * 2:, :] | |||
| printNeonMatrix(w_zh) | |||
| printNeonMatrix(w_rh) | |||
| printNeonMatrix(w_h) | |||
| # output input bias | |||
| b_rx = bias[0][:numins] | |||
| b_zx = bias[0][numins:numins * 2] | |||
| b_x = bias[0][numins * 2:] | |||
| printNeonVector(b_zx) | |||
| printNeonVector(b_rx) | |||
| printNeonVector(b_x) | |||
| # output recurrent bias | |||
| b_rh = bias[1][:numins] | |||
| b_zh = bias[1][numins:numins * 2] | |||
| b_h = bias[1][numins * 2:] | |||
| printNeonVector(b_zh) | |||
| printNeonVector(b_rh) | |||
| printNeonVector(b_h) | |||
| @@ -5,4 +5,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, | |||
| image_colorization, image_denoise, image_instance_segmentation, | |||
| image_portrait_enhancement, image_to_image_generation, | |||
| image_to_image_translation, object_detection, | |||
| product_retrieval_embedding, super_resolution, virual_tryon) | |||
| product_retrieval_embedding, salient_detection, | |||
| super_resolution, video_single_object_tracking, virual_tryon) | |||
| @@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel): | |||
| model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
| self.model = NAFNet(**self.config.model.network_g) | |||
| self.loss = PSNRLoss() | |||
| if torch.cuda.is_available(): | |||
| self._device = torch.device('cuda') | |||
| else: | |||
| self._device = torch.device('cpu') | |||
| self.model = self.model.to(self._device) | |||
| self.model = self._load_pretrained(self.model, model_path) | |||
| if self.training: | |||
| self.model.train() | |||
| else: | |||
| self.model.eval() | |||
| def _load_pretrained(self, | |||
| net, | |||
| load_path, | |||
| @@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel): | |||
| Returns: | |||
| Dict[str, Tensor]: results | |||
| """ | |||
| for key, value in inputs.items(): | |||
| inputs[key] = inputs[key].to(self._device) | |||
| if self.training: | |||
| return self._train_forward(**inputs) | |||
| elif 'target' in inputs: | |||
| @@ -7,13 +7,11 @@ if TYPE_CHECKING: | |||
| from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | |||
| from .model import CascadeMaskRCNNSwinModel | |||
| from .postprocess_utils import get_img_ins_seg_result | |||
| from .datasets import ImageInstanceSegmentationCocoDataset | |||
| else: | |||
| _import_structure = { | |||
| 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | |||
| 'model': ['CascadeMaskRCNNSwinModel'], | |||
| 'postprocess_utils': ['get_img_ins_seg_result'], | |||
| 'datasets': ['ImageInstanceSegmentationCocoDataset'] | |||
| } | |||
| import sys | |||
| @@ -1,2 +1 @@ | |||
| from .dataset import ImageInstanceSegmentationCocoDataset | |||
| from .transforms import build_preprocess_transform | |||
| @@ -38,7 +38,7 @@ class DetectionModel(TorchModel): | |||
| self.model, model_path, map_location='cpu') | |||
| self.class_names = checkpoint['meta']['CLASSES'] | |||
| config.test_pipeline[0].type = 'LoadImageFromWebcam' | |||
| self.test_pipeline = Compose( | |||
| self.transform_input = Compose( | |||
| replace_ImageToTensor(config.test_pipeline)) | |||
| self.model.cfg = config | |||
| self.model.eval() | |||
| @@ -56,7 +56,7 @@ class DetectionModel(TorchModel): | |||
| from mmcv.parallel import collate, scatter | |||
| data = dict(img=image) | |||
| data = self.test_pipeline(data) | |||
| data = self.transform_input(data) | |||
| data = collate([data], samples_per_gpu=1) | |||
| data['img_metas'] = [ | |||
| img_metas.data[0] for img_metas in data['img_metas'] | |||
| @@ -0,0 +1,22 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .salient_model import SalientDetection | |||
| else: | |||
| _import_structure = { | |||
| 'salient_model': ['SalientDetection'], | |||
| } | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1 @@ | |||
| from .u2net import U2NET | |||
| @@ -0,0 +1,300 @@ | |||
| # Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class REBNCONV(nn.Module): | |||
| def __init__(self, in_ch=3, out_ch=3, dirate=1): | |||
| super(REBNCONV, self).__init__() | |||
| self.conv_s1 = nn.Conv2d( | |||
| in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate) | |||
| self.bn_s1 = nn.BatchNorm2d(out_ch) | |||
| self.relu_s1 = nn.ReLU(inplace=True) | |||
| def forward(self, x): | |||
| hx = x | |||
| xout = self.relu_s1(self.bn_s1(self.conv_s1(hx))) | |||
| return xout | |||
| def _upsample_like(src, tar): | |||
| """upsample tensor 'src' to have the same spatial size with tensor 'tar'.""" | |||
| src = F.upsample(src, size=tar.shape[2:], mode='bilinear') | |||
| return src | |||
| class RSU7(nn.Module): | |||
| def __init__(self, in_ch=3, mid_ch=12, out_ch=3): | |||
| super(RSU7, self).__init__() | |||
| self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) | |||
| self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) | |||
| self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2) | |||
| self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hxin = self.rebnconvin(hx) | |||
| hx1 = self.rebnconv1(hxin) | |||
| hx = self.pool1(hx1) | |||
| hx2 = self.rebnconv2(hx) | |||
| hx = self.pool2(hx2) | |||
| hx3 = self.rebnconv3(hx) | |||
| hx = self.pool3(hx3) | |||
| hx4 = self.rebnconv4(hx) | |||
| hx = self.pool4(hx4) | |||
| hx5 = self.rebnconv5(hx) | |||
| hx = self.pool5(hx5) | |||
| hx6 = self.rebnconv6(hx) | |||
| hx7 = self.rebnconv7(hx6) | |||
| hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1)) | |||
| hx6dup = _upsample_like(hx6d, hx5) | |||
| hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1)) | |||
| hx5dup = _upsample_like(hx5d, hx4) | |||
| hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) | |||
| hx4dup = _upsample_like(hx4d, hx3) | |||
| hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) | |||
| hx3dup = _upsample_like(hx3d, hx2) | |||
| hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) | |||
| hx2dup = _upsample_like(hx2d, hx1) | |||
| hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) | |||
| return hx1d + hxin | |||
| class RSU6(nn.Module): | |||
| def __init__(self, in_ch=3, mid_ch=12, out_ch=3): | |||
| super(RSU6, self).__init__() | |||
| self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) | |||
| self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) | |||
| self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2) | |||
| self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hxin = self.rebnconvin(hx) | |||
| hx1 = self.rebnconv1(hxin) | |||
| hx = self.pool1(hx1) | |||
| hx2 = self.rebnconv2(hx) | |||
| hx = self.pool2(hx2) | |||
| hx3 = self.rebnconv3(hx) | |||
| hx = self.pool3(hx3) | |||
| hx4 = self.rebnconv4(hx) | |||
| hx = self.pool4(hx4) | |||
| hx5 = self.rebnconv5(hx) | |||
| hx6 = self.rebnconv6(hx5) | |||
| hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1)) | |||
| hx5dup = _upsample_like(hx5d, hx4) | |||
| hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) | |||
| hx4dup = _upsample_like(hx4d, hx3) | |||
| hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) | |||
| hx3dup = _upsample_like(hx3d, hx2) | |||
| hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) | |||
| hx2dup = _upsample_like(hx2d, hx1) | |||
| hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) | |||
| return hx1d + hxin | |||
| class RSU5(nn.Module): | |||
| def __init__(self, in_ch=3, mid_ch=12, out_ch=3): | |||
| super(RSU5, self).__init__() | |||
| self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) | |||
| self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) | |||
| self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2) | |||
| self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hxin = self.rebnconvin(hx) | |||
| hx1 = self.rebnconv1(hxin) | |||
| hx = self.pool1(hx1) | |||
| hx2 = self.rebnconv2(hx) | |||
| hx = self.pool2(hx2) | |||
| hx3 = self.rebnconv3(hx) | |||
| hx = self.pool3(hx3) | |||
| hx4 = self.rebnconv4(hx) | |||
| hx5 = self.rebnconv5(hx4) | |||
| hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1)) | |||
| hx4dup = _upsample_like(hx4d, hx3) | |||
| hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) | |||
| hx3dup = _upsample_like(hx3d, hx2) | |||
| hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) | |||
| hx2dup = _upsample_like(hx2d, hx1) | |||
| hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) | |||
| return hx1d + hxin | |||
| class RSU4(nn.Module): | |||
| def __init__(self, in_ch=3, mid_ch=12, out_ch=3): | |||
| super(RSU4, self).__init__() | |||
| self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) | |||
| self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) | |||
| self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) | |||
| self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2) | |||
| self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) | |||
| self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hxin = self.rebnconvin(hx) | |||
| hx1 = self.rebnconv1(hxin) | |||
| hx = self.pool1(hx1) | |||
| hx2 = self.rebnconv2(hx) | |||
| hx = self.pool2(hx2) | |||
| hx3 = self.rebnconv3(hx) | |||
| hx4 = self.rebnconv4(hx3) | |||
| hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) | |||
| hx3dup = _upsample_like(hx3d, hx2) | |||
| hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) | |||
| hx2dup = _upsample_like(hx2d, hx1) | |||
| hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) | |||
| return hx1d + hxin | |||
| class RSU4F(nn.Module): | |||
| def __init__(self, in_ch=3, mid_ch=12, out_ch=3): | |||
| super(RSU4F, self).__init__() | |||
| self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) | |||
| self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) | |||
| self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2) | |||
| self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4) | |||
| self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8) | |||
| self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4) | |||
| self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2) | |||
| self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hxin = self.rebnconvin(hx) | |||
| hx1 = self.rebnconv1(hxin) | |||
| hx2 = self.rebnconv2(hx1) | |||
| hx3 = self.rebnconv3(hx2) | |||
| hx4 = self.rebnconv4(hx3) | |||
| hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) | |||
| hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1)) | |||
| hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1)) | |||
| return hx1d + hxin | |||
| class U2NET(nn.Module): | |||
| def __init__(self, in_ch=3, out_ch=1): | |||
| super(U2NET, self).__init__() | |||
| # encoder | |||
| self.stage1 = RSU7(in_ch, 32, 64) | |||
| self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.stage2 = RSU6(64, 32, 128) | |||
| self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.stage3 = RSU5(128, 64, 256) | |||
| self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.stage4 = RSU4(256, 128, 512) | |||
| self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.stage5 = RSU4F(512, 256, 512) | |||
| self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True) | |||
| self.stage6 = RSU4F(512, 256, 512) | |||
| # decoder | |||
| self.stage5d = RSU4F(1024, 256, 512) | |||
| self.stage4d = RSU4(1024, 128, 256) | |||
| self.stage3d = RSU5(512, 64, 128) | |||
| self.stage2d = RSU6(256, 32, 64) | |||
| self.stage1d = RSU7(128, 16, 64) | |||
| self.side1 = nn.Conv2d(64, out_ch, 3, padding=1) | |||
| self.side2 = nn.Conv2d(64, out_ch, 3, padding=1) | |||
| self.side3 = nn.Conv2d(128, out_ch, 3, padding=1) | |||
| self.side4 = nn.Conv2d(256, out_ch, 3, padding=1) | |||
| self.side5 = nn.Conv2d(512, out_ch, 3, padding=1) | |||
| self.side6 = nn.Conv2d(512, out_ch, 3, padding=1) | |||
| self.outconv = nn.Conv2d(6 * out_ch, out_ch, 1) | |||
| def forward(self, x): | |||
| hx = x | |||
| hx1 = self.stage1(hx) | |||
| hx = self.pool12(hx1) | |||
| hx2 = self.stage2(hx) | |||
| hx = self.pool23(hx2) | |||
| hx3 = self.stage3(hx) | |||
| hx = self.pool34(hx3) | |||
| hx4 = self.stage4(hx) | |||
| hx = self.pool45(hx4) | |||
| hx5 = self.stage5(hx) | |||
| hx = self.pool56(hx5) | |||
| hx6 = self.stage6(hx) | |||
| hx6up = _upsample_like(hx6, hx5) | |||
| hx5d = self.stage5d(torch.cat((hx6up, hx5), 1)) | |||
| hx5dup = _upsample_like(hx5d, hx4) | |||
| hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1)) | |||
| hx4dup = _upsample_like(hx4d, hx3) | |||
| hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1)) | |||
| hx3dup = _upsample_like(hx3d, hx2) | |||
| hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1)) | |||
| hx2dup = _upsample_like(hx2d, hx1) | |||
| hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1)) | |||
| d1 = self.side1(hx1d) | |||
| d2 = self.side2(hx2d) | |||
| d2 = _upsample_like(d2, d1) | |||
| d3 = self.side3(hx3d) | |||
| d3 = _upsample_like(d3, d1) | |||
| d4 = self.side4(hx4d) | |||
| d4 = _upsample_like(d4, d1) | |||
| d5 = self.side5(hx5d) | |||
| d5 = _upsample_like(d5, d1) | |||
| d6 = self.side6(hx6) | |||
| d6 = _upsample_like(d6, d1) | |||
| d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1)) | |||
| return torch.sigmoid(d0), torch.sigmoid(d1), torch.sigmoid( | |||
| d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid( | |||
| d5), torch.sigmoid(d6) | |||
| @@ -0,0 +1,63 @@ | |||
| import os.path as osp | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base.base_torch_model import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .models import U2NET | |||
| @MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection) | |||
| class SalientDetection(TorchModel): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """str -- model file root.""" | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
| self.model = U2NET(3, 1) | |||
| checkpoint = torch.load(model_path, map_location='cpu') | |||
| self.transform_input = transforms.Compose([ | |||
| transforms.Resize((320, 320)), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize( | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| ]) | |||
| self.model.load_state_dict(checkpoint) | |||
| self.model.eval() | |||
| def inference(self, data): | |||
| """data is tensor 3 * H * W ---> return tensor H * W .""" | |||
| data = data.unsqueeze(0) | |||
| if next(self.model.parameters()).is_cuda: | |||
| data = data.to( | |||
| torch.device([next(self.model.parameters()).device][0])) | |||
| with torch.no_grad(): | |||
| results = self.model(data) | |||
| if next(self.model.parameters()).is_cuda: | |||
| return results[0][0, 0, :, :].cpu() | |||
| return results[0][0, 0, :, :] | |||
| def preprocess(self, image): | |||
| """image is numpy.""" | |||
| data = self.transform_input(Image.fromarray(image)) | |||
| return data.float() | |||
| def postprocess(self, inputs): | |||
| """resize .""" | |||
| data = inputs['data'] | |||
| w = inputs['img_w'] | |||
| h = inputs['img_h'] | |||
| data_norm = (data - torch.min(data)) / ( | |||
| torch.max(data) - torch.min(data)) | |||
| data_norm_np = (data_norm.numpy() * 255).astype('uint8') | |||
| data_norm_rst = cv2.resize(data_norm_np, (w, h)) | |||
| return data_norm_rst | |||
| @@ -0,0 +1,39 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| from easydict import EasyDict as edict | |||
| cfg = edict() | |||
| # MODEL | |||
| cfg.MODEL = edict() | |||
| # MODEL.BACKBONE | |||
| cfg.MODEL.BACKBONE = edict() | |||
| cfg.MODEL.BACKBONE.TYPE = 'vit_base_patch16_224_ce' | |||
| cfg.MODEL.BACKBONE.STRIDE = 16 | |||
| cfg.MODEL.BACKBONE.CAT_MODE = 'direct' | |||
| cfg.MODEL.BACKBONE.DROP_PATH_RATE = 0.1 | |||
| cfg.MODEL.BACKBONE.CE_LOC = [3, 6, 9] | |||
| cfg.MODEL.BACKBONE.CE_KEEP_RATIO = [0.7, 0.7, 0.7] | |||
| cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE = 'CTR_POINT' | |||
| # MODEL.HEAD | |||
| cfg.MODEL.HEAD = edict() | |||
| cfg.MODEL.HEAD.TYPE = 'CENTER' | |||
| cfg.MODEL.HEAD.NUM_CHANNELS = 256 | |||
| # DATA | |||
| cfg.DATA = edict() | |||
| cfg.DATA.MEAN = [0.485, 0.456, 0.406] | |||
| cfg.DATA.STD = [0.229, 0.224, 0.225] | |||
| cfg.DATA.SEARCH = edict() | |||
| cfg.DATA.SEARCH.SIZE = 384 | |||
| cfg.DATA.TEMPLATE = edict() | |||
| cfg.DATA.TEMPLATE.SIZE = 192 | |||
| # TEST | |||
| cfg.TEST = edict() | |||
| cfg.TEST.TEMPLATE_FACTOR = 2.0 | |||
| cfg.TEST.TEMPLATE_SIZE = 192 | |||
| cfg.TEST.SEARCH_FACTOR = 5.0 | |||
| cfg.TEST.SEARCH_SIZE = 384 | |||
| @@ -0,0 +1,54 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch.nn as nn | |||
| class Attention(nn.Module): | |||
| def __init__(self, | |||
| dim, | |||
| num_heads=8, | |||
| qkv_bias=False, | |||
| attn_drop=0., | |||
| proj_drop=0., | |||
| rpe=False, | |||
| z_size=7, | |||
| x_size=14): | |||
| super().__init__() | |||
| self.num_heads = num_heads | |||
| head_dim = dim // num_heads | |||
| self.scale = head_dim**-0.5 | |||
| self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) | |||
| self.attn_drop = nn.Dropout(attn_drop) | |||
| self.proj = nn.Linear(dim, dim) | |||
| self.proj_drop = nn.Dropout(proj_drop) | |||
| def forward(self, x, mask=None, return_attention=False): | |||
| # x: B, N, C | |||
| # mask: [B, N, ] torch.bool | |||
| B, N, C = x.shape | |||
| qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, | |||
| C // self.num_heads).permute(2, 0, 3, 1, 4) | |||
| q, k, v = qkv.unbind( | |||
| 0) # make torchscript happy (cannot use tensor as tuple) | |||
| attn = (q @ k.transpose(-2, -1)) * self.scale | |||
| if mask is not None: | |||
| attn = attn.masked_fill( | |||
| mask.unsqueeze(1).unsqueeze(2), | |||
| float('-inf'), | |||
| ) | |||
| attn = attn.softmax(dim=-1) | |||
| attn = self.attn_drop(attn) | |||
| x = (attn @ v).transpose(1, 2).reshape(B, N, C) | |||
| x = self.proj(x) | |||
| x = self.proj_drop(x) | |||
| if return_attention: | |||
| return x, attn | |||
| else: | |||
| return x | |||
| @@ -0,0 +1,129 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import math | |||
| import torch | |||
| import torch.nn as nn | |||
| from timm.models.layers import DropPath, Mlp | |||
| from .attn import Attention | |||
| def candidate_elimination(attn: torch.Tensor, tokens: torch.Tensor, | |||
| lens_t: int, keep_ratio: float, | |||
| global_index: torch.Tensor, | |||
| box_mask_z: torch.Tensor): | |||
| """ | |||
| Eliminate potential background candidates for computation reduction and noise cancellation. | |||
| Args: | |||
| attn (torch.Tensor): [B, num_heads, L_t + L_s, L_t + L_s], attention weights | |||
| tokens (torch.Tensor): [B, L_t + L_s, C], template and search region tokens | |||
| lens_t (int): length of template | |||
| keep_ratio (float): keep ratio of search region tokens (candidates) | |||
| global_index (torch.Tensor): global index of search region tokens | |||
| box_mask_z (torch.Tensor): template mask used to accumulate attention weights | |||
| Returns: | |||
| tokens_new (torch.Tensor): tokens after candidate elimination | |||
| keep_index (torch.Tensor): indices of kept search region tokens | |||
| removed_index (torch.Tensor): indices of removed search region tokens | |||
| """ | |||
| lens_s = attn.shape[-1] - lens_t | |||
| bs, hn, _, _ = attn.shape | |||
| lens_keep = math.ceil(keep_ratio * lens_s) | |||
| if lens_keep == lens_s: | |||
| return tokens, global_index, None | |||
| attn_t = attn[:, :, :lens_t, lens_t:] | |||
| if box_mask_z is not None: | |||
| box_mask_z = box_mask_z.unsqueeze(1).unsqueeze(-1).expand( | |||
| -1, attn_t.shape[1], -1, attn_t.shape[-1]) | |||
| attn_t = attn_t[box_mask_z] | |||
| attn_t = attn_t.view(bs, hn, -1, lens_s) | |||
| attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s | |||
| else: | |||
| attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s | |||
| # use sort instead of topk, due to the speed issue | |||
| # https://github.com/pytorch/pytorch/issues/22812 | |||
| sorted_attn, indices = torch.sort(attn_t, dim=1, descending=True) | |||
| _, topk_idx = sorted_attn[:, :lens_keep], indices[:, :lens_keep] | |||
| _, non_topk_idx = sorted_attn[:, lens_keep:], indices[:, lens_keep:] | |||
| keep_index = global_index.gather(dim=1, index=topk_idx) | |||
| removed_index = global_index.gather(dim=1, index=non_topk_idx) | |||
| # separate template and search tokens | |||
| tokens_t = tokens[:, :lens_t] | |||
| tokens_s = tokens[:, lens_t:] | |||
| # obtain the attentive and inattentive tokens | |||
| B, L, C = tokens_s.shape | |||
| attentive_tokens = tokens_s.gather( | |||
| dim=1, index=topk_idx.unsqueeze(-1).expand(B, -1, C)) | |||
| # concatenate these tokens | |||
| tokens_new = torch.cat([tokens_t, attentive_tokens], dim=1) | |||
| return tokens_new, keep_index, removed_index | |||
| class CEBlock(nn.Module): | |||
| def __init__( | |||
| self, | |||
| dim, | |||
| num_heads, | |||
| mlp_ratio=4., | |||
| qkv_bias=False, | |||
| drop=0., | |||
| attn_drop=0., | |||
| drop_path=0., | |||
| act_layer=nn.GELU, | |||
| norm_layer=nn.LayerNorm, | |||
| keep_ratio_search=1.0, | |||
| ): | |||
| super().__init__() | |||
| self.norm1 = norm_layer(dim) | |||
| self.attn = Attention( | |||
| dim, | |||
| num_heads=num_heads, | |||
| qkv_bias=qkv_bias, | |||
| attn_drop=attn_drop, | |||
| proj_drop=drop) | |||
| # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here | |||
| self.drop_path = DropPath( | |||
| drop_path) if drop_path > 0. else nn.Identity() | |||
| self.norm2 = norm_layer(dim) | |||
| mlp_hidden_dim = int(dim * mlp_ratio) | |||
| self.mlp = Mlp( | |||
| in_features=dim, | |||
| hidden_features=mlp_hidden_dim, | |||
| act_layer=act_layer, | |||
| drop=drop) | |||
| self.keep_ratio_search = keep_ratio_search | |||
| def forward(self, | |||
| x, | |||
| global_index_template, | |||
| global_index_search, | |||
| mask=None, | |||
| ce_template_mask=None, | |||
| keep_ratio_search=None): | |||
| x_attn, attn = self.attn(self.norm1(x), mask, True) | |||
| x = x + self.drop_path(x_attn) | |||
| lens_t = global_index_template.shape[1] | |||
| removed_index_search = None | |||
| if self.keep_ratio_search < 1 and (keep_ratio_search is None | |||
| or keep_ratio_search < 1): | |||
| keep_ratio_search = self.keep_ratio_search if keep_ratio_search is None else keep_ratio_search | |||
| x, global_index_search, removed_index_search = candidate_elimination( | |||
| attn, x, lens_t, keep_ratio_search, global_index_search, | |||
| ce_template_mask) | |||
| x = x + self.drop_path(self.mlp(self.norm2(x))) | |||
| return x, global_index_template, global_index_search, removed_index_search, attn | |||
| @@ -0,0 +1,141 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch | |||
| import torch.nn as nn | |||
| def conv(in_planes, | |||
| out_planes, | |||
| kernel_size=3, | |||
| stride=1, | |||
| padding=1, | |||
| dilation=1): | |||
| return nn.Sequential( | |||
| nn.Conv2d( | |||
| in_planes, | |||
| out_planes, | |||
| kernel_size=kernel_size, | |||
| stride=stride, | |||
| padding=padding, | |||
| dilation=dilation, | |||
| bias=True), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True)) | |||
| class CenterPredictor( | |||
| nn.Module, ): | |||
| def __init__(self, inplanes=64, channel=256, feat_sz=20, stride=16): | |||
| super(CenterPredictor, self).__init__() | |||
| self.feat_sz = feat_sz | |||
| self.stride = stride | |||
| self.img_sz = self.feat_sz * self.stride | |||
| # corner predict | |||
| self.conv1_ctr = conv(inplanes, channel) | |||
| self.conv2_ctr = conv(channel, channel // 2) | |||
| self.conv3_ctr = conv(channel // 2, channel // 4) | |||
| self.conv4_ctr = conv(channel // 4, channel // 8) | |||
| self.conv5_ctr = nn.Conv2d(channel // 8, 1, kernel_size=1) | |||
| # offset regress | |||
| self.conv1_offset = conv(inplanes, channel) | |||
| self.conv2_offset = conv(channel, channel // 2) | |||
| self.conv3_offset = conv(channel // 2, channel // 4) | |||
| self.conv4_offset = conv(channel // 4, channel // 8) | |||
| self.conv5_offset = nn.Conv2d(channel // 8, 2, kernel_size=1) | |||
| # size regress | |||
| self.conv1_size = conv(inplanes, channel) | |||
| self.conv2_size = conv(channel, channel // 2) | |||
| self.conv3_size = conv(channel // 2, channel // 4) | |||
| self.conv4_size = conv(channel // 4, channel // 8) | |||
| self.conv5_size = nn.Conv2d(channel // 8, 2, kernel_size=1) | |||
| for p in self.parameters(): | |||
| if p.dim() > 1: | |||
| nn.init.xavier_uniform_(p) | |||
| def forward(self, x, gt_score_map=None): | |||
| """ Forward pass with input x. """ | |||
| score_map_ctr, size_map, offset_map = self.get_score_map(x) | |||
| # assert gt_score_map is None | |||
| if gt_score_map is None: | |||
| bbox = self.cal_bbox(score_map_ctr, size_map, offset_map) | |||
| else: | |||
| bbox = self.cal_bbox( | |||
| gt_score_map.unsqueeze(1), size_map, offset_map) | |||
| return score_map_ctr, bbox, size_map, offset_map | |||
| def cal_bbox(self, | |||
| score_map_ctr, | |||
| size_map, | |||
| offset_map, | |||
| return_score=False): | |||
| max_score, idx = torch.max( | |||
| score_map_ctr.flatten(1), dim=1, keepdim=True) | |||
| idx_y = idx // self.feat_sz | |||
| idx_x = idx % self.feat_sz | |||
| idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1) | |||
| size = size_map.flatten(2).gather(dim=2, index=idx) | |||
| offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1) | |||
| # cx, cy, w, h | |||
| bbox = torch.cat( | |||
| [(idx_x.to(torch.float) + offset[:, :1]) / self.feat_sz, | |||
| (idx_y.to(torch.float) + offset[:, 1:]) / self.feat_sz, | |||
| size.squeeze(-1)], | |||
| dim=1) | |||
| if return_score: | |||
| return bbox, max_score | |||
| return bbox | |||
| def get_score_map(self, x): | |||
| def _sigmoid(x): | |||
| y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4) | |||
| return y | |||
| # ctr branch | |||
| x_ctr1 = self.conv1_ctr(x) | |||
| x_ctr2 = self.conv2_ctr(x_ctr1) | |||
| x_ctr3 = self.conv3_ctr(x_ctr2) | |||
| x_ctr4 = self.conv4_ctr(x_ctr3) | |||
| score_map_ctr = self.conv5_ctr(x_ctr4) | |||
| # offset branch | |||
| x_offset1 = self.conv1_offset(x) | |||
| x_offset2 = self.conv2_offset(x_offset1) | |||
| x_offset3 = self.conv3_offset(x_offset2) | |||
| x_offset4 = self.conv4_offset(x_offset3) | |||
| score_map_offset = self.conv5_offset(x_offset4) | |||
| # size branch | |||
| x_size1 = self.conv1_size(x) | |||
| x_size2 = self.conv2_size(x_size1) | |||
| x_size3 = self.conv3_size(x_size2) | |||
| x_size4 = self.conv4_size(x_size3) | |||
| score_map_size = self.conv5_size(x_size4) | |||
| return _sigmoid(score_map_ctr), _sigmoid( | |||
| score_map_size), score_map_offset | |||
| def build_box_head(cfg, hidden_dim): | |||
| stride = cfg.MODEL.BACKBONE.STRIDE | |||
| if cfg.MODEL.HEAD.TYPE == 'CENTER': | |||
| in_channel = hidden_dim | |||
| out_channel = cfg.MODEL.HEAD.NUM_CHANNELS | |||
| feat_sz = int(cfg.DATA.SEARCH.SIZE / stride) | |||
| center_head = CenterPredictor( | |||
| inplanes=in_channel, | |||
| channel=out_channel, | |||
| feat_sz=feat_sz, | |||
| stride=stride) | |||
| return center_head | |||
| else: | |||
| raise ValueError('HEAD TYPE %s is not supported.' | |||
| % cfg.MODEL.HEAD_TYPE) | |||
| @@ -0,0 +1,37 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch.nn as nn | |||
| from timm.models.layers import to_2tuple | |||
| class PatchEmbed(nn.Module): | |||
| """ 2D Image to Patch Embedding | |||
| """ | |||
| def __init__(self, | |||
| img_size=224, | |||
| patch_size=16, | |||
| in_chans=3, | |||
| embed_dim=768, | |||
| norm_layer=None, | |||
| flatten=True): | |||
| super().__init__() | |||
| img_size = to_2tuple(img_size) | |||
| patch_size = to_2tuple(patch_size) | |||
| self.img_size = img_size | |||
| self.patch_size = patch_size | |||
| self.grid_size = (img_size[0] // patch_size[0], | |||
| img_size[1] // patch_size[1]) | |||
| self.num_patches = self.grid_size[0] * self.grid_size[1] | |||
| self.flatten = flatten | |||
| self.proj = nn.Conv2d( | |||
| in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) | |||
| self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() | |||
| def forward(self, x): | |||
| x = self.proj(x) | |||
| if self.flatten: | |||
| x = x.flatten(2).transpose(1, 2) # BCHW -> BNC | |||
| x = self.norm(x) | |||
| return x | |||
| @@ -0,0 +1,93 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch.nn as nn | |||
| from timm.models.layers import to_2tuple | |||
| from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \ | |||
| PatchEmbed | |||
| class BaseBackbone(nn.Module): | |||
| def __init__(self): | |||
| super().__init__() | |||
| # for original ViT | |||
| self.pos_embed = None | |||
| self.img_size = [224, 224] | |||
| self.patch_size = 16 | |||
| self.embed_dim = 384 | |||
| self.cat_mode = 'direct' | |||
| self.pos_embed_z = None | |||
| self.pos_embed_x = None | |||
| self.template_segment_pos_embed = None | |||
| self.search_segment_pos_embed = None | |||
| self.return_stage = [2, 5, 8, 11] | |||
| def finetune_track(self, cfg, patch_start_index=1): | |||
| search_size = to_2tuple(cfg.DATA.SEARCH.SIZE) | |||
| template_size = to_2tuple(cfg.DATA.TEMPLATE.SIZE) | |||
| new_patch_size = cfg.MODEL.BACKBONE.STRIDE | |||
| self.cat_mode = cfg.MODEL.BACKBONE.CAT_MODE | |||
| # resize patch embedding | |||
| if new_patch_size != self.patch_size: | |||
| print( | |||
| 'Inconsistent Patch Size With The Pretrained Weights, Interpolate The Weight!' | |||
| ) | |||
| old_patch_embed = {} | |||
| for name, param in self.patch_embed.named_parameters(): | |||
| if 'weight' in name: | |||
| param = nn.functional.interpolate( | |||
| param, | |||
| size=(new_patch_size, new_patch_size), | |||
| mode='bicubic', | |||
| align_corners=False) | |||
| param = nn.Parameter(param) | |||
| old_patch_embed[name] = param | |||
| self.patch_embed = PatchEmbed( | |||
| img_size=self.img_size, | |||
| patch_size=new_patch_size, | |||
| in_chans=3, | |||
| embed_dim=self.embed_dim) | |||
| self.patch_embed.proj.bias = old_patch_embed['proj.bias'] | |||
| self.patch_embed.proj.weight = old_patch_embed['proj.weight'] | |||
| # for patch embedding | |||
| patch_pos_embed = self.pos_embed[:, patch_start_index:, :] | |||
| patch_pos_embed = patch_pos_embed.transpose(1, 2) | |||
| B, E, Q = patch_pos_embed.shape | |||
| P_H, P_W = self.img_size[0] // self.patch_size, self.img_size[ | |||
| 1] // self.patch_size | |||
| patch_pos_embed = patch_pos_embed.view(B, E, P_H, P_W) | |||
| # for search region | |||
| H, W = search_size | |||
| new_P_H, new_P_W = H // new_patch_size, W // new_patch_size | |||
| search_patch_pos_embed = nn.functional.interpolate( | |||
| patch_pos_embed, | |||
| size=(new_P_H, new_P_W), | |||
| mode='bicubic', | |||
| align_corners=False) | |||
| search_patch_pos_embed = search_patch_pos_embed.flatten(2).transpose( | |||
| 1, 2) | |||
| # for template region | |||
| H, W = template_size | |||
| new_P_H, new_P_W = H // new_patch_size, W // new_patch_size | |||
| template_patch_pos_embed = nn.functional.interpolate( | |||
| patch_pos_embed, | |||
| size=(new_P_H, new_P_W), | |||
| mode='bicubic', | |||
| align_corners=False) | |||
| template_patch_pos_embed = template_patch_pos_embed.flatten( | |||
| 2).transpose(1, 2) | |||
| self.pos_embed_z = nn.Parameter(template_patch_pos_embed) | |||
| self.pos_embed_x = nn.Parameter(search_patch_pos_embed) | |||
| @@ -0,0 +1,109 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch | |||
| from torch import nn | |||
| from modelscope.models.cv.video_single_object_tracking.models.layers.head import \ | |||
| build_box_head | |||
| from .vit_ce import vit_base_patch16_224_ce | |||
| class OSTrack(nn.Module): | |||
| """ This is the base class for OSTrack """ | |||
| def __init__(self, | |||
| transformer, | |||
| box_head, | |||
| aux_loss=False, | |||
| head_type='CORNER'): | |||
| """ Initializes the model. | |||
| Parameters: | |||
| transformer: torch module of the transformer architecture. | |||
| aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. | |||
| """ | |||
| super().__init__() | |||
| self.backbone = transformer | |||
| self.box_head = box_head | |||
| self.aux_loss = aux_loss | |||
| self.head_type = head_type | |||
| if head_type == 'CORNER' or head_type == 'CENTER': | |||
| self.feat_sz_s = int(box_head.feat_sz) | |||
| self.feat_len_s = int(box_head.feat_sz**2) | |||
| def forward( | |||
| self, | |||
| template: torch.Tensor, | |||
| search: torch.Tensor, | |||
| ce_template_mask=None, | |||
| ce_keep_rate=None, | |||
| ): | |||
| x, aux_dict = self.backbone( | |||
| z=template, | |||
| x=search, | |||
| ce_template_mask=ce_template_mask, | |||
| ce_keep_rate=ce_keep_rate, | |||
| ) | |||
| # Forward head | |||
| feat_last = x | |||
| if isinstance(x, list): | |||
| feat_last = x[-1] | |||
| out = self.forward_head(feat_last, None) | |||
| out.update(aux_dict) | |||
| out['backbone_feat'] = x | |||
| return out | |||
| def forward_head(self, cat_feature, gt_score_map=None): | |||
| """ | |||
| cat_feature: output embeddings of the backbone, it can be (HW1+HW2, B, C) or (HW2, B, C) | |||
| """ | |||
| enc_opt = cat_feature[:, -self. | |||
| feat_len_s:] # encoder output for the search region (B, HW, C) | |||
| opt = (enc_opt.unsqueeze(-1)).permute((0, 3, 2, 1)).contiguous() | |||
| bs, Nq, C, HW = opt.size() | |||
| opt_feat = opt.view(-1, C, self.feat_sz_s, self.feat_sz_s) | |||
| if self.head_type == 'CENTER': | |||
| # run the center head | |||
| score_map_ctr, bbox, size_map, offset_map = self.box_head( | |||
| opt_feat, gt_score_map) | |||
| outputs_coord = bbox | |||
| outputs_coord_new = outputs_coord.view(bs, Nq, 4) | |||
| out = { | |||
| 'pred_boxes': outputs_coord_new, | |||
| 'score_map': score_map_ctr, | |||
| 'size_map': size_map, | |||
| 'offset_map': offset_map | |||
| } | |||
| return out | |||
| else: | |||
| raise NotImplementedError | |||
| def build_ostrack(cfg): | |||
| if cfg.MODEL.BACKBONE.TYPE == 'vit_base_patch16_224_ce': | |||
| backbone = vit_base_patch16_224_ce( | |||
| False, | |||
| drop_path_rate=cfg.MODEL.BACKBONE.DROP_PATH_RATE, | |||
| ce_loc=cfg.MODEL.BACKBONE.CE_LOC, | |||
| ce_keep_ratio=cfg.MODEL.BACKBONE.CE_KEEP_RATIO, | |||
| ) | |||
| hidden_dim = backbone.embed_dim | |||
| patch_start_index = 1 | |||
| else: | |||
| raise NotImplementedError | |||
| backbone.finetune_track(cfg=cfg, patch_start_index=patch_start_index) | |||
| box_head = build_box_head(cfg, hidden_dim) | |||
| model = OSTrack( | |||
| backbone, | |||
| box_head, | |||
| aux_loss=False, | |||
| head_type=cfg.MODEL.HEAD.TYPE, | |||
| ) | |||
| return model | |||
| @@ -0,0 +1,24 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch | |||
| def combine_tokens(template_tokens, | |||
| search_tokens, | |||
| mode='direct', | |||
| return_res=False): | |||
| if mode == 'direct': | |||
| merged_feature = torch.cat((template_tokens, search_tokens), dim=1) | |||
| else: | |||
| raise NotImplementedError | |||
| return merged_feature | |||
| def recover_tokens(merged_tokens, mode='direct'): | |||
| if mode == 'direct': | |||
| recovered_tokens = merged_tokens | |||
| else: | |||
| raise NotImplementedError | |||
| return recovered_tokens | |||
| @@ -0,0 +1,343 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| from functools import partial | |||
| import torch | |||
| import torch.nn as nn | |||
| from timm.models.layers import DropPath, Mlp, to_2tuple | |||
| from modelscope.models.cv.video_single_object_tracking.models.layers.attn_blocks import \ | |||
| CEBlock | |||
| from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \ | |||
| PatchEmbed | |||
| from .base_backbone import BaseBackbone | |||
| from .utils import combine_tokens, recover_tokens | |||
| class Attention(nn.Module): | |||
| def __init__(self, | |||
| dim, | |||
| num_heads=8, | |||
| qkv_bias=False, | |||
| attn_drop=0., | |||
| proj_drop=0.): | |||
| super().__init__() | |||
| self.num_heads = num_heads | |||
| head_dim = dim // num_heads | |||
| self.scale = head_dim**-0.5 | |||
| self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) | |||
| self.attn_drop = nn.Dropout(attn_drop) | |||
| self.proj = nn.Linear(dim, dim) | |||
| self.proj_drop = nn.Dropout(proj_drop) | |||
| class Block(nn.Module): | |||
| def __init__(self, | |||
| dim, | |||
| num_heads, | |||
| mlp_ratio=4., | |||
| qkv_bias=False, | |||
| drop=0., | |||
| attn_drop=0., | |||
| drop_path=0., | |||
| act_layer=nn.GELU, | |||
| norm_layer=nn.LayerNorm): | |||
| super().__init__() | |||
| self.norm1 = norm_layer(dim) | |||
| self.attn = Attention( | |||
| dim, | |||
| num_heads=num_heads, | |||
| qkv_bias=qkv_bias, | |||
| attn_drop=attn_drop, | |||
| proj_drop=drop) | |||
| # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here | |||
| self.drop_path = DropPath( | |||
| drop_path) if drop_path > 0. else nn.Identity() | |||
| self.norm2 = norm_layer(dim) | |||
| mlp_hidden_dim = int(dim * mlp_ratio) | |||
| self.mlp = Mlp( | |||
| in_features=dim, | |||
| hidden_features=mlp_hidden_dim, | |||
| act_layer=act_layer, | |||
| drop=drop) | |||
| class VisionTransformer(BaseBackbone): | |||
| """ Vision Transformer | |||
| A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` | |||
| - https://arxiv.org/abs/2010.11929 | |||
| Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` | |||
| - https://arxiv.org/abs/2012.12877 | |||
| """ | |||
| def __init__(self, | |||
| img_size=224, | |||
| patch_size=16, | |||
| in_chans=3, | |||
| num_classes=1000, | |||
| embed_dim=768, | |||
| depth=12, | |||
| num_heads=12, | |||
| mlp_ratio=4., | |||
| qkv_bias=True, | |||
| distilled=False, | |||
| drop_rate=0., | |||
| attn_drop_rate=0., | |||
| drop_path_rate=0., | |||
| embed_layer=PatchEmbed, | |||
| norm_layer=None, | |||
| act_layer=None): | |||
| """ | |||
| Args: | |||
| img_size (int, tuple): input image size | |||
| patch_size (int, tuple): patch size | |||
| in_chans (int): number of input channels | |||
| num_classes (int): number of classes for classification head | |||
| embed_dim (int): embedding dimension | |||
| depth (int): depth of transformer | |||
| num_heads (int): number of attention heads | |||
| mlp_ratio (int): ratio of mlp hidden dim to embedding dim | |||
| qkv_bias (bool): enable bias for qkv if True | |||
| distilled (bool): model includes a distillation token and head as in DeiT models | |||
| drop_rate (float): dropout rate | |||
| attn_drop_rate (float): attention dropout rate | |||
| drop_path_rate (float): stochastic depth rate | |||
| embed_layer (nn.Module): patch embedding layer | |||
| norm_layer: (nn.Module): normalization layer | |||
| """ | |||
| super().__init__() | |||
| self.num_classes = num_classes | |||
| self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models | |||
| self.num_tokens = 2 if distilled else 1 | |||
| norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) | |||
| act_layer = act_layer or nn.GELU | |||
| self.patch_embed = embed_layer( | |||
| img_size=img_size, | |||
| patch_size=patch_size, | |||
| in_chans=in_chans, | |||
| embed_dim=embed_dim) | |||
| num_patches = self.patch_embed.num_patches | |||
| self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
| self.dist_token = None | |||
| self.pos_embed = nn.Parameter( | |||
| torch.zeros(1, num_patches + self.num_tokens, embed_dim)) | |||
| self.pos_drop = nn.Dropout(p=drop_rate) | |||
| dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) | |||
| ] # stochastic depth decay rule | |||
| self.blocks = nn.Sequential(*[ | |||
| Block( | |||
| dim=embed_dim, | |||
| num_heads=num_heads, | |||
| mlp_ratio=mlp_ratio, | |||
| qkv_bias=qkv_bias, | |||
| drop=drop_rate, | |||
| attn_drop=attn_drop_rate, | |||
| drop_path=dpr[i], | |||
| norm_layer=norm_layer, | |||
| act_layer=act_layer) for i in range(depth) | |||
| ]) | |||
| self.norm = norm_layer(embed_dim) | |||
| class VisionTransformerCE(VisionTransformer): | |||
| """ Vision Transformer with candidate elimination (CE) module | |||
| A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` | |||
| - https://arxiv.org/abs/2010.11929 | |||
| Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` | |||
| - https://arxiv.org/abs/2012.12877 | |||
| """ | |||
| def __init__(self, | |||
| img_size=224, | |||
| patch_size=16, | |||
| in_chans=3, | |||
| num_classes=1000, | |||
| embed_dim=768, | |||
| depth=12, | |||
| num_heads=12, | |||
| mlp_ratio=4., | |||
| qkv_bias=True, | |||
| distilled=False, | |||
| drop_rate=0., | |||
| attn_drop_rate=0., | |||
| drop_path_rate=0., | |||
| embed_layer=PatchEmbed, | |||
| norm_layer=None, | |||
| act_layer=None, | |||
| ce_loc=None, | |||
| ce_keep_ratio=None): | |||
| """ | |||
| Args: | |||
| img_size (int, tuple): input image size | |||
| patch_size (int, tuple): patch size | |||
| in_chans (int): number of input channels | |||
| num_classes (int): number of classes for classification head | |||
| embed_dim (int): embedding dimension | |||
| depth (int): depth of transformer | |||
| num_heads (int): number of attention heads | |||
| mlp_ratio (int): ratio of mlp hidden dim to embedding dim | |||
| qkv_bias (bool): enable bias for qkv if True | |||
| distilled (bool): model includes a distillation token and head as in DeiT models | |||
| drop_rate (float): dropout rate | |||
| attn_drop_rate (float): attention dropout rate | |||
| drop_path_rate (float): stochastic depth rate | |||
| embed_layer (nn.Module): patch embedding layer | |||
| norm_layer: (nn.Module): normalization layer | |||
| """ | |||
| super().__init__() | |||
| if isinstance(img_size, tuple): | |||
| self.img_size = img_size | |||
| else: | |||
| self.img_size = to_2tuple(img_size) | |||
| self.patch_size = patch_size | |||
| self.in_chans = in_chans | |||
| self.num_classes = num_classes | |||
| self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models | |||
| self.num_tokens = 2 if distilled else 1 | |||
| norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) | |||
| act_layer = act_layer or nn.GELU | |||
| self.patch_embed = embed_layer( | |||
| img_size=img_size, | |||
| patch_size=patch_size, | |||
| in_chans=in_chans, | |||
| embed_dim=embed_dim) | |||
| num_patches = self.patch_embed.num_patches | |||
| self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
| self.dist_token = nn.Parameter(torch.zeros( | |||
| 1, 1, embed_dim)) if distilled else None | |||
| self.pos_embed = nn.Parameter( | |||
| torch.zeros(1, num_patches + self.num_tokens, embed_dim)) | |||
| self.pos_drop = nn.Dropout(p=drop_rate) | |||
| dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) | |||
| ] # stochastic depth decay rule | |||
| blocks = [] | |||
| ce_index = 0 | |||
| self.ce_loc = ce_loc | |||
| for i in range(depth): | |||
| ce_keep_ratio_i = 1.0 | |||
| if ce_loc is not None and i in ce_loc: | |||
| ce_keep_ratio_i = ce_keep_ratio[ce_index] | |||
| ce_index += 1 | |||
| blocks.append( | |||
| CEBlock( | |||
| dim=embed_dim, | |||
| num_heads=num_heads, | |||
| mlp_ratio=mlp_ratio, | |||
| qkv_bias=qkv_bias, | |||
| drop=drop_rate, | |||
| attn_drop=attn_drop_rate, | |||
| drop_path=dpr[i], | |||
| norm_layer=norm_layer, | |||
| act_layer=act_layer, | |||
| keep_ratio_search=ce_keep_ratio_i)) | |||
| self.blocks = nn.Sequential(*blocks) | |||
| self.norm = norm_layer(embed_dim) | |||
| def forward_features( | |||
| self, | |||
| z, | |||
| x, | |||
| mask_x=None, | |||
| ce_template_mask=None, | |||
| ce_keep_rate=None, | |||
| ): | |||
| B = x.shape[0] | |||
| x = self.patch_embed(x) | |||
| z = self.patch_embed(z) | |||
| z += self.pos_embed_z | |||
| x += self.pos_embed_x | |||
| x = combine_tokens(z, x, mode=self.cat_mode) | |||
| x = self.pos_drop(x) | |||
| lens_z = self.pos_embed_z.shape[1] | |||
| lens_x = self.pos_embed_x.shape[1] | |||
| global_index_t = torch.linspace(0, lens_z - 1, lens_z).to(x.device) | |||
| global_index_t = global_index_t.repeat(B, 1) | |||
| global_index_s = torch.linspace(0, lens_x - 1, lens_x).to(x.device) | |||
| global_index_s = global_index_s.repeat(B, 1) | |||
| removed_indexes_s = [] | |||
| for i, blk in enumerate(self.blocks): | |||
| x, global_index_t, global_index_s, removed_index_s, attn = \ | |||
| blk(x, global_index_t, global_index_s, mask_x, ce_template_mask, ce_keep_rate) | |||
| if self.ce_loc is not None and i in self.ce_loc: | |||
| removed_indexes_s.append(removed_index_s) | |||
| x = self.norm(x) | |||
| lens_x_new = global_index_s.shape[1] | |||
| lens_z_new = global_index_t.shape[1] | |||
| z = x[:, :lens_z_new] | |||
| x = x[:, lens_z_new:] | |||
| if removed_indexes_s and removed_indexes_s[0] is not None: | |||
| removed_indexes_cat = torch.cat(removed_indexes_s, dim=1) | |||
| pruned_lens_x = lens_x - lens_x_new | |||
| pad_x = torch.zeros([B, pruned_lens_x, x.shape[2]], | |||
| device=x.device) | |||
| x = torch.cat([x, pad_x], dim=1) | |||
| index_all = torch.cat([global_index_s, removed_indexes_cat], dim=1) | |||
| # recover original token order | |||
| C = x.shape[-1] | |||
| x = torch.zeros_like(x).scatter_( | |||
| dim=1, | |||
| index=index_all.unsqueeze(-1).expand(B, -1, C).to(torch.int64), | |||
| src=x) | |||
| x = recover_tokens(x, mode=self.cat_mode) | |||
| # re-concatenate with the template, which may be further used by other modules | |||
| x = torch.cat([z, x], dim=1) | |||
| aux_dict = { | |||
| 'attn': attn, | |||
| 'removed_indexes_s': removed_indexes_s, # used for visualization | |||
| } | |||
| return x, aux_dict | |||
| def forward(self, z, x, ce_template_mask=None, ce_keep_rate=None): | |||
| x, aux_dict = self.forward_features( | |||
| z, | |||
| x, | |||
| ce_template_mask=ce_template_mask, | |||
| ce_keep_rate=ce_keep_rate, | |||
| ) | |||
| return x, aux_dict | |||
| def _create_vision_transformer(pretrained=False, **kwargs): | |||
| model = VisionTransformerCE(**kwargs) | |||
| return model | |||
| def vit_base_patch16_224_ce(pretrained=False, **kwargs): | |||
| """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). | |||
| """ | |||
| model_kwargs = dict( | |||
| patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) | |||
| model = _create_vision_transformer(pretrained=pretrained, **model_kwargs) | |||
| return model | |||
| @@ -0,0 +1,139 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import torch | |||
| from modelscope.models.cv.video_single_object_tracking.config.ostrack import \ | |||
| cfg | |||
| from modelscope.models.cv.video_single_object_tracking.models.ostrack.ostrack import \ | |||
| build_ostrack | |||
| from modelscope.models.cv.video_single_object_tracking.utils.utils import ( | |||
| Preprocessor, clip_box, generate_mask_cond, hann2d, sample_target, | |||
| transform_image_to_crop) | |||
| class OSTrack(): | |||
| def __init__(self, ckpt_path, device): | |||
| network = build_ostrack(cfg) | |||
| network.load_state_dict( | |||
| torch.load(ckpt_path, map_location='cpu')['net'], strict=True) | |||
| self.cfg = cfg | |||
| if device.type == 'cuda': | |||
| self.network = network.to(device) | |||
| else: | |||
| self.network = network | |||
| self.network.eval() | |||
| self.preprocessor = Preprocessor(device) | |||
| self.state = None | |||
| self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE | |||
| # motion constrain | |||
| if device.type == 'cuda': | |||
| self.output_window = hann2d( | |||
| torch.tensor([self.feat_sz, self.feat_sz]).long(), | |||
| centered=True).to(device) | |||
| else: | |||
| self.output_window = hann2d( | |||
| torch.tensor([self.feat_sz, self.feat_sz]).long(), | |||
| centered=True) | |||
| self.frame_id = 0 | |||
| # for save boxes from all queries | |||
| self.z_dict1 = {} | |||
| def initialize(self, image, info: dict): | |||
| # forward the template once | |||
| z_patch_arr, resize_factor, z_amask_arr = sample_target( | |||
| image, | |||
| info['init_bbox'], | |||
| self.cfg.TEST.TEMPLATE_FACTOR, | |||
| output_sz=self.cfg.TEST.TEMPLATE_SIZE) | |||
| self.z_patch_arr = z_patch_arr | |||
| template = self.preprocessor.process(z_patch_arr, z_amask_arr) | |||
| with torch.no_grad(): | |||
| self.z_dict1 = template | |||
| self.box_mask_z = None | |||
| if self.cfg.MODEL.BACKBONE.CE_LOC: | |||
| template_bbox = self.transform_bbox_to_crop( | |||
| info['init_bbox'], resize_factor, | |||
| template.tensors.device).squeeze(1) | |||
| self.box_mask_z = generate_mask_cond(self.cfg, 1, | |||
| template.tensors.device, | |||
| template_bbox) | |||
| # save states | |||
| self.state = info['init_bbox'] | |||
| self.frame_id = 0 | |||
| def track(self, image, info: dict = None): | |||
| H, W, _ = image.shape | |||
| self.frame_id += 1 | |||
| x_patch_arr, resize_factor, x_amask_arr = sample_target( | |||
| image, | |||
| self.state, | |||
| self.cfg.TEST.SEARCH_FACTOR, | |||
| output_sz=self.cfg.TEST.SEARCH_SIZE) # (x1, y1, w, h) | |||
| search = self.preprocessor.process(x_patch_arr, x_amask_arr) | |||
| with torch.no_grad(): | |||
| x_dict = search | |||
| # merge the template and the search | |||
| # run the transformer | |||
| out_dict = self.network.forward( | |||
| template=self.z_dict1.tensors, | |||
| search=x_dict.tensors, | |||
| ce_template_mask=self.box_mask_z) | |||
| # add hann windows | |||
| pred_score_map = out_dict['score_map'] | |||
| response = self.output_window * pred_score_map | |||
| pred_boxes = self.network.box_head.cal_bbox(response, | |||
| out_dict['size_map'], | |||
| out_dict['offset_map']) | |||
| pred_boxes = pred_boxes.view(-1, 4) | |||
| # Baseline: Take the mean of all pred boxes as the final result | |||
| pred_box = (pred_boxes.mean(dim=0) * self.cfg.TEST.SEARCH_SIZE | |||
| / resize_factor).tolist() # (cx, cy, w, h) [0,1] | |||
| # get the final box result | |||
| self.state = clip_box( | |||
| self.map_box_back(pred_box, resize_factor), H, W, margin=10) | |||
| x1, y1, w, h = self.state | |||
| x2 = x1 + w | |||
| y2 = y1 + h | |||
| return {'target_bbox': [x1, y1, x2, y2]} | |||
| def map_box_back(self, pred_box: list, resize_factor: float): | |||
| cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[ | |||
| 1] + 0.5 * self.state[3] | |||
| cx, cy, w, h = pred_box | |||
| half_side = 0.5 * self.cfg.TEST.SEARCH_SIZE / resize_factor | |||
| cx_real = cx + (cx_prev - half_side) | |||
| cy_real = cy + (cy_prev - half_side) | |||
| return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h] | |||
| def transform_bbox_to_crop(self, | |||
| box_in, | |||
| resize_factor, | |||
| device, | |||
| box_extract=None, | |||
| crop_type='template'): | |||
| if crop_type == 'template': | |||
| crop_sz = torch.Tensor( | |||
| [self.cfg.TEST.TEMPLATE_SIZE, self.cfg.TEST.TEMPLATE_SIZE]) | |||
| elif crop_type == 'search': | |||
| crop_sz = torch.Tensor( | |||
| [self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE]) | |||
| else: | |||
| raise NotImplementedError | |||
| box_in = torch.tensor(box_in) | |||
| if box_extract is None: | |||
| box_extract = box_in | |||
| else: | |||
| box_extract = torch.tensor(box_extract) | |||
| template_bbox = transform_image_to_crop( | |||
| box_in, box_extract, resize_factor, crop_sz, normalize=True) | |||
| template_bbox = template_bbox.view(1, 1, 4).to(device) | |||
| return template_bbox | |||
| @@ -0,0 +1,261 @@ | |||
| # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on | |||
| # https://github.com/botaoye/OSTrack/ | |||
| import math | |||
| from typing import Optional | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from torch import Tensor | |||
| def hann1d(sz: int, centered=True) -> torch.Tensor: | |||
| """1D cosine window.""" | |||
| if centered: | |||
| return 0.5 * (1 - torch.cos( | |||
| (2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float())) | |||
| w = 0.5 * (1 + torch.cos( | |||
| (2 * math.pi / (sz + 2)) * torch.arange(0, sz // 2 + 1).float())) | |||
| return torch.cat([w, w[1:sz - sz // 2].flip((0, ))]) | |||
| def hann2d(sz: torch.Tensor, centered=True) -> torch.Tensor: | |||
| """2D cosine window.""" | |||
| return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d( | |||
| sz[1].item(), centered).reshape(1, 1, 1, -1) | |||
| class NestedTensor(object): | |||
| def __init__(self, tensors, mask: Optional[Tensor]): | |||
| self.tensors = tensors | |||
| self.mask = mask | |||
| class Preprocessor(object): | |||
| def __init__(self, device: str): | |||
| self.device = device | |||
| self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)) | |||
| self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)) | |||
| if 'cuda' == self.device.type: | |||
| self.mean = self.mean.to(self.device) | |||
| self.std = self.std.to(self.device) | |||
| def process(self, img_arr: np.ndarray, amask_arr: np.ndarray): | |||
| # Deal with the image patch | |||
| if 'cuda' == self.device.type: | |||
| img_tensor = torch.tensor(img_arr).to(self.device).float().permute( | |||
| (2, 0, 1)).unsqueeze(dim=0) | |||
| else: | |||
| img_tensor = torch.tensor(img_arr).float().permute( | |||
| (2, 0, 1)).unsqueeze(dim=0) | |||
| img_tensor_norm = ( | |||
| (img_tensor / 255.0) - self.mean) / self.std # (1,3,H,W) | |||
| # Deal with the attention mask | |||
| if 'cuda' == self.device.type: | |||
| amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).to( | |||
| self.device).unsqueeze(dim=0) # (1,H,W) | |||
| else: | |||
| amask_tensor = torch.from_numpy(amask_arr).to( | |||
| torch.bool).unsqueeze(dim=0) # (1,H,W) | |||
| return NestedTensor(img_tensor_norm, amask_tensor) | |||
| def clip_box(box: list, H, W, margin=0): | |||
| x1, y1, w, h = box | |||
| x2, y2 = x1 + w, y1 + h | |||
| x1 = min(max(0, x1), W - margin) | |||
| x2 = min(max(margin, x2), W) | |||
| y1 = min(max(0, y1), H - margin) | |||
| y2 = min(max(margin, y2), H) | |||
| w = max(margin, x2 - x1) | |||
| h = max(margin, y2 - y1) | |||
| if isinstance(x1, torch.Tensor): | |||
| x1 = x1.item() | |||
| y1 = y1.item() | |||
| w = w.item() | |||
| h = h.item() | |||
| return [x1, y1, w, h] | |||
| def generate_mask_cond(cfg, bs, device, gt_bbox): | |||
| template_size = cfg.DATA.TEMPLATE.SIZE | |||
| stride = cfg.MODEL.BACKBONE.STRIDE | |||
| template_feat_size = template_size // stride | |||
| if cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE == 'CTR_POINT': | |||
| if template_feat_size == 8: | |||
| index = slice(3, 4) | |||
| elif template_feat_size == 12: | |||
| index = slice(5, 6) | |||
| elif template_feat_size == 7: | |||
| index = slice(3, 4) | |||
| elif template_feat_size == 14: | |||
| index = slice(6, 7) | |||
| else: | |||
| raise NotImplementedError | |||
| box_mask_z = torch.zeros([bs, template_feat_size, template_feat_size], | |||
| device=device) | |||
| box_mask_z[:, index, index] = 1 | |||
| box_mask_z = box_mask_z.flatten(1).to(torch.bool) | |||
| else: | |||
| raise NotImplementedError | |||
| return box_mask_z | |||
| def sample_target(im, | |||
| target_bb, | |||
| search_area_factor, | |||
| output_sz=None, | |||
| mask=None): | |||
| """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area | |||
| args: | |||
| im - cv image | |||
| target_bb - target box [x, y, w, h] | |||
| search_area_factor - Ratio of crop size to target size | |||
| output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done. | |||
| returns: | |||
| cv image - extracted crop | |||
| float - the factor by which the crop has been resized to make the crop size equal output_size | |||
| """ | |||
| if not isinstance(target_bb, list): | |||
| x, y, w, h = target_bb.tolist() | |||
| else: | |||
| x, y, w, h = target_bb | |||
| # Crop image | |||
| crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor) | |||
| if crop_sz < 1: | |||
| raise Exception('Too small bounding box.') | |||
| x1 = round(x + 0.5 * w - crop_sz * 0.5) | |||
| x2 = x1 + crop_sz | |||
| y1 = round(y + 0.5 * h - crop_sz * 0.5) | |||
| y2 = y1 + crop_sz | |||
| x1_pad = max(0, -x1) | |||
| x2_pad = max(x2 - im.shape[1] + 1, 0) | |||
| y1_pad = max(0, -y1) | |||
| y2_pad = max(y2 - im.shape[0] + 1, 0) | |||
| # Crop target | |||
| im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :] | |||
| if mask is not None: | |||
| mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad] | |||
| # Pad | |||
| im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, | |||
| x2_pad, cv2.BORDER_CONSTANT) | |||
| # deal with attention mask | |||
| H, W, _ = im_crop_padded.shape | |||
| att_mask = np.ones((H, W)) | |||
| end_x, end_y = -x2_pad, -y2_pad | |||
| if y2_pad == 0: | |||
| end_y = None | |||
| if x2_pad == 0: | |||
| end_x = None | |||
| att_mask[y1_pad:end_y, x1_pad:end_x] = 0 | |||
| if mask is not None: | |||
| mask_crop_padded = F.pad( | |||
| mask_crop, | |||
| pad=(x1_pad, x2_pad, y1_pad, y2_pad), | |||
| mode='constant', | |||
| value=0) | |||
| if output_sz is not None: | |||
| resize_factor = output_sz / crop_sz | |||
| im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz)) | |||
| att_mask = cv2.resize(att_mask, | |||
| (output_sz, output_sz)).astype(np.bool_) | |||
| if mask is None: | |||
| return im_crop_padded, resize_factor, att_mask | |||
| mask_crop_padded = \ | |||
| F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz), | |||
| mode='bilinear', align_corners=False)[0, 0] | |||
| return im_crop_padded, resize_factor, att_mask, mask_crop_padded | |||
| else: | |||
| if mask is None: | |||
| return im_crop_padded, att_mask.astype(np.bool_), 1.0 | |||
| return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded | |||
| def transform_image_to_crop(box_in: torch.Tensor, | |||
| box_extract: torch.Tensor, | |||
| resize_factor: float, | |||
| crop_sz: torch.Tensor, | |||
| normalize=False) -> torch.Tensor: | |||
| """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image | |||
| args: | |||
| box_in - the box for which the co-ordinates are to be transformed | |||
| box_extract - the box about which the image crop has been extracted. | |||
| resize_factor - the ratio between the original image scale and the scale of the image crop | |||
| crop_sz - size of the cropped image | |||
| returns: | |||
| torch.Tensor - transformed co-ordinates of box_in | |||
| """ | |||
| box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4] | |||
| box_in_center = box_in[0:2] + 0.5 * box_in[2:4] | |||
| box_out_center = (crop_sz - 1) / 2 + (box_in_center | |||
| - box_extract_center) * resize_factor | |||
| box_out_wh = box_in[2:4] * resize_factor | |||
| box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh)) | |||
| if normalize: | |||
| return box_out / crop_sz[0] | |||
| else: | |||
| return box_out | |||
| def check_box(box: list, image_height, image_width) -> bool: | |||
| """ To check whether the box is within the image range or not | |||
| args: | |||
| box - the bounding box in the form of [x1, y1, x2, y2] | |||
| image_height - the height of the image | |||
| image_width - the width of the image | |||
| returns: | |||
| bool - if box is valid, return True. Otherwise, return False | |||
| """ | |||
| assert len(box) == 4, 'box must be in the form of: [x1, y1, x2, y2]' | |||
| if box[0] < 0 or box[0] >= image_width: | |||
| return False | |||
| if box[2] < 0 or box[2] >= image_width: | |||
| return False | |||
| if box[1] < 0 or box[1] >= image_height: | |||
| return False | |||
| if box[3] < 0 or box[3] >= image_height: | |||
| return False | |||
| return True | |||
| def show_tracking_result(video_in_path, bboxes, video_save_path): | |||
| cap = cv2.VideoCapture(video_in_path) | |||
| for i in range(len(bboxes)): | |||
| box = bboxes[i] | |||
| success, frame = cap.read() | |||
| if success is False: | |||
| raise Exception(video_in_path, | |||
| ' can not be correctly decoded by OpenCV.') | |||
| if i == 0: | |||
| size = (frame.shape[1], frame.shape[0]) | |||
| fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') | |||
| video_writer = cv2.VideoWriter(video_save_path, fourcc, | |||
| cap.get(cv2.CAP_PROP_FPS), size, | |||
| True) | |||
| cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), | |||
| 5) | |||
| video_writer.write(frame) | |||
| video_writer.release | |||
| cap.release() | |||
| @@ -9,9 +9,10 @@ if TYPE_CHECKING: | |||
| from .gemm import GEMMForMultiModalEmbedding | |||
| from .diffusion import DiffusionForTextToImageSynthesis | |||
| from .mmr import VideoCLIPForMultiModalEmbedding | |||
| from .mplug_for_visual_question_answering import \ | |||
| MPlugForVisualQuestionAnswering | |||
| from .mplug_for_all_tasks import MPlugForAllTasks | |||
| from .ofa_for_all_tasks import OfaForAllTasks | |||
| from .ofa_for_text_to_image_synthesis_model import \ | |||
| OfaForTextToImageSynthesis | |||
| else: | |||
| _import_structure = { | |||
| @@ -19,8 +20,7 @@ else: | |||
| 'diffusion': ['DiffusionForTextToImageSynthesis'], | |||
| 'gemm': ['GEMMForMultiModalEmbedding'], | |||
| 'mmr': ['VideoCLIPForMultiModalEmbedding'], | |||
| 'mplug_for_visual_question_answering': | |||
| ['MPlugForVisualQuestionAnswering'], | |||
| 'mplug_for_all_tasks': ['MPlugForAllTasks'], | |||
| 'ofa_for_all_tasks': ['OfaForAllTasks'], | |||
| 'ofa_for_text_to_image_synthesis_model': | |||
| ['OfaForTextToImageSynthesis'] | |||
| @@ -1 +1 @@ | |||
| from .clip_model import CLIPForMultiModalEmbedding | |||
| from .model import CLIPForMultiModalEmbedding | |||
| @@ -0,0 +1,422 @@ | |||
| # Copyright 2018 The Google AI Language Team Authors. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Tokenization classes.""" | |||
| from __future__ import absolute_import, division, print_function | |||
| import collections | |||
| import os | |||
| import re | |||
| import unicodedata | |||
| import six | |||
| def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): | |||
| """Checks whether the casing config is consistent with the checkpoint name.""" | |||
| # The casing has to be passed in by the user and there is no explicit check | |||
| # as to whether it matches the checkpoint. The casing information probably | |||
| # should have been stored in the bert_config.json file, but it's not, so | |||
| # we have to heuristically detect it to validate. | |||
| if not init_checkpoint: | |||
| return | |||
| m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint) | |||
| if m is None: | |||
| return | |||
| model_name = m.group(1) | |||
| lower_models = [ | |||
| 'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12', | |||
| 'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12' | |||
| ] | |||
| cased_models = [ | |||
| 'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16', | |||
| 'multi_cased_L-12_H-768_A-12' | |||
| ] | |||
| is_bad_config = False | |||
| if model_name in lower_models and not do_lower_case: | |||
| is_bad_config = True | |||
| actual_flag = 'False' | |||
| case_name = 'lowercased' | |||
| opposite_flag = 'True' | |||
| if model_name in cased_models and do_lower_case: | |||
| is_bad_config = True | |||
| actual_flag = 'True' | |||
| case_name = 'cased' | |||
| opposite_flag = 'False' | |||
| if is_bad_config: | |||
| raise ValueError( | |||
| 'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. ' | |||
| 'However, `%s` seems to be a %s model, so you ' | |||
| 'should pass in `--do_lower_case=%s` so that the fine-tuning matches ' | |||
| 'how the model was pre-training. If this error is wrong, please ' | |||
| 'just comment out this check.' % | |||
| (actual_flag, init_checkpoint, model_name, case_name, | |||
| opposite_flag)) | |||
| def convert_to_unicode(text): | |||
| """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |||
| if six.PY3: | |||
| if isinstance(text, str): | |||
| return text | |||
| elif isinstance(text, bytes): | |||
| return text.decode('utf-8', 'ignore') | |||
| else: | |||
| raise ValueError('Unsupported string type: %s' % (type(text))) | |||
| elif six.PY2: | |||
| if isinstance(text, str): | |||
| return text.decode('utf-8', 'ignore') | |||
| elif isinstance(text, unicode): | |||
| return text | |||
| else: | |||
| raise ValueError('Unsupported string type: %s' % (type(text))) | |||
| else: | |||
| raise ValueError('Not running on Python2 or Python 3?') | |||
| def printable_text(text): | |||
| """Returns text encoded in a way suitable for print or `tf.logging`.""" | |||
| # These functions want `str` for both Python2 and Python3, but in one case | |||
| # it's a Unicode string and in the other it's a byte string. | |||
| if six.PY3: | |||
| if isinstance(text, str): | |||
| return text | |||
| elif isinstance(text, bytes): | |||
| return text.decode('utf-8', 'ignore') | |||
| else: | |||
| raise ValueError('Unsupported string type: %s' % (type(text))) | |||
| elif six.PY2: | |||
| if isinstance(text, str): | |||
| return text | |||
| elif isinstance(text, unicode): | |||
| return text.encode('utf-8') | |||
| else: | |||
| raise ValueError('Unsupported string type: %s' % (type(text))) | |||
| else: | |||
| raise ValueError('Not running on Python2 or Python 3?') | |||
| def load_vocab(vocab_file): | |||
| """Loads a vocabulary file into a dictionary.""" | |||
| vocab = collections.OrderedDict() | |||
| index = 0 | |||
| with open(vocab_file, 'r') as reader: | |||
| while True: | |||
| token = convert_to_unicode(reader.readline()) | |||
| if not token: | |||
| break | |||
| token = token.strip() | |||
| vocab[token] = index | |||
| index += 1 | |||
| return vocab | |||
| def convert_by_vocab(vocab, items): | |||
| """Converts a sequence of [tokens|ids] using the vocab.""" | |||
| output = [] | |||
| for item in items: | |||
| output.append(vocab[item]) | |||
| return output | |||
| def convert_tokens_to_ids(vocab, tokens): | |||
| return convert_by_vocab(vocab, tokens) | |||
| def convert_ids_to_tokens(inv_vocab, ids): | |||
| return convert_by_vocab(inv_vocab, ids) | |||
| def whitespace_tokenize(text): | |||
| """Runs basic whitespace cleaning and splitting on a piece of text.""" | |||
| text = text.strip() | |||
| if not text: | |||
| return [] | |||
| tokens = text.split() | |||
| return tokens | |||
| class FullTokenizer(object): | |||
| """Runs end-to-end tokenziation.""" | |||
| def __init__(self, vocab_file, do_lower_case=True): | |||
| self.vocab = load_vocab(vocab_file) | |||
| self.inv_vocab = {v: k for k, v in self.vocab.items()} | |||
| self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) | |||
| self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||
| def tokenize(self, text): | |||
| split_tokens = [] | |||
| for token in self.basic_tokenizer.tokenize(text): | |||
| for sub_token in self.wordpiece_tokenizer.tokenize(token): | |||
| split_tokens.append(sub_token) | |||
| return split_tokens | |||
| def convert_tokens_to_ids(self, tokens): | |||
| return convert_by_vocab(self.vocab, tokens) | |||
| def convert_ids_to_tokens(self, ids): | |||
| return convert_by_vocab(self.inv_vocab, ids) | |||
| @staticmethod | |||
| def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): | |||
| """ Converts a sequence of tokens (string) in a single string. """ | |||
| def clean_up_tokenization(out_string): | |||
| """ Clean up a list of simple English tokenization artifacts | |||
| like spaces before punctuations and abreviated forms. | |||
| """ | |||
| out_string = ( | |||
| out_string.replace(' .', '.').replace(' ?', '?').replace( | |||
| ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace( | |||
| " n't", "n't").replace(" 'm", "'m").replace( | |||
| " 's", "'s").replace(" 've", | |||
| "'ve").replace(" 're", "'re")) | |||
| return out_string | |||
| text = ' '.join(tokens).replace(' ##', '').strip() | |||
| if clean_up_tokenization_spaces: | |||
| clean_text = clean_up_tokenization(text) | |||
| return clean_text | |||
| else: | |||
| return text | |||
| def vocab_size(self): | |||
| return len(self.vocab) | |||
| class BasicTokenizer(object): | |||
| """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" | |||
| def __init__(self, do_lower_case=True): | |||
| """Constructs a BasicTokenizer. | |||
| Args: | |||
| do_lower_case: Whether to lower case the input. | |||
| """ | |||
| self.do_lower_case = do_lower_case | |||
| def tokenize(self, text): | |||
| """Tokenizes a piece of text.""" | |||
| text = convert_to_unicode(text) | |||
| text = self._clean_text(text) | |||
| # This was added on November 1st, 2018 for the multilingual and Chinese | |||
| # models. This is also applied to the English models now, but it doesn't | |||
| # matter since the English models were not trained on any Chinese data | |||
| # and generally don't have any Chinese data in them (there are Chinese | |||
| # characters in the vocabulary because Wikipedia does have some Chinese | |||
| # words in the English Wikipedia.). | |||
| text = self._tokenize_chinese_chars(text) | |||
| orig_tokens = whitespace_tokenize(text) | |||
| split_tokens = [] | |||
| for token in orig_tokens: | |||
| if self.do_lower_case: | |||
| token = token.lower() | |||
| token = self._run_strip_accents(token) | |||
| split_tokens.extend(self._run_split_on_punc(token)) | |||
| output_tokens = whitespace_tokenize(' '.join(split_tokens)) | |||
| return output_tokens | |||
| def _run_strip_accents(self, text): | |||
| """Strips accents from a piece of text.""" | |||
| text = unicodedata.normalize('NFD', text) | |||
| output = [] | |||
| for char in text: | |||
| cat = unicodedata.category(char) | |||
| if cat == 'Mn': | |||
| continue | |||
| output.append(char) | |||
| return ''.join(output) | |||
| def _run_split_on_punc(self, text): | |||
| """Splits punctuation on a piece of text.""" | |||
| chars = list(text) | |||
| i = 0 | |||
| start_new_word = True | |||
| output = [] | |||
| while i < len(chars): | |||
| char = chars[i] | |||
| if _is_punctuation(char): | |||
| output.append([char]) | |||
| start_new_word = True | |||
| else: | |||
| if start_new_word: | |||
| output.append([]) | |||
| start_new_word = False | |||
| output[-1].append(char) | |||
| i += 1 | |||
| return [''.join(x) for x in output] | |||
| def _tokenize_chinese_chars(self, text): | |||
| """Adds whitespace around any CJK character.""" | |||
| output = [] | |||
| for char in text: | |||
| cp = ord(char) | |||
| if self._is_chinese_char(cp): | |||
| output.append(' ') | |||
| output.append(char) | |||
| output.append(' ') | |||
| else: | |||
| output.append(char) | |||
| return ''.join(output) | |||
| def _is_chinese_char(self, cp): | |||
| """Checks whether CP is the codepoint of a CJK character.""" | |||
| # This defines a "chinese character" as anything in the CJK Unicode block: | |||
| # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||
| # | |||
| # Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||
| # despite its name. The modern Korean Hangul alphabet is a different block, | |||
| # as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||
| # space-separated words, so they are not treated specially and handled | |||
| # like the all of the other languages. | |||
| if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) | |||
| or (cp >= 0x20000 and cp <= 0x2A6DF) | |||
| or (cp >= 0x2A700 and cp <= 0x2B73F) | |||
| or (cp >= 0x2B740 and cp <= 0x2B81F) | |||
| or (cp >= 0x2B820 and cp <= 0x2CEAF) | |||
| or (cp >= 0xF900 and cp <= 0xFAFF) | |||
| or (cp >= 0x2F800 and cp <= 0x2FA1F)): | |||
| return True | |||
| return False | |||
| def _clean_text(self, text): | |||
| """Performs invalid character removal and whitespace cleanup on text.""" | |||
| output = [] | |||
| for char in text: | |||
| cp = ord(char) | |||
| if cp == 0 or cp == 0xfffd or _is_control(char): | |||
| continue | |||
| if _is_whitespace(char): | |||
| output.append(' ') | |||
| else: | |||
| output.append(char) | |||
| return ''.join(output) | |||
| class WordpieceTokenizer(object): | |||
| """Runs WordPiece tokenziation.""" | |||
| def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200): | |||
| self.vocab = vocab | |||
| self.unk_token = unk_token | |||
| self.max_input_chars_per_word = max_input_chars_per_word | |||
| def tokenize(self, text): | |||
| """Tokenizes a piece of text into its word pieces. | |||
| This uses a greedy longest-match-first algorithm to perform tokenization | |||
| using the given vocabulary. | |||
| For example: | |||
| input = "unaffable" | |||
| output = ["un", "##aff", "##able"] | |||
| Args: | |||
| text: A single token or whitespace separated tokens. This should have | |||
| already been passed through `BasicTokenizer. | |||
| Returns: | |||
| A list of wordpiece tokens. | |||
| """ | |||
| text = convert_to_unicode(text) | |||
| output_tokens = [] | |||
| for token in whitespace_tokenize(text): | |||
| chars = list(token) | |||
| if len(chars) > self.max_input_chars_per_word: | |||
| output_tokens.append(self.unk_token) | |||
| continue | |||
| is_bad = False | |||
| start = 0 | |||
| sub_tokens = [] | |||
| while start < len(chars): | |||
| end = len(chars) | |||
| cur_substr = None | |||
| while start < end: | |||
| substr = ''.join(chars[start:end]) | |||
| if start > 0: | |||
| substr = '##' + substr | |||
| if substr in self.vocab: | |||
| cur_substr = substr | |||
| break | |||
| end -= 1 | |||
| if cur_substr is None: | |||
| is_bad = True | |||
| break | |||
| sub_tokens.append(cur_substr) | |||
| start = end | |||
| if is_bad: | |||
| output_tokens.append(self.unk_token) | |||
| else: | |||
| output_tokens.extend(sub_tokens) | |||
| return output_tokens | |||
| def _is_whitespace(char): | |||
| """Checks whether `chars` is a whitespace character.""" | |||
| # \t, \n, and \r are technically contorl characters but we treat them | |||
| # as whitespace since they are generally considered as such. | |||
| if char == ' ' or char == '\t' or char == '\n' or char == '\r': | |||
| return True | |||
| cat = unicodedata.category(char) | |||
| if cat == 'Zs': | |||
| return True | |||
| return False | |||
| def _is_control(char): | |||
| """Checks whether `chars` is a control character.""" | |||
| # These are technically control characters but we count them as whitespace | |||
| # characters. | |||
| if char == '\t' or char == '\n' or char == '\r': | |||
| return False | |||
| cat = unicodedata.category(char) | |||
| if cat in ('Cc', 'Cf'): | |||
| return True | |||
| return False | |||
| def _is_punctuation(char): | |||
| """Checks whether `chars` is a punctuation character.""" | |||
| cp = ord(char) | |||
| # We treat all non-letter/number ASCII as punctuation. | |||
| # Characters such as "^", "$", and "`" are not in the Unicode | |||
| # Punctuation class but we treat them as punctuation anyways, for | |||
| # consistency. | |||
| if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) | |||
| or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | |||
| return True | |||
| cat = unicodedata.category(char) | |||
| if cat.startswith('P'): | |||
| return True | |||
| return False | |||
| @@ -1,29 +0,0 @@ | |||
| import torch.nn as nn | |||
| from transformers import BertConfig, BertForMaskedLM | |||
| class TextTransformer(nn.Module): | |||
| def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True): | |||
| super(TextTransformer, self).__init__() | |||
| bert_config = BertConfig.from_dict(config_dict) | |||
| if use_grad_ckp: | |||
| bert_config.gradient_checkpointing = True | |||
| self.bert = BertForMaskedLM(bert_config).bert | |||
| self.projector = nn.Linear( | |||
| bert_config.hidden_size, feat_dim, bias=False) | |||
| def forward(self, input_ids, attention_mask): | |||
| trans_features = { | |||
| 'input_ids': input_ids, | |||
| 'attention_mask': attention_mask | |||
| } | |||
| output_states = self.bert(**trans_features, return_dict=False) | |||
| output_tokens = output_states[0] | |||
| cls_tokens = output_tokens[:, 0, :] | |||
| return self.projector(cls_tokens) | |||
| @@ -1,216 +0,0 @@ | |||
| from typing import Any, Dict | |||
| import cv2 | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from PIL import Image | |||
| from tokenizers import BertWordPieceTokenizer | |||
| from torch.distributed.nn.functional import \ | |||
| all_gather as all_gather_with_backprop | |||
| from torchvision.transforms import Compose, Normalize, Resize, ToTensor | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.models.multi_modal.clip.clip_bert import TextTransformer | |||
| from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer | |||
| from modelscope.utils.constant import ModeKeys, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| __all__ = ['CLIPForMultiModalEmbedding'] | |||
| class CLIPModel(nn.Module): | |||
| def __init__(self, model_dir): | |||
| super(CLIPModel, self).__init__() | |||
| # including vision config and text config | |||
| model_config = json.load( | |||
| open('{}/encoder_config.json'.format(model_dir))) | |||
| # vision encoder | |||
| vision_config = model_config['vision_config'] | |||
| self.img_size = vision_config['input_resolution'] | |||
| self.vision_encoder = VisionTransformer( | |||
| input_resolution=self.img_size, | |||
| patch_size=vision_config['patch_size'], | |||
| width=vision_config['width'], | |||
| layers=vision_config['layers'], | |||
| heads=vision_config['heads'], | |||
| output_dim=vision_config['feat_dim'], | |||
| use_grad_ckp=True) | |||
| # text encoder | |||
| text_config = model_config['text_config'] | |||
| self.text_encoder = TextTransformer( | |||
| text_config['bert_config'], feat_dim=text_config['feat_dim']) | |||
| self.logit_scale = nn.Parameter(torch.ones([]) * 4.6) | |||
| def contrastive_loss(self, logits, dim): | |||
| neg_ce = torch.diag(F.log_softmax(logits, dim=dim)) | |||
| return -neg_ce.mean() | |||
| def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None): | |||
| if img_idx is not None and all_img_idx is not None: | |||
| with torch.no_grad(): | |||
| false_neg_indicator = ( | |||
| img_idx[:, None] == all_img_idx[None, :]) | |||
| false_neg_indicator.fill_diagonal_(False) | |||
| t2i_sim.masked_fill_(false_neg_indicator, float('-inf')) | |||
| i2t_sim.masked_fill_(false_neg_indicator, float('-inf')) | |||
| caption_loss = self.contrastive_loss(t2i_sim, dim=1) | |||
| image_loss = self.contrastive_loss(i2t_sim, dim=1) | |||
| else: | |||
| caption_loss = self.contrastive_loss(t2i_sim, dim=1) | |||
| image_loss = self.contrastive_loss(i2t_sim, dim=1) | |||
| return (caption_loss + image_loss) / 2.0 | |||
| def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor, | |||
| img_id_list): | |||
| img_feat = self.forward(img_tensor, input_type='img') | |||
| text_feat = self.forward((text_ids_tensor, text_masks_tensor), | |||
| input_type='text') | |||
| global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0) | |||
| global_text_feat = torch.cat( | |||
| all_gather_with_backprop(text_feat), dim=0) | |||
| global_img_id_list = torch.cat( | |||
| all_gather_with_backprop(img_id_list), dim=0) | |||
| t2i_sim_mat = text_feat @ global_img_feat.t() | |||
| i2t_sim_mat = img_feat @ global_text_feat.t() | |||
| logit_scale = self.logit_scale.exp().clamp(max=100.0) | |||
| t2i_sim_mat_logits = t2i_sim_mat * logit_scale | |||
| i2t_sim_mat_logits = i2t_sim_mat * logit_scale | |||
| loss = self.clip_loss( | |||
| t2i_sim_mat_logits, | |||
| i2t_sim_mat_logits, | |||
| img_idx=img_id_list, | |||
| all_img_idx=global_img_id_list) | |||
| return loss | |||
| def forward(self, input_data, input_type): | |||
| if input_type == 'img': | |||
| img_embedding = self.vision_encoder(input_data) | |||
| img_embedding = F.normalize(img_embedding, p=2.0, dim=1) | |||
| return img_embedding | |||
| elif input_type == 'text': | |||
| text_ids_tensor, text_mask_tensor = input_data | |||
| text_embedding = self.text_encoder(text_ids_tensor, | |||
| text_mask_tensor) | |||
| text_embedding = F.normalize(text_embedding, p=2.0, dim=1) | |||
| return text_embedding | |||
| elif input_type == ModeKeys.TRAIN: | |||
| return self.get_loss(*input_data) | |||
| else: | |||
| raise ValueError('Unknown input type') | |||
| @MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) | |||
| class CLIPForMultiModalEmbedding(TorchModel): | |||
| def __init__(self, model_dir, device_id=-1): | |||
| super().__init__(model_dir=model_dir, device_id=device_id) | |||
| self.clip_model = CLIPModel(model_dir=model_dir) | |||
| pretrained_params = torch.load( | |||
| '{}/pytorch_model.bin'.format(model_dir), 'cpu') | |||
| self.clip_model.load_state_dict(pretrained_params) | |||
| self.clip_model.eval() | |||
| self.device_id = device_id | |||
| if self.device_id >= 0: | |||
| self.clip_model.to('cuda:{}'.format(self.device_id)) | |||
| logger.info('Use GPU: {}'.format(self.device_id)) | |||
| else: | |||
| logger.info('Use CPU for inference') | |||
| # image preprocessor | |||
| norm_op = Normalize((0.48145466, 0.4578275, 0.40821073), | |||
| (0.26862954, 0.26130258, 0.27577711)) | |||
| self.img_preprocessor = Compose([ | |||
| Resize((self.clip_model.img_size, self.clip_model.img_size), | |||
| interpolation=Image.BICUBIC), | |||
| ToTensor(), norm_op | |||
| ]) | |||
| # text tokenizer | |||
| vocab_path = '{}/vocab.txt'.format(model_dir) | |||
| self.text_tokenizer = BertWordPieceTokenizer( | |||
| vocab_path, lowercase=False) | |||
| self.text_tokenizer.enable_truncation(max_length=30) | |||
| def tokenize_text(self, text_str): | |||
| tokens = self.text_tokenizer.encode(text_str) | |||
| max_tokens = 30 | |||
| text_ids_tensor = torch.zeros((1, max_tokens)).long() | |||
| text_mask_tensor = torch.zeros((1, max_tokens)) | |||
| text_ids, text_mask = tokens.ids, tokens.attention_mask | |||
| text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids) | |||
| text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask) | |||
| return text_ids_tensor, text_mask_tensor | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| from modelscope.outputs import OutputKeys | |||
| output = { | |||
| OutputKeys.IMG_EMBEDDING: None, | |||
| OutputKeys.TEXT_EMBEDDING: None | |||
| } | |||
| if 'img' in input and input['img'] is not None: | |||
| input_img = input['img'] | |||
| if isinstance(input_img, Image.Image): | |||
| img_tensor = self.img_preprocessor(input_img)[None, ...] | |||
| elif isinstance(input_img, np.ndarray): | |||
| if len(input_img.shape) == 2: | |||
| input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR) | |||
| input_img = input_img[:, :, ::-1] # in rgb order | |||
| input_img = Image.fromarray( | |||
| input_img.astype('uint8')).convert('RGB') | |||
| img_tensor = self.img_preprocessor(input_img)[None, ...] | |||
| else: | |||
| raise TypeError( | |||
| f'img should be either PIL.Image or np.array, but got {type(input_img)}' | |||
| ) | |||
| if self.device_id >= 0: | |||
| img_tensor = img_tensor.to('cuda:{}'.format(self.device_id)) | |||
| img_embedding = self.clip_model( | |||
| input_data=img_tensor, input_type='img') | |||
| from modelscope.outputs import OutputKeys | |||
| output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() | |||
| if 'text' in input and input['text'] is not None: | |||
| text_str = input['text'] | |||
| if isinstance(text_str, str): | |||
| text_ids_tensor, text_mask_tensor = self.tokenize_text( | |||
| text_str) | |||
| else: | |||
| raise TypeError( | |||
| f'text should be str, but got {type(text_str)}') | |||
| if self.device_id >= 0: | |||
| text_ids_tensor = text_ids_tensor.to('cuda:{}'.format( | |||
| self.device_id)) | |||
| text_mask_tensor = text_mask_tensor.to('cuda:{}'.format( | |||
| self.device_id)) | |||
| text_embedding = self.clip_model( | |||
| input_data=(text_ids_tensor, text_mask_tensor), | |||
| input_type='text') | |||
| output['text_embedding'] = text_embedding.data.cpu().numpy() | |||
| return output | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -1,131 +0,0 @@ | |||
| # Copyright 2021 The OpenAI CLIP Authors. All rights reserved. | |||
| from collections import OrderedDict | |||
| from typing import Tuple, Union | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn.functional as F | |||
| import torch.utils.checkpoint as checkpoint | |||
| from torch import nn | |||
| class LayerNorm(nn.LayerNorm): | |||
| """Subclass torch's LayerNorm to handle fp16.""" | |||
| def forward(self, x: torch.Tensor): | |||
| orig_type = x.dtype | |||
| ret = super().forward(x.type(torch.float32)) | |||
| return ret.type(orig_type) | |||
| class QuickGELU(nn.Module): | |||
| def forward(self, x: torch.Tensor): | |||
| return x * torch.sigmoid(1.702 * x) | |||
| class ResidualAttentionBlock(nn.Module): | |||
| def __init__(self, | |||
| d_model: int, | |||
| n_head: int, | |||
| attn_mask: torch.Tensor = None): | |||
| super().__init__() | |||
| self.attn = nn.MultiheadAttention(d_model, n_head) | |||
| self.ln_1 = LayerNorm(d_model) | |||
| self.mlp = nn.Sequential( | |||
| OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), | |||
| ('gelu', QuickGELU()), | |||
| ('c_proj', nn.Linear(d_model * 4, d_model))])) | |||
| self.ln_2 = LayerNorm(d_model) | |||
| self.attn_mask = attn_mask | |||
| def attention(self, x: torch.Tensor): | |||
| self.attn_mask = self.attn_mask.to( | |||
| dtype=x.dtype, | |||
| device=x.device) if self.attn_mask is not None else None | |||
| return self.attn( | |||
| x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] | |||
| def forward(self, x: torch.Tensor): | |||
| x = x + self.attention(self.ln_1(x)) | |||
| x = x + self.mlp(self.ln_2(x)) | |||
| return x | |||
| class Transformer(nn.Module): | |||
| def __init__(self, | |||
| width: int, | |||
| layers: int, | |||
| heads: int, | |||
| attn_mask: torch.Tensor = None, | |||
| use_grad_ckp: bool = True): | |||
| super().__init__() | |||
| self.width = width | |||
| self.layers = layers | |||
| self.resblocks = nn.Sequential(*[ | |||
| ResidualAttentionBlock(width, heads, attn_mask) | |||
| for _ in range(layers) | |||
| ]) | |||
| self.use_grad_ckp = use_grad_ckp | |||
| def forward(self, x: torch.Tensor): | |||
| if self.use_grad_ckp: | |||
| for each_block in self.resblocks: | |||
| x = checkpoint.checkpoint(each_block, x) | |||
| return x | |||
| else: | |||
| return self.resblocks(x) | |||
| class VisionTransformer(nn.Module): | |||
| def __init__(self, input_resolution: int, patch_size: int, width: int, | |||
| layers: int, heads: int, output_dim: int, use_grad_ckp: bool): | |||
| super().__init__() | |||
| self.input_resolution = input_resolution | |||
| self.output_dim = output_dim | |||
| self.conv1 = nn.Conv2d( | |||
| in_channels=3, | |||
| out_channels=width, | |||
| kernel_size=patch_size, | |||
| stride=patch_size, | |||
| bias=False) | |||
| scale = width**-0.5 | |||
| self.class_embedding = nn.Parameter(scale * torch.randn(width)) | |||
| self.positional_embedding = nn.Parameter(scale * torch.randn( | |||
| (input_resolution // patch_size)**2 + 1, width)) | |||
| self.ln_pre = LayerNorm(width) | |||
| self.transformer = Transformer( | |||
| width, layers, heads, use_grad_ckp=use_grad_ckp) | |||
| self.ln_post = LayerNorm(width) | |||
| self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) | |||
| def forward(self, x: torch.Tensor): | |||
| x = self.conv1(x) # shape = [*, width, grid, grid] | |||
| x = x.reshape(x.shape[0], x.shape[1], | |||
| -1) # shape = [*, width, grid ** 2] | |||
| x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] | |||
| class_embeddings = self.class_embedding.to(x.dtype) + \ | |||
| torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) | |||
| x = torch.cat([class_embeddings, x], dim=1) | |||
| x = x + self.positional_embedding.to(x.dtype) | |||
| x = self.ln_pre(x) | |||
| x = x.permute(1, 0, 2) # NLD -> LND | |||
| x = self.transformer(x) | |||
| x = x.permute(1, 0, 2) # LND -> NLD | |||
| x = self.ln_post(x[:, 0, :]) | |||
| if self.proj is not None: | |||
| x = x @ self.proj | |||
| return x | |||
| @@ -0,0 +1,82 @@ | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ BERT model configuration """ | |||
| from __future__ import (absolute_import, division, print_function, | |||
| unicode_literals) | |||
| import logging | |||
| logger = logging.getLogger(__name__) | |||
| class BertConfig(object): | |||
| r""" | |||
| :class:`~transformers.BertConfig` is the configuration class to store the configuration of a | |||
| `BertModel`. | |||
| Arguments: | |||
| vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. | |||
| hidden_size: Size of the encoder layers and the pooler layer. | |||
| num_hidden_layers: Number of hidden layers in the Transformer encoder. | |||
| num_attention_heads: Number of attention heads for each attention layer in | |||
| the Transformer encoder. | |||
| intermediate_size: The size of the "intermediate" (i.e., feed-forward) | |||
| layer in the Transformer encoder. | |||
| hidden_act: The non-linear activation function (function or string) in the | |||
| encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. | |||
| hidden_dropout_prob: The dropout probabilitiy for all fully connected | |||
| layers in the embeddings, encoder, and pooler. | |||
| attention_probs_dropout_prob: The dropout ratio for the attention | |||
| probabilities. | |||
| max_position_embeddings: The maximum sequence length that this model might | |||
| ever be used with. Typically set this to something large just in case | |||
| (e.g., 512 or 1024 or 2048). | |||
| type_vocab_size: The vocabulary size of the `token_type_ids` passed into | |||
| `BertModel`. | |||
| initializer_range: The sttdev of the truncated_normal_initializer for | |||
| initializing all weight matrices. | |||
| layer_norm_eps: The epsilon used by LayerNorm. | |||
| """ | |||
| def __init__(self, | |||
| vocab_size_or_config_json_file=30522, | |||
| hidden_size=768, | |||
| num_hidden_layers=12, | |||
| num_attention_heads=12, | |||
| intermediate_size=3072, | |||
| hidden_act='gelu', | |||
| hidden_dropout_prob=0.1, | |||
| attention_probs_dropout_prob=0.1, | |||
| max_position_embeddings=512, | |||
| type_vocab_size=2, | |||
| initializer_range=0.02, | |||
| layer_norm_eps=1e-12, | |||
| output_attentions=False, | |||
| output_hidden_states=False): | |||
| self.vocab_size = vocab_size_or_config_json_file | |||
| self.hidden_size = hidden_size | |||
| self.num_hidden_layers = num_hidden_layers | |||
| self.num_attention_heads = num_attention_heads | |||
| self.hidden_act = hidden_act | |||
| self.intermediate_size = intermediate_size | |||
| self.hidden_dropout_prob = hidden_dropout_prob | |||
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |||
| self.max_position_embeddings = max_position_embeddings | |||
| self.type_vocab_size = type_vocab_size | |||
| self.initializer_range = initializer_range | |||
| self.layer_norm_eps = layer_norm_eps | |||
| self.output_attentions = output_attentions | |||
| self.output_hidden_states = output_hidden_states | |||
| @@ -0,0 +1,677 @@ | |||
| import os | |||
| from collections import OrderedDict | |||
| from typing import Any, Dict, Iterable, List, Tuple, Union | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from PIL import Image | |||
| from torchvision.transforms import Compose, Normalize, Resize, ToTensor | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer | |||
| from modelscope.models.multi_modal.clip.configuration_bert import BertConfig | |||
| from modelscope.models.multi_modal.clip.modeling_bert import BertModel | |||
| from modelscope.utils.constant import ModeKeys, ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| __all__ = ['CLIPForMultiModalEmbedding'] | |||
| class Bottleneck(nn.Module): | |||
| expansion = 4 | |||
| def __init__(self, inplanes, planes, stride=1): | |||
| super().__init__() | |||
| # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 | |||
| self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) | |||
| self.bn1 = nn.BatchNorm2d(planes) | |||
| self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) | |||
| self.bn2 = nn.BatchNorm2d(planes) | |||
| self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() | |||
| self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) | |||
| self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| self.downsample = None | |||
| self.stride = stride | |||
| if stride > 1 or inplanes != planes * Bottleneck.expansion: | |||
| # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 | |||
| self.downsample = nn.Sequential( | |||
| OrderedDict([('-1', nn.AvgPool2d(stride)), | |||
| ('0', | |||
| nn.Conv2d( | |||
| inplanes, | |||
| planes * self.expansion, | |||
| 1, | |||
| stride=1, | |||
| bias=False)), | |||
| ('1', nn.BatchNorm2d(planes * self.expansion))])) | |||
| def forward(self, x: torch.Tensor): | |||
| identity = x | |||
| out = self.relu(self.bn1(self.conv1(x))) | |||
| out = self.relu(self.bn2(self.conv2(out))) | |||
| out = self.avgpool(out) | |||
| out = self.bn3(self.conv3(out)) | |||
| if self.downsample is not None: | |||
| identity = self.downsample(x) | |||
| out += identity | |||
| out = self.relu(out) | |||
| return out | |||
| class AttentionPool2d(nn.Module): | |||
| def __init__(self, | |||
| spacial_dim: int, | |||
| embed_dim: int, | |||
| num_heads: int, | |||
| output_dim: int = None): | |||
| super().__init__() | |||
| self.positional_embedding = nn.Parameter( | |||
| torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) | |||
| self.k_proj = nn.Linear(embed_dim, embed_dim) | |||
| self.q_proj = nn.Linear(embed_dim, embed_dim) | |||
| self.v_proj = nn.Linear(embed_dim, embed_dim) | |||
| self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) | |||
| self.num_heads = num_heads | |||
| def forward(self, x): | |||
| x = x.reshape(x.shape[0], x.shape[1], | |||
| x.shape[2] * x.shape[3]).permute(2, 0, | |||
| 1) # NCHW -> (HW)NC | |||
| x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC | |||
| x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC | |||
| x, _ = F.multi_head_attention_forward( | |||
| query=x, | |||
| key=x, | |||
| value=x, | |||
| embed_dim_to_check=x.shape[-1], | |||
| num_heads=self.num_heads, | |||
| q_proj_weight=self.q_proj.weight, | |||
| k_proj_weight=self.k_proj.weight, | |||
| v_proj_weight=self.v_proj.weight, | |||
| in_proj_weight=None, | |||
| in_proj_bias=torch.cat( | |||
| [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), | |||
| bias_k=None, | |||
| bias_v=None, | |||
| add_zero_attn=False, | |||
| dropout_p=0, | |||
| out_proj_weight=self.c_proj.weight, | |||
| out_proj_bias=self.c_proj.bias, | |||
| use_separate_proj_weight=True, | |||
| training=self.training, | |||
| need_weights=False) | |||
| return x[0] | |||
| class ModifiedResNet(nn.Module): | |||
| """ | |||
| A ResNet class that is similar to torchvision's but contains the following changes: | |||
| - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |||
| - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |||
| - The final pooling layer is a QKV attention instead of an average pool | |||
| """ | |||
| def __init__(self, | |||
| layers, | |||
| output_dim, | |||
| heads, | |||
| input_resolution=224, | |||
| width=64): | |||
| super().__init__() | |||
| self.output_dim = output_dim | |||
| self.input_resolution = input_resolution | |||
| # the 3-layer stem | |||
| self.conv1 = nn.Conv2d( | |||
| 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) | |||
| self.bn1 = nn.BatchNorm2d(width // 2) | |||
| self.conv2 = nn.Conv2d( | |||
| width // 2, width // 2, kernel_size=3, padding=1, bias=False) | |||
| self.bn2 = nn.BatchNorm2d(width // 2) | |||
| self.conv3 = nn.Conv2d( | |||
| width // 2, width, kernel_size=3, padding=1, bias=False) | |||
| self.bn3 = nn.BatchNorm2d(width) | |||
| self.avgpool = nn.AvgPool2d(2) | |||
| self.relu = nn.ReLU(inplace=True) | |||
| # residual layers | |||
| self._inplanes = width # this is a *mutable* variable used during construction | |||
| self.layer1 = self._make_layer(width, layers[0]) | |||
| self.layer2 = self._make_layer(width * 2, layers[1], stride=2) | |||
| self.layer3 = self._make_layer(width * 4, layers[2], stride=2) | |||
| self.layer4 = self._make_layer(width * 8, layers[3], stride=2) | |||
| embed_dim = width * 32 # the ResNet feature dimension | |||
| self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, | |||
| heads, output_dim) | |||
| def _make_layer(self, planes, blocks, stride=1): | |||
| layers = [Bottleneck(self._inplanes, planes, stride)] | |||
| self._inplanes = planes * Bottleneck.expansion | |||
| for _ in range(1, blocks): | |||
| layers.append(Bottleneck(self._inplanes, planes)) | |||
| return nn.Sequential(*layers) | |||
| def forward(self, x): | |||
| def stem(x): | |||
| for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), | |||
| (self.conv3, self.bn3)]: | |||
| x = self.relu(bn(conv(x))) | |||
| x = self.avgpool(x) | |||
| return x | |||
| x = x.type(self.conv1.weight.dtype) | |||
| x = stem(x) | |||
| x = self.layer1(x) | |||
| x = self.layer2(x) | |||
| x = self.layer3(x) | |||
| x = self.layer4(x) | |||
| x = self.attnpool(x) | |||
| return x | |||
| class LayerNorm(nn.LayerNorm): | |||
| """Subclass torch's LayerNorm to handle fp16.""" | |||
| def forward(self, x: torch.Tensor): | |||
| orig_type = x.dtype | |||
| ret = super().forward(x.type(torch.float32)) | |||
| return ret.type(orig_type) | |||
| class QuickGELU(nn.Module): | |||
| def forward(self, x: torch.Tensor): | |||
| return x * torch.sigmoid(1.702 * x) | |||
| class ResidualAttentionBlock(nn.Module): | |||
| def __init__(self, | |||
| d_model: int, | |||
| n_head: int, | |||
| attn_mask: torch.Tensor = None): | |||
| super().__init__() | |||
| self.attn = nn.MultiheadAttention(d_model, n_head) | |||
| self.ln_1 = LayerNorm(d_model) | |||
| self.mlp = nn.Sequential( | |||
| OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), | |||
| ('gelu', QuickGELU()), | |||
| ('c_proj', nn.Linear(d_model * 4, d_model))])) | |||
| self.ln_2 = LayerNorm(d_model) | |||
| self.attn_mask = attn_mask | |||
| def attention(self, x: torch.Tensor): | |||
| self.attn_mask = self.attn_mask.to( | |||
| dtype=x.dtype, | |||
| device=x.device) if self.attn_mask is not None else None | |||
| return self.attn( | |||
| x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] | |||
| def forward(self, x: torch.Tensor): | |||
| x = x + self.attention(self.ln_1(x)) | |||
| x = x + self.mlp(self.ln_2(x)) | |||
| return x | |||
| class Transformer(nn.Module): | |||
| def __init__(self, | |||
| width: int, | |||
| layers: int, | |||
| heads: int, | |||
| attn_mask: torch.Tensor = None): | |||
| super().__init__() | |||
| self.width = width | |||
| self.layers = layers | |||
| self.resblocks = nn.Sequential(*[ | |||
| ResidualAttentionBlock(width, heads, attn_mask) | |||
| for _ in range(layers) | |||
| ]) | |||
| def forward(self, x: torch.Tensor): | |||
| return self.resblocks(x) | |||
| class VisualTransformer(nn.Module): | |||
| def __init__(self, input_resolution: int, patch_size: int, width: int, | |||
| layers: int, heads: int, output_dim: int): | |||
| super().__init__() | |||
| self.input_resolution = input_resolution | |||
| self.output_dim = output_dim | |||
| self.conv1 = nn.Conv2d( | |||
| in_channels=3, | |||
| out_channels=width, | |||
| kernel_size=patch_size, | |||
| stride=patch_size, | |||
| bias=False) | |||
| scale = width**-0.5 | |||
| self.class_embedding = nn.Parameter(scale * torch.randn(width)) | |||
| self.positional_embedding = nn.Parameter(scale * torch.randn( | |||
| (input_resolution // patch_size)**2 + 1, width)) | |||
| self.ln_pre = LayerNorm(width) | |||
| self.transformer = Transformer(width, layers, heads) | |||
| self.ln_post = LayerNorm(width) | |||
| self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) | |||
| def forward(self, x: torch.Tensor): | |||
| x = self.conv1(x) # shape = [*, width, grid, grid] | |||
| x = x.reshape(x.shape[0], x.shape[1], | |||
| -1) # shape = [*, width, grid ** 2] | |||
| x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] | |||
| x = torch.cat( | |||
| [ # noqa | |||
| self.class_embedding.to(x.dtype) + torch.zeros( # noqa | |||
| x.shape[0], | |||
| 1, | |||
| x.shape[-1], | |||
| dtype=x.dtype, | |||
| device=x.device), | |||
| x # noqa | |||
| ], | |||
| dim=1) # noqa shape = [*, grid ** 2 + 1, width] | |||
| x = x + self.positional_embedding.to(x.dtype) | |||
| x = self.ln_pre(x) | |||
| x = x.permute(1, 0, 2) # NLD -> LND | |||
| x = self.transformer(x) | |||
| x = x.permute(1, 0, 2) # LND -> NLD | |||
| x = self.ln_post(x[:, 0, :]) | |||
| if self.proj is not None: | |||
| x = x @ self.proj | |||
| return x | |||
| class CLIP(nn.Module): | |||
| def __init__( | |||
| self, | |||
| embed_dim: int, | |||
| # vision | |||
| image_resolution: int, | |||
| vision_layers: Union[Tuple[int, int, int, int], int], | |||
| vision_width: int, | |||
| vision_patch_size: int, | |||
| # text | |||
| vocab_size: int, | |||
| text_attention_probs_dropout_prob: float, | |||
| text_hidden_act: str, | |||
| text_hidden_dropout_prob: float, | |||
| text_hidden_size: int, | |||
| text_initializer_range: float, | |||
| text_intermediate_size: int, | |||
| text_max_position_embeddings: int, | |||
| text_num_attention_heads: int, | |||
| text_num_hidden_layers: int, | |||
| text_type_vocab_size: int, | |||
| tokenizer: FullTokenizer, | |||
| ): | |||
| super().__init__() | |||
| if isinstance(vision_layers, (tuple, list)): | |||
| vision_heads = vision_width * 32 // 64 | |||
| self.visual = ModifiedResNet( | |||
| layers=vision_layers, | |||
| output_dim=embed_dim, | |||
| heads=vision_heads, | |||
| input_resolution=image_resolution, | |||
| width=vision_width) | |||
| else: | |||
| vision_heads = vision_width // 64 | |||
| self.visual = VisualTransformer( | |||
| input_resolution=image_resolution, | |||
| patch_size=vision_patch_size, | |||
| width=vision_width, | |||
| layers=vision_layers, | |||
| heads=vision_heads, | |||
| output_dim=embed_dim) | |||
| self.bert_config = BertConfig( | |||
| vocab_size_or_config_json_file=vocab_size, | |||
| hidden_size=text_hidden_size, | |||
| num_hidden_layers=text_num_hidden_layers, | |||
| num_attention_heads=text_num_attention_heads, | |||
| intermediate_size=text_intermediate_size, | |||
| hidden_act=text_hidden_act, | |||
| hidden_dropout_prob=text_hidden_dropout_prob, | |||
| attention_probs_dropout_prob=text_attention_probs_dropout_prob, | |||
| max_position_embeddings=text_max_position_embeddings, | |||
| type_vocab_size=text_type_vocab_size, | |||
| initializer_range=text_initializer_range, | |||
| layer_norm_eps=1e-12, | |||
| ) | |||
| self.bert = BertModel(self.bert_config) | |||
| self.text_projection = nn.Parameter( | |||
| torch.empty(text_hidden_size, embed_dim)) | |||
| self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) | |||
| self.tokenizer = tokenizer | |||
| self.initialize_parameters() | |||
| def initialize_parameters(self): | |||
| self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) | |||
| if isinstance(self.visual, ModifiedResNet): | |||
| if self.visual.attnpool is not None: | |||
| std = self.visual.attnpool.c_proj.in_features**-0.5 | |||
| nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) | |||
| nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) | |||
| nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) | |||
| nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) | |||
| for resnet_block in [ | |||
| self.visual.layer1, self.visual.layer2, self.visual.layer3, | |||
| self.visual.layer4 | |||
| ]: | |||
| for name, param in resnet_block.named_parameters(): | |||
| if name.endswith('bn3.weight'): | |||
| nn.init.zeros_(param) | |||
| if self.text_projection is not None: | |||
| nn.init.normal_( | |||
| self.text_projection, std=self.bert_config.hidden_size**-0.5) | |||
| @property | |||
| def dtype(self): | |||
| return self.visual.conv1.weight.dtype | |||
| def encode_image(self, image): | |||
| return self.visual(image.type(self.dtype)) | |||
| def encode_text(self, text): | |||
| pad_index = self.tokenizer.vocab['[PAD]'] | |||
| attn_mask = text.ne(pad_index).type(self.dtype) | |||
| x = self.bert( | |||
| text, attention_mask=attn_mask)[0].type( | |||
| self.dtype) # [batch_size, seq_length, hidden_size] | |||
| return x[:, 0, :] @ self.text_projection | |||
| def forward(self, image, text): | |||
| assert image is not None or text is not None, 'text and image cannot both be None!' | |||
| if image is None: | |||
| return self.encode_text(text) | |||
| elif text is None: | |||
| return self.encode_image(image) | |||
| image_features = self.encode_image(image) | |||
| text_features = self.encode_text(text) | |||
| image_features = image_features / image_features.norm( | |||
| dim=-1, keepdim=True) | |||
| text_features = text_features / text_features.norm( | |||
| dim=-1, keepdim=True) | |||
| return image_features, text_features, self.logit_scale.exp() | |||
| def get_similarity(self, image, text): | |||
| image_features = self.encode_image(image) | |||
| text_features = self.encode_text(text) | |||
| # normalized features | |||
| image_features = image_features / image_features.norm( | |||
| dim=1, keepdim=True) | |||
| text_features = text_features / text_features.norm(dim=1, keepdim=True) | |||
| # cosine similarity as logits | |||
| logit_scale = self.logit_scale.exp() | |||
| logits_per_image = logit_scale * image_features @ text_features.t() | |||
| logits_per_text = logits_per_image.t() | |||
| # shape = [global_batch_size, global_batch_size] | |||
| return logits_per_image, logits_per_text | |||
| def convert_models_to_fp32(model): | |||
| for p in model.parameters(): | |||
| p.data = p.data.float() | |||
| if p.grad: | |||
| p.grad.data = p.grad.data.float() | |||
| def convert_weights(model: nn.Module): | |||
| """Convert applicable model parameters to fp16""" | |||
| def _convert_weights_to_fp16(module): | |||
| if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)): | |||
| module.weight.data = module.weight.data.half() | |||
| if module.bias is not None: | |||
| module.bias.data = module.bias.data.half() | |||
| if isinstance(module, nn.MultiheadAttention): | |||
| for attr in [ | |||
| *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], | |||
| 'in_proj_bias', 'bias_k', 'bias_v' | |||
| ]: | |||
| tensor = getattr(module, attr) | |||
| if tensor is not None: | |||
| tensor.data = tensor.data.half() | |||
| if isinstance(module, BertModel): | |||
| module.to(torch.half) | |||
| for name in ['text_projection', 'proj']: | |||
| if hasattr(module, name): | |||
| attr = getattr(module, name) | |||
| if attr is not None: | |||
| attr.data = attr.data.half() | |||
| model.apply(_convert_weights_to_fp16) | |||
| def _convert_to_rgb(image): | |||
| return image.convert('RGB') | |||
| def image_transform(image_size=224): | |||
| transform = Compose([ | |||
| _convert_to_rgb, | |||
| Resize((image_size, image_size)), | |||
| ToTensor(), | |||
| Normalize((0.48145466, 0.4578275, 0.40821073), | |||
| (0.26862954, 0.26130258, 0.27577711)), | |||
| ]) | |||
| return transform | |||
| @MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) | |||
| class CLIPForMultiModalEmbedding(TorchModel): | |||
| def __init__(self, model_dir, device_id=-1): | |||
| super().__init__(model_dir=model_dir, device_id=device_id) | |||
| # Initialize the model. | |||
| vision_model_config_file = '{}/vision_model_config.json'.format( | |||
| model_dir) | |||
| logger.info( | |||
| f'Loading vision model config from {vision_model_config_file}') | |||
| assert os.path.exists(vision_model_config_file) | |||
| text_model_config_file = '{}/text_model_config.json'.format(model_dir) | |||
| logger.info(f'Loading text model config from {text_model_config_file}') | |||
| assert os.path.exists(text_model_config_file) | |||
| with open(vision_model_config_file, | |||
| 'r') as fv, open(text_model_config_file, 'r') as ft: | |||
| model_info = json.load(fv) | |||
| for k, v in json.load(ft).items(): | |||
| model_info[k] = v | |||
| # image preprocess | |||
| self.img_preprocess = image_transform(model_info['image_resolution']) | |||
| # text tokenizer | |||
| vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}' | |||
| self.tokenizer = FullTokenizer(vocab_file=vocab_file) | |||
| # initialize the model | |||
| self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer) | |||
| convert_weights(self.clip_model) | |||
| # restore the pretrained weight | |||
| checkpoint = torch.load( | |||
| f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu') | |||
| sd = checkpoint['state_dict'] | |||
| if next(iter(sd.items()))[0].startswith('module'): | |||
| sd = {k[len('module.'):]: v for k, v in sd.items()} | |||
| self.clip_model.load_state_dict(sd) | |||
| self.clip_model.eval() | |||
| # place the model | |||
| self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |||
| if self.device == 'cuda': | |||
| self.clip_model.to(self.device) | |||
| logger.info('Use GPU for inference') | |||
| else: | |||
| self.clip_model.float() | |||
| logger.info('Use CPU for inference') | |||
| def tokenize(self, | |||
| texts: Union[str, List[str]], | |||
| context_length: int = 52) -> torch.LongTensor: | |||
| """ | |||
| Returns the tokenized representation of given input string(s) | |||
| Parameters | |||
| ---------- | |||
| texts : Union[str, List[str]] | |||
| An input string or a list of input strings to tokenize | |||
| context_length : int | |||
| The context length to use; all baseline models use 24 as the context length | |||
| Returns | |||
| ------- | |||
| A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] | |||
| """ | |||
| if isinstance(texts, str): | |||
| texts = [texts] | |||
| all_tokens = [] | |||
| for text in texts: | |||
| all_tokens.append( | |||
| [self.tokenizer.vocab['[CLS]']] | |||
| + self.tokenizer.convert_tokens_to_ids( | |||
| self.tokenizer.tokenize(text))[:context_length - 2] | |||
| + [self.tokenizer.vocab['[SEP]']]) | |||
| result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) | |||
| for i, tokens in enumerate(all_tokens): | |||
| assert len(tokens) <= context_length | |||
| result[i, :len(tokens)] = torch.tensor(tokens) | |||
| return result | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| from modelscope.outputs import OutputKeys | |||
| output = { | |||
| OutputKeys.IMG_EMBEDDING: None, | |||
| OutputKeys.TEXT_EMBEDDING: None | |||
| } | |||
| if 'img' in input and input['img'] is not None: | |||
| image_input = input['img'] | |||
| # single image input | |||
| if isinstance(image_input, Image.Image): | |||
| image_tensor = self.img_preprocess(image_input).unsqueeze(0) | |||
| # multi images input | |||
| elif isinstance(image_input, list): | |||
| if all([isinstance(elem, Image.Image) | |||
| for elem in image_input]): | |||
| image_tensor = torch.stack( | |||
| [self.img_preprocess(elem) for elem in image_input], | |||
| dim=0) | |||
| else: | |||
| unsupported_elem_type = [ | |||
| type(elem) for elem in image_input | |||
| if not isinstance(elem, Image.Image) | |||
| ][0] | |||
| raise TypeError( | |||
| f'img should be PIL.Image or List[PIL.Image], \ | |||
| but got a List containing one {unsupported_elem_type}' | |||
| ) | |||
| # others | |||
| else: | |||
| raise TypeError( | |||
| f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}' | |||
| ) | |||
| image_tensor = image_tensor.to(self.device) | |||
| with torch.no_grad(): | |||
| image_features = self.clip_model.encode_image(image_tensor) | |||
| image_features /= image_features.norm( | |||
| dim=-1, keepdim=True) # l2-normalize | |||
| output[OutputKeys.IMG_EMBEDDING] = image_features | |||
| if 'text' in input and input['text'] is not None: | |||
| text_input = input['text'] | |||
| # single text input | |||
| if isinstance(text_input, str): | |||
| text_tensor = self.tokenize(text_input) | |||
| # multi texts input | |||
| elif isinstance(text_input, list): | |||
| if all([isinstance(elem, str) for elem in text_input]): | |||
| text_tensor = self.tokenize(text_input) | |||
| else: | |||
| unsupported_elem_type = [ | |||
| type(elem) for elem in text_input | |||
| if not isinstance(elem, str) | |||
| ][0] | |||
| raise TypeError( | |||
| f'text should be str or List[str], but got a List containing one {unsupported_elem_type}' | |||
| ) | |||
| # others | |||
| else: | |||
| raise TypeError( | |||
| f'text should be str or List[str], but got {type(text_input)}' | |||
| ) | |||
| text_tensor = text_tensor.to(self.device) | |||
| with torch.no_grad(): | |||
| text_features = self.clip_model.encode_text(text_tensor) | |||
| text_features /= text_features.norm( | |||
| dim=-1, keepdim=True) # l2-normalize | |||
| output[OutputKeys.TEXT_EMBEDDING] = text_features | |||
| return output | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @property | |||
| def temperature(self): | |||
| return 1.0 / self.clip_model.logit_scale.exp() | |||
| @@ -0,0 +1,507 @@ | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """PyTorch BERT model. """ | |||
| from __future__ import (absolute_import, division, print_function, | |||
| unicode_literals) | |||
| import logging | |||
| import math | |||
| import os | |||
| import sys | |||
| from io import open | |||
| import json | |||
| import torch | |||
| from torch import nn | |||
| from .configuration_bert import BertConfig | |||
| logger = logging.getLogger(__name__) | |||
| def gelu(x): | |||
| """ Original Implementation of the gelu activation function in Google Bert repo when initially created. | |||
| For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): | |||
| 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) | |||
| Also see https://arxiv.org/abs/1606.08415 | |||
| """ | |||
| return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) | |||
| def gelu_new(x): | |||
| """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). | |||
| Also see https://arxiv.org/abs/1606.08415 | |||
| """ | |||
| return 0.5 * x * (1 + torch.tanh( | |||
| math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) | |||
| def swish(x): | |||
| return x * torch.sigmoid(x) | |||
| ACT2FN = { | |||
| 'gelu': gelu, | |||
| 'relu': torch.nn.functional.relu, | |||
| 'swish': swish, | |||
| 'gelu_new': gelu_new | |||
| } | |||
| BertLayerNorm = torch.nn.LayerNorm | |||
| class BertEmbeddings(nn.Module): | |||
| """Construct the embeddings from word, position and token_type embeddings. | |||
| """ | |||
| def __init__(self, config): | |||
| super(BertEmbeddings, self).__init__() | |||
| self.word_embeddings = nn.Embedding( | |||
| config.vocab_size, config.hidden_size, padding_idx=0) | |||
| self.position_embeddings = nn.Embedding(config.max_position_embeddings, | |||
| config.hidden_size) | |||
| self.token_type_embeddings = nn.Embedding(config.type_vocab_size, | |||
| config.hidden_size) | |||
| # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load | |||
| # any TensorFlow checkpoint file | |||
| self.LayerNorm = BertLayerNorm( | |||
| config.hidden_size, eps=config.layer_norm_eps) | |||
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
| def forward(self, input_ids, token_type_ids=None, position_ids=None): | |||
| seq_length = input_ids.size(1) | |||
| if position_ids is None: | |||
| position_ids = torch.arange( | |||
| seq_length, dtype=torch.long, device=input_ids.device) | |||
| position_ids = position_ids.unsqueeze(0).expand_as(input_ids) | |||
| if token_type_ids is None: | |||
| token_type_ids = torch.zeros_like(input_ids) | |||
| words_embeddings = self.word_embeddings(input_ids) | |||
| position_embeddings = self.position_embeddings(position_ids) | |||
| token_type_embeddings = self.token_type_embeddings(token_type_ids) | |||
| embeddings = words_embeddings + position_embeddings + token_type_embeddings | |||
| embeddings = self.LayerNorm(embeddings) | |||
| embeddings = self.dropout(embeddings) | |||
| return embeddings | |||
| class BertSelfAttention(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertSelfAttention, self).__init__() | |||
| if config.hidden_size % config.num_attention_heads != 0: | |||
| raise ValueError( | |||
| 'The hidden size (%d) is not a multiple of the number of attention ' | |||
| 'heads (%d)' % | |||
| (config.hidden_size, config.num_attention_heads)) | |||
| self.output_attentions = config.output_attentions | |||
| self.num_attention_heads = config.num_attention_heads | |||
| self.attention_head_size = int(config.hidden_size | |||
| / config.num_attention_heads) | |||
| self.all_head_size = self.num_attention_heads * self.attention_head_size | |||
| self.query = nn.Linear(config.hidden_size, self.all_head_size) | |||
| self.key = nn.Linear(config.hidden_size, self.all_head_size) | |||
| self.value = nn.Linear(config.hidden_size, self.all_head_size) | |||
| self.dropout = nn.Dropout(config.attention_probs_dropout_prob) | |||
| def transpose_for_scores(self, x): | |||
| new_x_shape = x.size()[:-1] + (self.num_attention_heads, | |||
| self.attention_head_size) | |||
| x = x.view(*new_x_shape) | |||
| return x.permute(0, 2, 1, 3) | |||
| def forward(self, hidden_states, attention_mask=None, head_mask=None): | |||
| mixed_query_layer = self.query(hidden_states) | |||
| mixed_key_layer = self.key(hidden_states) | |||
| mixed_value_layer = self.value(hidden_states) | |||
| query_layer = self.transpose_for_scores(mixed_query_layer) | |||
| key_layer = self.transpose_for_scores(mixed_key_layer) | |||
| value_layer = self.transpose_for_scores(mixed_value_layer) | |||
| # Take the dot product between "query" and "key" to get the raw attention scores. | |||
| attention_scores = torch.matmul(query_layer, | |||
| key_layer.transpose(-1, -2)) | |||
| attention_scores = attention_scores / math.sqrt( | |||
| self.attention_head_size) | |||
| if attention_mask is not None: | |||
| # Apply the attention mask is (precomputed for all layers in BertModel forward() function) | |||
| attention_scores = attention_scores + attention_mask | |||
| # Normalize the attention scores to probabilities. | |||
| attention_probs = nn.Softmax(dim=-1)(attention_scores) | |||
| # This is actually dropping out entire tokens to attend to, which might | |||
| # seem a bit unusual, but is taken from the original Transformer paper. | |||
| attention_probs = self.dropout(attention_probs) | |||
| # Mask heads if we want to | |||
| if head_mask is not None: | |||
| attention_probs = attention_probs * head_mask | |||
| context_layer = torch.matmul(attention_probs, value_layer) | |||
| context_layer = context_layer.permute(0, 2, 1, 3).contiguous() | |||
| new_context_layer_shape = context_layer.size()[:-2] + ( | |||
| self.all_head_size, ) | |||
| context_layer = context_layer.view(*new_context_layer_shape) | |||
| outputs = (context_layer, | |||
| attention_probs) if self.output_attentions else ( | |||
| context_layer, ) | |||
| return outputs | |||
| class BertSelfOutput(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertSelfOutput, self).__init__() | |||
| self.dense = nn.Linear(config.hidden_size, config.hidden_size) | |||
| self.LayerNorm = BertLayerNorm( | |||
| config.hidden_size, eps=config.layer_norm_eps) | |||
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
| def forward(self, hidden_states, input_tensor): | |||
| hidden_states = self.dense(hidden_states) | |||
| hidden_states = self.dropout(hidden_states) | |||
| hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
| return hidden_states | |||
| class BertAttention(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertAttention, self).__init__() | |||
| self.self = BertSelfAttention(config) | |||
| self.output = BertSelfOutput(config) | |||
| self.pruned_heads = set() | |||
| def forward(self, input_tensor, attention_mask=None, head_mask=None): | |||
| self_outputs = self.self(input_tensor, attention_mask, head_mask) | |||
| attention_output = self.output(self_outputs[0], input_tensor) | |||
| outputs = (attention_output, | |||
| ) + self_outputs[1:] # add attentions if we output them | |||
| return outputs | |||
| class BertIntermediate(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertIntermediate, self).__init__() | |||
| self.dense = nn.Linear(config.hidden_size, config.intermediate_size) | |||
| if isinstance(config.hidden_act, | |||
| str) or (sys.version_info[0] == 2 | |||
| and isinstance(config.hidden_act, unicode)): | |||
| self.intermediate_act_fn = ACT2FN[config.hidden_act] | |||
| else: | |||
| self.intermediate_act_fn = config.hidden_act | |||
| def forward(self, hidden_states): | |||
| hidden_states = self.dense(hidden_states) | |||
| hidden_states = self.intermediate_act_fn(hidden_states) | |||
| return hidden_states | |||
| class BertOutput(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertOutput, self).__init__() | |||
| self.dense = nn.Linear(config.intermediate_size, config.hidden_size) | |||
| self.LayerNorm = BertLayerNorm( | |||
| config.hidden_size, eps=config.layer_norm_eps) | |||
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
| def forward(self, hidden_states, input_tensor): | |||
| hidden_states = self.dense(hidden_states) | |||
| hidden_states = self.dropout(hidden_states) | |||
| hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
| return hidden_states | |||
| class BertLayer(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertLayer, self).__init__() | |||
| self.attention = BertAttention(config) | |||
| self.intermediate = BertIntermediate(config) | |||
| self.output = BertOutput(config) | |||
| def forward(self, hidden_states, attention_mask=None, head_mask=None): | |||
| attention_outputs = self.attention(hidden_states, attention_mask, | |||
| head_mask) | |||
| attention_output = attention_outputs[0] | |||
| intermediate_output = self.intermediate(attention_output) | |||
| layer_output = self.output(intermediate_output, attention_output) | |||
| outputs = (layer_output, ) + attention_outputs[ | |||
| 1:] # add attentions if we output them | |||
| return outputs | |||
| class BertEncoder(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertEncoder, self).__init__() | |||
| self.output_attentions = config.output_attentions | |||
| self.output_hidden_states = config.output_hidden_states | |||
| self.layer = nn.ModuleList( | |||
| [BertLayer(config) for _ in range(config.num_hidden_layers)]) | |||
| def forward(self, hidden_states, attention_mask=None, head_mask=None): | |||
| all_hidden_states = () | |||
| all_attentions = () | |||
| for i, layer_module in enumerate(self.layer): | |||
| if self.output_hidden_states: | |||
| all_hidden_states = all_hidden_states + (hidden_states, ) | |||
| layer_outputs = layer_module(hidden_states, attention_mask, | |||
| head_mask[i]) | |||
| hidden_states = layer_outputs[0] | |||
| if self.output_attentions: | |||
| all_attentions = all_attentions + (layer_outputs[1], ) | |||
| # Add last layer | |||
| if self.output_hidden_states: | |||
| all_hidden_states = all_hidden_states + (hidden_states, ) | |||
| outputs = (hidden_states, ) | |||
| if self.output_hidden_states: | |||
| outputs = outputs + (all_hidden_states, ) | |||
| if self.output_attentions: | |||
| outputs = outputs + (all_attentions, ) | |||
| return outputs # last-layer hidden state, (all hidden states), (all attentions) | |||
| class BertPooler(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertPooler, self).__init__() | |||
| self.dense = nn.Linear(config.hidden_size, config.hidden_size) | |||
| self.activation = nn.Tanh() | |||
| def forward(self, hidden_states): | |||
| # We "pool" the model by simply taking the hidden state corresponding | |||
| # to the first token. | |||
| first_token_tensor = hidden_states[:, 0] | |||
| pooled_output = self.dense(first_token_tensor) | |||
| pooled_output = self.activation(pooled_output) | |||
| return pooled_output | |||
| class BertPredictionHeadTransform(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertPredictionHeadTransform, self).__init__() | |||
| self.dense = nn.Linear(config.hidden_size, config.hidden_size) | |||
| if isinstance(config.hidden_act, | |||
| str) or (sys.version_info[0] == 2 | |||
| and isinstance(config.hidden_act, unicode)): | |||
| self.transform_act_fn = ACT2FN[config.hidden_act] | |||
| else: | |||
| self.transform_act_fn = config.hidden_act | |||
| self.LayerNorm = BertLayerNorm( | |||
| config.hidden_size, eps=config.layer_norm_eps) | |||
| def forward(self, hidden_states): | |||
| hidden_states = self.dense(hidden_states) | |||
| hidden_states = self.transform_act_fn(hidden_states) | |||
| hidden_states = self.LayerNorm(hidden_states) | |||
| return hidden_states | |||
| class BertLMPredictionHead(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertLMPredictionHead, self).__init__() | |||
| self.transform = BertPredictionHeadTransform(config) | |||
| # The output weights are the same as the input embeddings, but there is | |||
| # an output-only bias for each token. | |||
| self.decoder = nn.Linear( | |||
| config.hidden_size, config.vocab_size, bias=False) | |||
| self.bias = nn.Parameter(torch.zeros(config.vocab_size)) | |||
| def forward(self, hidden_states): | |||
| hidden_states = self.transform(hidden_states) | |||
| hidden_states = self.decoder(hidden_states) + self.bias | |||
| return hidden_states | |||
| class BertOnlyMLMHead(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertOnlyMLMHead, self).__init__() | |||
| self.predictions = BertLMPredictionHead(config) | |||
| def forward(self, sequence_output): | |||
| prediction_scores = self.predictions(sequence_output) | |||
| return prediction_scores | |||
| class BertOnlyNSPHead(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertOnlyNSPHead, self).__init__() | |||
| self.seq_relationship = nn.Linear(config.hidden_size, 2) | |||
| def forward(self, pooled_output): | |||
| seq_relationship_score = self.seq_relationship(pooled_output) | |||
| return seq_relationship_score | |||
| class BertPreTrainingHeads(nn.Module): | |||
| def __init__(self, config): | |||
| super(BertPreTrainingHeads, self).__init__() | |||
| self.predictions = BertLMPredictionHead(config) | |||
| self.seq_relationship = nn.Linear(config.hidden_size, 2) | |||
| def forward(self, sequence_output, pooled_output): | |||
| prediction_scores = self.predictions(sequence_output) | |||
| seq_relationship_score = self.seq_relationship(pooled_output) | |||
| return prediction_scores, seq_relationship_score | |||
| class BertPreTrainedModel(nn.Module): | |||
| config_class = BertConfig | |||
| base_model_prefix = 'bert' | |||
| def __init__(self, config): | |||
| super(BertPreTrainedModel, self).__init__() | |||
| self.config = config | |||
| def _init_weights(self, module): | |||
| """ Initialize the weights """ | |||
| if isinstance(module, (nn.Linear, nn.Embedding)): | |||
| # Slightly different from the TF version which uses truncated_normal for initialization | |||
| # cf https://github.com/pytorch/pytorch/pull/5617 | |||
| module.weight.data.normal_( | |||
| mean=0.0, std=self.config.initializer_range) | |||
| elif isinstance(module, BertLayerNorm): | |||
| module.bias.data.zero_() | |||
| module.weight.data.fill_(1.0) | |||
| if isinstance(module, nn.Linear) and module.bias is not None: | |||
| module.bias.data.zero_() | |||
| class BertModel(BertPreTrainedModel): | |||
| r""" | |||
| Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: | |||
| **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` | |||
| Last layer hidden-state of the first token of the sequence (classification token) | |||
| further processed by a Linear layer and a Tanh activation function. The Linear | |||
| layer weights are trained from the next sentence prediction (classification) | |||
| objective during Bert pretraining. This output is usually *not* a good summary | |||
| of the semantic content of the input, you're often better with averaging or pooling | |||
| the sequence of hidden-states for the whole input sequence. | |||
| **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) | |||
| list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) | |||
| of shape ``(batch_size, sequence_length, hidden_size)``: | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| **attentions**: (`optional`, returned when ``config.output_attentions=True``) | |||
| list of ``torch.FloatTensor`` (one for each layer) | |||
| of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: | |||
| Attentions weights after the attention softmax, | |||
| used to compute the weighted average in the self-attention heads. | |||
| Examples:: | |||
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |||
| model = BertModel.from_pretrained('bert-base-uncased') | |||
| input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 | |||
| outputs = model(input_ids) | |||
| last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple | |||
| """ | |||
| def __init__(self, config): | |||
| super(BertModel, self).__init__(config) | |||
| self.embeddings = BertEmbeddings(config) | |||
| self.encoder = BertEncoder(config) | |||
| self.pooler = BertPooler(config) | |||
| self.apply(self._init_weights) | |||
| def forward(self, | |||
| input_ids, | |||
| attention_mask=None, | |||
| token_type_ids=None, | |||
| position_ids=None, | |||
| head_mask=None): | |||
| if attention_mask is None: | |||
| attention_mask = torch.ones_like(input_ids) | |||
| if token_type_ids is None: | |||
| token_type_ids = torch.zeros_like(input_ids) | |||
| # We create a 3D attention mask from a 2D tensor mask. | |||
| # Sizes are [batch_size, 1, 1, to_seq_length] | |||
| # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] | |||
| # this attention mask is more simple than the triangular masking of causal attention | |||
| # used in OpenAI GPT, we just need to prepare the broadcast dimension here. | |||
| extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) | |||
| # Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |||
| # masked positions, this operation will create a tensor which is 0.0 for | |||
| # positions we want to attend and -10000.0 for masked positions. | |||
| # Since we are adding it to the raw scores before the softmax, this is | |||
| # effectively the same as removing these entirely. | |||
| extended_attention_mask = extended_attention_mask.to( | |||
| dtype=next(self.parameters()).dtype) # fp16 compatibility | |||
| extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | |||
| # Prepare head mask if needed | |||
| # 1.0 in head_mask indicate we keep the head | |||
| # attention_probs has shape bsz x n_heads x N x N | |||
| # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] | |||
| # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] | |||
| if head_mask is not None: | |||
| if head_mask.dim() == 1: | |||
| head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( | |||
| -1).unsqueeze(-1) | |||
| head_mask = head_mask.expand(self.config.num_hidden_layers, -1, | |||
| -1, -1, -1) | |||
| elif head_mask.dim() == 2: | |||
| head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( | |||
| -1) # We can specify head_mask for each layer | |||
| head_mask = head_mask.to(dtype=next(self.parameters( | |||
| )).dtype) # switch to fload if need + fp16 compatibility | |||
| else: | |||
| head_mask = [None] * self.config.num_hidden_layers | |||
| embedding_output = self.embeddings( | |||
| input_ids, | |||
| position_ids=position_ids, | |||
| token_type_ids=token_type_ids) | |||
| encoder_outputs = self.encoder( | |||
| embedding_output, extended_attention_mask, head_mask=head_mask) | |||
| sequence_output = encoder_outputs[0] | |||
| pooled_output = self.pooler(sequence_output) | |||
| outputs = ( | |||
| sequence_output, | |||
| pooled_output, | |||
| ) + encoder_outputs[ | |||
| 1:] # add hidden_states and attentions if they are here | |||
| return outputs # sequence_output, pooled_output, (hidden_states), (attentions) | |||
| @@ -136,7 +136,7 @@ class DiffusionForTextToImageSynthesis(Model): | |||
| self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024 | |||
| # text tokenizer | |||
| vocab_path = '{}/vocab.txt'.format(model_dir) | |||
| vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}' | |||
| self.tokenizer = Tokenizer(vocab_file=vocab_path, seq_len=64) | |||
| # diffusion process | |||
| @@ -491,7 +491,9 @@ class GEVL(nn.Module): | |||
| gen_logits = self.to_logits(out_embs[-1:, ...]) | |||
| probs = F.softmax(self.gen_logit_scale.exp() * gen_logits, dim=-1) | |||
| pred = torch.argmax( | |||
| probs * (1.0 + torch.rand_like(probs)), axis=-1) | |||
| probs * (2.0 + torch.rand_like(probs)), axis=-1) | |||
| if int(pred) >= eot_token or int(pred) <= 0: | |||
| break | |||
| pred_tokens.append(pred) | |||
| text_input = torch.cat( | |||
| [text_input, pred.permute(1, 0).contiguous()], axis=1) | |||
| @@ -500,8 +502,6 @@ class GEVL(nn.Module): | |||
| for out_tokens in pred_text_tokens: | |||
| tokens = [] | |||
| for x in out_tokens: | |||
| if x >= eot_token or x <= 0: | |||
| break | |||
| tokens.append(int(x)) | |||
| out_text = self.tokenizer.decode(tokens) | |||
| out_text = out_text.strip() | |||
| @@ -14,5 +14,4 @@ | |||
| # limitations under the License. | |||
| from .configuration_mplug import MPlugConfig | |||
| from .modeling_mplug import (CONFIG_NAME, VOCAB_NAME, | |||
| MPlugForVisualQuestionAnswering) | |||
| from .modeling_mplug import CONFIG_NAME, MPlug | |||
| @@ -5,9 +5,69 @@ from typing import Tuple, Union | |||
| import torch | |||
| import torch.nn.functional as F | |||
| import torch.utils.checkpoint as checkpoint | |||
| from torch import nn | |||
| from modelscope.models.multi_modal.clip.clip_vit import Transformer | |||
| class QuickGELU(nn.Module): | |||
| def forward(self, x: torch.Tensor): | |||
| return x * torch.sigmoid(1.702 * x) | |||
| class ResidualAttentionBlock(nn.Module): | |||
| def __init__(self, | |||
| d_model: int, | |||
| n_head: int, | |||
| attn_mask: torch.Tensor = None): | |||
| super().__init__() | |||
| self.attn = nn.MultiheadAttention(d_model, n_head) | |||
| self.ln_1 = LayerNorm(d_model) | |||
| self.mlp = nn.Sequential( | |||
| OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), | |||
| ('gelu', QuickGELU()), | |||
| ('c_proj', nn.Linear(d_model * 4, d_model))])) | |||
| self.ln_2 = LayerNorm(d_model) | |||
| self.attn_mask = attn_mask | |||
| def attention(self, x: torch.Tensor): | |||
| self.attn_mask = self.attn_mask.to( | |||
| dtype=x.dtype, | |||
| device=x.device) if self.attn_mask is not None else None | |||
| return self.attn( | |||
| x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] | |||
| def forward(self, x: torch.Tensor): | |||
| x = x + self.attention(self.ln_1(x)) | |||
| x = x + self.mlp(self.ln_2(x)) | |||
| return x | |||
| class Transformer(nn.Module): | |||
| def __init__(self, | |||
| width: int, | |||
| layers: int, | |||
| heads: int, | |||
| attn_mask: torch.Tensor = None, | |||
| use_grad_ckp: bool = True): | |||
| super().__init__() | |||
| self.width = width | |||
| self.layers = layers | |||
| self.resblocks = nn.Sequential(*[ | |||
| ResidualAttentionBlock(width, heads, attn_mask) | |||
| for _ in range(layers) | |||
| ]) | |||
| self.use_grad_ckp = use_grad_ckp | |||
| def forward(self, x: torch.Tensor): | |||
| if self.use_grad_ckp: | |||
| for each_block in self.resblocks: | |||
| x = checkpoint.checkpoint(each_block, x) | |||
| return x | |||
| else: | |||
| return self.resblocks(x) | |||
| class Bottleneck(nn.Module): | |||
| @@ -15,14 +15,14 @@ | |||
| # limitations under the License. | |||
| """ MPLUG model configuration """ | |||
| import os | |||
| from collections import OrderedDict | |||
| from typing import Any, Dict, Mapping, Union | |||
| from typing import Any, Dict, Union | |||
| import yaml | |||
| from transformers import PretrainedConfig | |||
| from transformers.onnx import OnnxConfig | |||
| from transformers.utils import logging | |||
| from modelscope.utils.constant import Tasks | |||
| logger = logging.get_logger(__name__) | |||
| @@ -32,6 +32,7 @@ class MPlugConfig(PretrainedConfig): | |||
| def __init__( | |||
| self, | |||
| task=Tasks.visual_question_answering, | |||
| bert_config='config_bert.json', | |||
| image_res=504, | |||
| batch_size_train=128, | |||
| @@ -64,7 +65,9 @@ class MPlugConfig(PretrainedConfig): | |||
| clip_transformer_heads=12, | |||
| clip_transformer_layers=12, | |||
| **kwargs): | |||
| super().__init__(**kwargs) | |||
| self.task = task | |||
| self.bert_config = bert_config | |||
| self.image_res = image_res | |||
| self.batch_size_train = batch_size_train | |||
| @@ -103,23 +106,3 @@ class MPlugConfig(PretrainedConfig): | |||
| with open(yaml_file, 'r') as reader: | |||
| config_dict = yaml.load(reader, Loader=yaml.Loader) | |||
| return cls(**config_dict) | |||
| class MPlugOnnxConfig(OnnxConfig): | |||
| @property | |||
| def inputs(self) -> Mapping[str, Mapping[int, str]]: | |||
| return OrderedDict([ | |||
| ('input_ids', { | |||
| 0: 'batch', | |||
| 1: 'sequence' | |||
| }), | |||
| ('attention_mask', { | |||
| 0: 'batch', | |||
| 1: 'sequence' | |||
| }), | |||
| ('token_type_ids', { | |||
| 0: 'batch', | |||
| 1: 'sequence' | |||
| }), | |||
| ]) | |||
| @@ -42,14 +42,13 @@ from transformers.utils import logging | |||
| from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig | |||
| from modelscope.models.multi_modal.mplug.predictor import TextGenerator | |||
| from modelscope.utils.constant import ModelFile | |||
| transformers.logging.set_verbosity_error() | |||
| logger = logging.get_logger(__name__) | |||
| CONFIG_NAME = 'config.yaml' | |||
| WEIGHTS_NAME = 'pytorch_model.bin' | |||
| VOCAB_NAME = 'vocab.txt' | |||
| _CONFIG_FOR_DOC = 'BertConfig' | |||
| _TOKENIZER_FOR_DOC = 'BertTokenizer' | |||
| @@ -1726,32 +1725,145 @@ class BertLMHeadModel(BertPreTrainedModel): | |||
| return reordered_past | |||
| class MPlugForVisualQuestionAnswering(PreTrainedModel): | |||
| class BertPrefixModel(BertPreTrainedModel): | |||
| _keys_to_ignore_on_load_unexpected = [r'pooler'] | |||
| _keys_to_ignore_on_load_missing = [ | |||
| r'position_ids', r'predictions.decoder.bias' | |||
| ] | |||
| def __init__(self, config): | |||
| super().__init__(config) | |||
| self.bert = BertModel(config, add_pooling_layer=False) | |||
| self.cls = BertOnlyMLMHead(config) | |||
| self.init_weights() | |||
| def get_output_embeddings(self): | |||
| return self.cls.predictions.decoder | |||
| def set_output_embeddings(self, new_embeddings): | |||
| self.cls.predictions.decoder = new_embeddings | |||
| @add_start_docstrings_to_model_forward( | |||
| BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) | |||
| @add_code_sample_docstrings( | |||
| processor_class=_TOKENIZER_FOR_DOC, | |||
| checkpoint='bert-base-uncased', | |||
| output_type=CausalLMOutputWithCrossAttentions, | |||
| config_class=_CONFIG_FOR_DOC, | |||
| ) | |||
| def forward( | |||
| self, | |||
| input_ids=None, | |||
| attention_mask=None, | |||
| token_type_ids=None, | |||
| position_ids=None, | |||
| head_mask=None, | |||
| inputs_embeds=None, | |||
| encoder_hidden_states=None, | |||
| encoder_attention_mask=None, | |||
| labels=None, | |||
| past_key_values=None, | |||
| use_cache=None, | |||
| output_attentions=None, | |||
| output_hidden_states=None, | |||
| return_dict=None, | |||
| is_decoder=True, | |||
| reduction='mean', | |||
| soft_labels=None, | |||
| alpha=0, | |||
| return_logits=False, | |||
| ): | |||
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |||
| if labels is not None: | |||
| use_cache = False | |||
| outputs = self.bert( | |||
| input_ids, | |||
| attention_mask=attention_mask, | |||
| token_type_ids=token_type_ids, | |||
| position_ids=position_ids, | |||
| head_mask=head_mask, | |||
| inputs_embeds=inputs_embeds, | |||
| encoder_hidden_states=encoder_hidden_states, | |||
| encoder_attention_mask=encoder_attention_mask, | |||
| past_key_values=past_key_values, | |||
| use_cache=use_cache, | |||
| output_attentions=output_attentions, | |||
| output_hidden_states=output_hidden_states, | |||
| return_dict=return_dict, | |||
| is_decoder=is_decoder, | |||
| ) | |||
| sequence_output = outputs[0] | |||
| prediction_scores = self.cls(sequence_output) | |||
| if return_logits: | |||
| return prediction_scores[:, :-1, :].contiguous() | |||
| lm_loss = None | |||
| if labels is not None: | |||
| # we are doing next-token prediction; shift prediction scores and input ids by one | |||
| shifted_prediction_scores = prediction_scores[:, : | |||
| -1, :].contiguous() | |||
| labels = labels[:, 1:].contiguous() | |||
| loss_fct = CrossEntropyLoss() | |||
| lm_loss = loss_fct( | |||
| shifted_prediction_scores.view(-1, self.config.vocab_size), | |||
| labels.view(-1)) | |||
| if soft_labels is not None: | |||
| loss_distill = -torch.sum( | |||
| F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels, | |||
| dim=-1) | |||
| loss_distill = loss_distill[labels != -100].mean() | |||
| lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill | |||
| if not return_dict: | |||
| output = (prediction_scores, ) + outputs[2:] | |||
| return ((lm_loss, ) + output) if lm_loss is not None else output | |||
| return CausalLMOutputWithCrossAttentions( | |||
| loss=lm_loss, | |||
| logits=prediction_scores, | |||
| past_key_values=outputs.past_key_values, | |||
| hidden_states=outputs.hidden_states, | |||
| attentions=outputs.attentions, | |||
| cross_attentions=outputs.cross_attentions, | |||
| ) | |||
| class MPlug(PreTrainedModel): | |||
| config_class = MPlugConfig | |||
| def __init__(self, config): | |||
| super().__init__(config) | |||
| self.config = config | |||
| self.tokenizer = BertTokenizer.from_pretrained( | |||
| os.path.join(config.model_dir, VOCAB_NAME)) | |||
| os.path.join(config.model_dir, ModelFile.VOCAB_FILE)) | |||
| self.module_setting(config) | |||
| self.visual_encoder = self._initialize_clip(config) | |||
| self.text_encoder = BertModel( | |||
| self.config_encoder, add_pooling_layer=False) | |||
| self.fusion_encoder = FusionModel( | |||
| self.config_fusion, add_pooling_layer=False) | |||
| self.text_decoder = BertLMHeadModel(self.config_decoder) | |||
| self.init_distill(config) | |||
| self.beam_generator = TextGenerator(config, self.text_decoder) | |||
| @classmethod | |||
| def from_pretrained(cls, model_dir, load_checkpoint=True): | |||
| config = MPlugConfig.from_yaml_file( | |||
| from modelscope.utils.constant import Tasks | |||
| task_mapping = { | |||
| Tasks.visual_question_answering: MPlugForVisualQuestionAnswering, | |||
| Tasks.image_captioning: MPLUGForImageCaption | |||
| } | |||
| config = cls.config_class.from_yaml_file( | |||
| os.path.join(model_dir, CONFIG_NAME)) | |||
| config.model_dir = model_dir | |||
| model = cls(config) | |||
| model = task_mapping[config.task](config) | |||
| if load_checkpoint: | |||
| checkpoint_path = os.path.join(model_dir, WEIGHTS_NAME) | |||
| checkpoint_path = os.path.join(model_dir, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| checkpoint = torch.load(checkpoint_path, map_location='cpu') | |||
| if 'model' in checkpoint: | |||
| state_dict = checkpoint['model'] | |||
| @@ -1803,6 +1915,161 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel): | |||
| clip_model.visual.positional_embedding = pos_embed | |||
| return clip_model | |||
| def forward(self, *args, **kwargs): | |||
| raise NotImplementedError | |||
| def module_setting(self, config): | |||
| bert_config_path = os.path.join(config.model_dir, config.bert_config) | |||
| self.config_encoder = BertConfig.from_json_file(bert_config_path) | |||
| self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers | |||
| self.config_fusion = BertConfig.from_json_file(bert_config_path) | |||
| self.config_decoder = BertConfig.from_json_file(bert_config_path) | |||
| self.config_decoder.add_cross_attention = True | |||
| self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers | |||
| self.large = False | |||
| if self.config_encoder.hidden_size != config.vision_width: | |||
| self.visn_fc = nn.Linear(config.vision_width, | |||
| self.config_encoder.hidden_size) | |||
| self.visn_layer_norm = nn.LayerNorm( | |||
| self.config_encoder.hidden_size, eps=1e-12) | |||
| self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob) | |||
| self.large = True | |||
| @torch.no_grad() | |||
| def copy_params(self): | |||
| for model_pair in self.model_pairs: | |||
| for param, param_m in zip(model_pair[0].parameters(), | |||
| model_pair[1].parameters()): | |||
| param_m.data.copy_(param.data) # initialize | |||
| param_m.requires_grad = False # not update by gradient | |||
| @torch.no_grad() | |||
| def _momentum_update(self): | |||
| for model_pair in self.model_pairs: | |||
| for param, param_m in zip(model_pair[0].parameters(), | |||
| model_pair[1].parameters()): | |||
| param_m.data = param_m.data * self.momentum + param.data * ( | |||
| 1. - self.momentum) | |||
| def generation(self, question_states, question_atts, out_size=1): | |||
| encoder_inputs = [question_states, question_atts] | |||
| topk_ids, topk_scores = self.beam_generator.translate_batch( | |||
| encoder_inputs, out_size=out_size) | |||
| return topk_ids, topk_scores | |||
| @staticmethod | |||
| def _tile(x, dim, n_tile): | |||
| import numpy as np | |||
| init_dim = x.size(dim) | |||
| repeat_idx = [1] * x.dim() | |||
| repeat_idx[dim] = n_tile | |||
| x = x.repeat(*(repeat_idx)) | |||
| order_index = torch.LongTensor( | |||
| np.concatenate( | |||
| [init_dim * np.arange(n_tile) + i for i in range(init_dim)])) | |||
| return torch.index_select(x, dim, order_index.to(x.device)) | |||
| def rank_answer(self, question_states, question_atts, answer_ids, | |||
| answer_atts, k): | |||
| num_ques = question_states.size(0) | |||
| start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token | |||
| start_output = self.text_decoder( | |||
| start_ids, | |||
| encoder_hidden_states=question_states, | |||
| encoder_attention_mask=question_atts, | |||
| return_dict=True, | |||
| reduction='none') | |||
| logits = start_output.logits[:, 0, :] # first token's logit | |||
| # topk_probs: top-k probability | |||
| # topk_ids: [num_question, k] | |||
| answer_first_token = answer_ids[:, 1] | |||
| prob_first_token = F.softmax( | |||
| logits, dim=1).index_select( | |||
| dim=1, index=answer_first_token) | |||
| topk_probs, topk_ids = prob_first_token.topk(k, dim=1) | |||
| # answer input: [num_question*k, answer_len] | |||
| input_ids = [] | |||
| input_atts = [] | |||
| for b, topk_id in enumerate(topk_ids): | |||
| input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) | |||
| input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) | |||
| input_ids = torch.cat(input_ids, dim=0) | |||
| input_atts = torch.cat(input_atts, dim=0) | |||
| targets_ids = input_ids.masked_fill( | |||
| input_ids == self.tokenizer.pad_token_id, -100) | |||
| # repeat encoder's output for top-k answers | |||
| question_states = self._tile(question_states, 0, k) | |||
| question_atts = self._tile(question_atts, 0, k) | |||
| output = self.text_decoder( | |||
| input_ids, | |||
| attention_mask=input_atts, | |||
| encoder_hidden_states=question_states, | |||
| encoder_attention_mask=question_atts, | |||
| labels=targets_ids, | |||
| return_dict=True, | |||
| reduction='none') | |||
| answer_loss = output.loss | |||
| answer_loss = answer_loss.view(input_ids.size(0), -1) | |||
| # topk_prob: first token probability | |||
| topk_probs = topk_probs.view(-1, 1) | |||
| log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1) | |||
| # re-calculate log probabilities for the answer sequences using chain rule | |||
| log_probs_sum = log_probs.sum(1) | |||
| log_probs_sum = log_probs_sum.view(num_ques, k) | |||
| topk_probs = F.softmax(log_probs_sum, dim=-1) | |||
| # get top-k after re-ranking | |||
| topk_probs, rerank_id = topk_probs.topk(k, dim=1) | |||
| topk_ids = torch.gather(topk_ids, 1, rerank_id) | |||
| return topk_ids, topk_probs | |||
| class MPlugForVisualQuestionAnswering(MPlug): | |||
| def __init__(self, config): | |||
| super().__init__(config) | |||
| self.text_decoder = BertLMHeadModel(self.config_decoder) | |||
| self.beam_generator = TextGenerator(config, self.text_decoder) | |||
| self.init_distill(config) | |||
| def init_distill(self, config): | |||
| self.distill = config.distill | |||
| if self.distill: | |||
| self.visual_encoder_m = self._initialize_clip(config) | |||
| self.text_encoder_m = BertModel( | |||
| self.config_encoder, add_pooling_layer=False) | |||
| self.fusion_encoder_m = FusionModel( | |||
| self.config_fusion, add_pooling_layer=False) | |||
| self.text_decoder_m = BertLMHeadModel(self.config_decoder) | |||
| self.model_pairs = [ | |||
| [self.visual_encoder, self.visual_encoder_m], | |||
| [self.text_encoder, self.text_encoder_m], | |||
| [self.text_decoder, self.text_decoder_m], | |||
| ] | |||
| if self.config_encoder.hidden_size != config.vision_width: | |||
| self.visn_fc_m = nn.Linear(config.vision_width, | |||
| self.config_encoder.hidden_size) | |||
| self.visn_layer_norm_m = nn.LayerNorm( | |||
| self.config_encoder.hidden_size, eps=1e-12) | |||
| self.dropout_m = nn.Dropout( | |||
| self.config_encoder.hidden_dropout_prob) | |||
| self.model_pairs.extend( | |||
| [[self.visn_fc, self.visn_fc_m], | |||
| [self.visn_layer_norm, self.visn_layer_norm_m]]) | |||
| self.copy_params() | |||
| self.momentum = 0.995 | |||
| def forward(self, | |||
| image, | |||
| question, | |||
| @@ -1935,145 +2202,110 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel): | |||
| merge_text_attention) | |||
| return topk_ids, topk_probs | |||
| def module_setting(self, config): | |||
| bert_config_path = os.path.join(config.model_dir, config.bert_config) | |||
| self.config_encoder = BertConfig.from_json_file(bert_config_path) | |||
| self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers | |||
| self.config_fusion = BertConfig.from_json_file(bert_config_path) | |||
| self.config_decoder = BertConfig.from_json_file(bert_config_path) | |||
| self.config_decoder.add_cross_attention = True | |||
| self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers | |||
| self.large = False | |||
| if self.config_encoder.hidden_size != config.vision_width: | |||
| self.visn_fc = nn.Linear(config.vision_width, | |||
| self.config_encoder.hidden_size) | |||
| self.visn_layer_norm = nn.LayerNorm( | |||
| self.config_encoder.hidden_size, eps=1e-12) | |||
| self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob) | |||
| self.large = True | |||
| def init_distill(self, config): | |||
| self.distill = config.distill | |||
| if self.distill: | |||
| self.visual_encoder_m = self._initialize_clip(config) | |||
| self.text_encoder_m = BertModel( | |||
| self.config_encoder, add_pooling_layer=False) | |||
| self.fusion_encoder_m = FusionModel( | |||
| self.config_fusion, add_pooling_layer=False) | |||
| self.text_decoder_m = BertLMHeadModel(self.config_decoder) | |||
| self.model_pairs = [ | |||
| [self.visual_encoder, self.visual_encoder_m], | |||
| [self.text_encoder, self.text_encoder_m], | |||
| [self.text_decoder, self.text_decoder_m], | |||
| ] | |||
| if self.config_encoder.hidden_size != config.vision_width: | |||
| self.visn_fc_m = nn.Linear(config.vision_width, | |||
| self.config_encoder.hidden_size) | |||
| self.visn_layer_norm_m = nn.LayerNorm( | |||
| self.config_encoder.hidden_size, eps=1e-12) | |||
| self.dropout_m = nn.Dropout( | |||
| self.config_encoder.hidden_dropout_prob) | |||
| self.model_pairs.extend( | |||
| [[self.visn_fc, self.visn_fc_m], | |||
| [self.visn_layer_norm, self.visn_layer_norm_m]]) | |||
| self.copy_params() | |||
| self.momentum = 0.995 | |||
| @torch.no_grad() | |||
| def copy_params(self): | |||
| for model_pair in self.model_pairs: | |||
| for param, param_m in zip(model_pair[0].parameters(), | |||
| model_pair[1].parameters()): | |||
| param_m.data.copy_(param.data) # initialize | |||
| param_m.requires_grad = False # not update by gradient | |||
| @torch.no_grad() | |||
| def _momentum_update(self): | |||
| for model_pair in self.model_pairs: | |||
| for param, param_m in zip(model_pair[0].parameters(), | |||
| model_pair[1].parameters()): | |||
| param_m.data = param_m.data * self.momentum + param.data * ( | |||
| 1. - self.momentum) | |||
| def generation(self, question_states, question_atts): | |||
| encoder_inputs = [question_states, question_atts] | |||
| topk_ids, topk_scores = self.beam_generator.translate_batch( | |||
| encoder_inputs) | |||
| return topk_ids, topk_scores | |||
| @staticmethod | |||
| def _tile(x, dim, n_tile): | |||
| import numpy as np | |||
| init_dim = x.size(dim) | |||
| repeat_idx = [1] * x.dim() | |||
| repeat_idx[dim] = n_tile | |||
| x = x.repeat(*(repeat_idx)) | |||
| order_index = torch.LongTensor( | |||
| np.concatenate( | |||
| [init_dim * np.arange(n_tile) + i for i in range(init_dim)])) | |||
| return torch.index_select(x, dim, order_index.to(x.device)) | |||
| def rank_answer(self, question_states, question_atts, answer_ids, | |||
| answer_atts, k): | |||
| num_ques = question_states.size(0) | |||
| start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token | |||
| start_output = self.text_decoder( | |||
| start_ids, | |||
| encoder_hidden_states=question_states, | |||
| encoder_attention_mask=question_atts, | |||
| return_dict=True, | |||
| reduction='none') | |||
| logits = start_output.logits[:, 0, :] # first token's logit | |||
| class MPLUGForImageCaption(MPlug): | |||
| # topk_probs: top-k probability | |||
| # topk_ids: [num_question, k] | |||
| answer_first_token = answer_ids[:, 1] | |||
| prob_first_token = F.softmax( | |||
| logits, dim=1).index_select( | |||
| dim=1, index=answer_first_token) | |||
| topk_probs, topk_ids = prob_first_token.topk(k, dim=1) | |||
| # answer input: [num_question*k, answer_len] | |||
| input_ids = [] | |||
| input_atts = [] | |||
| for b, topk_id in enumerate(topk_ids): | |||
| input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) | |||
| input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) | |||
| input_ids = torch.cat(input_ids, dim=0) | |||
| input_atts = torch.cat(input_atts, dim=0) | |||
| targets_ids = input_ids.masked_fill( | |||
| input_ids == self.tokenizer.pad_token_id, -100) | |||
| # repeat encoder's output for top-k answers | |||
| question_states = self._tile(question_states, 0, k) | |||
| question_atts = self._tile(question_atts, 0, k) | |||
| def __init__(self, config): | |||
| super().__init__(config) | |||
| self.text_decoder = BertPrefixModel(self.config_decoder) | |||
| self.beam_generator = TextGenerator(config, self.text_decoder) | |||
| output = self.text_decoder( | |||
| input_ids, | |||
| attention_mask=input_atts, | |||
| encoder_hidden_states=question_states, | |||
| encoder_attention_mask=question_atts, | |||
| labels=targets_ids, | |||
| return_dict=True, | |||
| reduction='none') | |||
| def beam_search(self, | |||
| image, | |||
| question, | |||
| answer=None, | |||
| train=True, | |||
| out_size=5): | |||
| image_embeds = self.visual_encoder.visual(image, skip_last_layer=True) | |||
| if self.large: | |||
| image_embeds = self.dropout( | |||
| self.visn_layer_norm(self.visn_fc(image_embeds))) | |||
| image_atts = torch.ones( | |||
| image_embeds.size()[:-1], dtype=torch.long).to(image.device) | |||
| text_output = self.text_encoder( | |||
| question.input_ids, | |||
| attention_mask=question.attention_mask, | |||
| return_dict=True) | |||
| text_embeds = text_output.last_hidden_state | |||
| fusion_output = self.fusion_encoder( | |||
| encoder_embeds=text_embeds, | |||
| attention_mask=question.attention_mask, | |||
| encoder_hidden_states=image_embeds, | |||
| encoder_attention_mask=image_atts, | |||
| return_dict=False) | |||
| image_output, question_output = fusion_output | |||
| question_output = torch.cat([image_output, question_output], 1) | |||
| merge_text_attention = torch.cat([image_atts, question.attention_mask], | |||
| 1) | |||
| topk_ids, topk_probs = self.generation( | |||
| question_output, merge_text_attention, out_size=out_size) | |||
| return topk_ids, topk_probs | |||
| answer_loss = output.loss | |||
| answer_loss = answer_loss.view(input_ids.size(0), -1) | |||
| def forward(self, | |||
| image, | |||
| question, | |||
| answer=None, | |||
| train=True, | |||
| out_size=5, | |||
| scst=False): | |||
| if (scst): | |||
| return self.beam_search( | |||
| image, question, answer, train=True, out_size=out_size) | |||
| image = image.to(dtype=next(self.parameters()).dtype) | |||
| image_embeds = self.visual_encoder.visual(image, skip_last_layer=True) | |||
| if self.large: | |||
| image_embeds = self.dropout( | |||
| self.visn_layer_norm(self.visn_fc(image_embeds))) | |||
| image_atts = torch.ones( | |||
| image_embeds.size()[:-1], dtype=torch.long).to(image.device) | |||
| # topk_prob: first token probability | |||
| topk_probs = topk_probs.view(-1, 1) | |||
| log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1) | |||
| if train: | |||
| answer_targets = answer.input_ids.masked_fill( | |||
| answer.input_ids == self.tokenizer.pad_token_id, -100) | |||
| text_output = self.text_encoder( | |||
| question.input_ids, | |||
| attention_mask=question.attention_mask, | |||
| return_dict=True) | |||
| text_embeds = text_output.last_hidden_state | |||
| fusion_output = self.fusion_encoder( | |||
| encoder_embeds=text_embeds, | |||
| attention_mask=question.attention_mask, | |||
| encoder_hidden_states=image_embeds, | |||
| encoder_attention_mask=image_atts, | |||
| return_dict=False) | |||
| # re-calculate log probabilities for the answer sequences using chain rule | |||
| log_probs_sum = log_probs.sum(1) | |||
| log_probs_sum = log_probs_sum.view(num_ques, k) | |||
| image_output, question_output = fusion_output | |||
| topk_probs = F.softmax(log_probs_sum, dim=-1) | |||
| # get top-k after re-ranking | |||
| topk_probs, rerank_id = topk_probs.topk(k, dim=1) | |||
| topk_ids = torch.gather(topk_ids, 1, rerank_id) | |||
| question_output = torch.cat([image_output, question_output], 1) | |||
| merge_text_attention = torch.cat( | |||
| [image_atts, question.attention_mask], 1) | |||
| return topk_ids, topk_probs | |||
| answer_output = self.text_decoder( | |||
| answer.input_ids, | |||
| attention_mask=answer.attention_mask, | |||
| encoder_hidden_states=question_output, | |||
| encoder_attention_mask=merge_text_attention, | |||
| labels=answer_targets, | |||
| return_dict=True, | |||
| reduction='none') | |||
| loss = answer_output.loss | |||
| return loss | |||
| else: | |||
| text_output = self.text_encoder( | |||
| question.input_ids, | |||
| attention_mask=question.attention_mask, | |||
| return_dict=True) | |||
| text_embeds = text_output.last_hidden_state | |||
| fusion_output = self.fusion_encoder( | |||
| encoder_embeds=text_embeds, | |||
| attention_mask=question.attention_mask, | |||
| encoder_hidden_states=image_embeds, | |||
| encoder_attention_mask=image_atts, | |||
| return_dict=False) | |||
| image_output, question_output = fusion_output | |||
| question_output = torch.cat([image_output, question_output], 1) | |||
| merge_text_attention = torch.cat( | |||
| [image_atts, question.attention_mask], 1) | |||
| topk_ids, topk_probs = self.generation(question_output, | |||
| merge_text_attention) | |||
| return topk_ids, topk_probs | |||
| @@ -6,12 +6,13 @@ from modelscope.models.base import Tensor | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import Tasks | |||
| __all__ = ['MPlugForVisualQuestionAnswering'] | |||
| __all__ = ['MPlugForAllTasks'] | |||
| @MODELS.register_module( | |||
| Tasks.visual_question_answering, module_name=Models.mplug) | |||
| class MPlugForVisualQuestionAnswering(TorchModel): | |||
| @MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug) | |||
| class MPlugForAllTasks(TorchModel): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """initialize the mplug model from the `model_dir` path. | |||
| @@ -20,8 +21,8 @@ class MPlugForVisualQuestionAnswering(TorchModel): | |||
| """ | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| from modelscope.models.multi_modal.mplug import MPlugForVisualQuestionAnswering | |||
| self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir) | |||
| from modelscope.models.multi_modal.mplug import MPlug | |||
| self.model = MPlug.from_pretrained(model_dir) | |||
| self.tokenizer = self.model.tokenizer | |||
| def train(self): | |||
| @@ -44,4 +45,13 @@ class MPlugForVisualQuestionAnswering(TorchModel): | |||
| } | |||
| """ | |||
| return self.model(**input)[0] | |||
| topk_ids, _ = self.model(**input) | |||
| replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), | |||
| ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), | |||
| ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) | |||
| pred_string = self.tokenizer.decode(topk_ids[0][0]) | |||
| for _old, _new in replace_tokens_bert: | |||
| pred_string = pred_string.replace(_old, _new) | |||
| pred_string = pred_string.strip() | |||
| return pred_string | |||
| @@ -22,6 +22,8 @@ from transformers.models.bert.tokenization_bert import (BasicTokenizer, | |||
| WordpieceTokenizer) | |||
| from transformers.utils import logging | |||
| from modelscope.utils.constant import ModelFile | |||
| logger = logging.get_logger(__name__) | |||
| VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'} | |||
| @@ -42,7 +44,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| 'ofa-base': 1024, | |||
| } | |||
| VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'} | |||
| VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE} | |||
| PRETRAINED_VOCAB_FILES_MAP_ZH = { | |||
| 'vocab_file': { | |||
| @@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerFast | |||
| from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast | |||
| from transformers.utils import logging | |||
| from modelscope.utils.constant import ModelFile | |||
| from .tokenization_ofa import OFATokenizer, OFATokenizerZH | |||
| logger = logging.get_logger(__name__) | |||
| @@ -50,7 +51,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| 'ofa-base': 1024, | |||
| } | |||
| VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'} | |||
| VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE} | |||
| PRETRAINED_VOCAB_FILES_MAP_ZH = { | |||
| 'vocab_file': { | |||
| @@ -23,11 +23,12 @@ from typing import List, Optional, Tuple | |||
| from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control, | |||
| _is_punctuation, _is_whitespace) | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger(__name__) | |||
| VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} | |||
| VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE} | |||
| PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} | |||
| @@ -22,13 +22,14 @@ import transformers | |||
| from tokenizers import normalizers | |||
| from transformers.tokenization_utils_fast import PreTrainedTokenizerFast | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.logger import get_logger | |||
| from .tokenization_sbert import SbertTokenizer | |||
| logger = get_logger(__name__) | |||
| VOCAB_FILES_NAMES = { | |||
| 'vocab_file': 'vocab.txt', | |||
| 'vocab_file': ModelFile.VOCAB_FILE, | |||
| 'tokenizer_file': 'tokenizer.json' | |||
| } | |||
| @@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, | |||
| relative_to_absolute_path) | |||
| from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||
| from modelscope.utils.config import ConfigDict | |||
| from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
| DatasetFormations, DownloadMode, Hubs) | |||
| from modelscope.utils.logger import get_logger | |||
| from .task_datasets.builder import build_task_dataset | |||
| from .utils.dataset_builder import ExternalDataset | |||
| from .utils.dataset_utils import (get_dataset_files, | |||
| get_target_dataset_structure, | |||
| load_dataset_builder) | |||
| @@ -67,9 +70,16 @@ class MsDataset: | |||
| def __len__(self): | |||
| return len(self._hf_ds) | |||
| @property | |||
| def config_kwargs(self): | |||
| if isinstance(self._hf_ds, ExternalDataset): | |||
| return self._hf_ds.config_kwargs | |||
| else: | |||
| return None | |||
| @classmethod | |||
| def from_hf_dataset(cls, | |||
| hf_ds: Union[Dataset, DatasetDict], | |||
| hf_ds: Union[Dataset, DatasetDict, ExternalDataset], | |||
| target: str = None) -> Union[dict, 'MsDataset']: | |||
| if isinstance(hf_ds, Dataset): | |||
| return cls(hf_ds, target) | |||
| @@ -77,6 +87,8 @@ class MsDataset: | |||
| if len(hf_ds.keys()) == 1: | |||
| return cls(next(iter(hf_ds.values())), target) | |||
| return {k: cls(v, target) for k, v in hf_ds.items()} | |||
| elif isinstance(hf_ds, ExternalDataset): | |||
| return cls(hf_ds) | |||
| else: | |||
| raise TypeError( | |||
| f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | |||
| @@ -96,7 +108,8 @@ class MsDataset: | |||
| Mapping[str, Union[str, | |||
| Sequence[str]]]]] = None, | |||
| download_mode: Optional[DownloadMode] = DownloadMode. | |||
| REUSE_DATASET_IF_EXISTS | |||
| REUSE_DATASET_IF_EXISTS, | |||
| **config_kwargs, | |||
| ) -> Union[dict, 'MsDataset']: | |||
| """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
| Args: | |||
| @@ -113,6 +126,7 @@ class MsDataset: | |||
| hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | |||
| download_mode (DownloadMode or str, optional): How to treat existing datasets. default | |||
| DownloadMode.REUSE_DATASET_IF_EXISTS | |||
| **config_kwargs (additional keyword arguments): Keyword arguments to be passed | |||
| Returns: | |||
| MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | |||
| @@ -128,7 +142,8 @@ class MsDataset: | |||
| split=split, | |||
| data_dir=data_dir, | |||
| data_files=data_files, | |||
| download_mode=download_mode.value) | |||
| download_mode=download_mode.value, | |||
| **config_kwargs) | |||
| return MsDataset.from_hf_dataset(dataset, target=target) | |||
| elif hub == Hubs.modelscope: | |||
| return MsDataset._load_ms_dataset( | |||
| @@ -140,22 +155,22 @@ class MsDataset: | |||
| split=split, | |||
| data_dir=data_dir, | |||
| data_files=data_files, | |||
| download_mode=download_mode) | |||
| download_mode=download_mode, | |||
| **config_kwargs) | |||
| @staticmethod | |||
| def _load_ms_dataset( | |||
| dataset_name: Union[str, list], | |||
| namespace: Optional[str] = None, | |||
| target: Optional[str] = None, | |||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
| subset_name: Optional[str] = None, | |||
| split: Optional[str] = None, | |||
| data_dir: Optional[str] = None, | |||
| data_files: Optional[Union[str, Sequence[str], | |||
| Mapping[str, Union[str, | |||
| Sequence[str]]]]] = None, | |||
| download_mode: Optional[DownloadMode] = None | |||
| ) -> Union[dict, 'MsDataset']: | |||
| def _load_ms_dataset(dataset_name: Union[str, list], | |||
| namespace: Optional[str] = None, | |||
| target: Optional[str] = None, | |||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
| subset_name: Optional[str] = None, | |||
| split: Optional[str] = None, | |||
| data_dir: Optional[str] = None, | |||
| data_files: Optional[Union[ | |||
| str, Sequence[str], | |||
| Mapping[str, Union[str, Sequence[str]]]]] = None, | |||
| download_mode: Optional[DownloadMode] = None, | |||
| **config_kwargs) -> Union[dict, 'MsDataset']: | |||
| if isinstance(dataset_name, str): | |||
| dataset_formation = DatasetFormations.native | |||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||
| @@ -184,7 +199,8 @@ class MsDataset: | |||
| data_dir=data_dir, | |||
| data_files=data_files, | |||
| cache_dir=MS_DATASETS_CACHE, | |||
| download_mode=download_mode.value) | |||
| download_mode=download_mode.value, | |||
| **config_kwargs) | |||
| else: | |||
| dataset = MsDataset._load_from_ms( | |||
| dataset_name, | |||
| @@ -195,7 +211,7 @@ class MsDataset: | |||
| subset_name=subset_name, | |||
| split=split, | |||
| download_mode=download_mode, | |||
| ) | |||
| **config_kwargs) | |||
| elif isinstance(dataset_name, list): | |||
| if target is None: | |||
| target = 'target' | |||
| @@ -206,16 +222,15 @@ class MsDataset: | |||
| return MsDataset.from_hf_dataset(dataset, target=target) | |||
| @staticmethod | |||
| def _load_from_ms( | |||
| dataset_name: str, | |||
| dataset_files: dict, | |||
| download_dir: str, | |||
| namespace: Optional[str] = None, | |||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
| subset_name: Optional[str] = None, | |||
| split: Optional[str] = None, | |||
| download_mode: Optional[DownloadMode] = None, | |||
| ) -> Union[Dataset, DatasetDict]: | |||
| def _load_from_ms(dataset_name: str, | |||
| dataset_files: dict, | |||
| download_dir: str, | |||
| namespace: Optional[str] = None, | |||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
| subset_name: Optional[str] = None, | |||
| split: Optional[str] = None, | |||
| download_mode: Optional[DownloadMode] = None, | |||
| **config_kwargs) -> Union[Dataset, DatasetDict]: | |||
| for json_path in dataset_files['.json']: | |||
| if json_path.endswith(f'{dataset_name}.json'): | |||
| with open(json_path, encoding='utf-8') as dataset_json_file: | |||
| @@ -226,7 +241,6 @@ class MsDataset: | |||
| meta_map, file_map = get_dataset_files(target_dataset_structure, | |||
| dataset_name, namespace, | |||
| version) | |||
| builder = load_dataset_builder( | |||
| dataset_name, | |||
| subset_name, | |||
| @@ -235,7 +249,8 @@ class MsDataset: | |||
| zip_data_files=file_map, | |||
| cache_dir=MS_DATASETS_CACHE, | |||
| version=version, | |||
| split=list(target_dataset_structure.keys())) | |||
| split=list(target_dataset_structure.keys()), | |||
| **config_kwargs) | |||
| download_config = DownloadConfig( | |||
| cache_dir=download_dir, | |||
| @@ -253,7 +268,6 @@ class MsDataset: | |||
| data_dir=download_dir, | |||
| ) | |||
| builder.download_and_prepare( | |||
| download_config=download_config, | |||
| dl_manager=dl_manager, | |||
| download_mode=download_mode.value, | |||
| try_from_hf_gcs=False) | |||
| @@ -338,6 +352,8 @@ class MsDataset: | |||
| self, | |||
| columns: Union[str, List[str]] = None, | |||
| preprocessors: Union[Callable, List[Callable]] = None, | |||
| task_name: str = None, | |||
| task_data_config: ConfigDict = None, | |||
| **format_kwargs, | |||
| ): | |||
| """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | |||
| @@ -350,6 +366,8 @@ class MsDataset: | |||
| columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | |||
| preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | |||
| the output fields of processors will also be added. | |||
| task_name (str, default None): task name, refer to :obj:`Tasks` for more details | |||
| task_data_config (ConfigDict, default None): config dict for model object. | |||
| format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | |||
| Returns: | |||
| @@ -360,6 +378,10 @@ class MsDataset: | |||
| raise ImportError( | |||
| 'The function to_torch_dataset requires pytorch to be installed' | |||
| ) | |||
| if isinstance(self._hf_ds, ExternalDataset): | |||
| task_data_config.update({'preprocessor': preprocessors}) | |||
| return build_task_dataset(task_data_config, task_name, | |||
| self._hf_ds.config_kwargs) | |||
| if preprocessors is not None: | |||
| return self.to_torch_dataset_with_processors( | |||
| preprocessors, columns=columns) | |||
| @@ -8,6 +8,7 @@ if TYPE_CHECKING: | |||
| from .builder import TASK_DATASETS, build_task_dataset | |||
| from .torch_base_dataset import TorchTaskDataset | |||
| from .veco_dataset import VecoDataset | |||
| from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset | |||
| else: | |||
| _import_structure = { | |||
| @@ -15,6 +16,8 @@ else: | |||
| 'builder': ['TASK_DATASETS', 'build_task_dataset'], | |||
| 'torch_base_dataset': ['TorchTaskDataset'], | |||
| 'veco_dataset': ['VecoDataset'], | |||
| 'image_instance_segmentation_coco_dataset': | |||
| ['ImageInstanceSegmentationCocoDataset'] | |||
| } | |||
| import sys | |||
| @@ -2,14 +2,32 @@ import os.path as osp | |||
| import numpy as np | |||
| from pycocotools.coco import COCO | |||
| from torch.utils.data import Dataset | |||
| class ImageInstanceSegmentationCocoDataset(Dataset): | |||
| from modelscope.metainfo import Models | |||
| from modelscope.utils.constant import Tasks | |||
| from .builder import TASK_DATASETS | |||
| from .torch_base_dataset import TorchTaskDataset | |||
| DATASET_STRUCTURE = { | |||
| 'train': { | |||
| 'annotation': 'annotations/instances_train.json', | |||
| 'images': 'images/train' | |||
| }, | |||
| 'validation': { | |||
| 'annotation': 'annotations/instances_val.json', | |||
| 'images': 'images/val' | |||
| } | |||
| } | |||
| @TASK_DATASETS.register_module( | |||
| module_name=Models.cascade_mask_rcnn_swin, | |||
| group_key=Tasks.image_segmentation) | |||
| class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): | |||
| """Coco-style dataset for image instance segmentation. | |||
| Args: | |||
| ann_file (str): Annotation file path. | |||
| split_config (dict): Annotation file path. {"train":"xxxxx"} | |||
| classes (Sequence[str], optional): Specify classes to load. | |||
| If is None, ``cls.CLASSES`` will be used. Default: None. | |||
| data_root (str, optional): Data root for ``ann_file``, | |||
| @@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
| 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | |||
| def __init__(self, | |||
| ann_file, | |||
| split_config: dict, | |||
| preprocessor=None, | |||
| classes=None, | |||
| data_root=None, | |||
| img_prefix='', | |||
| seg_prefix=None, | |||
| test_mode=False, | |||
| filter_empty_gt=True): | |||
| self.ann_file = ann_file | |||
| self.data_root = data_root | |||
| self.img_prefix = img_prefix | |||
| filter_empty_gt=True, | |||
| **kwargs): | |||
| self.data_root = next(iter(split_config.values())) | |||
| self.split = next(iter(split_config.keys())) | |||
| self.preprocessor = preprocessor | |||
| self.ann_file = osp.join(self.data_root, | |||
| DATASET_STRUCTURE[self.split]['annotation']) | |||
| self.img_prefix = osp.join(self.data_root, | |||
| DATASET_STRUCTURE[self.split]['images']) | |||
| self.seg_prefix = seg_prefix | |||
| self.test_mode = test_mode | |||
| self.filter_empty_gt = filter_empty_gt | |||
| self.CLASSES = self.get_classes(classes) | |||
| # join paths if data_root is specified | |||
| if self.data_root is not None: | |||
| if not osp.isabs(self.ann_file): | |||
| self.ann_file = osp.join(self.data_root, self.ann_file) | |||
| if not (self.img_prefix is None or osp.isabs(self.img_prefix)): | |||
| self.img_prefix = osp.join(self.data_root, self.img_prefix) | |||
| if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): | |||
| self.seg_prefix = osp.join(self.data_root, self.seg_prefix) | |||
| # load annotations | |||
| self.data_infos = self.load_annotations(self.ann_file) | |||
| @@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
| # set group flag for the sampler | |||
| self._set_group_flag() | |||
| self.preprocessor = None | |||
| def __len__(self): | |||
| """Total number of samples of data.""" | |||
| return len(self.data_infos) | |||
| @@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
| raise ValueError(f'Unsupported type {type(classes)} of classes.') | |||
| return class_names | |||
| def to_torch_dataset(self, preprocessors=None): | |||
| self.preprocessor = preprocessors | |||
| return self | |||
| @@ -8,6 +8,7 @@ from datasets.info import DatasetInfo | |||
| from datasets.packaged_modules import csv | |||
| from datasets.utils.filelock import FileLock | |||
| from modelscope.utils.constant import DownloadMode | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||
| **config_kwargs, | |||
| ): | |||
| self.namespace = namespace | |||
| super().__init__( | |||
| cache_dir=cache_dir, | |||
| name=subset_name, | |||
| hash=hash, | |||
| namespace=namespace, | |||
| data_files=meta_data_files, | |||
| **config_kwargs) | |||
| @@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| os.rmdir(self._cache_dir) | |||
| self.zip_data_files = zip_data_files | |||
| def _relative_data_dir(self, with_version=True, with_hash=True) -> str: | |||
| """Relative path of this dataset in cache_dir: | |||
| Will be: | |||
| self.name/self.config.version/self.hash/ | |||
| or if a namespace has been specified: | |||
| self.namespace___self.name/self.config.version/self.hash/ | |||
| """ | |||
| builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' | |||
| builder_config = self.config | |||
| hash = self.hash | |||
| if builder_config: | |||
| builder_data_dir = os.path.join(builder_data_dir, self.config_id) | |||
| if with_version: | |||
| builder_data_dir = os.path.join(builder_data_dir, | |||
| str(self.config.version)) | |||
| if with_hash and hash and isinstance(hash, str): | |||
| builder_data_dir = os.path.join(builder_data_dir, hash) | |||
| return builder_data_dir | |||
| def _build_cache_dir(self): | |||
| builder_data_dir = os.path.join( | |||
| self._cache_dir_root, | |||
| @@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| datasets.SplitGenerator( | |||
| name=split_name, | |||
| gen_kwargs={ | |||
| 'files': dl_manager.iter_files(files), | |||
| 'base_dir': zip_data_files.get(split_name) | |||
| 'files': | |||
| dl_manager.iter_files(files), | |||
| 'base_dir': | |||
| os.path.join( | |||
| zip_data_files.get(split_name), | |||
| os.path.splitext( | |||
| self.zip_data_files.get(split_name))[0]) | |||
| if self.zip_data_files.get(split_name) else | |||
| zip_data_files.get(split_name) | |||
| })) | |||
| return splits | |||
| @@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
| logger.error( | |||
| f"Failed to read file '{file}' with error {type(e)}: {e}") | |||
| raise | |||
| class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): | |||
| def __init__( | |||
| self, | |||
| dataset_name: str, | |||
| cache_dir: str, | |||
| namespace: str, | |||
| subset_name: str, | |||
| hash: str, | |||
| meta_data_files: Mapping[str, Union[str, Sequence[str]]], | |||
| zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||
| **config_kwargs, | |||
| ): | |||
| self.name = dataset_name | |||
| self.subset_name = subset_name | |||
| self.namespace = namespace | |||
| self.hash = hash | |||
| self.data_files = meta_data_files | |||
| self.zip_data_files = zip_data_files | |||
| self.split_path_dict = None | |||
| self.config = None | |||
| self._cache_dir_root = os.path.expanduser(cache_dir) | |||
| self._cache_dir = self._build_cache_dir() | |||
| self._config_kwargs = config_kwargs | |||
| def download_and_prepare(self, download_mode, dl_manager, | |||
| **download_kwargs): | |||
| # Prevent parallel disk operations | |||
| lock_path = os.path.join( | |||
| self._cache_dir_root, | |||
| self._cache_dir.replace(os.sep, '_') + '.lock') | |||
| with FileLock(lock_path): | |||
| data_exists = os.path.exists(self._cache_dir) | |||
| if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: | |||
| logger.warning( | |||
| f'Reusing dataset {self.name} ({self._cache_dir})') | |||
| return | |||
| logger.info(f'Generating dataset {self.name} ({self._cache_dir})') | |||
| self._download_and_prepare(dl_manager=dl_manager) | |||
| def _download_and_prepare(self, dl_manager): | |||
| split_path_dict = dl_manager.download_and_extract(self.zip_data_files) | |||
| self.split_path_dict = { | |||
| k: os.path.join(v, | |||
| os.path.splitext(self.zip_data_files[k])[0]) | |||
| for k, v in split_path_dict.items() | |||
| } | |||
| def as_dataset(self): | |||
| return ExternalDataset(self.split_path_dict, self._config_kwargs) | |||
| class ExternalDataset(object): | |||
| def __init__(self, split_path_dict, config_kwargs): | |||
| config_kwargs.update({'split_config': split_path_dict}) | |||
| self.config_kwargs = config_kwargs | |||
| def __len__(self): | |||
| return len(self.config_kwargs['split_config']) | |||
| @@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder | |||
| from modelscope.utils.constant import DEFAULT_DATASET_REVISION | |||
| from modelscope.utils.logger import get_logger | |||
| from .dataset_builder import MsCsvDatasetBuilder | |||
| from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | |||
| logger = get_logger() | |||
| @@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, | |||
| modelscope_api = HubApi() | |||
| for split, info in subset_split_into.items(): | |||
| meta_map[split] = modelscope_api.get_dataset_file_url( | |||
| info['meta'], dataset_name, namespace, revision) | |||
| info.get('meta', ''), dataset_name, namespace, revision) | |||
| if info.get('file'): | |||
| file_map[split] = info['file'] | |||
| return meta_map, file_map | |||
| @@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, | |||
| zip_data_files: Mapping[str, Union[str, | |||
| Sequence[str]]], | |||
| cache_dir: str, version: Optional[Union[str]], | |||
| split: Sequence[str]) -> DatasetBuilder: | |||
| split: Sequence[str], | |||
| **config_kwargs) -> DatasetBuilder: | |||
| sub_dir = os.path.join(version, '_'.join(split)) | |||
| builder_instance = MsCsvDatasetBuilder( | |||
| dataset_name=dataset_name, | |||
| namespace=namespace, | |||
| cache_dir=cache_dir, | |||
| subset_name=subset_name, | |||
| meta_data_files=meta_data_files, | |||
| zip_data_files=zip_data_files, | |||
| hash=sub_dir) | |||
| meta_data_file = next(iter(meta_data_files.values())) | |||
| if not meta_data_file: | |||
| builder_instance = TaskSpecificDatasetBuilder( | |||
| dataset_name=dataset_name, | |||
| namespace=namespace, | |||
| cache_dir=cache_dir, | |||
| subset_name=subset_name, | |||
| meta_data_files=meta_data_files, | |||
| zip_data_files=zip_data_files, | |||
| hash=sub_dir, | |||
| **config_kwargs) | |||
| elif meta_data_file.endswith('.csv'): | |||
| builder_instance = MsCsvDatasetBuilder( | |||
| dataset_name=dataset_name, | |||
| namespace=namespace, | |||
| cache_dir=cache_dir, | |||
| subset_name=subset_name, | |||
| meta_data_files=meta_data_files, | |||
| zip_data_files=zip_data_files, | |||
| hash=sub_dir) | |||
| else: | |||
| raise NotImplementedError( | |||
| f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' | |||
| ) | |||
| return builder_instance | |||
| @@ -188,6 +188,16 @@ TASK_OUTPUTS = { | |||
| Tasks.body_2d_keypoints: | |||
| [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES], | |||
| # video single object tracking result for single video | |||
| # { | |||
| # "boxes": [ | |||
| # [x1, y1, x2, y2], | |||
| # [x1, y1, x2, y2], | |||
| # [x1, y1, x2, y2], | |||
| # ] | |||
| # } | |||
| Tasks.video_single_object_tracking: [OutputKeys.BOXES], | |||
| # live category recognition result for single video | |||
| # { | |||
| # "scores": [0.885272, 0.014790631, 0.014558001], | |||
| @@ -405,7 +415,7 @@ TASK_OUTPUTS = { | |||
| # audio processed for single file in PCM format | |||
| # { | |||
| # "output_pcm": np.array with shape(samples,) and dtype float32 | |||
| # "output_pcm": pcm encoded audio bytes | |||
| # } | |||
| Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM], | |||
| Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM], | |||
| @@ -417,6 +427,19 @@ TASK_OUTPUTS = { | |||
| # } | |||
| Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM], | |||
| # { | |||
| # "kws_list": [ | |||
| # { | |||
| # 'keyword': '', # the keyword spotted | |||
| # 'offset': 19.4, # the keyword start time in second | |||
| # 'length': 0.68, # the keyword length in second | |||
| # 'confidence': 0.85 # the possibility if it is the keyword | |||
| # }, | |||
| # ... | |||
| # ] | |||
| # } | |||
| Tasks.keyword_spotting: [OutputKeys.KWS_LIST], | |||
| # ============ multi-modal tasks =================== | |||
| # image caption result for single sample | |||
| @@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .ans_pipeline import ANSPipeline | |||
| from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline | |||
| from .kws_farfield_pipeline import KWSFarfieldPipeline | |||
| from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline | |||
| from .linear_aec_pipeline import LinearAECPipeline | |||
| from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline | |||
| @@ -14,6 +15,7 @@ else: | |||
| _import_structure = { | |||
| 'ans_pipeline': ['ANSPipeline'], | |||
| 'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'], | |||
| 'kws_farfield_pipeline': ['KWSFarfieldPipeline'], | |||
| 'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'], | |||
| 'linear_aec_pipeline': ['LinearAECPipeline'], | |||
| 'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'], | |||
| @@ -0,0 +1,81 @@ | |||
| import io | |||
| import wave | |||
| from typing import Any, Dict | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.utils.constant import Tasks | |||
| @PIPELINES.register_module( | |||
| Tasks.keyword_spotting, | |||
| module_name=Pipelines.speech_dfsmn_kws_char_farfield) | |||
| class KWSFarfieldPipeline(Pipeline): | |||
| r"""A Keyword Spotting Inference Pipeline . | |||
| When invoke the class with pipeline.__call__(), it accept only one parameter: | |||
| inputs(str): the path of wav file | |||
| """ | |||
| SAMPLE_RATE = 16000 | |||
| SAMPLE_WIDTH = 2 | |||
| INPUT_CHANNELS = 3 | |||
| OUTPUT_CHANNELS = 2 | |||
| def __init__(self, model, **kwargs): | |||
| """ | |||
| use `model` to create a kws far field pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model, **kwargs) | |||
| self.model = self.model.to(self.device) | |||
| self.model.eval() | |||
| frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH | |||
| self._nframe = self.model.size_in // frame_size | |||
| self.frame_count = 0 | |||
| def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: | |||
| if isinstance(inputs, bytes): | |||
| return dict(input_file=inputs) | |||
| elif isinstance(inputs, Dict): | |||
| return inputs | |||
| else: | |||
| raise ValueError(f'Not supported input type: {type(inputs)}') | |||
| def forward(self, inputs: Dict[str, Any], | |||
| **forward_params) -> Dict[str, Any]: | |||
| input_file = inputs['input_file'] | |||
| if isinstance(input_file, bytes): | |||
| input_file = io.BytesIO(input_file) | |||
| self.frame_count = 0 | |||
| kws_list = [] | |||
| with wave.open(input_file, 'rb') as fin: | |||
| if 'output_file' in inputs: | |||
| with wave.open(inputs['output_file'], 'wb') as fout: | |||
| fout.setframerate(self.SAMPLE_RATE) | |||
| fout.setnchannels(self.OUTPUT_CHANNELS) | |||
| fout.setsampwidth(self.SAMPLE_WIDTH) | |||
| self._process(fin, kws_list, fout) | |||
| else: | |||
| self._process(fin, kws_list) | |||
| return {OutputKeys.KWS_LIST: kws_list} | |||
| def _process(self, | |||
| fin: wave.Wave_read, | |||
| kws_list, | |||
| fout: wave.Wave_write = None): | |||
| data = fin.readframes(self._nframe) | |||
| while len(data) >= self.model.size_in: | |||
| self.frame_count += self._nframe | |||
| result = self.model.forward_decode(data) | |||
| if fout: | |||
| fout.writeframes(result['pcm']) | |||
| if 'kws' in result: | |||
| result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE | |||
| kws_list.append(result['kws']) | |||
| data = fin.readframes(self._nframe) | |||
| def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -255,7 +255,7 @@ class Pipeline(ABC): | |||
| return self._collate_fn(torch.from_numpy(data)) | |||
| elif isinstance(data, torch.Tensor): | |||
| return data.to(self.device) | |||
| elif isinstance(data, (str, int, float, bool, type(None))): | |||
| elif isinstance(data, (bytes, str, int, float, bool, type(None))): | |||
| return data | |||
| elif isinstance(data, InputFeatures): | |||
| return data | |||
| @@ -124,12 +124,16 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||
| Tasks.image_classification: | |||
| (Pipelines.daily_image_classification, | |||
| 'damo/cv_vit-base_image-classification_Dailylife-labels'), | |||
| Tasks.ocr_recognition: (Pipelines.ocr_recognition, | |||
| 'damo/cv_convnextTiny_ocr-recognition_damo'), | |||
| Tasks.ocr_recognition: | |||
| (Pipelines.ocr_recognition, | |||
| 'damo/cv_convnextTiny_ocr-recognition-general_damo'), | |||
| Tasks.skin_retouching: (Pipelines.skin_retouching, | |||
| 'damo/cv_unet_skin-retouching'), | |||
| Tasks.crowd_counting: (Pipelines.crowd_counting, | |||
| 'damo/cv_hrnet_crowd-counting_dcanet'), | |||
| Tasks.video_single_object_tracking: | |||
| (Pipelines.video_single_object_tracking, | |||
| 'damo/cv_vitb_video-single-object-tracking_ostrack'), | |||
| } | |||
| @@ -10,6 +10,7 @@ if TYPE_CHECKING: | |||
| from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline | |||
| from .crowd_counting_pipeline import CrowdCountingPipeline | |||
| from .image_detection_pipeline import ImageDetectionPipeline | |||
| from .image_salient_detection_pipeline import ImageSalientDetectionPipeline | |||
| from .face_detection_pipeline import FaceDetectionPipeline | |||
| from .face_image_generation_pipeline import FaceImageGenerationPipeline | |||
| from .face_recognition_pipeline import FaceRecognitionPipeline | |||
| @@ -43,6 +44,7 @@ else: | |||
| 'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'], | |||
| 'crowd_counting_pipeline': ['CrowdCountingPipeline'], | |||
| 'image_detection_pipeline': ['ImageDetectionPipeline'], | |||
| 'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'], | |||
| 'face_detection_pipeline': ['FaceDetectionPipeline'], | |||
| 'face_image_generation_pipeline': ['FaceImageGenerationPipeline'], | |||
| 'face_recognition_pipeline': ['FaceRecognitionPipeline'], | |||
| @@ -0,0 +1,47 @@ | |||
| from typing import Any, Dict | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import LoadImage | |||
| from modelscope.utils.constant import Tasks | |||
| @PIPELINES.register_module( | |||
| Tasks.image_segmentation, module_name=Pipelines.salient_detection) | |||
| class ImageSalientDetectionPipeline(Pipeline): | |||
| def __init__(self, model: str, **kwargs): | |||
| """ | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model, auto_collate=False, **kwargs) | |||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||
| img = LoadImage.convert_to_ndarray(input) | |||
| img_h, img_w, _ = img.shape | |||
| img = self.model.preprocess(img) | |||
| result = {'img': img, 'img_w': img_w, 'img_h': img_h} | |||
| return result | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| outputs = self.model.inference(input['img']) | |||
| result = { | |||
| 'data': outputs, | |||
| 'img_w': input['img_w'], | |||
| 'img_h': input['img_h'] | |||
| } | |||
| return result | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| data = self.model.postprocess(inputs) | |||
| outputs = { | |||
| OutputKeys.SCORES: None, | |||
| OutputKeys.LABELS: None, | |||
| OutputKeys.MASKS: data | |||
| } | |||
| return outputs | |||
| @@ -0,0 +1,80 @@ | |||
| import os.path as osp | |||
| from typing import Any, Dict | |||
| import cv2 | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.cv.video_single_object_tracking.config.ostrack import \ | |||
| cfg | |||
| from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \ | |||
| OSTrack | |||
| from modelscope.models.cv.video_single_object_tracking.utils.utils import \ | |||
| check_box | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.video_single_object_tracking, | |||
| module_name=Pipelines.video_single_object_tracking) | |||
| class VideoSingleObjectTrackingPipeline(Pipeline): | |||
| def __init__(self, model: str, **kwargs): | |||
| """ | |||
| use `model` to create a single object tracking pipeline | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model, **kwargs) | |||
| self.cfg = cfg | |||
| ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE) | |||
| logger.info(f'loading model from {ckpt_path}') | |||
| self.tracker = OSTrack(ckpt_path, self.device) | |||
| logger.info('init tracker done') | |||
| def preprocess(self, input) -> Input: | |||
| self.video_path = input[0] | |||
| self.init_bbox = input[1] | |||
| return input | |||
| def forward(self, input: Input) -> Dict[str, Any]: | |||
| output_boxes = [] | |||
| cap = cv2.VideoCapture(self.video_path) | |||
| success, frame = cap.read() | |||
| if success is False: | |||
| raise Exception( | |||
| 'modelscope error: %s can not be decoded by OpenCV.' % | |||
| (self.video_path)) | |||
| init_box = self.init_bbox | |||
| frame_h, frame_w = frame.shape[0:2] | |||
| if not check_box(init_box, frame_h, frame_w): | |||
| raise Exception('modelscope error: init_box out of image range ', | |||
| init_box) | |||
| output_boxes.append(init_box.copy()) | |||
| init_box[2] = init_box[2] - init_box[0] | |||
| init_box[3] = init_box[3] - init_box[1] | |||
| self.tracker.initialize(frame, {'init_bbox': init_box}) | |||
| logger.info('init bbox done') | |||
| while True: | |||
| ret, frame = cap.read() | |||
| if frame is None: | |||
| break | |||
| out = self.tracker.track(frame) | |||
| state = [int(s) for s in out['target_bbox']] | |||
| output_boxes.append(state) | |||
| cap.release() | |||
| logger.info('tracking process done') | |||
| return { | |||
| OutputKeys.BOXES: output_boxes, | |||
| } | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -1,11 +1,15 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor, | |||
| Preprocessor) | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -35,9 +39,19 @@ class ImageCaptioningPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| if preprocessor is None: | |||
| if isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(pipe_model.model_dir) | |||
| elif isinstance(pipe_model, MPlugForAllTasks): | |||
| preprocessor = MPlugPreprocessor(pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def forward(self, inputs: Dict[str, Any], | |||
| **forward_params) -> Dict[str, Any]: | |||
| with torch.no_grad(): | |||
| return super().forward(inputs, **forward_params) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| if isinstance(self.model, OfaForAllTasks): | |||
| return inputs | |||
| return {OutputKeys.CAPTION: inputs} | |||
| @@ -5,13 +5,12 @@ import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models import Model | |||
| from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering, | |||
| OfaForAllTasks) | |||
| from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Pipeline, Tensor | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor, | |||
| OfaPreprocessor) | |||
| from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor, | |||
| Preprocessor) | |||
| from modelscope.utils.constant import Tasks | |||
| __all__ = ['VisualQuestionAnsweringPipeline'] | |||
| @@ -23,9 +22,8 @@ __all__ = ['VisualQuestionAnsweringPipeline'] | |||
| class VisualQuestionAnsweringPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[MPlugForVisualQuestionAnswering, str], | |||
| preprocessor: Optional[ | |||
| MPlugVisualQuestionAnsweringPreprocessor] = None, | |||
| model: Union[Model, str], | |||
| preprocessor: Optional[Preprocessor] = None, | |||
| **kwargs): | |||
| """use `model` and `preprocessor` to create a visual question answering pipeline for prediction | |||
| @@ -35,18 +33,12 @@ class VisualQuestionAnsweringPipeline(Pipeline): | |||
| """ | |||
| model = model if isinstance(model, | |||
| Model) else Model.from_pretrained(model) | |||
| self.tokenizer = None | |||
| if preprocessor is None: | |||
| if isinstance(model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model.model_dir) | |||
| elif isinstance(model, MPlugForVisualQuestionAnswering): | |||
| preprocessor = MPlugVisualQuestionAnsweringPreprocessor( | |||
| model.model_dir) | |||
| if isinstance(model, MPlugForVisualQuestionAnswering): | |||
| model.eval() | |||
| self.tokenizer = model.tokenizer | |||
| else: | |||
| model.model.eval() | |||
| elif isinstance(model, MPlugForAllTasks): | |||
| preprocessor = MPlugPreprocessor(model.model_dir) | |||
| model.model.eval() | |||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | |||
| def forward(self, inputs: Dict[str, Any], | |||
| @@ -64,14 +56,6 @@ class VisualQuestionAnsweringPipeline(Pipeline): | |||
| Returns: | |||
| Dict[str, str]: the prediction results | |||
| """ | |||
| if self.tokenizer is None: | |||
| if isinstance(self.model, OfaForAllTasks): | |||
| return inputs | |||
| replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), | |||
| ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), | |||
| ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) | |||
| pred_string = self.tokenizer.decode(inputs[0][0]) | |||
| for _old, _new in replace_tokens_bert: | |||
| pred_string = pred_string.replace(_old, _new) | |||
| pred_string.strip() | |||
| return {OutputKeys.TEXT: pred_string} | |||
| return {OutputKeys.TEXT: inputs} | |||
| @@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .base import Preprocessor | |||
| from .builder import PREPROCESSORS, build_preprocessor | |||
| from .common import Compose | |||
| from .common import Compose, ToTensor, Filter | |||
| from .asr import WavToScp | |||
| from .audio import LinearAECAndFbank | |||
| from .image import (LoadImage, load_image, | |||
| @@ -14,8 +14,7 @@ if TYPE_CHECKING: | |||
| ImageInstanceSegmentationPreprocessor, | |||
| ImageDenoisePreprocessor) | |||
| from .kws import WavToLists | |||
| from .multi_modal import (OfaPreprocessor, | |||
| MPlugVisualQuestionAnsweringPreprocessor) | |||
| from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) | |||
| from .nlp import (Tokenize, SequenceClassificationPreprocessor, | |||
| TextGenerationPreprocessor, | |||
| TokenClassificationPreprocessor, | |||
| @@ -33,7 +32,7 @@ else: | |||
| _import_structure = { | |||
| 'base': ['Preprocessor'], | |||
| 'builder': ['PREPROCESSORS', 'build_preprocessor'], | |||
| 'common': ['Compose'], | |||
| 'common': ['Compose', 'ToTensor', 'Filter'], | |||
| 'audio': ['LinearAECAndFbank'], | |||
| 'asr': ['WavToScp'], | |||
| 'video': ['ReadVideoData'], | |||
| @@ -42,8 +41,7 @@ else: | |||
| 'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor' | |||
| ], | |||
| 'kws': ['WavToLists'], | |||
| 'multi_modal': | |||
| ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'], | |||
| 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], | |||
| 'nlp': [ | |||
| 'Tokenize', 'SequenceClassificationPreprocessor', | |||
| 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', | |||
| @@ -2,6 +2,10 @@ | |||
| import time | |||
| from collections.abc import Sequence | |||
| from typing import Mapping | |||
| import numpy as np | |||
| import torch | |||
| from .builder import PREPROCESSORS, build_preprocessor | |||
| @@ -25,12 +29,18 @@ class Compose(object): | |||
| if isinstance(transform, dict): | |||
| if self.field_name is None: | |||
| transform = build_preprocessor(transform, field_name) | |||
| self.transforms.append(transform) | |||
| else: | |||
| # if not found key in field_name, try field_name=None(default_group) | |||
| try: | |||
| transform = build_preprocessor(transform, field_name) | |||
| except KeyError: | |||
| transform = build_preprocessor(transform, None) | |||
| elif callable(transform): | |||
| self.transforms.append(transform) | |||
| pass | |||
| else: | |||
| raise TypeError('transform must be callable or a dict, but got' | |||
| f' {type(transform)}') | |||
| self.transforms.append(transform) | |||
| def __call__(self, data): | |||
| for t in self.transforms: | |||
| @@ -52,3 +62,82 @@ class Compose(object): | |||
| format_string += f'\n {t}' | |||
| format_string += '\n)' | |||
| return format_string | |||
| def to_tensor(data): | |||
| """Convert objects of various python types to :obj:`torch.Tensor`. | |||
| Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, | |||
| :class:`Sequence`, :class:`int` and :class:`float`. | |||
| Args: | |||
| data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to | |||
| be converted. | |||
| """ | |||
| if isinstance(data, torch.Tensor): | |||
| return data | |||
| elif isinstance(data, np.ndarray): | |||
| return torch.from_numpy(data) | |||
| elif isinstance(data, Sequence) and not isinstance(data, str): | |||
| return torch.tensor(data) | |||
| elif isinstance(data, int): | |||
| return torch.LongTensor([data]) | |||
| elif isinstance(data, float): | |||
| return torch.FloatTensor([data]) | |||
| else: | |||
| raise TypeError(f'type {type(data)} cannot be converted to tensor.') | |||
| @PREPROCESSORS.register_module() | |||
| class ToTensor(object): | |||
| """Convert target object to tensor. | |||
| Args: | |||
| keys (Sequence[str]): Key of data to be converted to Tensor. | |||
| Only valid when data is type of `Mapping`. If `keys` is None, | |||
| all values of keys will be converted to tensor by default. | |||
| """ | |||
| def __init__(self, keys=None): | |||
| self.keys = keys | |||
| def __call__(self, data): | |||
| if isinstance(data, Mapping): | |||
| if self.keys is None: | |||
| self.keys = list(data.keys()) | |||
| for key in self.keys: | |||
| data[key] = to_tensor(data[key]) | |||
| else: | |||
| data = to_tensor(data) | |||
| return data | |||
| def __repr__(self): | |||
| return self.__class__.__name__ + f'(keys={self.keys})' | |||
| @PREPROCESSORS.register_module() | |||
| class Filter(object): | |||
| """This is usually the last stage of the dataloader transform. | |||
| Only data of reserved keys will be kept and passed directly to the model, others will be removed. | |||
| Args: | |||
| keys (Sequence[str]): Keys of data to be reserved, others will be removed. | |||
| """ | |||
| def __init__(self, reserved_keys): | |||
| self.reserved_keys = reserved_keys | |||
| def __call__(self, data): | |||
| assert isinstance(data, Mapping) | |||
| reserved_data = {} | |||
| for key in self.reserved_keys: | |||
| reserved_data[key] = data[key] | |||
| return reserved_data | |||
| def __repr__(self): | |||
| return self.__class__.__name__ + f'(keys={self.reserved_keys})' | |||
| @@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor): | |||
| super().__init__(*args, **kwargs) | |||
| self.model_dir: str = model_dir | |||
| from .common import Filter | |||
| # TODO: `Filter` should be moved to configurarion file of each model | |||
| self._transforms = [Filter(reserved_keys=['input', 'target'])] | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| """process the raw input data | |||
| @@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor): | |||
| Returns: | |||
| Dict[str, Any]: the preprocessed data | |||
| """ | |||
| for t in self._transforms: | |||
| data = t(data) | |||
| return data | |||
| @@ -19,7 +19,7 @@ from .ofa.utils.collate import collate_fn | |||
| __all__ = [ | |||
| 'OfaPreprocessor', | |||
| 'MPlugVisualQuestionAnsweringPreprocessor', | |||
| 'MPlugPreprocessor', | |||
| ] | |||
| @@ -28,7 +28,7 @@ __all__ = [ | |||
| class OfaPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -102,39 +102,55 @@ class OfaPreprocessor(Preprocessor): | |||
| @PREPROCESSORS.register_module( | |||
| Fields.multi_modal, | |||
| module_name=Preprocessors.mplug_visual_question_answering) | |||
| class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor): | |||
| Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor) | |||
| class MPlugPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via 'bert-base-uncased' tokenizer and configuration | |||
| """ | |||
| from transformers import BertTokenizer | |||
| from modelscope.models.multi_modal.mplug import CONFIG_NAME, VOCAB_NAME, MPlugConfig | |||
| super().__init__(*args, **kwargs) | |||
| self.model_dir = model_dir | |||
| # tokenizer | |||
| self.tokenizer = BertTokenizer.from_pretrained( | |||
| osp.join(model_dir, VOCAB_NAME)) | |||
| self._tokenizer = None | |||
| self._patch_resize_transform = None | |||
| # load configuration | |||
| config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME)) | |||
| @property | |||
| def tokenizer(self): | |||
| from transformers import BertTokenizer | |||
| # Initialize transform | |||
| from torchvision import transforms | |||
| mean = (0.48145466, 0.4578275, 0.40821073) | |||
| std = (0.26862954, 0.26130258, 0.27577711) | |||
| if self._tokenizer is None: | |||
| self._tokenizer = BertTokenizer.from_pretrained(self.model_dir) | |||
| return self._tokenizer | |||
| @property | |||
| def patch_resize_transform(self): | |||
| if self._patch_resize_transform is None: | |||
| from torchvision import transforms | |||
| from modelscope.models.multi_modal.mplug import CONFIG_NAME, MPlugConfig | |||
| config = MPlugConfig.from_yaml_file( | |||
| osp.join(self.model_dir, CONFIG_NAME)) | |||
| mean = (0.48145466, 0.4578275, 0.40821073) | |||
| std = (0.26862954, 0.26130258, 0.27577711) | |||
| self._patch_resize_transform = transforms.Compose([ | |||
| transforms.Resize((config.image_res, config.image_res), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=mean, std=std), | |||
| ]) | |||
| return self._patch_resize_transform | |||
| def __call__(self, *args, **kwargs): | |||
| call_mapping = { | |||
| Tasks.visual_question_answering: self.vqa_call, | |||
| Tasks.image_captioning: self.caption_call | |||
| } | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| transforms.Resize((config.image_res, config.image_res), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=mean, std=std), | |||
| ]) | |||
| self.cfg = Config.from_file( | |||
| osp.join(self.model_dir, ModelFile.CONFIGURATION)) | |||
| return call_mapping[self.cfg.task](*args, **kwargs) | |||
| def __call__(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]: | |||
| def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]: | |||
| image: Image.Image = data[0] if isinstance(data, | |||
| tuple) else data['image'] | |||
| question: str = data[1] if isinstance(data, | |||
| @@ -147,3 +163,19 @@ class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor): | |||
| return_tensors='pt') | |||
| return {'image': image, 'question': question, 'train': False} | |||
| def caption_call( | |||
| self, data: Union[Image.Image, tuple, | |||
| Dict[str, Any]]) -> Dict[str, Any]: | |||
| if isinstance(data, Image.Image): | |||
| image = data | |||
| elif isinstance(data, tuple): | |||
| image = data[0] | |||
| else: | |||
| image = data['image'] | |||
| image = image.convert('RGB') | |||
| image = self.patch_resize_transform(image) | |||
| image = torch.stack([image], dim=0) | |||
| question = self.tokenizer('', return_tensors='pt') | |||
| return {'image': image, 'question': question, 'train': False} | |||
| @@ -4,6 +4,7 @@ import os.path as osp | |||
| import uuid | |||
| from typing import Any, Dict, Iterable, Optional, Tuple, Union | |||
| import numpy as np | |||
| from transformers import AutoTokenizer | |||
| from modelscope.metainfo import Models, Preprocessors | |||
| @@ -43,7 +44,7 @@ class Tokenize(Preprocessor): | |||
| class SequenceClassificationPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||
| text_b, | |||
| return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | |||
| **self.tokenize_kwargs) | |||
| output = { | |||
| k: np.array(v) if isinstance(v, list) else v | |||
| for k, v in output.items() | |||
| } | |||
| self.labels_to_id(labels, output) | |||
| return output | |||
| @@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||
| if labels is not None: | |||
| if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ | |||
| and self.label2id is not None: | |||
| output[OutputKeys.LABEL] = [ | |||
| output[OutputKeys.LABELS] = [ | |||
| self.label2id[str(label)] for label in labels | |||
| ] | |||
| elif label_can_be_mapped(labels) and self.label2id is not None: | |||
| output[OutputKeys.LABEL] = self.label2id[str(labels)] | |||
| output[OutputKeys.LABELS] = self.label2id[str(labels)] | |||
| else: | |||
| output[OutputKeys.LABEL] = labels | |||
| output[OutputKeys.LABELS] = labels | |||
| @PREPROCESSORS.register_module( | |||
| @@ -286,7 +291,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||
| """ | |||
| def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -517,7 +522,7 @@ class NERPreprocessor(Preprocessor): | |||
| """ | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -609,7 +614,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| from fairseq.data import Dictionary | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data via the vocab file from the `model_dir` path | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -22,7 +22,7 @@ __all__ = ['DialogIntentPredictionPreprocessor'] | |||
| class DialogIntentPredictionPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -20,7 +20,7 @@ __all__ = ['DialogModelingPreprocessor'] | |||
| class DialogModelingPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -17,7 +17,7 @@ __all__ = ['DialogStateTrackingPreprocessor'] | |||
| class DialogStateTrackingPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -8,6 +8,7 @@ from itertools import chain | |||
| import numpy as np | |||
| from modelscope.preprocessors.space.tokenizer import Tokenizer | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.nlp.space import ontology, utils | |||
| from modelscope.utils.nlp.space.db_ops import MultiWozDB | |||
| @@ -343,7 +344,7 @@ class MultiWOZBPETextField(BPETextField): | |||
| ] | |||
| special_tokens.extend(self.add_sepcial_tokens()) | |||
| self.tokenizer = Tokenizer( | |||
| vocab_path=os.path.join(model_dir, 'vocab.txt'), | |||
| vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE), | |||
| special_tokens=special_tokens, | |||
| tokenizer_type=config.BPETextField.tokenizer_type) | |||
| self.understand_ids = self.tokenizer.convert_tokens_to_ids( | |||
| @@ -14,6 +14,7 @@ import numpy as np | |||
| from tqdm import tqdm | |||
| from modelscope.preprocessors.space.tokenizer import Tokenizer | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.nlp.space import ontology | |||
| from modelscope.utils.nlp.space.scores import hierarchical_set_score | |||
| from modelscope.utils.nlp.space.utils import list2np | |||
| @@ -50,7 +51,7 @@ class BPETextField(object): | |||
| ] | |||
| special_tokens.extend(self.add_sepcial_tokens()) | |||
| self.tokenizer = Tokenizer( | |||
| vocab_path=os.path.join(model_dir, 'vocab.txt'), | |||
| vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE), | |||
| special_tokens=special_tokens, | |||
| tokenizer_type=config.BPETextField.tokenizer_type) | |||
| self.understand_ids = self.numericalize(self.understand_tokens) | |||
| @@ -28,7 +28,7 @@ __all__ = ['ConversationalTextToSqlPreprocessor'] | |||
| class ConversationalTextToSqlPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| """preprocess the data | |||
| Args: | |||
| model_dir (str): model path | |||
| @@ -193,6 +193,15 @@ class SubPreprocessor(): | |||
| from nltk import data | |||
| data.path.append(os.path.join(self.model_dir, 'nltk_data')) | |||
| zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt') | |||
| if os.path.exists(zippath): | |||
| print('punkt has already exist!') | |||
| else: | |||
| import zipfile | |||
| with zipfile.ZipFile(zippath + '.zip') as zf: | |||
| zf.extractall( | |||
| os.path.join(self.model_dir, 'nltk_data/tokenizers/')) | |||
| question = nltk.word_tokenize(question) | |||
| question = mwtokenizer.tokenize(question) | |||
| @@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): | |||
| def prediction_step(self, model, inputs): | |||
| pass | |||
| def to_task_dataset(self, datasets, mode, preprocessor=None): | |||
| # wait for dataset interface to become stable... | |||
| return datasets.to_torch_dataset(preprocessor) | |||
| @@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer): | |||
| train_outputs = dict() | |||
| self._mode = ModeKeys.TRAIN | |||
| inputs = self.collate_fn(inputs) | |||
| # call model forward but not __call__ to skip postprocess | |||
| if isinstance(inputs, Mapping): | |||
| d_loss = model._train_forward_d(**inputs) | |||
| @@ -192,7 +192,7 @@ class Hook: | |||
| Whether to reach the end of every epoch | |||
| Returns: bool | |||
| """ | |||
| return trainer.inner_iter + 1 == len(trainer.data_loader) | |||
| return trainer.inner_iter + 1 == trainer.iters_per_epoch | |||
| def is_last_epoch(self, trainer): | |||
| """ | |||
| @@ -93,7 +93,7 @@ class TextLoggerHook(LoggerHook): | |||
| lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}' | |||
| if self.by_epoch: | |||
| log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t' | |||
| log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{trainer.iters_per_epoch}]\t' | |||
| else: | |||
| log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t' | |||
| log_str += f'{lr_str}, ' | |||
| @@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
| self.train_keys = build_dataset_keys( | |||
| self.cfg.dataset.train if hasattr(self.cfg, 'dataset') | |||
| and hasattr(self.cfg.dataset, 'train') else None) | |||
| # TODO eval may has special keys, which is now not supported. | |||
| # because there is only one preprocessor in the trainer, and it only supports one group of keys. | |||
| self.eval_keys = self.train_keys | |||
| self.eval_keys = build_dataset_keys( | |||
| self.cfg.dataset.val if hasattr(self.cfg, 'dataset') | |||
| and hasattr(self.cfg.dataset, 'val') else None) | |||
| if len(self.eval_keys) == 0: | |||
| self.eval_keys = self.train_keys | |||
| super().__init__( | |||
| model=model_dir, | |||
| @@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
| elif isinstance(model, nn.Module): | |||
| return model | |||
| def build_preprocessor(self) -> Preprocessor: | |||
| def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||
| """Build the preprocessor. | |||
| User can override this method to implement custom logits. | |||
| @@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
| model_args = {} if self.label2id is None else { | |||
| 'label2id': self.label2id | |||
| } | |||
| cfg = ConfigDict({ | |||
| **getattr(self.cfg, 'preprocessor'), | |||
| 'model_dir': | |||
| self.model_dir, | |||
| **model_args, | |||
| 'mode': | |||
| ModeKeys.TRAIN, | |||
| **self.train_keys, | |||
| }) | |||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
| field_name = Tasks.find_field_by_task(self.cfg.task) | |||
| train_preprocessor, eval_preprocessor = None, None | |||
| _train_cfg, _eval_cfg = {}, {} | |||
| if 'type' not in self.cfg.preprocessor and ( | |||
| 'train' in self.cfg.preprocessor | |||
| or 'val' in self.cfg.preprocessor): | |||
| if 'train' in self.cfg.preprocessor: | |||
| _train_cfg = self.cfg.preprocessor.train | |||
| if 'val' in self.cfg.preprocessor: | |||
| _eval_cfg = self.cfg.preprocessor.val | |||
| else: | |||
| _train_cfg = self.cfg.preprocessor | |||
| _eval_cfg = self.cfg.preprocessor | |||
| if len(_train_cfg): | |||
| _train_cfg.update({ | |||
| 'model_dir': self.model_dir, | |||
| **model_args, | |||
| **self.train_keys, 'mode': ModeKeys.TRAIN | |||
| }) | |||
| train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||
| if len(_eval_cfg): | |||
| _eval_cfg.update({ | |||
| 'model_dir': self.model_dir, | |||
| **model_args, | |||
| **self.eval_keys, 'mode': ModeKeys.EVAL | |||
| }) | |||
| eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||
| return train_preprocessor, eval_preprocessor | |||
| @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) | |||
| @@ -178,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): | |||
| """Veco evaluates the datasets one by one. | |||
| """ | |||
| from modelscope.task_datasets import VecoDataset | |||
| from modelscope.msdatasets.task_datasets import VecoDataset | |||
| self.model.eval() | |||
| self._mode = ModeKeys.EVAL | |||
| metric_values = {} | |||
| @@ -5,15 +5,15 @@ import time | |||
| from collections.abc import Mapping | |||
| from distutils.version import LooseVersion | |||
| from functools import partial | |||
| from typing import Callable, List, Optional, Tuple, Union | |||
| from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from addict import Dict | |||
| from torch import distributed as dist | |||
| from torch import nn | |||
| from torch.utils.data import DataLoader, Dataset | |||
| from torch.utils.data.dataloader import default_collate | |||
| from torch.utils.data.distributed import DistributedSampler | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| @@ -21,23 +21,26 @@ from modelscope.metainfo import Trainers | |||
| from modelscope.metrics import build_metric, task_default_metrics | |||
| from modelscope.models.base import Model, TorchModel | |||
| from modelscope.msdatasets.ms_dataset import MsDataset | |||
| from modelscope.preprocessors import build_preprocessor | |||
| from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||
| from modelscope.msdatasets.task_datasets.torch_base_dataset import \ | |||
| TorchTaskDataset | |||
| from modelscope.preprocessors.base import Preprocessor | |||
| from modelscope.task_datasets.builder import build_task_dataset | |||
| from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | |||
| from modelscope.preprocessors.builder import build_preprocessor | |||
| from modelscope.preprocessors.common import Compose | |||
| from modelscope.trainers.hooks.builder import HOOKS | |||
| from modelscope.trainers.hooks.priority import Priority, get_priority | |||
| from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
| from modelscope.trainers.optimizer.builder import build_optimizer | |||
| from modelscope.utils.config import Config, ConfigDict | |||
| from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, | |||
| ModelFile, Tasks, TrainerStages) | |||
| from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, | |||
| ConfigKeys, Hubs, ModeKeys, ModelFile, | |||
| Tasks, TrainerStages) | |||
| from modelscope.utils.data_utils import to_device | |||
| from modelscope.utils.file_utils import func_receive_dict_inputs | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.registry import build_from_cfg | |||
| from modelscope.utils.tensor_utils import torch_default_data_collator | |||
| from modelscope.utils.torch_utils import (broadcast, create_device, | |||
| get_dist_info, init_dist) | |||
| from modelscope.utils.torch_utils import (create_device, get_dist_info, | |||
| init_dist) | |||
| from .base import BaseTrainer | |||
| from .builder import TRAINERS | |||
| from .default_config import DEFAULT_CONFIG | |||
| @@ -83,7 +86,8 @@ class EpochBasedTrainer(BaseTrainer): | |||
| data_collator: Optional[Callable] = None, | |||
| train_dataset: Optional[Union[MsDataset, Dataset]] = None, | |||
| eval_dataset: Optional[Union[MsDataset, Dataset]] = None, | |||
| preprocessor: Optional[Preprocessor] = None, | |||
| preprocessor: Optional[Union[Preprocessor, | |||
| Dict[str, Preprocessor]]] = None, | |||
| optimizers: Tuple[torch.optim.Optimizer, | |||
| torch.optim.lr_scheduler._LRScheduler] = (None, | |||
| None), | |||
| @@ -120,24 +124,46 @@ class EpochBasedTrainer(BaseTrainer): | |||
| else: | |||
| self.work_dir = self.cfg.train.get('work_dir', './work_dir') | |||
| self.preprocessor = None | |||
| self.train_preprocessor, self.eval_preprocessor = None, None | |||
| if isinstance(preprocessor, Preprocessor): | |||
| self.preprocessor = preprocessor | |||
| elif hasattr(self.cfg, 'preprocessor'): | |||
| self.preprocessor = self.build_preprocessor() | |||
| if self.preprocessor is not None: | |||
| self.preprocessor.mode = ModeKeys.TRAIN | |||
| self.train_preprocessor = preprocessor | |||
| self.eval_preprocessor = preprocessor | |||
| elif isinstance(preprocessor, Mapping): | |||
| if not (ConfigKeys.train in preprocessor | |||
| or ConfigKeys.val in preprocessor): | |||
| raise ValueError( | |||
| f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' | |||
| ) | |||
| if ConfigKeys.train in preprocessor: | |||
| assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) | |||
| self.train_preprocessor = preprocessor[ConfigKeys.train] | |||
| if ConfigKeys.val in preprocessor: | |||
| assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) | |||
| self.eval_preprocessor = preprocessor[ConfigKeys.val] | |||
| elif hasattr(self.cfg, ConfigFields.preprocessor): | |||
| self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( | |||
| ) | |||
| if self.train_preprocessor is not None: | |||
| self.train_preprocessor.mode = ModeKeys.TRAIN | |||
| if self.eval_preprocessor is not None: | |||
| self.eval_preprocessor.mode = ModeKeys.EVAL | |||
| device_name = kwargs.get('device', 'gpu') | |||
| assert device_name in ['gpu', | |||
| 'cpu'], 'device should be either cpu or gpu.' | |||
| self.device = create_device(device_name == 'cpu') | |||
| self.train_dataset = self.to_task_dataset( | |||
| train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) | |||
| train_dataset, | |||
| mode=ModeKeys.TRAIN, | |||
| preprocessor=self.train_preprocessor) | |||
| self.eval_dataset = self.to_task_dataset( | |||
| eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) | |||
| eval_dataset, | |||
| mode=ModeKeys.EVAL, | |||
| preprocessor=self.eval_preprocessor) | |||
| self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||
| self.data_collator = data_collator if data_collator is not None else default_collate | |||
| self.metrics = self.get_metrics() | |||
| self._metric_values = None | |||
| self.optimizers = optimizers | |||
| @@ -155,6 +181,16 @@ class EpochBasedTrainer(BaseTrainer): | |||
| else: | |||
| self._max_epochs = kwargs['max_epochs'] | |||
| self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None) | |||
| self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None) | |||
| if self._train_iters_per_epoch is None and hasattr( | |||
| self.cfg.train, 'train_iters_per_epoch'): | |||
| self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch | |||
| if self._eval_iters_per_epoch is None and hasattr( | |||
| self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, | |||
| 'val_iters_per_epoch'): | |||
| self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch | |||
| self.use_fp16 = kwargs.get('use_fp16', False) | |||
| # TODO @wenmeng.zwm add seed init fn | |||
| @@ -211,7 +247,32 @@ class EpochBasedTrainer(BaseTrainer): | |||
| @property | |||
| def max_iters(self): | |||
| """int: Maximum training iterations.""" | |||
| return self._max_epochs * len(self.data_loader) | |||
| return self._max_epochs * self.iters_per_epoch | |||
| @property | |||
| def iters_per_epoch(self): | |||
| """int: Total iterations of one epoch""" | |||
| def _get_data_len(data_loader): | |||
| try: | |||
| return len(data_loader) | |||
| except Exception as e: | |||
| self.logger.error(e) | |||
| raise ValueError( | |||
| 'Please implement ``__len__`` method for your dataset, ' | |||
| 'or add `train_iters_per_epoch` and `train_iters_per_epoch` ' | |||
| 'to your configuration file or kwargs') | |||
| if self.mode == ModeKeys.TRAIN: | |||
| if self._train_iters_per_epoch is not None: | |||
| return self._train_iters_per_epoch | |||
| else: | |||
| return _get_data_len(self.train_dataloader) | |||
| elif self.mode == ModeKeys.EVAL: | |||
| if self._eval_iters_per_epoch is not None: | |||
| return self._eval_iters_per_epoch | |||
| else: | |||
| return _get_data_len(self.eval_dataloader) | |||
| def to_task_dataset(self, | |||
| datasets: Union[Dataset, List[Dataset]], | |||
| @@ -228,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): | |||
| if isinstance(datasets, TorchTaskDataset): | |||
| return datasets | |||
| elif isinstance(datasets, MsDataset): | |||
| datasets = datasets.to_torch_dataset( | |||
| preprocessors=self.preprocessor) | |||
| return datasets | |||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||
| else ConfigDict(type=None, mode=mode) | |||
| return datasets.to_torch_dataset( | |||
| task_data_config=cfg, | |||
| task_name=self.cfg.task, | |||
| preprocessors=preprocessor) | |||
| elif isinstance(datasets, List) and isinstance( | |||
| datasets[0], MsDataset): | |||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||
| else ConfigDict(type=None, mode=mode) | |||
| datasets = [ | |||
| d.to_torch_dataset(preprocessor=self.preprocessor) | |||
| for d in datasets | |||
| d.to_torch_dataset( | |||
| task_data_config=cfg, | |||
| task_name=self.cfg.task, | |||
| preprocessors=preprocessor) for d in datasets | |||
| ] | |||
| cfg = ConfigDict( | |||
| type=self.cfg.task, mode=mode, datasets=datasets) | |||
| @@ -258,24 +326,44 @@ class EpochBasedTrainer(BaseTrainer): | |||
| else: | |||
| return datasets | |||
| def build_preprocessor(self) -> Preprocessor: | |||
| """Build the preprocessor. | |||
| def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||
| """Build train and eval preprocessor. | |||
| User can override this method to implement custom logits. | |||
| Returns: The preprocessor instance. | |||
| Returns: The train preprocessor and eval preprocessor instance. | |||
| """ | |||
| # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||
| # when they are different ones in training and evaluation | |||
| cfg = ConfigDict({ | |||
| **getattr(self.cfg, 'preprocessor'), | |||
| 'model_dir': | |||
| self.model_dir, | |||
| 'mode': | |||
| ModeKeys.TRAIN, | |||
| }) | |||
| return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
| field_name = Tasks.find_field_by_task(self.cfg.task) | |||
| train_preprocessor, eval_preprocessor = None, None | |||
| _train_cfg, _eval_cfg = {}, {} | |||
| _dafault_args = {'model_dir': self.model_dir} | |||
| if 'type' not in self.cfg.preprocessor and ( | |||
| 'train' in self.cfg.preprocessor | |||
| or 'val' in self.cfg.preprocessor): | |||
| if 'train' in self.cfg.preprocessor: | |||
| _train_cfg = self.cfg.preprocessor.train | |||
| if 'val' in self.cfg.preprocessor: | |||
| _eval_cfg = self.cfg.preprocessor.val | |||
| else: | |||
| _train_cfg = self.cfg.preprocessor | |||
| _eval_cfg = self.cfg.preprocessor | |||
| if len(_train_cfg): | |||
| if isinstance(_train_cfg, Sequence): | |||
| # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, | |||
| # and add mode for Compose or other plans | |||
| raise NotImplementedError('Not supported yet!') | |||
| _train_cfg.update(_dafault_args) | |||
| train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||
| if len(_eval_cfg): | |||
| if isinstance(_eval_cfg, Sequence): | |||
| raise NotImplementedError('Not supported yet!') | |||
| _eval_cfg.update(_dafault_args) | |||
| eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||
| return train_preprocessor, eval_preprocessor | |||
| def get_metrics(self) -> List[str]: | |||
| """Get the metric class types. | |||
| @@ -373,34 +461,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
| return build_parallel(dp_cfg) | |||
| def collate_fn(self, data): | |||
| """Prepare the input just before the forward function. | |||
| This method will move the tensors to the right device. | |||
| Usually this method does not need to be overridden. | |||
| Args: | |||
| data: The data out of the dataloader. | |||
| Returns: The processed data. | |||
| """ | |||
| from torch.utils.data.dataloader import default_collate | |||
| if isinstance(data, dict) or isinstance(data, Mapping): | |||
| return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||
| elif isinstance(data, (tuple, list)): | |||
| if isinstance(data[0], (int, float)): | |||
| return default_collate(data).to(self.device) | |||
| else: | |||
| return type(data)(self.collate_fn(v) for v in data) | |||
| elif isinstance(data, np.ndarray): | |||
| return self.collate_fn(torch.from_numpy(data)) | |||
| elif isinstance(data, torch.Tensor): | |||
| return data.to(self.device) | |||
| elif isinstance(data, (str, int, float, bool)): | |||
| return data | |||
| else: | |||
| raise ValueError(f'Unsupported data type {type(data)}') | |||
| def train_step(self, model, inputs): | |||
| """ Perform a training step on a batch of inputs. | |||
| @@ -421,7 +481,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
| # TODO: find more pretty way to change mode | |||
| model.train() | |||
| self._mode = ModeKeys.TRAIN | |||
| inputs = self.collate_fn(inputs) | |||
| # call model forward but not __call__ to skip postprocess | |||
| if isinstance(inputs, | |||
| Mapping) and not func_receive_dict_inputs(model.forward): | |||
| @@ -486,7 +545,9 @@ class EpochBasedTrainer(BaseTrainer): | |||
| if self.train_dataset is None: | |||
| train_data = self.cfg.dataset.train | |||
| self.train_dataset = self.build_dataset( | |||
| train_data, mode=ModeKeys.TRAIN) | |||
| train_data, | |||
| mode=ModeKeys.TRAIN, | |||
| preprocessor=self.train_preprocessor) | |||
| data_loader = self._build_dataloader_with_dataset( | |||
| self.train_dataset, | |||
| @@ -505,7 +566,9 @@ class EpochBasedTrainer(BaseTrainer): | |||
| if self.eval_dataset is None: | |||
| val_data = self.cfg.dataset.val | |||
| self.eval_dataset = self.build_dataset( | |||
| val_data, mode=ModeKeys.EVAL) | |||
| val_data, | |||
| mode=ModeKeys.EVAL, | |||
| preprocessor=self.eval_preprocessor) | |||
| batch_size = self.cfg.evaluation.batch_size | |||
| workers = self.cfg.evaluation.workers | |||
| @@ -521,7 +584,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
| ) | |||
| return data_loader | |||
| def build_dataset(self, data_cfg, mode): | |||
| def build_dataset(self, data_cfg, mode, preprocessor=None): | |||
| """ Build torch dataset object using data config | |||
| """ | |||
| dataset = MsDataset.load( | |||
| @@ -530,9 +593,13 @@ class EpochBasedTrainer(BaseTrainer): | |||
| subset_name=data_cfg.subset_name if hasattr( | |||
| data_cfg, 'subset_name') else None, | |||
| hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||
| **data_cfg, | |||
| ) | |||
| cfg = ConfigDict(type=self.cfg.model.type, mode=mode) | |||
| torch_dataset = dataset.to_torch_dataset( | |||
| preprocessors=self.preprocessor, ) | |||
| task_data_config=cfg, | |||
| task_name=self.cfg.task, | |||
| preprocessors=self.preprocessor) | |||
| dataset = self.to_task_dataset(torch_dataset, mode) | |||
| return dataset | |||
| @@ -698,6 +765,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
| self.invoke_hook(TrainerStages.before_train_epoch) | |||
| time.sleep(2) # Prevent possible deadlock during epoch transition | |||
| for i, data_batch in enumerate(data_loader): | |||
| data_batch = to_device(data_batch, self.device) | |||
| self.data_batch = data_batch | |||
| self._inner_iter = i | |||
| self.invoke_hook(TrainerStages.before_train_iter) | |||
| @@ -706,6 +774,9 @@ class EpochBasedTrainer(BaseTrainer): | |||
| del self.data_batch | |||
| self._iter += 1 | |||
| if i + 1 >= self.iters_per_epoch: | |||
| break | |||
| self.invoke_hook(TrainerStages.after_train_epoch) | |||
| self._epoch += 1 | |||
| @@ -721,17 +792,21 @@ class EpochBasedTrainer(BaseTrainer): | |||
| metric_values = multi_gpu_test( | |||
| self.model, | |||
| data_loader, | |||
| device=self.device, | |||
| tmpdir=None, | |||
| gpu_collect=False, | |||
| data_collate_fn=self.collate_fn, | |||
| metric_classes=metric_classes) | |||
| metric_classes=metric_classes, | |||
| data_loader_iters_per_gpu=self.iters_per_epoch) | |||
| else: | |||
| from modelscope.trainers.utils.inference import single_gpu_test | |||
| metric_values = single_gpu_test( | |||
| self.model, | |||
| data_loader, | |||
| data_collate_fn=self.collate_fn, | |||
| metric_classes=metric_classes) | |||
| device=self.device, | |||
| metric_classes=metric_classes, | |||
| data_loader_iters=self.iters_per_epoch) | |||
| self._inner_iter = self.iters_per_epoch - 1 # start from index 0 | |||
| return metric_values | |||