diff --git a/.gitattributes b/.gitattributes index 88ef2f44..60ff0dd2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4,3 +4,4 @@ *.wav filter=lfs diff=lfs merge=lfs -text *.JPEG filter=lfs diff=lfs merge=lfs -text *.jpeg filter=lfs diff=lfs merge=lfs -text +*.avi filter=lfs diff=lfs merge=lfs -text diff --git a/data/test/audios/3ch_nihaomiya.wav b/data/test/audios/3ch_nihaomiya.wav new file mode 100644 index 00000000..57d9f061 --- /dev/null +++ b/data/test/audios/3ch_nihaomiya.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad1a268c614076614a2ae6528abc29cc85ae35826d172079d7d9b26a0299559 +size 4325096 diff --git a/data/test/audios/farend_speech.wav b/data/test/audios/farend_speech.wav new file mode 100644 index 00000000..4e96d842 --- /dev/null +++ b/data/test/audios/farend_speech.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3637ee0628d0953f77d5a32327980af542c43230c4127d2a72b4df1ea2ffb0be +size 320042 diff --git a/data/test/audios/nearend_mic.wav b/data/test/audios/nearend_mic.wav new file mode 100644 index 00000000..e055c2e0 --- /dev/null +++ b/data/test/audios/nearend_mic.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc116af609a66f431f94df6b385ff2aa362f8a2d437c2279f5401e47f9178469 +size 320042 diff --git a/data/test/audios/speech_with_noise.wav b/data/test/audios/speech_with_noise.wav new file mode 100644 index 00000000..d57488c9 --- /dev/null +++ b/data/test/audios/speech_with_noise.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9354345a6297f4522e690d337546aa9a686a7e61eefcd935478a2141b924db8f +size 76770 diff --git a/data/test/images/image_salient_detection.jpg b/data/test/images/image_salient_detection.jpg new file mode 100644 index 00000000..9c0632d3 --- /dev/null +++ b/data/test/images/image_salient_detection.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70ea0c06f9cfe3882253f7175221d47e394ab9c469076ab220e880b17dbcdd02 +size 48552 diff --git a/data/test/images/ocr_recognition_document.png b/data/test/images/ocr_recognition_document.png new file mode 100644 index 00000000..d74018bb --- /dev/null +++ b/data/test/images/ocr_recognition_document.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29f2ad929c852f6456367054d13e113078cf06b763fe54d73fd324f789331aa3 +size 61611 diff --git a/data/test/videos/dog.avi b/data/test/videos/dog.avi new file mode 100644 index 00000000..afcda087 --- /dev/null +++ b/data/test/videos/dog.avi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:469090fb217a34a2c096cfd42c251da69dca9fcd1a3c1faae7d29183c1816c14 +size 12834294 diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index d906a80d..09bff2c1 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -362,8 +362,10 @@ class HubApi: dataset_name: str, namespace: str, revision: Optional[str] = DEFAULT_DATASET_REVISION): - return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ - f'Revision={revision}&FilePath={file_name}' + if file_name.endswith('.csv'): + file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ + f'Revision={revision}&FilePath={file_name}' + return file_name def get_dataset_access_config( self, diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index a0aab6d3..0bc16026 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -38,6 +38,7 @@ class Models(object): # audio models sambert_hifigan = 'sambert-hifigan' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' kws_kwsbp = 'kws-kwsbp' generic_asr = 'generic-asr' @@ -86,6 +87,7 @@ class Pipelines(object): body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' human_detection = 'resnet18-human-detection' object_detection = 'vit-object-detection' + salient_detection = 'u2net-salient-detection' image_classification = 'image-classification' face_detection = 'resnet-face-detection-scrfd10gkps' live_category = 'live-category' @@ -109,6 +111,7 @@ class Pipelines(object): skin_retouching = 'unet-skin-retouching' tinynas_classification = 'tinynas-classification' crowd_counting = 'hrnet-crowd-counting' + video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' # nlp tasks sentence_similarity = 'sentence-similarity' @@ -132,6 +135,7 @@ class Pipelines(object): sambert_hifigan_tts = 'sambert-hifigan-tts' speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' kws_kwsbp = 'kws-kwsbp' asr_inference = 'asr-inference' @@ -215,7 +219,7 @@ class Preprocessors(object): # multi-modal preprocessor ofa_tasks_preprocessor = 'ofa-tasks-preprocessor' - mplug_visual_question_answering = 'mplug-visual-question-answering' + mplug_tasks_preprocessor = 'mplug-tasks-preprocessor' class Metrics(object): diff --git a/modelscope/models/audio/kws/__init__.py b/modelscope/models/audio/kws/__init__.py index f3db5e08..dd183fe5 100644 --- a/modelscope/models/audio/kws/__init__.py +++ b/modelscope/models/audio/kws/__init__.py @@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .generic_key_word_spotting import GenericKeyWordSpotting + from .farfield.model import FSMNSeleNetV2Decorator else: _import_structure = { 'generic_key_word_spotting': ['GenericKeyWordSpotting'], + 'farfield.model': ['FSMNSeleNetV2Decorator'], } import sys diff --git a/modelscope/models/audio/kws/farfield/__init__.py b/modelscope/models/audio/kws/farfield/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/audio/kws/farfield/fsmn.py b/modelscope/models/audio/kws/farfield/fsmn.py new file mode 100644 index 00000000..e88d3976 --- /dev/null +++ b/modelscope/models/audio/kws/farfield/fsmn.py @@ -0,0 +1,495 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .model_def import (HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32, + printNeonMatrix, printNeonVector) + +DEBUG = False + + +def to_kaldi_matrix(np_mat): + """ function that transform as str numpy mat to standard kaldi str matrix + + Args: + np_mat: numpy mat + + Returns: str + """ + np.set_printoptions(threshold=np.inf, linewidth=np.nan) + out_str = str(np_mat) + out_str = out_str.replace('[', '') + out_str = out_str.replace(']', '') + return '[ %s ]\n' % out_str + + +def print_tensor(torch_tensor): + """ print torch tensor for debug + + Args: + torch_tensor: a tensor + """ + re_str = '' + x = torch_tensor.detach().squeeze().numpy() + re_str += to_kaldi_matrix(x) + re_str += '\n' + print(re_str) + + +class LinearTransform(nn.Module): + + def __init__(self, input_dim, output_dim): + super(LinearTransform, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.linear = nn.Linear(input_dim, output_dim, bias=False) + + self.debug = False + self.dataout = None + + def forward(self, input): + output = self.linear(input) + + if self.debug: + self.dataout = output + + return output + + def print_model(self): + printNeonMatrix(self.linear.weight) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.output_dim, + self.input_dim) + re_str += ' 1\n' + + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + re_str += '\n' + + return re_str + + +class AffineTransform(nn.Module): + + def __init__(self, input_dim, output_dim): + super(AffineTransform, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + + self.linear = nn.Linear(input_dim, output_dim) + + self.debug = False + self.dataout = None + + def forward(self, input): + output = self.linear(input) + + if self.debug: + self.dataout = output + + return output + + def print_model(self): + printNeonMatrix(self.linear.weight) + printNeonVector(self.linear.bias) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.output_dim, + self.input_dim) + re_str += ' 1 1 0\n' + + linear_weights = self.state_dict()['linear.weight'] + x = linear_weights.squeeze().numpy() + re_str += to_kaldi_matrix(x) + + linear_bias = self.state_dict()['linear.bias'] + x = linear_bias.squeeze().numpy() + re_str += to_kaldi_matrix(x) + re_str += '\n' + + return re_str + + +class Fsmn(nn.Module): + """ + FSMN implementation. + """ + + def __init__(self, + input_dim, + output_dim, + lorder=None, + rorder=None, + lstride=None, + rstride=None): + super(Fsmn, self).__init__() + + self.dim = input_dim + + if lorder is None: + return + + self.lorder = lorder + self.rorder = rorder + self.lstride = lstride + self.rstride = rstride + + self.conv_left = nn.Conv2d( + self.dim, + self.dim, (lorder, 1), + dilation=(lstride, 1), + groups=self.dim, + bias=False) + + if rorder > 0: + self.conv_right = nn.Conv2d( + self.dim, + self.dim, (rorder, 1), + dilation=(rstride, 1), + groups=self.dim, + bias=False) + else: + self.conv_right = None + + self.debug = False + self.dataout = None + + def forward(self, input): + x = torch.unsqueeze(input, 1) + x_per = x.permute(0, 3, 2, 1) + + y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0]) + + if self.conv_right is not None: + y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride]) + y_right = y_right[:, :, self.rstride:, :] + out = x_per + self.conv_left(y_left) + self.conv_right(y_right) + else: + out = x_per + self.conv_left(y_left) + + out1 = out.permute(0, 3, 2, 1) + output = out1.squeeze(1) + + if self.debug: + self.dataout = output + + return output + + def print_model(self): + tmpw = self.conv_left.weight + tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) + for j in range(tmpw.shape[0]): + tmpwm[:, j] = tmpw[j, 0, :, 0] + + printNeonMatrix(tmpwm) + + if self.conv_right is not None: + tmpw = self.conv_right.weight + tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) + for j in range(tmpw.shape[0]): + tmpwm[:, j] = tmpw[j, 0, :, 0] + + printNeonMatrix(tmpwm) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + re_str += ' %d %d %d %d %d 0\n' % ( + 1, self.lorder, self.rorder, self.lstride, self.rstride) + + lfiters = self.state_dict()['conv_left.weight'] + x = np.flipud(lfiters.squeeze().numpy().T) + re_str += to_kaldi_matrix(x) + + if self.conv_right is not None: + rfiters = self.state_dict()['conv_right.weight'] + x = (rfiters.squeeze().numpy().T) + re_str += to_kaldi_matrix(x) + re_str += '\n' + + return re_str + + +class RectifiedLinear(nn.Module): + + def __init__(self, input_dim, output_dim): + super(RectifiedLinear, self).__init__() + self.dim = input_dim + self.relu = nn.ReLU() + + def forward(self, input): + return self.relu(input) + + def to_kaldi_nnet(self): + re_str = '' + re_str += ' %d %d\n' % (self.dim, self.dim) + re_str += '\n' + return re_str + + +class FSMNNet(nn.Module): + """ + FSMN net for keyword spotting + """ + + def __init__(self, + input_dim=200, + linear_dim=128, + proj_dim=128, + lorder=10, + rorder=1, + num_syn=5, + fsmn_layers=4): + """ + Args: + input_dim: input dimension + linear_dim: fsmn input dimension + proj_dim: fsmn projection dimension + lorder: fsmn left order + rorder: fsmn right order + num_syn: output dimension + fsmn_layers: no. of sequential fsmn layers + """ + super(FSMNNet, self).__init__() + + self.input_dim = input_dim + self.linear_dim = linear_dim + self.proj_dim = proj_dim + self.lorder = lorder + self.rorder = rorder + self.num_syn = num_syn + self.fsmn_layers = fsmn_layers + + self.linear1 = AffineTransform(input_dim, linear_dim) + self.relu = RectifiedLinear(linear_dim, linear_dim) + + self.fsmn = self._build_repeats(linear_dim, proj_dim, lorder, rorder, + fsmn_layers) + + self.linear2 = AffineTransform(linear_dim, num_syn) + + @staticmethod + def _build_repeats(linear_dim=136, + proj_dim=68, + lorder=3, + rorder=2, + fsmn_layers=5): + repeats = [ + nn.Sequential( + LinearTransform(linear_dim, proj_dim), + Fsmn(proj_dim, proj_dim, lorder, rorder, 1, 1), + AffineTransform(proj_dim, linear_dim), + RectifiedLinear(linear_dim, linear_dim)) + for i in range(fsmn_layers) + ] + + return nn.Sequential(*repeats) + + def forward(self, input): + x1 = self.linear1(input) + x2 = self.relu(x1) + x3 = self.fsmn(x2) + x4 = self.linear2(x3) + return x4 + + def print_model(self): + self.linear1.print_model() + + for layer in self.fsmn: + layer[0].print_model() + layer[1].print_model() + layer[2].print_model() + + self.linear2.print_model() + + def print_header(self): + # + # write total header + # + header = [0.0] * HEADER_BLOCK_SIZE * 4 + # numins + header[0] = 0.0 + # numouts + header[1] = 0.0 + # dimins + header[2] = self.input_dim + # dimouts + header[3] = self.num_syn + # numlayers + header[4] = 3 + + # + # write each layer's header + # + hidx = 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_DENSE.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 + header[HEADER_BLOCK_SIZE * hidx + 2] = self.input_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = self.linear_dim + header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 + header[HEADER_BLOCK_SIZE * hidx + 5] = float( + ActivationType.ACTIVATION_RELU.value) + hidx += 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_SEQUENTIAL_FSMN.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 + header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = self.proj_dim + header[HEADER_BLOCK_SIZE * hidx + 4] = self.lorder + header[HEADER_BLOCK_SIZE * hidx + 5] = self.rorder + header[HEADER_BLOCK_SIZE * hidx + 6] = self.fsmn_layers + header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0 + hidx += 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_DENSE.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 + header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = self.num_syn + header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 + header[HEADER_BLOCK_SIZE * hidx + 5] = float( + ActivationType.ACTIVATION_SOFTMAX.value) + + for h in header: + print(f32ToI32(h)) + + def to_kaldi_nnet(self): + re_str = '' + re_str += '\n' + re_str += self.linear1.to_kaldi_nnet() + re_str += self.relu.to_kaldi_nnet() + + for fsmn in self.fsmn: + re_str += fsmn[0].to_kaldi_nnet() + re_str += fsmn[1].to_kaldi_nnet() + re_str += fsmn[2].to_kaldi_nnet() + re_str += fsmn[3].to_kaldi_nnet() + + re_str += self.linear2.to_kaldi_nnet() + re_str += ' %d %d\n' % (self.num_syn, self.num_syn) + re_str += '\n' + re_str += '\n' + + return re_str + + +class DFSMN(nn.Module): + """ + One deep fsmn layer + """ + + def __init__(self, + dimproj=64, + dimlinear=128, + lorder=20, + rorder=1, + lstride=1, + rstride=1): + """ + Args: + dimproj: projection dimension, input and output dimension of memory blocks + dimlinear: dimension of mapping layer + lorder: left order + rorder: right order + lstride: left stride + rstride: right stride + """ + super(DFSMN, self).__init__() + + self.lorder = lorder + self.rorder = rorder + self.lstride = lstride + self.rstride = rstride + + self.expand = AffineTransform(dimproj, dimlinear) + self.shrink = LinearTransform(dimlinear, dimproj) + + self.conv_left = nn.Conv2d( + dimproj, + dimproj, (lorder, 1), + dilation=(lstride, 1), + groups=dimproj, + bias=False) + + if rorder > 0: + self.conv_right = nn.Conv2d( + dimproj, + dimproj, (rorder, 1), + dilation=(rstride, 1), + groups=dimproj, + bias=False) + else: + self.conv_right = None + + def forward(self, input): + f1 = F.relu(self.expand(input)) + p1 = self.shrink(f1) + + x = torch.unsqueeze(p1, 1) + x_per = x.permute(0, 3, 2, 1) + + y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0]) + + if self.conv_right is not None: + y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride]) + y_right = y_right[:, :, self.rstride:, :] + out = x_per + self.conv_left(y_left) + self.conv_right(y_right) + else: + out = x_per + self.conv_left(y_left) + + out1 = out.permute(0, 3, 2, 1) + output = input + out1.squeeze(1) + + return output + + def print_model(self): + self.expand.print_model() + self.shrink.print_model() + + tmpw = self.conv_left.weight + tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) + for j in range(tmpw.shape[0]): + tmpwm[:, j] = tmpw[j, 0, :, 0] + + printNeonMatrix(tmpwm) + + if self.conv_right is not None: + tmpw = self.conv_right.weight + tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0]) + for j in range(tmpw.shape[0]): + tmpwm[:, j] = tmpw[j, 0, :, 0] + + printNeonMatrix(tmpwm) + + +def build_dfsmn_repeats(linear_dim=128, + proj_dim=64, + lorder=20, + rorder=1, + fsmn_layers=6): + """ + build stacked dfsmn layers + Args: + linear_dim: + proj_dim: + lorder: + rorder: + fsmn_layers: + + Returns: + + """ + repeats = [ + nn.Sequential(DFSMN(proj_dim, linear_dim, lorder, rorder, 1, 1)) + for i in range(fsmn_layers) + ] + + return nn.Sequential(*repeats) diff --git a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py new file mode 100644 index 00000000..1884e533 --- /dev/null +++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py @@ -0,0 +1,236 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear +from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32 + + +class FSMNUnit(nn.Module): + """ A multi-channel fsmn unit + + """ + + def __init__(self, dimlinear=128, dimproj=64, lorder=20, rorder=1): + """ + Args: + dimlinear: input / output dimension + dimproj: fsmn input / output dimension + lorder: left ofder + rorder: right order + """ + super(FSMNUnit, self).__init__() + + self.shrink = LinearTransform(dimlinear, dimproj) + self.fsmn = Fsmn(dimproj, dimproj, lorder, rorder, 1, 1) + self.expand = AffineTransform(dimproj, dimlinear) + + self.debug = False + self.dataout = None + + ''' + batch, time, channel, feature + ''' + + def forward(self, input): + if torch.cuda.is_available(): + out = torch.zeros(input.shape).cuda() + else: + out = torch.zeros(input.shape) + + for n in range(input.shape[2]): + out1 = self.shrink(input[:, :, n, :]) + out2 = self.fsmn(out1) + out[:, :, n, :] = F.relu(self.expand(out2)) + + if self.debug: + self.dataout = out + + return out + + def print_model(self): + self.shrink.print_model() + self.fsmn.print_model() + self.expand.print_model() + + def to_kaldi_nnet(self): + re_str = self.shrink.to_kaldi_nnet() + re_str += self.fsmn.to_kaldi_nnet() + re_str += self.expand.to_kaldi_nnet() + + relu = RectifiedLinear(self.expand.linear.out_features, + self.expand.linear.out_features) + re_str += relu.to_kaldi_nnet() + + return re_str + + +class FSMNSeleNetV2(nn.Module): + """ FSMN model with channel selection. + """ + + def __init__(self, + input_dim=120, + linear_dim=128, + proj_dim=64, + lorder=20, + rorder=1, + num_syn=5, + fsmn_layers=5, + sele_layer=0): + """ + Args: + input_dim: input dimension + linear_dim: fsmn input dimension + proj_dim: fsmn projection dimension + lorder: fsmn left order + rorder: fsmn right order + num_syn: output dimension + fsmn_layers: no. of fsmn units + sele_layer: channel selection layer index + """ + super(FSMNSeleNetV2, self).__init__() + + self.sele_layer = sele_layer + + self.featmap = AffineTransform(input_dim, linear_dim) + + self.mem = [] + for i in range(fsmn_layers): + unit = FSMNUnit(linear_dim, proj_dim, lorder, rorder) + self.mem.append(unit) + self.add_module('mem_{:d}'.format(i), unit) + + self.decision = AffineTransform(linear_dim, num_syn) + + def forward(self, input): + # multi-channel feature mapping + if torch.cuda.is_available(): + x = torch.zeros(input.shape[0], input.shape[1], input.shape[2], + self.featmap.linear.out_features).cuda() + else: + x = torch.zeros(input.shape[0], input.shape[1], input.shape[2], + self.featmap.linear.out_features) + + for n in range(input.shape[2]): + x[:, :, n, :] = F.relu(self.featmap(input[:, :, n, :])) + + for i, unit in enumerate(self.mem): + y = unit(x) + + # perform channel selection + if i == self.sele_layer: + pool = nn.MaxPool2d((y.shape[2], 1), stride=(y.shape[2], 1)) + y = pool(y) + + x = y + + # remove channel dimension + y = torch.squeeze(y, -2) + z = self.decision(y) + + return z + + def print_model(self): + self.featmap.print_model() + + for unit in self.mem: + unit.print_model() + + self.decision.print_model() + + def print_header(self): + ''' + get FSMN params + ''' + input_dim = self.featmap.linear.in_features + linear_dim = self.featmap.linear.out_features + proj_dim = self.mem[0].shrink.linear.out_features + lorder = self.mem[0].fsmn.conv_left.kernel_size[0] + rorder = 0 + if self.mem[0].fsmn.conv_right is not None: + rorder = self.mem[0].fsmn.conv_right.kernel_size[0] + + num_syn = self.decision.linear.out_features + fsmn_layers = len(self.mem) + + # no. of output channels, 0.0 means the same as numins + # numouts = 0.0 + numouts = 1.0 + + # + # write total header + # + header = [0.0] * HEADER_BLOCK_SIZE * 4 + # numins + header[0] = 0.0 + # numouts + header[1] = numouts + # dimins + header[2] = input_dim + # dimouts + header[3] = num_syn + # numlayers + header[4] = 3 + + # + # write each layer's header + # + hidx = 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_DENSE.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 + header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim + header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 + header[HEADER_BLOCK_SIZE * hidx + 5] = float( + ActivationType.ACTIVATION_RELU.value) + hidx += 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_SEQUENTIAL_FSMN.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0 + header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = proj_dim + header[HEADER_BLOCK_SIZE * hidx + 4] = lorder + header[HEADER_BLOCK_SIZE * hidx + 5] = rorder + header[HEADER_BLOCK_SIZE * hidx + 6] = fsmn_layers + if numouts == 1.0: + header[HEADER_BLOCK_SIZE * hidx + 7] = float(self.sele_layer) + else: + header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0 + hidx += 1 + + header[HEADER_BLOCK_SIZE * hidx + 0] = float( + LayerType.LAYER_DENSE.value) + header[HEADER_BLOCK_SIZE * hidx + 1] = numouts + header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim + header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn + header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0 + header[HEADER_BLOCK_SIZE * hidx + 5] = float( + ActivationType.ACTIVATION_SOFTMAX.value) + + for h in header: + print(f32ToI32(h)) + + def to_kaldi_nnet(self): + re_str = '\n' + + re_str = self.featmap.to_kaldi_nnet() + + relu = RectifiedLinear(self.featmap.linear.out_features, + self.featmap.linear.out_features) + re_str += relu.to_kaldi_nnet() + + for unit in self.mem: + re_str += unit.to_kaldi_nnet() + + re_str += self.decision.to_kaldi_nnet() + + re_str += ' %d %d\n' % (self.decision.linear.out_features, + self.decision.linear.out_features) + re_str += '\n' + re_str += '\n' + + return re_str diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py new file mode 100644 index 00000000..81e47350 --- /dev/null +++ b/modelscope/models/audio/kws/farfield/model.py @@ -0,0 +1,74 @@ +import os +from typing import Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models import TorchModel +from modelscope.models.base import Tensor +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from .fsmn_sele_v2 import FSMNSeleNetV2 + + +@MODELS.register_module( + Tasks.keyword_spotting, module_name=Models.speech_dfsmn_kws_char_farfield) +class FSMNSeleNetV2Decorator(TorchModel): + r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """ + + MODEL_TXT = 'model.txt' + SC_CONFIG = 'sound_connect.conf' + SC_CONF_ITEM_KWS_MODEL = '${kws_model}' + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the dfsmn model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + sc_config_file = os.path.join(model_dir, self.SC_CONFIG) + model_txt_file = os.path.join(model_dir, self.MODEL_TXT) + model_bin_file = os.path.join(model_dir, + ModelFile.TORCH_MODEL_BIN_FILE) + self._model = None + if os.path.exists(model_bin_file): + self._model = FSMNSeleNetV2(*args, **kwargs) + checkpoint = torch.load(model_bin_file) + self._model.load_state_dict(checkpoint, strict=False) + + self._sc = None + if os.path.exists(model_txt_file): + with open(sc_config_file) as f: + lines = f.readlines() + with open(sc_config_file, 'w') as f: + for line in lines: + if self.SC_CONF_ITEM_KWS_MODEL in line: + line = line.replace(self.SC_CONF_ITEM_KWS_MODEL, + model_txt_file) + f.write(line) + import py_sound_connect + self._sc = py_sound_connect.SoundConnect(sc_config_file) + self.size_in = self._sc.bytesPerBlockIn() + self.size_out = self._sc.bytesPerBlockOut() + + if self._model is None and self._sc is None: + raise Exception( + f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.' + ) + + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: + ... + + def forward_decode(self, data: bytes): + result = {'pcm': self._sc.process(data, self.size_out)} + state = self._sc.kwsState() + if state == 2: + result['kws'] = { + 'keyword': + self._sc.kwsKeyword(self._sc.kwsSpottedKeywordIndex()), + 'offset': self._sc.kwsKeywordOffset(), + 'length': self._sc.kwsKeywordLength(), + 'confidence': self._sc.kwsConfidence() + } + return result diff --git a/modelscope/models/audio/kws/farfield/model_def.py b/modelscope/models/audio/kws/farfield/model_def.py new file mode 100644 index 00000000..3f5ba7d7 --- /dev/null +++ b/modelscope/models/audio/kws/farfield/model_def.py @@ -0,0 +1,121 @@ +import math +import struct +from enum import Enum + +HEADER_BLOCK_SIZE = 10 + + +class LayerType(Enum): + LAYER_DENSE = 1 + LAYER_GRU = 2 + LAYER_ATTENTION = 3 + LAYER_FSMN = 4 + LAYER_SEQUENTIAL_FSMN = 5 + LAYER_FSMN_SELE = 6 + LAYER_GRU_ATTENTION = 7 + LAYER_DFSMN = 8 + + +class ActivationType(Enum): + ACTIVATION_NONE = 0 + ACTIVATION_RELU = 1 + ACTIVATION_TANH = 2 + ACTIVATION_SIGMOID = 3 + ACTIVATION_SOFTMAX = 4 + ACTIVATION_LOGSOFTMAX = 5 + + +def f32ToI32(f): + """ + print layer + """ + bs = struct.pack('f', f) + + ba = bytearray() + ba.append(bs[0]) + ba.append(bs[1]) + ba.append(bs[2]) + ba.append(bs[3]) + + return struct.unpack('i', ba)[0] + + +def printNeonMatrix(w): + """ + print matrix with neon padding + """ + numrows, numcols = w.shape + numnecols = math.ceil(numcols / 4) + + for i in range(numrows): + for j in range(numcols): + print(f32ToI32(w[i, j])) + + for j in range(numnecols * 4 - numcols): + print(0) + + +def printNeonVector(b): + """ + print vector with neon padding + """ + size = b.shape[0] + nesize = math.ceil(size / 4) + + for i in range(size): + print(f32ToI32(b[i])) + + for i in range(nesize * 4 - size): + print(0) + + +def printDense(layer): + """ + save dense layer + """ + statedict = layer.state_dict() + printNeonMatrix(statedict['weight']) + printNeonVector(statedict['bias']) + + +def printGRU(layer): + """ + save gru layer + """ + statedict = layer.state_dict() + weight = [statedict['weight_ih_l0'], statedict['weight_hh_l0']] + bias = [statedict['bias_ih_l0'], statedict['bias_hh_l0']] + numins, numouts = weight[0].shape + numins = numins // 3 + + # output input weights + w_rx = weight[0][:numins, :] + w_zx = weight[0][numins:numins * 2, :] + w_x = weight[0][numins * 2:, :] + printNeonMatrix(w_zx) + printNeonMatrix(w_rx) + printNeonMatrix(w_x) + + # output recurrent weights + w_rh = weight[1][:numins, :] + w_zh = weight[1][numins:numins * 2, :] + w_h = weight[1][numins * 2:, :] + printNeonMatrix(w_zh) + printNeonMatrix(w_rh) + printNeonMatrix(w_h) + + # output input bias + b_rx = bias[0][:numins] + b_zx = bias[0][numins:numins * 2] + b_x = bias[0][numins * 2:] + printNeonVector(b_zx) + printNeonVector(b_rx) + printNeonVector(b_x) + + # output recurrent bias + b_rh = bias[1][:numins] + b_zh = bias[1][numins:numins * 2] + b_h = bias[1][numins * 2:] + printNeonVector(b_zh) + printNeonVector(b_rh) + printNeonVector(b_h) diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index a05bc57d..f2ecd08e 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -5,4 +5,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, image_colorization, image_denoise, image_instance_segmentation, image_portrait_enhancement, image_to_image_generation, image_to_image_translation, object_detection, - product_retrieval_embedding, super_resolution, virual_tryon) + product_retrieval_embedding, salient_detection, + super_resolution, video_single_object_tracking, virual_tryon) diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py index eaf5d0c5..c484b37b 100644 --- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py +++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py @@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel): model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) self.model = NAFNet(**self.config.model.network_g) self.loss = PSNRLoss() - - if torch.cuda.is_available(): - self._device = torch.device('cuda') - else: - self._device = torch.device('cpu') - - self.model = self.model.to(self._device) self.model = self._load_pretrained(self.model, model_path) - if self.training: - self.model.train() - else: - self.model.eval() - def _load_pretrained(self, net, load_path, @@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel): Returns: Dict[str, Tensor]: results """ - for key, value in inputs.items(): - inputs[key] = inputs[key].to(self._device) if self.training: return self._train_forward(**inputs) elif 'target' in inputs: diff --git a/modelscope/models/cv/image_instance_segmentation/__init__.py b/modelscope/models/cv/image_instance_segmentation/__init__.py index 4706f8f8..8ccfef4b 100644 --- a/modelscope/models/cv/image_instance_segmentation/__init__.py +++ b/modelscope/models/cv/image_instance_segmentation/__init__.py @@ -7,13 +7,11 @@ if TYPE_CHECKING: from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin from .model import CascadeMaskRCNNSwinModel from .postprocess_utils import get_img_ins_seg_result - from .datasets import ImageInstanceSegmentationCocoDataset else: _import_structure = { 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], 'model': ['CascadeMaskRCNNSwinModel'], 'postprocess_utils': ['get_img_ins_seg_result'], - 'datasets': ['ImageInstanceSegmentationCocoDataset'] } import sys diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py index 93c71b46..cca1432f 100644 --- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py +++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py @@ -1,2 +1 @@ -from .dataset import ImageInstanceSegmentationCocoDataset from .transforms import build_preprocess_transform diff --git a/modelscope/models/cv/object_detection/mmdet_model.py b/modelscope/models/cv/object_detection/mmdet_model.py index 51f05e47..7bf81349 100644 --- a/modelscope/models/cv/object_detection/mmdet_model.py +++ b/modelscope/models/cv/object_detection/mmdet_model.py @@ -38,7 +38,7 @@ class DetectionModel(TorchModel): self.model, model_path, map_location='cpu') self.class_names = checkpoint['meta']['CLASSES'] config.test_pipeline[0].type = 'LoadImageFromWebcam' - self.test_pipeline = Compose( + self.transform_input = Compose( replace_ImageToTensor(config.test_pipeline)) self.model.cfg = config self.model.eval() @@ -56,7 +56,7 @@ class DetectionModel(TorchModel): from mmcv.parallel import collate, scatter data = dict(img=image) - data = self.test_pipeline(data) + data = self.transform_input(data) data = collate([data], samples_per_gpu=1) data['img_metas'] = [ img_metas.data[0] for img_metas in data['img_metas'] diff --git a/modelscope/models/cv/salient_detection/__init__.py b/modelscope/models/cv/salient_detection/__init__.py new file mode 100644 index 00000000..b3b5b5fa --- /dev/null +++ b/modelscope/models/cv/salient_detection/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .salient_model import SalientDetection + +else: + _import_structure = { + 'salient_model': ['SalientDetection'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/salient_detection/models/__init__.py b/modelscope/models/cv/salient_detection/models/__init__.py new file mode 100644 index 00000000..0850c33d --- /dev/null +++ b/modelscope/models/cv/salient_detection/models/__init__.py @@ -0,0 +1 @@ +from .u2net import U2NET diff --git a/modelscope/models/cv/salient_detection/models/u2net.py b/modelscope/models/cv/salient_detection/models/u2net.py new file mode 100644 index 00000000..0a0a4511 --- /dev/null +++ b/modelscope/models/cv/salient_detection/models/u2net.py @@ -0,0 +1,300 @@ +# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class REBNCONV(nn.Module): + + def __init__(self, in_ch=3, out_ch=3, dirate=1): + super(REBNCONV, self).__init__() + self.conv_s1 = nn.Conv2d( + in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate) + self.bn_s1 = nn.BatchNorm2d(out_ch) + self.relu_s1 = nn.ReLU(inplace=True) + + def forward(self, x): + hx = x + xout = self.relu_s1(self.bn_s1(self.conv_s1(hx))) + return xout + + +def _upsample_like(src, tar): + """upsample tensor 'src' to have the same spatial size with tensor 'tar'.""" + src = F.upsample(src, size=tar.shape[2:], mode='bilinear') + return src + + +class RSU7(nn.Module): + + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU7, self).__init__() + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + hxin = self.rebnconvin(hx) + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + hx5 = self.rebnconv5(hx) + hx = self.pool5(hx5) + hx6 = self.rebnconv6(hx) + hx7 = self.rebnconv7(hx6) + hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1)) + hx6dup = _upsample_like(hx6d, hx5) + hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + return hx1d + hxin + + +class RSU6(nn.Module): + + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU6, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + hxin = self.rebnconvin(hx) + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + hx5 = self.rebnconv5(hx) + hx6 = self.rebnconv6(hx5) + hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + return hx1d + hxin + + +class RSU5(nn.Module): + + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU5, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + hxin = self.rebnconvin(hx) + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + hx4 = self.rebnconv4(hx) + hx5 = self.rebnconv5(hx4) + hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + return hx1d + hxin + + +class RSU4(nn.Module): + + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + + hx = x + hxin = self.rebnconvin(hx) + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + hx3 = self.rebnconv3(hx) + hx4 = self.rebnconv4(hx3) + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + return hx1d + hxin + + +class RSU4F(nn.Module): + + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4F, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4) + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + + hx = x + hxin = self.rebnconvin(hx) + hx1 = self.rebnconv1(hxin) + hx2 = self.rebnconv2(hx1) + hx3 = self.rebnconv3(hx2) + hx4 = self.rebnconv4(hx3) + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1)) + hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1)) + return hx1d + hxin + + +class U2NET(nn.Module): + + def __init__(self, in_ch=3, out_ch=1): + super(U2NET, self).__init__() + + # encoder + self.stage1 = RSU7(in_ch, 32, 64) + self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.stage2 = RSU6(64, 32, 128) + self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.stage3 = RSU5(128, 64, 256) + self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.stage4 = RSU4(256, 128, 512) + self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.stage5 = RSU4F(512, 256, 512) + self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + self.stage6 = RSU4F(512, 256, 512) + # decoder + self.stage5d = RSU4F(1024, 256, 512) + self.stage4d = RSU4(1024, 128, 256) + self.stage3d = RSU5(512, 64, 128) + self.stage2d = RSU6(256, 32, 64) + self.stage1d = RSU7(128, 16, 64) + self.side1 = nn.Conv2d(64, out_ch, 3, padding=1) + self.side2 = nn.Conv2d(64, out_ch, 3, padding=1) + self.side3 = nn.Conv2d(128, out_ch, 3, padding=1) + self.side4 = nn.Conv2d(256, out_ch, 3, padding=1) + self.side5 = nn.Conv2d(512, out_ch, 3, padding=1) + self.side6 = nn.Conv2d(512, out_ch, 3, padding=1) + self.outconv = nn.Conv2d(6 * out_ch, out_ch, 1) + + def forward(self, x): + + hx = x + hx1 = self.stage1(hx) + hx = self.pool12(hx1) + hx2 = self.stage2(hx) + hx = self.pool23(hx2) + hx3 = self.stage3(hx) + hx = self.pool34(hx3) + hx4 = self.stage4(hx) + hx = self.pool45(hx4) + hx5 = self.stage5(hx) + hx = self.pool56(hx5) + hx6 = self.stage6(hx) + hx6up = _upsample_like(hx6, hx5) + + hx5d = self.stage5d(torch.cat((hx6up, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1)) + d1 = self.side1(hx1d) + d2 = self.side2(hx2d) + d2 = _upsample_like(d2, d1) + d3 = self.side3(hx3d) + d3 = _upsample_like(d3, d1) + d4 = self.side4(hx4d) + d4 = _upsample_like(d4, d1) + d5 = self.side5(hx5d) + d5 = _upsample_like(d5, d1) + d6 = self.side6(hx6) + d6 = _upsample_like(d6, d1) + d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1)) + return torch.sigmoid(d0), torch.sigmoid(d1), torch.sigmoid( + d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid( + d5), torch.sigmoid(d6) diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py new file mode 100644 index 00000000..539d1f24 --- /dev/null +++ b/modelscope/models/cv/salient_detection/salient_model.py @@ -0,0 +1,63 @@ +import os.path as osp + +import cv2 +import numpy as np +import torch +from PIL import Image +from torchvision import transforms + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from .models import U2NET + + +@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection) +class SalientDetection(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, *args, **kwargs) + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + self.model = U2NET(3, 1) + checkpoint = torch.load(model_path, map_location='cpu') + self.transform_input = transforms.Compose([ + transforms.Resize((320, 320)), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ]) + self.model.load_state_dict(checkpoint) + self.model.eval() + + def inference(self, data): + """data is tensor 3 * H * W ---> return tensor H * W .""" + data = data.unsqueeze(0) + if next(self.model.parameters()).is_cuda: + data = data.to( + torch.device([next(self.model.parameters()).device][0])) + + with torch.no_grad(): + results = self.model(data) + + if next(self.model.parameters()).is_cuda: + return results[0][0, 0, :, :].cpu() + return results[0][0, 0, :, :] + + def preprocess(self, image): + """image is numpy.""" + data = self.transform_input(Image.fromarray(image)) + return data.float() + + def postprocess(self, inputs): + """resize .""" + data = inputs['data'] + w = inputs['img_w'] + h = inputs['img_h'] + data_norm = (data - torch.min(data)) / ( + torch.max(data) - torch.min(data)) + data_norm_np = (data_norm.numpy() * 255).astype('uint8') + data_norm_rst = cv2.resize(data_norm_np, (w, h)) + + return data_norm_rst diff --git a/modelscope/models/cv/video_single_object_tracking/__init__.py b/modelscope/models/cv/video_single_object_tracking/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/config/__init__.py b/modelscope/models/cv/video_single_object_tracking/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py new file mode 100644 index 00000000..772813cf --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py @@ -0,0 +1,39 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +from easydict import EasyDict as edict + +cfg = edict() + +# MODEL +cfg.MODEL = edict() + +# MODEL.BACKBONE +cfg.MODEL.BACKBONE = edict() +cfg.MODEL.BACKBONE.TYPE = 'vit_base_patch16_224_ce' +cfg.MODEL.BACKBONE.STRIDE = 16 +cfg.MODEL.BACKBONE.CAT_MODE = 'direct' +cfg.MODEL.BACKBONE.DROP_PATH_RATE = 0.1 +cfg.MODEL.BACKBONE.CE_LOC = [3, 6, 9] +cfg.MODEL.BACKBONE.CE_KEEP_RATIO = [0.7, 0.7, 0.7] +cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE = 'CTR_POINT' + +# MODEL.HEAD +cfg.MODEL.HEAD = edict() +cfg.MODEL.HEAD.TYPE = 'CENTER' +cfg.MODEL.HEAD.NUM_CHANNELS = 256 + +# DATA +cfg.DATA = edict() +cfg.DATA.MEAN = [0.485, 0.456, 0.406] +cfg.DATA.STD = [0.229, 0.224, 0.225] +cfg.DATA.SEARCH = edict() +cfg.DATA.SEARCH.SIZE = 384 +cfg.DATA.TEMPLATE = edict() +cfg.DATA.TEMPLATE.SIZE = 192 + +# TEST +cfg.TEST = edict() +cfg.TEST.TEMPLATE_FACTOR = 2.0 +cfg.TEST.TEMPLATE_SIZE = 192 +cfg.TEST.SEARCH_FACTOR = 5.0 +cfg.TEST.SEARCH_SIZE = 384 diff --git a/modelscope/models/cv/video_single_object_tracking/models/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py new file mode 100644 index 00000000..158d88aa --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py @@ -0,0 +1,54 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch.nn as nn + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0., + rpe=False, + z_size=7, + x_size=14): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, mask=None, return_attention=False): + # x: B, N, C + # mask: [B, N, ] torch.bool + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind( + 0) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + + if mask is not None: + attn = attn.masked_fill( + mask.unsqueeze(1).unsqueeze(2), + float('-inf'), + ) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + + if return_attention: + return x, attn + else: + return x diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py new file mode 100644 index 00000000..45706f71 --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py @@ -0,0 +1,129 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import math + +import torch +import torch.nn as nn +from timm.models.layers import DropPath, Mlp + +from .attn import Attention + + +def candidate_elimination(attn: torch.Tensor, tokens: torch.Tensor, + lens_t: int, keep_ratio: float, + global_index: torch.Tensor, + box_mask_z: torch.Tensor): + """ + Eliminate potential background candidates for computation reduction and noise cancellation. + Args: + attn (torch.Tensor): [B, num_heads, L_t + L_s, L_t + L_s], attention weights + tokens (torch.Tensor): [B, L_t + L_s, C], template and search region tokens + lens_t (int): length of template + keep_ratio (float): keep ratio of search region tokens (candidates) + global_index (torch.Tensor): global index of search region tokens + box_mask_z (torch.Tensor): template mask used to accumulate attention weights + + Returns: + tokens_new (torch.Tensor): tokens after candidate elimination + keep_index (torch.Tensor): indices of kept search region tokens + removed_index (torch.Tensor): indices of removed search region tokens + """ + lens_s = attn.shape[-1] - lens_t + bs, hn, _, _ = attn.shape + + lens_keep = math.ceil(keep_ratio * lens_s) + if lens_keep == lens_s: + return tokens, global_index, None + + attn_t = attn[:, :, :lens_t, lens_t:] + + if box_mask_z is not None: + box_mask_z = box_mask_z.unsqueeze(1).unsqueeze(-1).expand( + -1, attn_t.shape[1], -1, attn_t.shape[-1]) + attn_t = attn_t[box_mask_z] + attn_t = attn_t.view(bs, hn, -1, lens_s) + attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s + else: + attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s + + # use sort instead of topk, due to the speed issue + # https://github.com/pytorch/pytorch/issues/22812 + sorted_attn, indices = torch.sort(attn_t, dim=1, descending=True) + + _, topk_idx = sorted_attn[:, :lens_keep], indices[:, :lens_keep] + _, non_topk_idx = sorted_attn[:, lens_keep:], indices[:, lens_keep:] + keep_index = global_index.gather(dim=1, index=topk_idx) + removed_index = global_index.gather(dim=1, index=non_topk_idx) + + # separate template and search tokens + tokens_t = tokens[:, :lens_t] + tokens_s = tokens[:, lens_t:] + + # obtain the attentive and inattentive tokens + B, L, C = tokens_s.shape + attentive_tokens = tokens_s.gather( + dim=1, index=topk_idx.unsqueeze(-1).expand(B, -1, C)) + + # concatenate these tokens + tokens_new = torch.cat([tokens_t, attentive_tokens], dim=1) + + return tokens_new, keep_index, removed_index + + +class CEBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + keep_ratio_search=1.0, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.keep_ratio_search = keep_ratio_search + + def forward(self, + x, + global_index_template, + global_index_search, + mask=None, + ce_template_mask=None, + keep_ratio_search=None): + x_attn, attn = self.attn(self.norm1(x), mask, True) + x = x + self.drop_path(x_attn) + lens_t = global_index_template.shape[1] + + removed_index_search = None + if self.keep_ratio_search < 1 and (keep_ratio_search is None + or keep_ratio_search < 1): + keep_ratio_search = self.keep_ratio_search if keep_ratio_search is None else keep_ratio_search + x, global_index_search, removed_index_search = candidate_elimination( + attn, x, lens_t, keep_ratio_search, global_index_search, + ce_template_mask) + + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x, global_index_template, global_index_search, removed_index_search, attn diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py new file mode 100644 index 00000000..e64b68d7 --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py @@ -0,0 +1,141 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch +import torch.nn as nn + + +def conv(in_planes, + out_planes, + kernel_size=3, + stride=1, + padding=1, + dilation=1): + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=True), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True)) + + +class CenterPredictor( + nn.Module, ): + + def __init__(self, inplanes=64, channel=256, feat_sz=20, stride=16): + super(CenterPredictor, self).__init__() + self.feat_sz = feat_sz + self.stride = stride + self.img_sz = self.feat_sz * self.stride + + # corner predict + self.conv1_ctr = conv(inplanes, channel) + self.conv2_ctr = conv(channel, channel // 2) + self.conv3_ctr = conv(channel // 2, channel // 4) + self.conv4_ctr = conv(channel // 4, channel // 8) + self.conv5_ctr = nn.Conv2d(channel // 8, 1, kernel_size=1) + + # offset regress + self.conv1_offset = conv(inplanes, channel) + self.conv2_offset = conv(channel, channel // 2) + self.conv3_offset = conv(channel // 2, channel // 4) + self.conv4_offset = conv(channel // 4, channel // 8) + self.conv5_offset = nn.Conv2d(channel // 8, 2, kernel_size=1) + + # size regress + self.conv1_size = conv(inplanes, channel) + self.conv2_size = conv(channel, channel // 2) + self.conv3_size = conv(channel // 2, channel // 4) + self.conv4_size = conv(channel // 4, channel // 8) + self.conv5_size = nn.Conv2d(channel // 8, 2, kernel_size=1) + + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, x, gt_score_map=None): + """ Forward pass with input x. """ + score_map_ctr, size_map, offset_map = self.get_score_map(x) + + # assert gt_score_map is None + if gt_score_map is None: + bbox = self.cal_bbox(score_map_ctr, size_map, offset_map) + else: + bbox = self.cal_bbox( + gt_score_map.unsqueeze(1), size_map, offset_map) + + return score_map_ctr, bbox, size_map, offset_map + + def cal_bbox(self, + score_map_ctr, + size_map, + offset_map, + return_score=False): + max_score, idx = torch.max( + score_map_ctr.flatten(1), dim=1, keepdim=True) + idx_y = idx // self.feat_sz + idx_x = idx % self.feat_sz + + idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1) + size = size_map.flatten(2).gather(dim=2, index=idx) + offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1) + + # cx, cy, w, h + bbox = torch.cat( + [(idx_x.to(torch.float) + offset[:, :1]) / self.feat_sz, + (idx_y.to(torch.float) + offset[:, 1:]) / self.feat_sz, + size.squeeze(-1)], + dim=1) + + if return_score: + return bbox, max_score + return bbox + + def get_score_map(self, x): + + def _sigmoid(x): + y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4) + return y + + # ctr branch + x_ctr1 = self.conv1_ctr(x) + x_ctr2 = self.conv2_ctr(x_ctr1) + x_ctr3 = self.conv3_ctr(x_ctr2) + x_ctr4 = self.conv4_ctr(x_ctr3) + score_map_ctr = self.conv5_ctr(x_ctr4) + + # offset branch + x_offset1 = self.conv1_offset(x) + x_offset2 = self.conv2_offset(x_offset1) + x_offset3 = self.conv3_offset(x_offset2) + x_offset4 = self.conv4_offset(x_offset3) + score_map_offset = self.conv5_offset(x_offset4) + + # size branch + x_size1 = self.conv1_size(x) + x_size2 = self.conv2_size(x_size1) + x_size3 = self.conv3_size(x_size2) + x_size4 = self.conv4_size(x_size3) + score_map_size = self.conv5_size(x_size4) + return _sigmoid(score_map_ctr), _sigmoid( + score_map_size), score_map_offset + + +def build_box_head(cfg, hidden_dim): + stride = cfg.MODEL.BACKBONE.STRIDE + + if cfg.MODEL.HEAD.TYPE == 'CENTER': + in_channel = hidden_dim + out_channel = cfg.MODEL.HEAD.NUM_CHANNELS + feat_sz = int(cfg.DATA.SEARCH.SIZE / stride) + center_head = CenterPredictor( + inplanes=in_channel, + channel=out_channel, + feat_sz=feat_sz, + stride=stride) + return center_head + else: + raise ValueError('HEAD TYPE %s is not supported.' + % cfg.MODEL.HEAD_TYPE) diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py new file mode 100644 index 00000000..0e623505 --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py @@ -0,0 +1,37 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch.nn as nn +from timm.models.layers import to_2tuple + + +class PatchEmbed(nn.Module): + """ 2D Image to Patch Embedding + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], + img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py new file mode 100644 index 00000000..e2d2f80f --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py @@ -0,0 +1,93 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch.nn as nn +from timm.models.layers import to_2tuple + +from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \ + PatchEmbed + + +class BaseBackbone(nn.Module): + + def __init__(self): + super().__init__() + + # for original ViT + self.pos_embed = None + self.img_size = [224, 224] + self.patch_size = 16 + self.embed_dim = 384 + + self.cat_mode = 'direct' + + self.pos_embed_z = None + self.pos_embed_x = None + + self.template_segment_pos_embed = None + self.search_segment_pos_embed = None + + self.return_stage = [2, 5, 8, 11] + + def finetune_track(self, cfg, patch_start_index=1): + + search_size = to_2tuple(cfg.DATA.SEARCH.SIZE) + template_size = to_2tuple(cfg.DATA.TEMPLATE.SIZE) + new_patch_size = cfg.MODEL.BACKBONE.STRIDE + + self.cat_mode = cfg.MODEL.BACKBONE.CAT_MODE + + # resize patch embedding + if new_patch_size != self.patch_size: + print( + 'Inconsistent Patch Size With The Pretrained Weights, Interpolate The Weight!' + ) + old_patch_embed = {} + for name, param in self.patch_embed.named_parameters(): + if 'weight' in name: + param = nn.functional.interpolate( + param, + size=(new_patch_size, new_patch_size), + mode='bicubic', + align_corners=False) + param = nn.Parameter(param) + old_patch_embed[name] = param + self.patch_embed = PatchEmbed( + img_size=self.img_size, + patch_size=new_patch_size, + in_chans=3, + embed_dim=self.embed_dim) + self.patch_embed.proj.bias = old_patch_embed['proj.bias'] + self.patch_embed.proj.weight = old_patch_embed['proj.weight'] + + # for patch embedding + patch_pos_embed = self.pos_embed[:, patch_start_index:, :] + patch_pos_embed = patch_pos_embed.transpose(1, 2) + B, E, Q = patch_pos_embed.shape + P_H, P_W = self.img_size[0] // self.patch_size, self.img_size[ + 1] // self.patch_size + patch_pos_embed = patch_pos_embed.view(B, E, P_H, P_W) + + # for search region + H, W = search_size + new_P_H, new_P_W = H // new_patch_size, W // new_patch_size + search_patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_P_H, new_P_W), + mode='bicubic', + align_corners=False) + search_patch_pos_embed = search_patch_pos_embed.flatten(2).transpose( + 1, 2) + + # for template region + H, W = template_size + new_P_H, new_P_W = H // new_patch_size, W // new_patch_size + template_patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_P_H, new_P_W), + mode='bicubic', + align_corners=False) + template_patch_pos_embed = template_patch_pos_embed.flatten( + 2).transpose(1, 2) + + self.pos_embed_z = nn.Parameter(template_patch_pos_embed) + self.pos_embed_x = nn.Parameter(search_patch_pos_embed) diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py new file mode 100644 index 00000000..977e936d --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py @@ -0,0 +1,109 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch +from torch import nn + +from modelscope.models.cv.video_single_object_tracking.models.layers.head import \ + build_box_head +from .vit_ce import vit_base_patch16_224_ce + + +class OSTrack(nn.Module): + """ This is the base class for OSTrack """ + + def __init__(self, + transformer, + box_head, + aux_loss=False, + head_type='CORNER'): + """ Initializes the model. + Parameters: + transformer: torch module of the transformer architecture. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.backbone = transformer + self.box_head = box_head + + self.aux_loss = aux_loss + self.head_type = head_type + if head_type == 'CORNER' or head_type == 'CENTER': + self.feat_sz_s = int(box_head.feat_sz) + self.feat_len_s = int(box_head.feat_sz**2) + + def forward( + self, + template: torch.Tensor, + search: torch.Tensor, + ce_template_mask=None, + ce_keep_rate=None, + ): + x, aux_dict = self.backbone( + z=template, + x=search, + ce_template_mask=ce_template_mask, + ce_keep_rate=ce_keep_rate, + ) + + # Forward head + feat_last = x + if isinstance(x, list): + feat_last = x[-1] + out = self.forward_head(feat_last, None) + + out.update(aux_dict) + out['backbone_feat'] = x + return out + + def forward_head(self, cat_feature, gt_score_map=None): + """ + cat_feature: output embeddings of the backbone, it can be (HW1+HW2, B, C) or (HW2, B, C) + """ + enc_opt = cat_feature[:, -self. + feat_len_s:] # encoder output for the search region (B, HW, C) + opt = (enc_opt.unsqueeze(-1)).permute((0, 3, 2, 1)).contiguous() + bs, Nq, C, HW = opt.size() + opt_feat = opt.view(-1, C, self.feat_sz_s, self.feat_sz_s) + + if self.head_type == 'CENTER': + # run the center head + score_map_ctr, bbox, size_map, offset_map = self.box_head( + opt_feat, gt_score_map) + outputs_coord = bbox + outputs_coord_new = outputs_coord.view(bs, Nq, 4) + out = { + 'pred_boxes': outputs_coord_new, + 'score_map': score_map_ctr, + 'size_map': size_map, + 'offset_map': offset_map + } + return out + else: + raise NotImplementedError + + +def build_ostrack(cfg): + if cfg.MODEL.BACKBONE.TYPE == 'vit_base_patch16_224_ce': + backbone = vit_base_patch16_224_ce( + False, + drop_path_rate=cfg.MODEL.BACKBONE.DROP_PATH_RATE, + ce_loc=cfg.MODEL.BACKBONE.CE_LOC, + ce_keep_ratio=cfg.MODEL.BACKBONE.CE_KEEP_RATIO, + ) + hidden_dim = backbone.embed_dim + patch_start_index = 1 + else: + raise NotImplementedError + + backbone.finetune_track(cfg=cfg, patch_start_index=patch_start_index) + + box_head = build_box_head(cfg, hidden_dim) + + model = OSTrack( + backbone, + box_head, + aux_loss=False, + head_type=cfg.MODEL.HEAD.TYPE, + ) + + return model diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py new file mode 100644 index 00000000..a49fa50c --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py @@ -0,0 +1,24 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch + + +def combine_tokens(template_tokens, + search_tokens, + mode='direct', + return_res=False): + if mode == 'direct': + merged_feature = torch.cat((template_tokens, search_tokens), dim=1) + else: + raise NotImplementedError + + return merged_feature + + +def recover_tokens(merged_tokens, mode='direct'): + if mode == 'direct': + recovered_tokens = merged_tokens + else: + raise NotImplementedError + + return recovered_tokens diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py new file mode 100644 index 00000000..cd393109 --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py @@ -0,0 +1,343 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +from functools import partial + +import torch +import torch.nn as nn +from timm.models.layers import DropPath, Mlp, to_2tuple + +from modelscope.models.cv.video_single_object_tracking.models.layers.attn_blocks import \ + CEBlock +from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \ + PatchEmbed +from .base_backbone import BaseBackbone +from .utils import combine_tokens, recover_tokens + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + +class Block(nn.Module): + + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + +class VisionTransformer(BaseBackbone): + """ Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + distilled=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + distilled (bool): model includes a distillation token and head as in DeiT models + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 2 if distilled else 1 + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.dist_token = None + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer) for i in range(depth) + ]) + self.norm = norm_layer(embed_dim) + + +class VisionTransformerCE(VisionTransformer): + """ Vision Transformer with candidate elimination (CE) module + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + + Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + distilled=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + ce_loc=None, + ce_keep_ratio=None): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + distilled (bool): model includes a distillation token and head as in DeiT models + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + if isinstance(img_size, tuple): + self.img_size = img_size + else: + self.img_size = to_2tuple(img_size) + self.patch_size = patch_size + self.in_chans = in_chans + + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 2 if distilled else 1 + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.dist_token = nn.Parameter(torch.zeros( + 1, 1, embed_dim)) if distilled else None + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + blocks = [] + ce_index = 0 + self.ce_loc = ce_loc + for i in range(depth): + ce_keep_ratio_i = 1.0 + if ce_loc is not None and i in ce_loc: + ce_keep_ratio_i = ce_keep_ratio[ce_index] + ce_index += 1 + + blocks.append( + CEBlock( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + keep_ratio_search=ce_keep_ratio_i)) + + self.blocks = nn.Sequential(*blocks) + self.norm = norm_layer(embed_dim) + + def forward_features( + self, + z, + x, + mask_x=None, + ce_template_mask=None, + ce_keep_rate=None, + ): + B = x.shape[0] + + x = self.patch_embed(x) + z = self.patch_embed(z) + + z += self.pos_embed_z + x += self.pos_embed_x + + x = combine_tokens(z, x, mode=self.cat_mode) + + x = self.pos_drop(x) + + lens_z = self.pos_embed_z.shape[1] + lens_x = self.pos_embed_x.shape[1] + + global_index_t = torch.linspace(0, lens_z - 1, lens_z).to(x.device) + global_index_t = global_index_t.repeat(B, 1) + + global_index_s = torch.linspace(0, lens_x - 1, lens_x).to(x.device) + global_index_s = global_index_s.repeat(B, 1) + removed_indexes_s = [] + for i, blk in enumerate(self.blocks): + x, global_index_t, global_index_s, removed_index_s, attn = \ + blk(x, global_index_t, global_index_s, mask_x, ce_template_mask, ce_keep_rate) + + if self.ce_loc is not None and i in self.ce_loc: + removed_indexes_s.append(removed_index_s) + + x = self.norm(x) + lens_x_new = global_index_s.shape[1] + lens_z_new = global_index_t.shape[1] + + z = x[:, :lens_z_new] + x = x[:, lens_z_new:] + + if removed_indexes_s and removed_indexes_s[0] is not None: + removed_indexes_cat = torch.cat(removed_indexes_s, dim=1) + + pruned_lens_x = lens_x - lens_x_new + pad_x = torch.zeros([B, pruned_lens_x, x.shape[2]], + device=x.device) + x = torch.cat([x, pad_x], dim=1) + index_all = torch.cat([global_index_s, removed_indexes_cat], dim=1) + # recover original token order + C = x.shape[-1] + x = torch.zeros_like(x).scatter_( + dim=1, + index=index_all.unsqueeze(-1).expand(B, -1, C).to(torch.int64), + src=x) + + x = recover_tokens(x, mode=self.cat_mode) + + # re-concatenate with the template, which may be further used by other modules + x = torch.cat([z, x], dim=1) + + aux_dict = { + 'attn': attn, + 'removed_indexes_s': removed_indexes_s, # used for visualization + } + + return x, aux_dict + + def forward(self, z, x, ce_template_mask=None, ce_keep_rate=None): + + x, aux_dict = self.forward_features( + z, + x, + ce_template_mask=ce_template_mask, + ce_keep_rate=ce_keep_rate, + ) + + return x, aux_dict + + +def _create_vision_transformer(pretrained=False, **kwargs): + model = VisionTransformerCE(**kwargs) + return model + + +def vit_base_patch16_224_ce(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer(pretrained=pretrained, **model_kwargs) + return model diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/__init__.py b/modelscope/models/cv/video_single_object_tracking/tracker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py new file mode 100644 index 00000000..3eff252a --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py @@ -0,0 +1,139 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import torch + +from modelscope.models.cv.video_single_object_tracking.config.ostrack import \ + cfg +from modelscope.models.cv.video_single_object_tracking.models.ostrack.ostrack import \ + build_ostrack +from modelscope.models.cv.video_single_object_tracking.utils.utils import ( + Preprocessor, clip_box, generate_mask_cond, hann2d, sample_target, + transform_image_to_crop) + + +class OSTrack(): + + def __init__(self, ckpt_path, device): + network = build_ostrack(cfg) + network.load_state_dict( + torch.load(ckpt_path, map_location='cpu')['net'], strict=True) + self.cfg = cfg + if device.type == 'cuda': + self.network = network.to(device) + else: + self.network = network + self.network.eval() + self.preprocessor = Preprocessor(device) + self.state = None + + self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE + # motion constrain + if device.type == 'cuda': + self.output_window = hann2d( + torch.tensor([self.feat_sz, self.feat_sz]).long(), + centered=True).to(device) + else: + self.output_window = hann2d( + torch.tensor([self.feat_sz, self.feat_sz]).long(), + centered=True) + self.frame_id = 0 + # for save boxes from all queries + self.z_dict1 = {} + + def initialize(self, image, info: dict): + # forward the template once + z_patch_arr, resize_factor, z_amask_arr = sample_target( + image, + info['init_bbox'], + self.cfg.TEST.TEMPLATE_FACTOR, + output_sz=self.cfg.TEST.TEMPLATE_SIZE) + self.z_patch_arr = z_patch_arr + template = self.preprocessor.process(z_patch_arr, z_amask_arr) + with torch.no_grad(): + self.z_dict1 = template + + self.box_mask_z = None + if self.cfg.MODEL.BACKBONE.CE_LOC: + template_bbox = self.transform_bbox_to_crop( + info['init_bbox'], resize_factor, + template.tensors.device).squeeze(1) + self.box_mask_z = generate_mask_cond(self.cfg, 1, + template.tensors.device, + template_bbox) + + # save states + self.state = info['init_bbox'] + self.frame_id = 0 + + def track(self, image, info: dict = None): + H, W, _ = image.shape + self.frame_id += 1 + x_patch_arr, resize_factor, x_amask_arr = sample_target( + image, + self.state, + self.cfg.TEST.SEARCH_FACTOR, + output_sz=self.cfg.TEST.SEARCH_SIZE) # (x1, y1, w, h) + search = self.preprocessor.process(x_patch_arr, x_amask_arr) + + with torch.no_grad(): + x_dict = search + # merge the template and the search + # run the transformer + out_dict = self.network.forward( + template=self.z_dict1.tensors, + search=x_dict.tensors, + ce_template_mask=self.box_mask_z) + + # add hann windows + pred_score_map = out_dict['score_map'] + response = self.output_window * pred_score_map + pred_boxes = self.network.box_head.cal_bbox(response, + out_dict['size_map'], + out_dict['offset_map']) + pred_boxes = pred_boxes.view(-1, 4) + # Baseline: Take the mean of all pred boxes as the final result + pred_box = (pred_boxes.mean(dim=0) * self.cfg.TEST.SEARCH_SIZE + / resize_factor).tolist() # (cx, cy, w, h) [0,1] + # get the final box result + self.state = clip_box( + self.map_box_back(pred_box, resize_factor), H, W, margin=10) + + x1, y1, w, h = self.state + x2 = x1 + w + y2 = y1 + h + return {'target_bbox': [x1, y1, x2, y2]} + + def map_box_back(self, pred_box: list, resize_factor: float): + cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[ + 1] + 0.5 * self.state[3] + cx, cy, w, h = pred_box + half_side = 0.5 * self.cfg.TEST.SEARCH_SIZE / resize_factor + cx_real = cx + (cx_prev - half_side) + cy_real = cy + (cy_prev - half_side) + return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h] + + def transform_bbox_to_crop(self, + box_in, + resize_factor, + device, + box_extract=None, + crop_type='template'): + if crop_type == 'template': + crop_sz = torch.Tensor( + [self.cfg.TEST.TEMPLATE_SIZE, self.cfg.TEST.TEMPLATE_SIZE]) + elif crop_type == 'search': + crop_sz = torch.Tensor( + [self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE]) + else: + raise NotImplementedError + + box_in = torch.tensor(box_in) + if box_extract is None: + box_extract = box_in + else: + box_extract = torch.tensor(box_extract) + template_bbox = transform_image_to_crop( + box_in, box_extract, resize_factor, crop_sz, normalize=True) + template_bbox = template_bbox.view(1, 1, 4).to(device) + + return template_bbox diff --git a/modelscope/models/cv/video_single_object_tracking/utils/__init__.py b/modelscope/models/cv/video_single_object_tracking/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py new file mode 100644 index 00000000..505b2aa9 --- /dev/null +++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py @@ -0,0 +1,261 @@ +# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on +# https://github.com/botaoye/OSTrack/ +import math +from typing import Optional + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor + + +def hann1d(sz: int, centered=True) -> torch.Tensor: + """1D cosine window.""" + if centered: + return 0.5 * (1 - torch.cos( + (2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float())) + w = 0.5 * (1 + torch.cos( + (2 * math.pi / (sz + 2)) * torch.arange(0, sz // 2 + 1).float())) + return torch.cat([w, w[1:sz - sz // 2].flip((0, ))]) + + +def hann2d(sz: torch.Tensor, centered=True) -> torch.Tensor: + """2D cosine window.""" + return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d( + sz[1].item(), centered).reshape(1, 1, 1, -1) + + +class NestedTensor(object): + + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + +class Preprocessor(object): + + def __init__(self, device: str): + self.device = device + self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)) + self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)) + if 'cuda' == self.device.type: + self.mean = self.mean.to(self.device) + self.std = self.std.to(self.device) + + def process(self, img_arr: np.ndarray, amask_arr: np.ndarray): + # Deal with the image patch + if 'cuda' == self.device.type: + img_tensor = torch.tensor(img_arr).to(self.device).float().permute( + (2, 0, 1)).unsqueeze(dim=0) + else: + img_tensor = torch.tensor(img_arr).float().permute( + (2, 0, 1)).unsqueeze(dim=0) + img_tensor_norm = ( + (img_tensor / 255.0) - self.mean) / self.std # (1,3,H,W) + + # Deal with the attention mask + if 'cuda' == self.device.type: + amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).to( + self.device).unsqueeze(dim=0) # (1,H,W) + else: + amask_tensor = torch.from_numpy(amask_arr).to( + torch.bool).unsqueeze(dim=0) # (1,H,W) + return NestedTensor(img_tensor_norm, amask_tensor) + + +def clip_box(box: list, H, W, margin=0): + x1, y1, w, h = box + x2, y2 = x1 + w, y1 + h + x1 = min(max(0, x1), W - margin) + x2 = min(max(margin, x2), W) + y1 = min(max(0, y1), H - margin) + y2 = min(max(margin, y2), H) + w = max(margin, x2 - x1) + h = max(margin, y2 - y1) + if isinstance(x1, torch.Tensor): + x1 = x1.item() + y1 = y1.item() + w = w.item() + h = h.item() + return [x1, y1, w, h] + + +def generate_mask_cond(cfg, bs, device, gt_bbox): + template_size = cfg.DATA.TEMPLATE.SIZE + stride = cfg.MODEL.BACKBONE.STRIDE + template_feat_size = template_size // stride + + if cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE == 'CTR_POINT': + if template_feat_size == 8: + index = slice(3, 4) + elif template_feat_size == 12: + index = slice(5, 6) + elif template_feat_size == 7: + index = slice(3, 4) + elif template_feat_size == 14: + index = slice(6, 7) + else: + raise NotImplementedError + box_mask_z = torch.zeros([bs, template_feat_size, template_feat_size], + device=device) + box_mask_z[:, index, index] = 1 + box_mask_z = box_mask_z.flatten(1).to(torch.bool) + else: + raise NotImplementedError + + return box_mask_z + + +def sample_target(im, + target_bb, + search_area_factor, + output_sz=None, + mask=None): + """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area + + args: + im - cv image + target_bb - target box [x, y, w, h] + search_area_factor - Ratio of crop size to target size + output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done. + + returns: + cv image - extracted crop + float - the factor by which the crop has been resized to make the crop size equal output_size + """ + if not isinstance(target_bb, list): + x, y, w, h = target_bb.tolist() + else: + x, y, w, h = target_bb + # Crop image + crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor) + + if crop_sz < 1: + raise Exception('Too small bounding box.') + + x1 = round(x + 0.5 * w - crop_sz * 0.5) + x2 = x1 + crop_sz + + y1 = round(y + 0.5 * h - crop_sz * 0.5) + y2 = y1 + crop_sz + + x1_pad = max(0, -x1) + x2_pad = max(x2 - im.shape[1] + 1, 0) + + y1_pad = max(0, -y1) + y2_pad = max(y2 - im.shape[0] + 1, 0) + + # Crop target + im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :] + if mask is not None: + mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad] + + # Pad + im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, + x2_pad, cv2.BORDER_CONSTANT) + # deal with attention mask + H, W, _ = im_crop_padded.shape + att_mask = np.ones((H, W)) + end_x, end_y = -x2_pad, -y2_pad + if y2_pad == 0: + end_y = None + if x2_pad == 0: + end_x = None + att_mask[y1_pad:end_y, x1_pad:end_x] = 0 + if mask is not None: + mask_crop_padded = F.pad( + mask_crop, + pad=(x1_pad, x2_pad, y1_pad, y2_pad), + mode='constant', + value=0) + + if output_sz is not None: + resize_factor = output_sz / crop_sz + im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz)) + att_mask = cv2.resize(att_mask, + (output_sz, output_sz)).astype(np.bool_) + if mask is None: + return im_crop_padded, resize_factor, att_mask + mask_crop_padded = \ + F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz), + mode='bilinear', align_corners=False)[0, 0] + return im_crop_padded, resize_factor, att_mask, mask_crop_padded + + else: + if mask is None: + return im_crop_padded, att_mask.astype(np.bool_), 1.0 + return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded + + +def transform_image_to_crop(box_in: torch.Tensor, + box_extract: torch.Tensor, + resize_factor: float, + crop_sz: torch.Tensor, + normalize=False) -> torch.Tensor: + """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image + args: + box_in - the box for which the co-ordinates are to be transformed + box_extract - the box about which the image crop has been extracted. + resize_factor - the ratio between the original image scale and the scale of the image crop + crop_sz - size of the cropped image + + returns: + torch.Tensor - transformed co-ordinates of box_in + """ + box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4] + + box_in_center = box_in[0:2] + 0.5 * box_in[2:4] + + box_out_center = (crop_sz - 1) / 2 + (box_in_center + - box_extract_center) * resize_factor + box_out_wh = box_in[2:4] * resize_factor + + box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh)) + if normalize: + return box_out / crop_sz[0] + else: + return box_out + + +def check_box(box: list, image_height, image_width) -> bool: + """ To check whether the box is within the image range or not + args: + box - the bounding box in the form of [x1, y1, x2, y2] + image_height - the height of the image + image_width - the width of the image + + returns: + bool - if box is valid, return True. Otherwise, return False + """ + assert len(box) == 4, 'box must be in the form of: [x1, y1, x2, y2]' + if box[0] < 0 or box[0] >= image_width: + return False + if box[2] < 0 or box[2] >= image_width: + return False + if box[1] < 0 or box[1] >= image_height: + return False + if box[3] < 0 or box[3] >= image_height: + return False + return True + + +def show_tracking_result(video_in_path, bboxes, video_save_path): + cap = cv2.VideoCapture(video_in_path) + for i in range(len(bboxes)): + box = bboxes[i] + success, frame = cap.read() + if success is False: + raise Exception(video_in_path, + ' can not be correctly decoded by OpenCV.') + if i == 0: + size = (frame.shape[1], frame.shape[0]) + fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') + video_writer = cv2.VideoWriter(video_save_path, fourcc, + cap.get(cv2.CAP_PROP_FPS), size, + True) + cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), + 5) + video_writer.write(frame) + video_writer.release + cap.release() diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py index 9a0636ee..112b3a58 100644 --- a/modelscope/models/multi_modal/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -9,9 +9,10 @@ if TYPE_CHECKING: from .gemm import GEMMForMultiModalEmbedding from .diffusion import DiffusionForTextToImageSynthesis from .mmr import VideoCLIPForMultiModalEmbedding - from .mplug_for_visual_question_answering import \ - MPlugForVisualQuestionAnswering + from .mplug_for_all_tasks import MPlugForAllTasks from .ofa_for_all_tasks import OfaForAllTasks + from .ofa_for_text_to_image_synthesis_model import \ + OfaForTextToImageSynthesis else: _import_structure = { @@ -19,8 +20,7 @@ else: 'diffusion': ['DiffusionForTextToImageSynthesis'], 'gemm': ['GEMMForMultiModalEmbedding'], 'mmr': ['VideoCLIPForMultiModalEmbedding'], - 'mplug_for_visual_question_answering': - ['MPlugForVisualQuestionAnswering'], + 'mplug_for_all_tasks': ['MPlugForAllTasks'], 'ofa_for_all_tasks': ['OfaForAllTasks'], 'ofa_for_text_to_image_synthesis_model': ['OfaForTextToImageSynthesis'] diff --git a/modelscope/models/multi_modal/clip/__init__.py b/modelscope/models/multi_modal/clip/__init__.py index bb2fb3b2..3fd492b9 100644 --- a/modelscope/models/multi_modal/clip/__init__.py +++ b/modelscope/models/multi_modal/clip/__init__.py @@ -1 +1 @@ -from .clip_model import CLIPForMultiModalEmbedding +from .model import CLIPForMultiModalEmbedding diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py new file mode 100644 index 00000000..8d356f42 --- /dev/null +++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py @@ -0,0 +1,422 @@ +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import, division, print_function +import collections +import os +import re +import unicodedata + +import six + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + 'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12', + 'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12' + ] + + cased_models = [ + 'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16', + 'multi_cased_L-12_H-768_A-12' + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = 'False' + case_name = 'lowercased' + opposite_flag = 'True' + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = 'True' + case_name = 'cased' + opposite_flag = 'False' + + if is_bad_config: + raise ValueError( + 'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. ' + 'However, `%s` seems to be a %s model, so you ' + 'should pass in `--do_lower_case=%s` so that the fine-tuning matches ' + 'how the model was pre-training. If this error is wrong, please ' + 'just comment out this check.' % + (actual_flag, init_checkpoint, model_name, case_name, + opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode('utf-8', 'ignore') + else: + raise ValueError('Unsupported string type: %s' % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode('utf-8', 'ignore') + elif isinstance(text, unicode): + return text + else: + raise ValueError('Unsupported string type: %s' % (type(text))) + else: + raise ValueError('Not running on Python2 or Python 3?') + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode('utf-8', 'ignore') + else: + raise ValueError('Unsupported string type: %s' % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode('utf-8') + else: + raise ValueError('Unsupported string type: %s' % (type(text))) + else: + raise ValueError('Not running on Python2 or Python 3?') + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, 'r') as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + @staticmethod + def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): + """ Converts a sequence of tokens (string) in a single string. """ + + def clean_up_tokenization(out_string): + """ Clean up a list of simple English tokenization artifacts + like spaces before punctuations and abreviated forms. + """ + out_string = ( + out_string.replace(' .', '.').replace(' ?', '?').replace( + ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace( + " n't", "n't").replace(" 'm", "'m").replace( + " 's", "'s").replace(" 've", + "'ve").replace(" 're", "'re")) + return out_string + + text = ' '.join(tokens).replace(' ##', '').strip() + if clean_up_tokenization_spaces: + clean_text = clean_up_tokenization(text) + return clean_text + else: + return text + + def vocab_size(self): + return len(self.vocab) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(' '.join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize('NFD', text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == 'Mn': + continue + output.append(char) + return ''.join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return [''.join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(' ') + output.append(char) + output.append(' ') + else: + output.append(char) + return ''.join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F)): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(' ') + else: + output.append(char) + return ''.join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = ''.join(chars[start:end]) + if start > 0: + substr = '##' + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == ' ' or char == '\t' or char == '\n' or char == '\r': + return True + cat = unicodedata.category(char) + if cat == 'Zs': + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == '\t' or char == '\n' or char == '\r': + return False + cat = unicodedata.category(char) + if cat in ('Cc', 'Cf'): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) + or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith('P'): + return True + return False diff --git a/modelscope/models/multi_modal/clip/clip_bert.py b/modelscope/models/multi_modal/clip/clip_bert.py deleted file mode 100644 index 24ccc1fa..00000000 --- a/modelscope/models/multi_modal/clip/clip_bert.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch.nn as nn -from transformers import BertConfig, BertForMaskedLM - - -class TextTransformer(nn.Module): - - def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True): - super(TextTransformer, self).__init__() - bert_config = BertConfig.from_dict(config_dict) - if use_grad_ckp: - bert_config.gradient_checkpointing = True - - self.bert = BertForMaskedLM(bert_config).bert - - self.projector = nn.Linear( - bert_config.hidden_size, feat_dim, bias=False) - - def forward(self, input_ids, attention_mask): - trans_features = { - 'input_ids': input_ids, - 'attention_mask': attention_mask - } - - output_states = self.bert(**trans_features, return_dict=False) - output_tokens = output_states[0] - - cls_tokens = output_tokens[:, 0, :] - - return self.projector(cls_tokens) diff --git a/modelscope/models/multi_modal/clip/clip_model.py b/modelscope/models/multi_modal/clip/clip_model.py deleted file mode 100644 index e092f4af..00000000 --- a/modelscope/models/multi_modal/clip/clip_model.py +++ /dev/null @@ -1,216 +0,0 @@ -from typing import Any, Dict - -import cv2 -import json -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from PIL import Image -from tokenizers import BertWordPieceTokenizer -from torch.distributed.nn.functional import \ - all_gather as all_gather_with_backprop -from torchvision.transforms import Compose, Normalize, Resize, ToTensor - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.multi_modal.clip.clip_bert import TextTransformer -from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer -from modelscope.utils.constant import ModeKeys, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['CLIPForMultiModalEmbedding'] - - -class CLIPModel(nn.Module): - - def __init__(self, model_dir): - super(CLIPModel, self).__init__() - # including vision config and text config - model_config = json.load( - open('{}/encoder_config.json'.format(model_dir))) - - # vision encoder - vision_config = model_config['vision_config'] - self.img_size = vision_config['input_resolution'] - self.vision_encoder = VisionTransformer( - input_resolution=self.img_size, - patch_size=vision_config['patch_size'], - width=vision_config['width'], - layers=vision_config['layers'], - heads=vision_config['heads'], - output_dim=vision_config['feat_dim'], - use_grad_ckp=True) - - # text encoder - text_config = model_config['text_config'] - self.text_encoder = TextTransformer( - text_config['bert_config'], feat_dim=text_config['feat_dim']) - - self.logit_scale = nn.Parameter(torch.ones([]) * 4.6) - - def contrastive_loss(self, logits, dim): - neg_ce = torch.diag(F.log_softmax(logits, dim=dim)) - return -neg_ce.mean() - - def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None): - if img_idx is not None and all_img_idx is not None: - with torch.no_grad(): - false_neg_indicator = ( - img_idx[:, None] == all_img_idx[None, :]) - false_neg_indicator.fill_diagonal_(False) - t2i_sim.masked_fill_(false_neg_indicator, float('-inf')) - i2t_sim.masked_fill_(false_neg_indicator, float('-inf')) - caption_loss = self.contrastive_loss(t2i_sim, dim=1) - image_loss = self.contrastive_loss(i2t_sim, dim=1) - else: - caption_loss = self.contrastive_loss(t2i_sim, dim=1) - image_loss = self.contrastive_loss(i2t_sim, dim=1) - return (caption_loss + image_loss) / 2.0 - - def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor, - img_id_list): - img_feat = self.forward(img_tensor, input_type='img') - text_feat = self.forward((text_ids_tensor, text_masks_tensor), - input_type='text') - - global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0) - global_text_feat = torch.cat( - all_gather_with_backprop(text_feat), dim=0) - global_img_id_list = torch.cat( - all_gather_with_backprop(img_id_list), dim=0) - - t2i_sim_mat = text_feat @ global_img_feat.t() - i2t_sim_mat = img_feat @ global_text_feat.t() - - logit_scale = self.logit_scale.exp().clamp(max=100.0) - t2i_sim_mat_logits = t2i_sim_mat * logit_scale - i2t_sim_mat_logits = i2t_sim_mat * logit_scale - - loss = self.clip_loss( - t2i_sim_mat_logits, - i2t_sim_mat_logits, - img_idx=img_id_list, - all_img_idx=global_img_id_list) - - return loss - - def forward(self, input_data, input_type): - if input_type == 'img': - img_embedding = self.vision_encoder(input_data) - img_embedding = F.normalize(img_embedding, p=2.0, dim=1) - return img_embedding - elif input_type == 'text': - text_ids_tensor, text_mask_tensor = input_data - text_embedding = self.text_encoder(text_ids_tensor, - text_mask_tensor) - text_embedding = F.normalize(text_embedding, p=2.0, dim=1) - return text_embedding - elif input_type == ModeKeys.TRAIN: - return self.get_loss(*input_data) - else: - raise ValueError('Unknown input type') - - -@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) -class CLIPForMultiModalEmbedding(TorchModel): - - def __init__(self, model_dir, device_id=-1): - super().__init__(model_dir=model_dir, device_id=device_id) - self.clip_model = CLIPModel(model_dir=model_dir) - pretrained_params = torch.load( - '{}/pytorch_model.bin'.format(model_dir), 'cpu') - self.clip_model.load_state_dict(pretrained_params) - self.clip_model.eval() - - self.device_id = device_id - if self.device_id >= 0: - self.clip_model.to('cuda:{}'.format(self.device_id)) - logger.info('Use GPU: {}'.format(self.device_id)) - else: - logger.info('Use CPU for inference') - - # image preprocessor - norm_op = Normalize((0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711)) - self.img_preprocessor = Compose([ - Resize((self.clip_model.img_size, self.clip_model.img_size), - interpolation=Image.BICUBIC), - ToTensor(), norm_op - ]) - - # text tokenizer - vocab_path = '{}/vocab.txt'.format(model_dir) - self.text_tokenizer = BertWordPieceTokenizer( - vocab_path, lowercase=False) - self.text_tokenizer.enable_truncation(max_length=30) - - def tokenize_text(self, text_str): - tokens = self.text_tokenizer.encode(text_str) - max_tokens = 30 - text_ids_tensor = torch.zeros((1, max_tokens)).long() - text_mask_tensor = torch.zeros((1, max_tokens)) - - text_ids, text_mask = tokens.ids, tokens.attention_mask - text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids) - text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask) - - return text_ids_tensor, text_mask_tensor - - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - from modelscope.outputs import OutputKeys - output = { - OutputKeys.IMG_EMBEDDING: None, - OutputKeys.TEXT_EMBEDDING: None - } - if 'img' in input and input['img'] is not None: - input_img = input['img'] - if isinstance(input_img, Image.Image): - img_tensor = self.img_preprocessor(input_img)[None, ...] - elif isinstance(input_img, np.ndarray): - if len(input_img.shape) == 2: - input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR) - input_img = input_img[:, :, ::-1] # in rgb order - input_img = Image.fromarray( - input_img.astype('uint8')).convert('RGB') - img_tensor = self.img_preprocessor(input_img)[None, ...] - else: - raise TypeError( - f'img should be either PIL.Image or np.array, but got {type(input_img)}' - ) - - if self.device_id >= 0: - img_tensor = img_tensor.to('cuda:{}'.format(self.device_id)) - - img_embedding = self.clip_model( - input_data=img_tensor, input_type='img') - from modelscope.outputs import OutputKeys - output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy() - - if 'text' in input and input['text'] is not None: - text_str = input['text'] - if isinstance(text_str, str): - text_ids_tensor, text_mask_tensor = self.tokenize_text( - text_str) - else: - raise TypeError( - f'text should be str, but got {type(text_str)}') - - if self.device_id >= 0: - text_ids_tensor = text_ids_tensor.to('cuda:{}'.format( - self.device_id)) - text_mask_tensor = text_mask_tensor.to('cuda:{}'.format( - self.device_id)) - - text_embedding = self.clip_model( - input_data=(text_ids_tensor, text_mask_tensor), - input_type='text') - output['text_embedding'] = text_embedding.data.cpu().numpy() - - return output - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - return inputs diff --git a/modelscope/models/multi_modal/clip/clip_vit.py b/modelscope/models/multi_modal/clip/clip_vit.py deleted file mode 100644 index cfe67426..00000000 --- a/modelscope/models/multi_modal/clip/clip_vit.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2021 The OpenAI CLIP Authors. All rights reserved. - -from collections import OrderedDict -from typing import Tuple, Union - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils.checkpoint as checkpoint -from torch import nn - - -class LayerNorm(nn.LayerNorm): - """Subclass torch's LayerNorm to handle fp16.""" - - def forward(self, x: torch.Tensor): - orig_type = x.dtype - ret = super().forward(x.type(torch.float32)) - return ret.type(orig_type) - - -class QuickGELU(nn.Module): - - def forward(self, x: torch.Tensor): - return x * torch.sigmoid(1.702 * x) - - -class ResidualAttentionBlock(nn.Module): - - def __init__(self, - d_model: int, - n_head: int, - attn_mask: torch.Tensor = None): - super().__init__() - - self.attn = nn.MultiheadAttention(d_model, n_head) - self.ln_1 = LayerNorm(d_model) - self.mlp = nn.Sequential( - OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), - ('gelu', QuickGELU()), - ('c_proj', nn.Linear(d_model * 4, d_model))])) - self.ln_2 = LayerNorm(d_model) - self.attn_mask = attn_mask - - def attention(self, x: torch.Tensor): - self.attn_mask = self.attn_mask.to( - dtype=x.dtype, - device=x.device) if self.attn_mask is not None else None - return self.attn( - x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] - - def forward(self, x: torch.Tensor): - x = x + self.attention(self.ln_1(x)) - x = x + self.mlp(self.ln_2(x)) - return x - - -class Transformer(nn.Module): - - def __init__(self, - width: int, - layers: int, - heads: int, - attn_mask: torch.Tensor = None, - use_grad_ckp: bool = True): - super().__init__() - self.width = width - self.layers = layers - self.resblocks = nn.Sequential(*[ - ResidualAttentionBlock(width, heads, attn_mask) - for _ in range(layers) - ]) - - self.use_grad_ckp = use_grad_ckp - - def forward(self, x: torch.Tensor): - if self.use_grad_ckp: - for each_block in self.resblocks: - x = checkpoint.checkpoint(each_block, x) - return x - else: - return self.resblocks(x) - - -class VisionTransformer(nn.Module): - - def __init__(self, input_resolution: int, patch_size: int, width: int, - layers: int, heads: int, output_dim: int, use_grad_ckp: bool): - super().__init__() - self.input_resolution = input_resolution - self.output_dim = output_dim - self.conv1 = nn.Conv2d( - in_channels=3, - out_channels=width, - kernel_size=patch_size, - stride=patch_size, - bias=False) - - scale = width**-0.5 - self.class_embedding = nn.Parameter(scale * torch.randn(width)) - self.positional_embedding = nn.Parameter(scale * torch.randn( - (input_resolution // patch_size)**2 + 1, width)) - self.ln_pre = LayerNorm(width) - - self.transformer = Transformer( - width, layers, heads, use_grad_ckp=use_grad_ckp) - - self.ln_post = LayerNorm(width) - self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) - - def forward(self, x: torch.Tensor): - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], - -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] - class_embeddings = self.class_embedding.to(x.dtype) + \ - torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) - x = torch.cat([class_embeddings, x], dim=1) - x = x + self.positional_embedding.to(x.dtype) - x = self.ln_pre(x) - - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x) - x = x.permute(1, 0, 2) # LND -> NLD - - x = self.ln_post(x[:, 0, :]) - - if self.proj is not None: - x = x @ self.proj - - return x diff --git a/modelscope/models/multi_modal/clip/configuration_bert.py b/modelscope/models/multi_modal/clip/configuration_bert.py new file mode 100644 index 00000000..b75f5db8 --- /dev/null +++ b/modelscope/models/multi_modal/clip/configuration_bert.py @@ -0,0 +1,82 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import logging + +logger = logging.getLogger(__name__) + + +class BertConfig(object): + r""" + :class:`~transformers.BertConfig` is the configuration class to store the configuration of a + `BertModel`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + + def __init__(self, + vocab_size_or_config_json_file=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + output_attentions=False, + output_hidden_states=False): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.output_attentions = output_attentions + self.output_hidden_states = output_hidden_states diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py new file mode 100644 index 00000000..2fb0d7e3 --- /dev/null +++ b/modelscope/models/multi_modal/clip/model.py @@ -0,0 +1,677 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Tuple, Union + +import json +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from torchvision.transforms import Compose, Normalize, Resize, ToTensor + +from modelscope.metainfo import Models +from modelscope.models import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer +from modelscope.models.multi_modal.clip.configuration_bert import BertConfig +from modelscope.models.multi_modal.clip.modeling_bert import BertModel +from modelscope.utils.constant import ModeKeys, ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['CLIPForMultiModalEmbedding'] + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict([('-1', nn.AvgPool2d(stride)), + ('0', + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False)), + ('1', nn.BatchNorm2d(planes * self.expansion))])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + + def __init__(self, + spacial_dim: int, + embed_dim: int, + num_heads: int, + output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.reshape(x.shape[0], x.shape[1], + x.shape[2] * x.shape[3]).permute(2, 0, + 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x, + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False) + + return x[0] + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, + layers, + output_dim, + heads, + input_resolution=224, + width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d( + width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, + heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), + (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisualTransformer(nn.Module): + + def __init__(self, input_resolution: int, patch_size: int, width: int, + layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat( + [ # noqa + self.class_embedding.to(x.dtype) + torch.zeros( # noqa + x.shape[0], + 1, + x.shape[-1], + dtype=x.dtype, + device=x.device), + x # noqa + ], + dim=1) # noqa shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + +class CLIP(nn.Module): + + def __init__( + self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + vocab_size: int, + text_attention_probs_dropout_prob: float, + text_hidden_act: str, + text_hidden_dropout_prob: float, + text_hidden_size: int, + text_initializer_range: float, + text_intermediate_size: int, + text_max_position_embeddings: int, + text_num_attention_heads: int, + text_num_hidden_layers: int, + text_type_vocab_size: int, + tokenizer: FullTokenizer, + ): + super().__init__() + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width) + else: + vision_heads = vision_width // 64 + self.visual = VisualTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim) + + self.bert_config = BertConfig( + vocab_size_or_config_json_file=vocab_size, + hidden_size=text_hidden_size, + num_hidden_layers=text_num_hidden_layers, + num_attention_heads=text_num_attention_heads, + intermediate_size=text_intermediate_size, + hidden_act=text_hidden_act, + hidden_dropout_prob=text_hidden_dropout_prob, + attention_probs_dropout_prob=text_attention_probs_dropout_prob, + max_position_embeddings=text_max_position_embeddings, + type_vocab_size=text_type_vocab_size, + initializer_range=text_initializer_range, + layer_norm_eps=1e-12, + ) + self.bert = BertModel(self.bert_config) + + self.text_projection = nn.Parameter( + torch.empty(text_hidden_size, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.tokenizer = tokenizer + + self.initialize_parameters() + + def initialize_parameters(self): + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features**-0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [ + self.visual.layer1, self.visual.layer2, self.visual.layer3, + self.visual.layer4 + ]: + for name, param in resnet_block.named_parameters(): + if name.endswith('bn3.weight'): + nn.init.zeros_(param) + + if self.text_projection is not None: + nn.init.normal_( + self.text_projection, std=self.bert_config.hidden_size**-0.5) + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + pad_index = self.tokenizer.vocab['[PAD]'] + attn_mask = text.ne(pad_index).type(self.dtype) + x = self.bert( + text, attention_mask=attn_mask)[0].type( + self.dtype) # [batch_size, seq_length, hidden_size] + return x[:, 0, :] @ self.text_projection + + def forward(self, image, text): + assert image is not None or text is not None, 'text and image cannot both be None!' + + if image is None: + return self.encode_text(text) + elif text is None: + return self.encode_image(image) + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + image_features = image_features / image_features.norm( + dim=-1, keepdim=True) + text_features = text_features / text_features.norm( + dim=-1, keepdim=True) + + return image_features, text_features, self.logit_scale.exp() + + def get_similarity(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm( + dim=1, keepdim=True) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logits_per_image.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_models_to_fp32(model): + for p in model.parameters(): + p.data = p.data.float() + if p.grad: + p.grad.data = p.grad.data.float() + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(module): + if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)): + module.weight.data = module.weight.data.half() + if module.bias is not None: + module.bias.data = module.bias.data.half() + + if isinstance(module, nn.MultiheadAttention): + for attr in [ + *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], + 'in_proj_bias', 'bias_k', 'bias_v' + ]: + tensor = getattr(module, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + if isinstance(module, BertModel): + module.to(torch.half) + + for name in ['text_projection', 'proj']: + if hasattr(module, name): + attr = getattr(module, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def _convert_to_rgb(image): + return image.convert('RGB') + + +def image_transform(image_size=224): + transform = Compose([ + _convert_to_rgb, + Resize((image_size, image_size)), + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) + return transform + + +@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) +class CLIPForMultiModalEmbedding(TorchModel): + + def __init__(self, model_dir, device_id=-1): + super().__init__(model_dir=model_dir, device_id=device_id) + + # Initialize the model. + vision_model_config_file = '{}/vision_model_config.json'.format( + model_dir) + logger.info( + f'Loading vision model config from {vision_model_config_file}') + assert os.path.exists(vision_model_config_file) + + text_model_config_file = '{}/text_model_config.json'.format(model_dir) + logger.info(f'Loading text model config from {text_model_config_file}') + assert os.path.exists(text_model_config_file) + + with open(vision_model_config_file, + 'r') as fv, open(text_model_config_file, 'r') as ft: + model_info = json.load(fv) + for k, v in json.load(ft).items(): + model_info[k] = v + + # image preprocess + self.img_preprocess = image_transform(model_info['image_resolution']) + + # text tokenizer + vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}' + self.tokenizer = FullTokenizer(vocab_file=vocab_file) + + # initialize the model + self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer) + convert_weights(self.clip_model) + + # restore the pretrained weight + checkpoint = torch.load( + f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu') + sd = checkpoint['state_dict'] + if next(iter(sd.items()))[0].startswith('module'): + sd = {k[len('module.'):]: v for k, v in sd.items()} + self.clip_model.load_state_dict(sd) + self.clip_model.eval() + + # place the model + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + if self.device == 'cuda': + self.clip_model.to(self.device) + logger.info('Use GPU for inference') + else: + self.clip_model.float() + logger.info('Use CPU for inference') + + def tokenize(self, + texts: Union[str, List[str]], + context_length: int = 52) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + context_length : int + The context length to use; all baseline models use 24 as the context length + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + all_tokens = [] + for text in texts: + all_tokens.append( + [self.tokenizer.vocab['[CLS]']] + + self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(text))[:context_length - 2] + + [self.tokenizer.vocab['[SEP]']]) + + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + assert len(tokens) <= context_length + result[i, :len(tokens)] = torch.tensor(tokens) + + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + from modelscope.outputs import OutputKeys + output = { + OutputKeys.IMG_EMBEDDING: None, + OutputKeys.TEXT_EMBEDDING: None + } + if 'img' in input and input['img'] is not None: + image_input = input['img'] + + # single image input + if isinstance(image_input, Image.Image): + image_tensor = self.img_preprocess(image_input).unsqueeze(0) + # multi images input + elif isinstance(image_input, list): + if all([isinstance(elem, Image.Image) + for elem in image_input]): + image_tensor = torch.stack( + [self.img_preprocess(elem) for elem in image_input], + dim=0) + else: + unsupported_elem_type = [ + type(elem) for elem in image_input + if not isinstance(elem, Image.Image) + ][0] + raise TypeError( + f'img should be PIL.Image or List[PIL.Image], \ + but got a List containing one {unsupported_elem_type}' + ) + # others + else: + raise TypeError( + f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}' + ) + + image_tensor = image_tensor.to(self.device) + + with torch.no_grad(): + image_features = self.clip_model.encode_image(image_tensor) + image_features /= image_features.norm( + dim=-1, keepdim=True) # l2-normalize + + output[OutputKeys.IMG_EMBEDDING] = image_features + + if 'text' in input and input['text'] is not None: + text_input = input['text'] + + # single text input + if isinstance(text_input, str): + text_tensor = self.tokenize(text_input) + # multi texts input + elif isinstance(text_input, list): + if all([isinstance(elem, str) for elem in text_input]): + text_tensor = self.tokenize(text_input) + else: + unsupported_elem_type = [ + type(elem) for elem in text_input + if not isinstance(elem, str) + ][0] + raise TypeError( + f'text should be str or List[str], but got a List containing one {unsupported_elem_type}' + ) + # others + else: + raise TypeError( + f'text should be str or List[str], but got {type(text_input)}' + ) + + text_tensor = text_tensor.to(self.device) + + with torch.no_grad(): + text_features = self.clip_model.encode_text(text_tensor) + text_features /= text_features.norm( + dim=-1, keepdim=True) # l2-normalize + output[OutputKeys.TEXT_EMBEDDING] = text_features + + return output + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs + + @property + def temperature(self): + return 1.0 / self.clip_model.logit_scale.exp() diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py new file mode 100644 index 00000000..b5f104ce --- /dev/null +++ b/modelscope/models/multi_modal/clip/modeling_bert.py @@ -0,0 +1,507 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import logging +import math +import os +import sys +from io import open + +import json +import torch +from torch import nn + +from .configuration_bert import BertConfig + +logger = logging.getLogger(__name__) + + +def gelu(x): + """ Original Implementation of the gelu activation function in Google Bert repo when initially created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def gelu_new(x): + """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1 + torch.tanh( + math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = { + 'gelu': gelu, + 'relu': torch.nn.functional.relu, + 'swish': swish, + 'gelu_new': gelu_new +} + +BertLayerNorm = torch.nn.LayerNorm + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange( + seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + 'The hidden size (%d) is not a multiple of the number of attention ' + 'heads (%d)' % + (config.hidden_size, config.num_attention_heads)) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size + / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if self.output_attentions else ( + context_layer, ) + return outputs + + +class BertSelfOutput(nn.Module): + + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def forward(self, input_tensor, attention_mask=None, head_mask=None): + self_outputs = self.self(input_tensor, attention_mask, head_mask) + attention_output = self.output(self_outputs[0], input_tensor) + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, + str) or (sys.version_info[0] == 2 + and isinstance(config.hidden_act, unicode)): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + attention_outputs = self.attention(hidden_states, attention_mask, + head_mask) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output, ) + attention_outputs[ + 1:] # add attentions if we output them + return outputs + + +class BertEncoder(nn.Module): + + def __init__(self, config): + super(BertEncoder, self).__init__() + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.layer = nn.ModuleList( + [BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + all_hidden_states = () + all_attentions = () + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + layer_outputs = layer_module(hidden_states, attention_mask, + head_mask[i]) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1], ) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states, ) + + outputs = (hidden_states, ) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states, ) + if self.output_attentions: + outputs = outputs + (all_attentions, ) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + + +class BertPooler(nn.Module): + + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, + str) or (sys.version_info[0] == 2 + and isinstance(config.hidden_act, unicode)): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + + def __init__(self, config): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + + def __init__(self, config): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + + def __init__(self, config): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(nn.Module): + config_class = BertConfig + base_model_prefix = 'bert' + + def __init__(self, config): + super(BertPreTrainedModel, self).__init__() + self.config = config + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) + of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, + used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + + def __init__(self, config): + super(BertModel, self).__init__(config) + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + + self.apply(self._init_weights) + + def forward(self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( + -1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, + -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( + -1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters( + )).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings( + input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids) + encoder_outputs = self.encoder( + embedding_output, extended_attention_mask, head_mask=head_mask) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = ( + sequence_output, + pooled_output, + ) + encoder_outputs[ + 1:] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py index 4d61e2d1..8617b8dd 100644 --- a/modelscope/models/multi_modal/diffusion/model.py +++ b/modelscope/models/multi_modal/diffusion/model.py @@ -136,7 +136,7 @@ class DiffusionForTextToImageSynthesis(Model): self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024 # text tokenizer - vocab_path = '{}/vocab.txt'.format(model_dir) + vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}' self.tokenizer = Tokenizer(vocab_file=vocab_path, seq_len=64) # diffusion process diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py index 26eea0d5..db928212 100644 --- a/modelscope/models/multi_modal/gemm/gemm_base.py +++ b/modelscope/models/multi_modal/gemm/gemm_base.py @@ -491,7 +491,9 @@ class GEVL(nn.Module): gen_logits = self.to_logits(out_embs[-1:, ...]) probs = F.softmax(self.gen_logit_scale.exp() * gen_logits, dim=-1) pred = torch.argmax( - probs * (1.0 + torch.rand_like(probs)), axis=-1) + probs * (2.0 + torch.rand_like(probs)), axis=-1) + if int(pred) >= eot_token or int(pred) <= 0: + break pred_tokens.append(pred) text_input = torch.cat( [text_input, pred.permute(1, 0).contiguous()], axis=1) @@ -500,8 +502,6 @@ class GEVL(nn.Module): for out_tokens in pred_text_tokens: tokens = [] for x in out_tokens: - if x >= eot_token or x <= 0: - break tokens.append(int(x)) out_text = self.tokenizer.decode(tokens) out_text = out_text.strip() diff --git a/modelscope/models/multi_modal/mplug/__init__.py b/modelscope/models/multi_modal/mplug/__init__.py index bca5849b..955c87e2 100644 --- a/modelscope/models/multi_modal/mplug/__init__.py +++ b/modelscope/models/multi_modal/mplug/__init__.py @@ -14,5 +14,4 @@ # limitations under the License. from .configuration_mplug import MPlugConfig -from .modeling_mplug import (CONFIG_NAME, VOCAB_NAME, - MPlugForVisualQuestionAnswering) +from .modeling_mplug import CONFIG_NAME, MPlug diff --git a/modelscope/models/multi_modal/mplug/clip/clip.py b/modelscope/models/multi_modal/mplug/clip/clip.py index fbdfbd29..aa56e39b 100644 --- a/modelscope/models/multi_modal/mplug/clip/clip.py +++ b/modelscope/models/multi_modal/mplug/clip/clip.py @@ -5,9 +5,69 @@ from typing import Tuple, Union import torch import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint from torch import nn -from modelscope.models.multi_modal.clip.clip_vit import Transformer + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None, + use_grad_ckp: bool = True): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + self.use_grad_ckp = use_grad_ckp + + def forward(self, x: torch.Tensor): + if self.use_grad_ckp: + for each_block in self.resblocks: + x = checkpoint.checkpoint(each_block, x) + return x + else: + return self.resblocks(x) class Bottleneck(nn.Module): diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py index 6b2914c4..c275ed15 100644 --- a/modelscope/models/multi_modal/mplug/configuration_mplug.py +++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py @@ -15,14 +15,14 @@ # limitations under the License. """ MPLUG model configuration """ import os -from collections import OrderedDict -from typing import Any, Dict, Mapping, Union +from typing import Any, Dict, Union import yaml from transformers import PretrainedConfig -from transformers.onnx import OnnxConfig from transformers.utils import logging +from modelscope.utils.constant import Tasks + logger = logging.get_logger(__name__) @@ -32,6 +32,7 @@ class MPlugConfig(PretrainedConfig): def __init__( self, + task=Tasks.visual_question_answering, bert_config='config_bert.json', image_res=504, batch_size_train=128, @@ -64,7 +65,9 @@ class MPlugConfig(PretrainedConfig): clip_transformer_heads=12, clip_transformer_layers=12, **kwargs): + super().__init__(**kwargs) + self.task = task self.bert_config = bert_config self.image_res = image_res self.batch_size_train = batch_size_train @@ -103,23 +106,3 @@ class MPlugConfig(PretrainedConfig): with open(yaml_file, 'r') as reader: config_dict = yaml.load(reader, Loader=yaml.Loader) return cls(**config_dict) - - -class MPlugOnnxConfig(OnnxConfig): - - @property - def inputs(self) -> Mapping[str, Mapping[int, str]]: - return OrderedDict([ - ('input_ids', { - 0: 'batch', - 1: 'sequence' - }), - ('attention_mask', { - 0: 'batch', - 1: 'sequence' - }), - ('token_type_ids', { - 0: 'batch', - 1: 'sequence' - }), - ]) diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py index 0b45ea12..50622cc0 100755 --- a/modelscope/models/multi_modal/mplug/modeling_mplug.py +++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py @@ -42,14 +42,13 @@ from transformers.utils import logging from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig from modelscope.models.multi_modal.mplug.predictor import TextGenerator +from modelscope.utils.constant import ModelFile transformers.logging.set_verbosity_error() logger = logging.get_logger(__name__) CONFIG_NAME = 'config.yaml' -WEIGHTS_NAME = 'pytorch_model.bin' -VOCAB_NAME = 'vocab.txt' _CONFIG_FOR_DOC = 'BertConfig' _TOKENIZER_FOR_DOC = 'BertTokenizer' @@ -1726,32 +1725,145 @@ class BertLMHeadModel(BertPreTrainedModel): return reordered_past -class MPlugForVisualQuestionAnswering(PreTrainedModel): +class BertPrefixModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint='bert-base-uncased', + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=True, + reduction='mean', + soft_labels=None, + alpha=0, + return_logits=False, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, : + -1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + if soft_labels is not None: + loss_distill = -torch.sum( + F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels, + dim=-1) + loss_distill = loss_distill[labels != -100].mean() + lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill + + if not return_dict: + output = (prediction_scores, ) + outputs[2:] + return ((lm_loss, ) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class MPlug(PreTrainedModel): config_class = MPlugConfig def __init__(self, config): super().__init__(config) self.config = config self.tokenizer = BertTokenizer.from_pretrained( - os.path.join(config.model_dir, VOCAB_NAME)) + os.path.join(config.model_dir, ModelFile.VOCAB_FILE)) self.module_setting(config) self.visual_encoder = self._initialize_clip(config) self.text_encoder = BertModel( self.config_encoder, add_pooling_layer=False) self.fusion_encoder = FusionModel( self.config_fusion, add_pooling_layer=False) - self.text_decoder = BertLMHeadModel(self.config_decoder) - self.init_distill(config) - self.beam_generator = TextGenerator(config, self.text_decoder) @classmethod def from_pretrained(cls, model_dir, load_checkpoint=True): - config = MPlugConfig.from_yaml_file( + from modelscope.utils.constant import Tasks + + task_mapping = { + Tasks.visual_question_answering: MPlugForVisualQuestionAnswering, + Tasks.image_captioning: MPLUGForImageCaption + } + config = cls.config_class.from_yaml_file( os.path.join(model_dir, CONFIG_NAME)) config.model_dir = model_dir - model = cls(config) + model = task_mapping[config.task](config) if load_checkpoint: - checkpoint_path = os.path.join(model_dir, WEIGHTS_NAME) + checkpoint_path = os.path.join(model_dir, + ModelFile.TORCH_MODEL_BIN_FILE) checkpoint = torch.load(checkpoint_path, map_location='cpu') if 'model' in checkpoint: state_dict = checkpoint['model'] @@ -1803,6 +1915,161 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel): clip_model.visual.positional_embedding = pos_embed return clip_model + def forward(self, *args, **kwargs): + raise NotImplementedError + + def module_setting(self, config): + bert_config_path = os.path.join(config.model_dir, config.bert_config) + self.config_encoder = BertConfig.from_json_file(bert_config_path) + self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers + self.config_fusion = BertConfig.from_json_file(bert_config_path) + self.config_decoder = BertConfig.from_json_file(bert_config_path) + self.config_decoder.add_cross_attention = True + self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers + self.large = False + if self.config_encoder.hidden_size != config.vision_width: + self.visn_fc = nn.Linear(config.vision_width, + self.config_encoder.hidden_size) + self.visn_layer_norm = nn.LayerNorm( + self.config_encoder.hidden_size, eps=1e-12) + self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob) + self.large = True + + @torch.no_grad() + def copy_params(self): + for model_pair in self.model_pairs: + for param, param_m in zip(model_pair[0].parameters(), + model_pair[1].parameters()): + param_m.data.copy_(param.data) # initialize + param_m.requires_grad = False # not update by gradient + + @torch.no_grad() + def _momentum_update(self): + for model_pair in self.model_pairs: + for param, param_m in zip(model_pair[0].parameters(), + model_pair[1].parameters()): + param_m.data = param_m.data * self.momentum + param.data * ( + 1. - self.momentum) + + def generation(self, question_states, question_atts, out_size=1): + encoder_inputs = [question_states, question_atts] + topk_ids, topk_scores = self.beam_generator.translate_batch( + encoder_inputs, out_size=out_size) + return topk_ids, topk_scores + + @staticmethod + def _tile(x, dim, n_tile): + import numpy as np + init_dim = x.size(dim) + repeat_idx = [1] * x.dim() + repeat_idx[dim] = n_tile + x = x.repeat(*(repeat_idx)) + order_index = torch.LongTensor( + np.concatenate( + [init_dim * np.arange(n_tile) + i for i in range(init_dim)])) + return torch.index_select(x, dim, order_index.to(x.device)) + + def rank_answer(self, question_states, question_atts, answer_ids, + answer_atts, k): + + num_ques = question_states.size(0) + start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token + + start_output = self.text_decoder( + start_ids, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + return_dict=True, + reduction='none') + logits = start_output.logits[:, 0, :] # first token's logit + + # topk_probs: top-k probability + # topk_ids: [num_question, k] + answer_first_token = answer_ids[:, 1] + prob_first_token = F.softmax( + logits, dim=1).index_select( + dim=1, index=answer_first_token) + topk_probs, topk_ids = prob_first_token.topk(k, dim=1) + + # answer input: [num_question*k, answer_len] + input_ids = [] + input_atts = [] + for b, topk_id in enumerate(topk_ids): + input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) + input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) + input_ids = torch.cat(input_ids, dim=0) + input_atts = torch.cat(input_atts, dim=0) + + targets_ids = input_ids.masked_fill( + input_ids == self.tokenizer.pad_token_id, -100) + + # repeat encoder's output for top-k answers + question_states = self._tile(question_states, 0, k) + question_atts = self._tile(question_atts, 0, k) + + output = self.text_decoder( + input_ids, + attention_mask=input_atts, + encoder_hidden_states=question_states, + encoder_attention_mask=question_atts, + labels=targets_ids, + return_dict=True, + reduction='none') + + answer_loss = output.loss + answer_loss = answer_loss.view(input_ids.size(0), -1) + + # topk_prob: first token probability + topk_probs = topk_probs.view(-1, 1) + log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1) + + # re-calculate log probabilities for the answer sequences using chain rule + log_probs_sum = log_probs.sum(1) + log_probs_sum = log_probs_sum.view(num_ques, k) + + topk_probs = F.softmax(log_probs_sum, dim=-1) + # get top-k after re-ranking + topk_probs, rerank_id = topk_probs.topk(k, dim=1) + topk_ids = torch.gather(topk_ids, 1, rerank_id) + + return topk_ids, topk_probs + + +class MPlugForVisualQuestionAnswering(MPlug): + + def __init__(self, config): + super().__init__(config) + self.text_decoder = BertLMHeadModel(self.config_decoder) + self.beam_generator = TextGenerator(config, self.text_decoder) + self.init_distill(config) + + def init_distill(self, config): + self.distill = config.distill + if self.distill: + self.visual_encoder_m = self._initialize_clip(config) + self.text_encoder_m = BertModel( + self.config_encoder, add_pooling_layer=False) + self.fusion_encoder_m = FusionModel( + self.config_fusion, add_pooling_layer=False) + self.text_decoder_m = BertLMHeadModel(self.config_decoder) + self.model_pairs = [ + [self.visual_encoder, self.visual_encoder_m], + [self.text_encoder, self.text_encoder_m], + [self.text_decoder, self.text_decoder_m], + ] + if self.config_encoder.hidden_size != config.vision_width: + self.visn_fc_m = nn.Linear(config.vision_width, + self.config_encoder.hidden_size) + self.visn_layer_norm_m = nn.LayerNorm( + self.config_encoder.hidden_size, eps=1e-12) + self.dropout_m = nn.Dropout( + self.config_encoder.hidden_dropout_prob) + self.model_pairs.extend( + [[self.visn_fc, self.visn_fc_m], + [self.visn_layer_norm, self.visn_layer_norm_m]]) + self.copy_params() + self.momentum = 0.995 + def forward(self, image, question, @@ -1935,145 +2202,110 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel): merge_text_attention) return topk_ids, topk_probs - def module_setting(self, config): - bert_config_path = os.path.join(config.model_dir, config.bert_config) - self.config_encoder = BertConfig.from_json_file(bert_config_path) - self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers - self.config_fusion = BertConfig.from_json_file(bert_config_path) - self.config_decoder = BertConfig.from_json_file(bert_config_path) - self.config_decoder.add_cross_attention = True - self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers - self.large = False - if self.config_encoder.hidden_size != config.vision_width: - self.visn_fc = nn.Linear(config.vision_width, - self.config_encoder.hidden_size) - self.visn_layer_norm = nn.LayerNorm( - self.config_encoder.hidden_size, eps=1e-12) - self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob) - self.large = True - - def init_distill(self, config): - self.distill = config.distill - if self.distill: - self.visual_encoder_m = self._initialize_clip(config) - self.text_encoder_m = BertModel( - self.config_encoder, add_pooling_layer=False) - self.fusion_encoder_m = FusionModel( - self.config_fusion, add_pooling_layer=False) - self.text_decoder_m = BertLMHeadModel(self.config_decoder) - self.model_pairs = [ - [self.visual_encoder, self.visual_encoder_m], - [self.text_encoder, self.text_encoder_m], - [self.text_decoder, self.text_decoder_m], - ] - if self.config_encoder.hidden_size != config.vision_width: - self.visn_fc_m = nn.Linear(config.vision_width, - self.config_encoder.hidden_size) - self.visn_layer_norm_m = nn.LayerNorm( - self.config_encoder.hidden_size, eps=1e-12) - self.dropout_m = nn.Dropout( - self.config_encoder.hidden_dropout_prob) - self.model_pairs.extend( - [[self.visn_fc, self.visn_fc_m], - [self.visn_layer_norm, self.visn_layer_norm_m]]) - self.copy_params() - self.momentum = 0.995 - - @torch.no_grad() - def copy_params(self): - for model_pair in self.model_pairs: - for param, param_m in zip(model_pair[0].parameters(), - model_pair[1].parameters()): - param_m.data.copy_(param.data) # initialize - param_m.requires_grad = False # not update by gradient - - @torch.no_grad() - def _momentum_update(self): - for model_pair in self.model_pairs: - for param, param_m in zip(model_pair[0].parameters(), - model_pair[1].parameters()): - param_m.data = param_m.data * self.momentum + param.data * ( - 1. - self.momentum) - - def generation(self, question_states, question_atts): - encoder_inputs = [question_states, question_atts] - topk_ids, topk_scores = self.beam_generator.translate_batch( - encoder_inputs) - return topk_ids, topk_scores - - @staticmethod - def _tile(x, dim, n_tile): - import numpy as np - init_dim = x.size(dim) - repeat_idx = [1] * x.dim() - repeat_idx[dim] = n_tile - x = x.repeat(*(repeat_idx)) - order_index = torch.LongTensor( - np.concatenate( - [init_dim * np.arange(n_tile) + i for i in range(init_dim)])) - return torch.index_select(x, dim, order_index.to(x.device)) - - def rank_answer(self, question_states, question_atts, answer_ids, - answer_atts, k): - - num_ques = question_states.size(0) - start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token - start_output = self.text_decoder( - start_ids, - encoder_hidden_states=question_states, - encoder_attention_mask=question_atts, - return_dict=True, - reduction='none') - logits = start_output.logits[:, 0, :] # first token's logit +class MPLUGForImageCaption(MPlug): - # topk_probs: top-k probability - # topk_ids: [num_question, k] - answer_first_token = answer_ids[:, 1] - prob_first_token = F.softmax( - logits, dim=1).index_select( - dim=1, index=answer_first_token) - topk_probs, topk_ids = prob_first_token.topk(k, dim=1) - - # answer input: [num_question*k, answer_len] - input_ids = [] - input_atts = [] - for b, topk_id in enumerate(topk_ids): - input_ids.append(answer_ids.index_select(dim=0, index=topk_id)) - input_atts.append(answer_atts.index_select(dim=0, index=topk_id)) - input_ids = torch.cat(input_ids, dim=0) - input_atts = torch.cat(input_atts, dim=0) - - targets_ids = input_ids.masked_fill( - input_ids == self.tokenizer.pad_token_id, -100) - - # repeat encoder's output for top-k answers - question_states = self._tile(question_states, 0, k) - question_atts = self._tile(question_atts, 0, k) + def __init__(self, config): + super().__init__(config) + self.text_decoder = BertPrefixModel(self.config_decoder) + self.beam_generator = TextGenerator(config, self.text_decoder) - output = self.text_decoder( - input_ids, - attention_mask=input_atts, - encoder_hidden_states=question_states, - encoder_attention_mask=question_atts, - labels=targets_ids, - return_dict=True, - reduction='none') + def beam_search(self, + image, + question, + answer=None, + train=True, + out_size=5): + image_embeds = self.visual_encoder.visual(image, skip_last_layer=True) + if self.large: + image_embeds = self.dropout( + self.visn_layer_norm(self.visn_fc(image_embeds))) + image_atts = torch.ones( + image_embeds.size()[:-1], dtype=torch.long).to(image.device) + text_output = self.text_encoder( + question.input_ids, + attention_mask=question.attention_mask, + return_dict=True) + text_embeds = text_output.last_hidden_state + fusion_output = self.fusion_encoder( + encoder_embeds=text_embeds, + attention_mask=question.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=False) + image_output, question_output = fusion_output + question_output = torch.cat([image_output, question_output], 1) + merge_text_attention = torch.cat([image_atts, question.attention_mask], + 1) + topk_ids, topk_probs = self.generation( + question_output, merge_text_attention, out_size=out_size) + return topk_ids, topk_probs - answer_loss = output.loss - answer_loss = answer_loss.view(input_ids.size(0), -1) + def forward(self, + image, + question, + answer=None, + train=True, + out_size=5, + scst=False): + if (scst): + return self.beam_search( + image, question, answer, train=True, out_size=out_size) + image = image.to(dtype=next(self.parameters()).dtype) + image_embeds = self.visual_encoder.visual(image, skip_last_layer=True) + if self.large: + image_embeds = self.dropout( + self.visn_layer_norm(self.visn_fc(image_embeds))) + image_atts = torch.ones( + image_embeds.size()[:-1], dtype=torch.long).to(image.device) - # topk_prob: first token probability - topk_probs = topk_probs.view(-1, 1) - log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1) + if train: + answer_targets = answer.input_ids.masked_fill( + answer.input_ids == self.tokenizer.pad_token_id, -100) + text_output = self.text_encoder( + question.input_ids, + attention_mask=question.attention_mask, + return_dict=True) + text_embeds = text_output.last_hidden_state + fusion_output = self.fusion_encoder( + encoder_embeds=text_embeds, + attention_mask=question.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=False) - # re-calculate log probabilities for the answer sequences using chain rule - log_probs_sum = log_probs.sum(1) - log_probs_sum = log_probs_sum.view(num_ques, k) + image_output, question_output = fusion_output - topk_probs = F.softmax(log_probs_sum, dim=-1) - # get top-k after re-ranking - topk_probs, rerank_id = topk_probs.topk(k, dim=1) - topk_ids = torch.gather(topk_ids, 1, rerank_id) + question_output = torch.cat([image_output, question_output], 1) + merge_text_attention = torch.cat( + [image_atts, question.attention_mask], 1) - return topk_ids, topk_probs + answer_output = self.text_decoder( + answer.input_ids, + attention_mask=answer.attention_mask, + encoder_hidden_states=question_output, + encoder_attention_mask=merge_text_attention, + labels=answer_targets, + return_dict=True, + reduction='none') + loss = answer_output.loss + return loss + else: + text_output = self.text_encoder( + question.input_ids, + attention_mask=question.attention_mask, + return_dict=True) + text_embeds = text_output.last_hidden_state + fusion_output = self.fusion_encoder( + encoder_embeds=text_embeds, + attention_mask=question.attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=False) + image_output, question_output = fusion_output + question_output = torch.cat([image_output, question_output], 1) + merge_text_attention = torch.cat( + [image_atts, question.attention_mask], 1) + topk_ids, topk_probs = self.generation(question_output, + merge_text_attention) + return topk_ids, topk_probs diff --git a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py similarity index 60% rename from modelscope/models/multi_modal/mplug_for_visual_question_answering.py rename to modelscope/models/multi_modal/mplug_for_all_tasks.py index 88875fda..bb5a9c46 100644 --- a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py +++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py @@ -6,12 +6,13 @@ from modelscope.models.base import Tensor from modelscope.models.builder import MODELS from modelscope.utils.constant import Tasks -__all__ = ['MPlugForVisualQuestionAnswering'] +__all__ = ['MPlugForAllTasks'] @MODELS.register_module( Tasks.visual_question_answering, module_name=Models.mplug) -class MPlugForVisualQuestionAnswering(TorchModel): +@MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug) +class MPlugForAllTasks(TorchModel): def __init__(self, model_dir: str, *args, **kwargs): """initialize the mplug model from the `model_dir` path. @@ -20,8 +21,8 @@ class MPlugForVisualQuestionAnswering(TorchModel): """ super().__init__(model_dir, *args, **kwargs) - from modelscope.models.multi_modal.mplug import MPlugForVisualQuestionAnswering - self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir) + from modelscope.models.multi_modal.mplug import MPlug + self.model = MPlug.from_pretrained(model_dir) self.tokenizer = self.model.tokenizer def train(self): @@ -44,4 +45,13 @@ class MPlugForVisualQuestionAnswering(TorchModel): } """ - return self.model(**input)[0] + topk_ids, _ = self.model(**input) + replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), + ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), + ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) + + pred_string = self.tokenizer.decode(topk_ids[0][0]) + for _old, _new in replace_tokens_bert: + pred_string = pred_string.replace(_old, _new) + pred_string = pred_string.strip() + return pred_string diff --git a/modelscope/models/multi_modal/ofa/tokenization_ofa.py b/modelscope/models/multi_modal/ofa/tokenization_ofa.py index 158905eb..fd50505c 100644 --- a/modelscope/models/multi_modal/ofa/tokenization_ofa.py +++ b/modelscope/models/multi_modal/ofa/tokenization_ofa.py @@ -22,6 +22,8 @@ from transformers.models.bert.tokenization_bert import (BasicTokenizer, WordpieceTokenizer) from transformers.utils import logging +from modelscope.utils.constant import ModelFile + logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'} @@ -42,7 +44,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'ofa-base': 1024, } -VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE} PRETRAINED_VOCAB_FILES_MAP_ZH = { 'vocab_file': { diff --git a/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py b/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py index 03d2d71e..db11370d 100644 --- a/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py +++ b/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py @@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerFast from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast from transformers.utils import logging +from modelscope.utils.constant import ModelFile from .tokenization_ofa import OFATokenizer, OFATokenizerZH logger = logging.get_logger(__name__) @@ -50,7 +51,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'ofa-base': 1024, } -VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE} PRETRAINED_VOCAB_FILES_MAP_ZH = { 'vocab_file': { diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py index cbf98746..3171e31d 100644 --- a/modelscope/models/nlp/structbert/tokenization_sbert.py +++ b/modelscope/models/nlp/structbert/tokenization_sbert.py @@ -23,11 +23,12 @@ from typing import List, Optional, Tuple from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace) +from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger logger = get_logger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE} PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py index 5b8d79cc..a0a81121 100644 --- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py +++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py @@ -22,13 +22,14 @@ import transformers from tokenizers import normalizers from transformers.tokenization_utils_fast import PreTrainedTokenizerFast +from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger from .tokenization_sbert import SbertTokenizer logger = get_logger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.txt', + 'vocab_file': ModelFile.VOCAB_FILE, 'tokenizer_file': 'tokenizer.json' } diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 1e84dd8a..6e4486dd 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, relative_to_absolute_path) from modelscope.msdatasets.config import MS_DATASETS_CACHE +from modelscope.utils.config import ConfigDict from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DatasetFormations, DownloadMode, Hubs) from modelscope.utils.logger import get_logger +from .task_datasets.builder import build_task_dataset +from .utils.dataset_builder import ExternalDataset from .utils.dataset_utils import (get_dataset_files, get_target_dataset_structure, load_dataset_builder) @@ -67,9 +70,16 @@ class MsDataset: def __len__(self): return len(self._hf_ds) + @property + def config_kwargs(self): + if isinstance(self._hf_ds, ExternalDataset): + return self._hf_ds.config_kwargs + else: + return None + @classmethod def from_hf_dataset(cls, - hf_ds: Union[Dataset, DatasetDict], + hf_ds: Union[Dataset, DatasetDict, ExternalDataset], target: str = None) -> Union[dict, 'MsDataset']: if isinstance(hf_ds, Dataset): return cls(hf_ds, target) @@ -77,6 +87,8 @@ class MsDataset: if len(hf_ds.keys()) == 1: return cls(next(iter(hf_ds.values())), target) return {k: cls(v, target) for k, v in hf_ds.items()} + elif isinstance(hf_ds, ExternalDataset): + return cls(hf_ds) else: raise TypeError( f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' @@ -96,7 +108,8 @@ class MsDataset: Mapping[str, Union[str, Sequence[str]]]]] = None, download_mode: Optional[DownloadMode] = DownloadMode. - REUSE_DATASET_IF_EXISTS + REUSE_DATASET_IF_EXISTS, + **config_kwargs, ) -> Union[dict, 'MsDataset']: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. Args: @@ -113,6 +126,7 @@ class MsDataset: hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope download_mode (DownloadMode or str, optional): How to treat existing datasets. default DownloadMode.REUSE_DATASET_IF_EXISTS + **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. @@ -128,7 +142,8 @@ class MsDataset: split=split, data_dir=data_dir, data_files=data_files, - download_mode=download_mode.value) + download_mode=download_mode.value, + **config_kwargs) return MsDataset.from_hf_dataset(dataset, target=target) elif hub == Hubs.modelscope: return MsDataset._load_ms_dataset( @@ -140,22 +155,22 @@ class MsDataset: split=split, data_dir=data_dir, data_files=data_files, - download_mode=download_mode) + download_mode=download_mode, + **config_kwargs) @staticmethod - def _load_ms_dataset( - dataset_name: Union[str, list], - namespace: Optional[str] = None, - target: Optional[str] = None, - version: Optional[str] = DEFAULT_DATASET_REVISION, - subset_name: Optional[str] = None, - split: Optional[str] = None, - data_dir: Optional[str] = None, - data_files: Optional[Union[str, Sequence[str], - Mapping[str, Union[str, - Sequence[str]]]]] = None, - download_mode: Optional[DownloadMode] = None - ) -> Union[dict, 'MsDataset']: + def _load_ms_dataset(dataset_name: Union[str, list], + namespace: Optional[str] = None, + target: Optional[str] = None, + version: Optional[str] = DEFAULT_DATASET_REVISION, + subset_name: Optional[str] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[ + str, Sequence[str], + Mapping[str, Union[str, Sequence[str]]]]] = None, + download_mode: Optional[DownloadMode] = None, + **config_kwargs) -> Union[dict, 'MsDataset']: if isinstance(dataset_name, str): dataset_formation = DatasetFormations.native if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ @@ -184,7 +199,8 @@ class MsDataset: data_dir=data_dir, data_files=data_files, cache_dir=MS_DATASETS_CACHE, - download_mode=download_mode.value) + download_mode=download_mode.value, + **config_kwargs) else: dataset = MsDataset._load_from_ms( dataset_name, @@ -195,7 +211,7 @@ class MsDataset: subset_name=subset_name, split=split, download_mode=download_mode, - ) + **config_kwargs) elif isinstance(dataset_name, list): if target is None: target = 'target' @@ -206,16 +222,15 @@ class MsDataset: return MsDataset.from_hf_dataset(dataset, target=target) @staticmethod - def _load_from_ms( - dataset_name: str, - dataset_files: dict, - download_dir: str, - namespace: Optional[str] = None, - version: Optional[str] = DEFAULT_DATASET_REVISION, - subset_name: Optional[str] = None, - split: Optional[str] = None, - download_mode: Optional[DownloadMode] = None, - ) -> Union[Dataset, DatasetDict]: + def _load_from_ms(dataset_name: str, + dataset_files: dict, + download_dir: str, + namespace: Optional[str] = None, + version: Optional[str] = DEFAULT_DATASET_REVISION, + subset_name: Optional[str] = None, + split: Optional[str] = None, + download_mode: Optional[DownloadMode] = None, + **config_kwargs) -> Union[Dataset, DatasetDict]: for json_path in dataset_files['.json']: if json_path.endswith(f'{dataset_name}.json'): with open(json_path, encoding='utf-8') as dataset_json_file: @@ -226,7 +241,6 @@ class MsDataset: meta_map, file_map = get_dataset_files(target_dataset_structure, dataset_name, namespace, version) - builder = load_dataset_builder( dataset_name, subset_name, @@ -235,7 +249,8 @@ class MsDataset: zip_data_files=file_map, cache_dir=MS_DATASETS_CACHE, version=version, - split=list(target_dataset_structure.keys())) + split=list(target_dataset_structure.keys()), + **config_kwargs) download_config = DownloadConfig( cache_dir=download_dir, @@ -253,7 +268,6 @@ class MsDataset: data_dir=download_dir, ) builder.download_and_prepare( - download_config=download_config, dl_manager=dl_manager, download_mode=download_mode.value, try_from_hf_gcs=False) @@ -338,6 +352,8 @@ class MsDataset: self, columns: Union[str, List[str]] = None, preprocessors: Union[Callable, List[Callable]] = None, + task_name: str = None, + task_data_config: ConfigDict = None, **format_kwargs, ): """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to @@ -350,6 +366,8 @@ class MsDataset: columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of processors will also be added. + task_name (str, default None): task name, refer to :obj:`Tasks` for more details + task_data_config (ConfigDict, default None): config dict for model object. format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. Returns: @@ -360,6 +378,10 @@ class MsDataset: raise ImportError( 'The function to_torch_dataset requires pytorch to be installed' ) + if isinstance(self._hf_ds, ExternalDataset): + task_data_config.update({'preprocessor': preprocessors}) + return build_task_dataset(task_data_config, task_name, + self._hf_ds.config_kwargs) if preprocessors is not None: return self.to_torch_dataset_with_processors( preprocessors, columns=columns) diff --git a/modelscope/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py similarity index 80% rename from modelscope/task_datasets/__init__.py rename to modelscope/msdatasets/task_datasets/__init__.py index 93e01cb5..c80f8cd5 100644 --- a/modelscope/task_datasets/__init__.py +++ b/modelscope/msdatasets/task_datasets/__init__.py @@ -8,6 +8,7 @@ if TYPE_CHECKING: from .builder import TASK_DATASETS, build_task_dataset from .torch_base_dataset import TorchTaskDataset from .veco_dataset import VecoDataset + from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset else: _import_structure = { @@ -15,6 +16,8 @@ else: 'builder': ['TASK_DATASETS', 'build_task_dataset'], 'torch_base_dataset': ['TorchTaskDataset'], 'veco_dataset': ['VecoDataset'], + 'image_instance_segmentation_coco_dataset': + ['ImageInstanceSegmentationCocoDataset'] } import sys diff --git a/modelscope/task_datasets/base.py b/modelscope/msdatasets/task_datasets/base.py similarity index 100% rename from modelscope/task_datasets/base.py rename to modelscope/msdatasets/task_datasets/base.py diff --git a/modelscope/task_datasets/builder.py b/modelscope/msdatasets/task_datasets/builder.py similarity index 100% rename from modelscope/task_datasets/builder.py rename to modelscope/msdatasets/task_datasets/builder.py diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py similarity index 90% rename from modelscope/models/cv/image_instance_segmentation/datasets/dataset.py rename to modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py index d9e1b348..04c8e142 100644 --- a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py +++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py @@ -2,14 +2,32 @@ import os.path as osp import numpy as np from pycocotools.coco import COCO -from torch.utils.data import Dataset - -class ImageInstanceSegmentationCocoDataset(Dataset): +from modelscope.metainfo import Models +from modelscope.utils.constant import Tasks +from .builder import TASK_DATASETS +from .torch_base_dataset import TorchTaskDataset + +DATASET_STRUCTURE = { + 'train': { + 'annotation': 'annotations/instances_train.json', + 'images': 'images/train' + }, + 'validation': { + 'annotation': 'annotations/instances_val.json', + 'images': 'images/val' + } +} + + +@TASK_DATASETS.register_module( + module_name=Models.cascade_mask_rcnn_swin, + group_key=Tasks.image_segmentation) +class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): """Coco-style dataset for image instance segmentation. Args: - ann_file (str): Annotation file path. + split_config (dict): Annotation file path. {"train":"xxxxx"} classes (Sequence[str], optional): Specify classes to load. If is None, ``cls.CLASSES`` will be used. Default: None. data_root (str, optional): Data root for ``ann_file``, @@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') def __init__(self, - ann_file, + split_config: dict, + preprocessor=None, classes=None, - data_root=None, - img_prefix='', seg_prefix=None, test_mode=False, - filter_empty_gt=True): - self.ann_file = ann_file - self.data_root = data_root - self.img_prefix = img_prefix + filter_empty_gt=True, + **kwargs): + self.data_root = next(iter(split_config.values())) + self.split = next(iter(split_config.keys())) + self.preprocessor = preprocessor + + self.ann_file = osp.join(self.data_root, + DATASET_STRUCTURE[self.split]['annotation']) + + self.img_prefix = osp.join(self.data_root, + DATASET_STRUCTURE[self.split]['images']) self.seg_prefix = seg_prefix self.test_mode = test_mode self.filter_empty_gt = filter_empty_gt self.CLASSES = self.get_classes(classes) - # join paths if data_root is specified - if self.data_root is not None: - if not osp.isabs(self.ann_file): - self.ann_file = osp.join(self.data_root, self.ann_file) - if not (self.img_prefix is None or osp.isabs(self.img_prefix)): - self.img_prefix = osp.join(self.data_root, self.img_prefix) - if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): - self.seg_prefix = osp.join(self.data_root, self.seg_prefix) - # load annotations self.data_infos = self.load_annotations(self.ann_file) @@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): # set group flag for the sampler self._set_group_flag() - self.preprocessor = None - def __len__(self): """Total number of samples of data.""" return len(self.data_infos) @@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): raise ValueError(f'Unsupported type {type(classes)} of classes.') return class_names - - def to_torch_dataset(self, preprocessors=None): - self.preprocessor = preprocessors - return self diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py similarity index 100% rename from modelscope/task_datasets/torch_base_dataset.py rename to modelscope/msdatasets/task_datasets/torch_base_dataset.py diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/msdatasets/task_datasets/veco_dataset.py similarity index 100% rename from modelscope/task_datasets/veco_dataset.py rename to modelscope/msdatasets/task_datasets/veco_dataset.py diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py index 2b4bad07..85489c58 100644 --- a/modelscope/msdatasets/utils/dataset_builder.py +++ b/modelscope/msdatasets/utils/dataset_builder.py @@ -8,6 +8,7 @@ from datasets.info import DatasetInfo from datasets.packaged_modules import csv from datasets.utils.filelock import FileLock +from modelscope.utils.constant import DownloadMode from modelscope.utils.logger import get_logger logger = get_logger() @@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, **config_kwargs, ): + self.namespace = namespace super().__init__( cache_dir=cache_dir, name=subset_name, hash=hash, - namespace=namespace, data_files=meta_data_files, **config_kwargs) @@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): os.rmdir(self._cache_dir) self.zip_data_files = zip_data_files + def _relative_data_dir(self, with_version=True, with_hash=True) -> str: + """Relative path of this dataset in cache_dir: + Will be: + self.name/self.config.version/self.hash/ + or if a namespace has been specified: + self.namespace___self.name/self.config.version/self.hash/ + """ + builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' + builder_config = self.config + hash = self.hash + if builder_config: + builder_data_dir = os.path.join(builder_data_dir, self.config_id) + if with_version: + builder_data_dir = os.path.join(builder_data_dir, + str(self.config.version)) + if with_hash and hash and isinstance(hash, str): + builder_data_dir = os.path.join(builder_data_dir, hash) + return builder_data_dir + def _build_cache_dir(self): builder_data_dir = os.path.join( self._cache_dir_root, @@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): datasets.SplitGenerator( name=split_name, gen_kwargs={ - 'files': dl_manager.iter_files(files), - 'base_dir': zip_data_files.get(split_name) + 'files': + dl_manager.iter_files(files), + 'base_dir': + os.path.join( + zip_data_files.get(split_name), + os.path.splitext( + self.zip_data_files.get(split_name))[0]) + if self.zip_data_files.get(split_name) else + zip_data_files.get(split_name) })) return splits @@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): logger.error( f"Failed to read file '{file}' with error {type(e)}: {e}") raise + + +class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): + + def __init__( + self, + dataset_name: str, + cache_dir: str, + namespace: str, + subset_name: str, + hash: str, + meta_data_files: Mapping[str, Union[str, Sequence[str]]], + zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, + **config_kwargs, + ): + self.name = dataset_name + self.subset_name = subset_name + self.namespace = namespace + self.hash = hash + self.data_files = meta_data_files + self.zip_data_files = zip_data_files + self.split_path_dict = None + self.config = None + self._cache_dir_root = os.path.expanduser(cache_dir) + self._cache_dir = self._build_cache_dir() + self._config_kwargs = config_kwargs + + def download_and_prepare(self, download_mode, dl_manager, + **download_kwargs): + # Prevent parallel disk operations + lock_path = os.path.join( + self._cache_dir_root, + self._cache_dir.replace(os.sep, '_') + '.lock') + with FileLock(lock_path): + data_exists = os.path.exists(self._cache_dir) + if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: + logger.warning( + f'Reusing dataset {self.name} ({self._cache_dir})') + return + logger.info(f'Generating dataset {self.name} ({self._cache_dir})') + self._download_and_prepare(dl_manager=dl_manager) + + def _download_and_prepare(self, dl_manager): + split_path_dict = dl_manager.download_and_extract(self.zip_data_files) + self.split_path_dict = { + k: os.path.join(v, + os.path.splitext(self.zip_data_files[k])[0]) + for k, v in split_path_dict.items() + } + + def as_dataset(self): + return ExternalDataset(self.split_path_dict, self._config_kwargs) + + +class ExternalDataset(object): + + def __init__(self, split_path_dict, config_kwargs): + config_kwargs.update({'split_config': split_path_dict}) + self.config_kwargs = config_kwargs + + def __len__(self): + return len(self.config_kwargs['split_config']) diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index ff7cd8b1..09556d84 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder from modelscope.utils.constant import DEFAULT_DATASET_REVISION from modelscope.utils.logger import get_logger -from .dataset_builder import MsCsvDatasetBuilder +from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder logger = get_logger() @@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, modelscope_api = HubApi() for split, info in subset_split_into.items(): meta_map[split] = modelscope_api.get_dataset_file_url( - info['meta'], dataset_name, namespace, revision) + info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): file_map[split] = info['file'] return meta_map, file_map @@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, zip_data_files: Mapping[str, Union[str, Sequence[str]]], cache_dir: str, version: Optional[Union[str]], - split: Sequence[str]) -> DatasetBuilder: + split: Sequence[str], + **config_kwargs) -> DatasetBuilder: sub_dir = os.path.join(version, '_'.join(split)) - builder_instance = MsCsvDatasetBuilder( - dataset_name=dataset_name, - namespace=namespace, - cache_dir=cache_dir, - subset_name=subset_name, - meta_data_files=meta_data_files, - zip_data_files=zip_data_files, - hash=sub_dir) + meta_data_file = next(iter(meta_data_files.values())) + if not meta_data_file: + builder_instance = TaskSpecificDatasetBuilder( + dataset_name=dataset_name, + namespace=namespace, + cache_dir=cache_dir, + subset_name=subset_name, + meta_data_files=meta_data_files, + zip_data_files=zip_data_files, + hash=sub_dir, + **config_kwargs) + elif meta_data_file.endswith('.csv'): + builder_instance = MsCsvDatasetBuilder( + dataset_name=dataset_name, + namespace=namespace, + cache_dir=cache_dir, + subset_name=subset_name, + meta_data_files=meta_data_files, + zip_data_files=zip_data_files, + hash=sub_dir) + else: + raise NotImplementedError( + f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' + ) return builder_instance diff --git a/modelscope/outputs.py b/modelscope/outputs.py index f279f311..200a03cd 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -188,6 +188,16 @@ TASK_OUTPUTS = { Tasks.body_2d_keypoints: [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES], + # video single object tracking result for single video + # { + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] + # } + Tasks.video_single_object_tracking: [OutputKeys.BOXES], + # live category recognition result for single video # { # "scores": [0.885272, 0.014790631, 0.014558001], @@ -405,7 +415,7 @@ TASK_OUTPUTS = { # audio processed for single file in PCM format # { - # "output_pcm": np.array with shape(samples,) and dtype float32 + # "output_pcm": pcm encoded audio bytes # } Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM], Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM], @@ -417,6 +427,19 @@ TASK_OUTPUTS = { # } Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM], + # { + # "kws_list": [ + # { + # 'keyword': '', # the keyword spotted + # 'offset': 19.4, # the keyword start time in second + # 'length': 0.68, # the keyword length in second + # 'confidence': 0.85 # the possibility if it is the keyword + # }, + # ... + # ] + # } + Tasks.keyword_spotting: [OutputKeys.KWS_LIST], + # ============ multi-modal tasks =================== # image caption result for single sample diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py index 562125b4..b46ca87e 100644 --- a/modelscope/pipelines/audio/__init__.py +++ b/modelscope/pipelines/audio/__init__.py @@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .ans_pipeline import ANSPipeline from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline + from .kws_farfield_pipeline import KWSFarfieldPipeline from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline from .linear_aec_pipeline import LinearAECPipeline from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline @@ -14,6 +15,7 @@ else: _import_structure = { 'ans_pipeline': ['ANSPipeline'], 'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'], + 'kws_farfield_pipeline': ['KWSFarfieldPipeline'], 'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'], 'linear_aec_pipeline': ['LinearAECPipeline'], 'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'], diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py new file mode 100644 index 00000000..a114e7fb --- /dev/null +++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py @@ -0,0 +1,81 @@ +import io +import wave +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + Tasks.keyword_spotting, + module_name=Pipelines.speech_dfsmn_kws_char_farfield) +class KWSFarfieldPipeline(Pipeline): + r"""A Keyword Spotting Inference Pipeline . + + When invoke the class with pipeline.__call__(), it accept only one parameter: + inputs(str): the path of wav file + """ + SAMPLE_RATE = 16000 + SAMPLE_WIDTH = 2 + INPUT_CHANNELS = 3 + OUTPUT_CHANNELS = 2 + + def __init__(self, model, **kwargs): + """ + use `model` to create a kws far field pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + self.model = self.model.to(self.device) + self.model.eval() + frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH + self._nframe = self.model.size_in // frame_size + self.frame_count = 0 + + def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + if isinstance(inputs, bytes): + return dict(input_file=inputs) + elif isinstance(inputs, Dict): + return inputs + else: + raise ValueError(f'Not supported input type: {type(inputs)}') + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + input_file = inputs['input_file'] + if isinstance(input_file, bytes): + input_file = io.BytesIO(input_file) + self.frame_count = 0 + kws_list = [] + with wave.open(input_file, 'rb') as fin: + if 'output_file' in inputs: + with wave.open(inputs['output_file'], 'wb') as fout: + fout.setframerate(self.SAMPLE_RATE) + fout.setnchannels(self.OUTPUT_CHANNELS) + fout.setsampwidth(self.SAMPLE_WIDTH) + self._process(fin, kws_list, fout) + else: + self._process(fin, kws_list) + return {OutputKeys.KWS_LIST: kws_list} + + def _process(self, + fin: wave.Wave_read, + kws_list, + fout: wave.Wave_write = None): + data = fin.readframes(self._nframe) + while len(data) >= self.model.size_in: + self.frame_count += self._nframe + result = self.model.forward_decode(data) + if fout: + fout.writeframes(result['pcm']) + if 'kws' in result: + result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE + kws_list.append(result['kws']) + data = fin.readframes(self._nframe) + + def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index b1d82557..041dfb34 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -255,7 +255,7 @@ class Pipeline(ABC): return self._collate_fn(torch.from_numpy(data)) elif isinstance(data, torch.Tensor): return data.to(self.device) - elif isinstance(data, (str, int, float, bool, type(None))): + elif isinstance(data, (bytes, str, int, float, bool, type(None))): return data elif isinstance(data, InputFeatures): return data diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 1066fa8d..4105e28b 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -124,12 +124,16 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_classification: (Pipelines.daily_image_classification, 'damo/cv_vit-base_image-classification_Dailylife-labels'), - Tasks.ocr_recognition: (Pipelines.ocr_recognition, - 'damo/cv_convnextTiny_ocr-recognition_damo'), + Tasks.ocr_recognition: + (Pipelines.ocr_recognition, + 'damo/cv_convnextTiny_ocr-recognition-general_damo'), Tasks.skin_retouching: (Pipelines.skin_retouching, 'damo/cv_unet_skin-retouching'), Tasks.crowd_counting: (Pipelines.crowd_counting, 'damo/cv_hrnet_crowd-counting_dcanet'), + Tasks.video_single_object_tracking: + (Pipelines.video_single_object_tracking, + 'damo/cv_vitb_video-single-object-tracking_ostrack'), } diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 91a2f1e0..cee91c8e 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline from .crowd_counting_pipeline import CrowdCountingPipeline from .image_detection_pipeline import ImageDetectionPipeline + from .image_salient_detection_pipeline import ImageSalientDetectionPipeline from .face_detection_pipeline import FaceDetectionPipeline from .face_image_generation_pipeline import FaceImageGenerationPipeline from .face_recognition_pipeline import FaceRecognitionPipeline @@ -43,6 +44,7 @@ else: 'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'], 'crowd_counting_pipeline': ['CrowdCountingPipeline'], 'image_detection_pipeline': ['ImageDetectionPipeline'], + 'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'], 'face_detection_pipeline': ['FaceDetectionPipeline'], 'face_image_generation_pipeline': ['FaceImageGenerationPipeline'], 'face_recognition_pipeline': ['FaceRecognitionPipeline'], diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py new file mode 100644 index 00000000..433275ba --- /dev/null +++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py @@ -0,0 +1,47 @@ +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + Tasks.image_segmentation, module_name=Pipelines.salient_detection) +class ImageSalientDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + model: model id on modelscope hub. + """ + super().__init__(model=model, auto_collate=False, **kwargs) + + def preprocess(self, input: Input) -> Dict[str, Any]: + + img = LoadImage.convert_to_ndarray(input) + img_h, img_w, _ = img.shape + img = self.model.preprocess(img) + result = {'img': img, 'img_w': img_w, 'img_h': img_h} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + + outputs = self.model.inference(input['img']) + result = { + 'data': outputs, + 'img_w': input['img_w'], + 'img_h': input['img_h'] + } + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + + data = self.model.postprocess(inputs) + outputs = { + OutputKeys.SCORES: None, + OutputKeys.LABELS: None, + OutputKeys.MASKS: data + } + return outputs diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py new file mode 100644 index 00000000..f4ba4d0b --- /dev/null +++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py @@ -0,0 +1,80 @@ +import os.path as osp +from typing import Any, Dict + +import cv2 + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.video_single_object_tracking.config.ostrack import \ + cfg +from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \ + OSTrack +from modelscope.models.cv.video_single_object_tracking.utils.utils import \ + check_box +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.video_single_object_tracking, + module_name=Pipelines.video_single_object_tracking) +class VideoSingleObjectTrackingPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a single object tracking pipeline + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + self.cfg = cfg + ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE) + logger.info(f'loading model from {ckpt_path}') + self.tracker = OSTrack(ckpt_path, self.device) + logger.info('init tracker done') + + def preprocess(self, input) -> Input: + self.video_path = input[0] + self.init_bbox = input[1] + return input + + def forward(self, input: Input) -> Dict[str, Any]: + output_boxes = [] + cap = cv2.VideoCapture(self.video_path) + success, frame = cap.read() + if success is False: + raise Exception( + 'modelscope error: %s can not be decoded by OpenCV.' % + (self.video_path)) + + init_box = self.init_bbox + frame_h, frame_w = frame.shape[0:2] + if not check_box(init_box, frame_h, frame_w): + raise Exception('modelscope error: init_box out of image range ', + init_box) + output_boxes.append(init_box.copy()) + init_box[2] = init_box[2] - init_box[0] + init_box[3] = init_box[3] - init_box[1] + self.tracker.initialize(frame, {'init_bbox': init_box}) + logger.info('init bbox done') + + while True: + ret, frame = cap.read() + if frame is None: + break + out = self.tracker.track(frame) + state = [int(s) for s in out['target_bbox']] + output_boxes.append(state) + cap.release() + logger.info('tracking process done') + + return { + OutputKeys.BOXES: output_boxes, + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py index 2028e7dc..99cccee1 100644 --- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py +++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py @@ -1,11 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict, Optional, Union +import torch + from modelscope.metainfo import Pipelines -from modelscope.models.multi_modal import OfaForAllTasks +from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import OfaPreprocessor, Preprocessor +from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor, + Preprocessor) from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger @@ -35,9 +39,19 @@ class ImageCaptioningPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) + if preprocessor is None: + if isinstance(pipe_model, OfaForAllTasks): + preprocessor = OfaPreprocessor(pipe_model.model_dir) + elif isinstance(pipe_model, MPlugForAllTasks): + preprocessor = MPlugPreprocessor(pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + return super().forward(inputs, **forward_params) + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - return inputs + if isinstance(self.model, OfaForAllTasks): + return inputs + return {OutputKeys.CAPTION: inputs} diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py index 9c694500..b2442a3e 100644 --- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py @@ -5,13 +5,12 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering, - OfaForAllTasks) +from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor, - OfaPreprocessor) +from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor, + Preprocessor) from modelscope.utils.constant import Tasks __all__ = ['VisualQuestionAnsweringPipeline'] @@ -23,9 +22,8 @@ __all__ = ['VisualQuestionAnsweringPipeline'] class VisualQuestionAnsweringPipeline(Pipeline): def __init__(self, - model: Union[MPlugForVisualQuestionAnswering, str], - preprocessor: Optional[ - MPlugVisualQuestionAnsweringPreprocessor] = None, + model: Union[Model, str], + preprocessor: Optional[Preprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a visual question answering pipeline for prediction @@ -35,18 +33,12 @@ class VisualQuestionAnsweringPipeline(Pipeline): """ model = model if isinstance(model, Model) else Model.from_pretrained(model) - self.tokenizer = None if preprocessor is None: if isinstance(model, OfaForAllTasks): preprocessor = OfaPreprocessor(model.model_dir) - elif isinstance(model, MPlugForVisualQuestionAnswering): - preprocessor = MPlugVisualQuestionAnsweringPreprocessor( - model.model_dir) - if isinstance(model, MPlugForVisualQuestionAnswering): - model.eval() - self.tokenizer = model.tokenizer - else: - model.model.eval() + elif isinstance(model, MPlugForAllTasks): + preprocessor = MPlugPreprocessor(model.model_dir) + model.model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) def forward(self, inputs: Dict[str, Any], @@ -64,14 +56,6 @@ class VisualQuestionAnsweringPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ - if self.tokenizer is None: + if isinstance(self.model, OfaForAllTasks): return inputs - replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), - ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), - ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) - - pred_string = self.tokenizer.decode(inputs[0][0]) - for _old, _new in replace_tokens_bert: - pred_string = pred_string.replace(_old, _new) - pred_string.strip() - return {OutputKeys.TEXT: pred_string} + return {OutputKeys.TEXT: inputs} diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 9a2adb04..0328b91a 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .base import Preprocessor from .builder import PREPROCESSORS, build_preprocessor - from .common import Compose + from .common import Compose, ToTensor, Filter from .asr import WavToScp from .audio import LinearAECAndFbank from .image import (LoadImage, load_image, @@ -14,8 +14,7 @@ if TYPE_CHECKING: ImageInstanceSegmentationPreprocessor, ImageDenoisePreprocessor) from .kws import WavToLists - from .multi_modal import (OfaPreprocessor, - MPlugVisualQuestionAnsweringPreprocessor) + from .multi_modal import (OfaPreprocessor, MPlugPreprocessor) from .nlp import (Tokenize, SequenceClassificationPreprocessor, TextGenerationPreprocessor, TokenClassificationPreprocessor, @@ -33,7 +32,7 @@ else: _import_structure = { 'base': ['Preprocessor'], 'builder': ['PREPROCESSORS', 'build_preprocessor'], - 'common': ['Compose'], + 'common': ['Compose', 'ToTensor', 'Filter'], 'audio': ['LinearAECAndFbank'], 'asr': ['WavToScp'], 'video': ['ReadVideoData'], @@ -42,8 +41,7 @@ else: 'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor' ], 'kws': ['WavToLists'], - 'multi_modal': - ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'], + 'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'], 'nlp': [ 'Tokenize', 'SequenceClassificationPreprocessor', 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', diff --git a/modelscope/preprocessors/common.py b/modelscope/preprocessors/common.py index 89fa859d..aa1db84c 100644 --- a/modelscope/preprocessors/common.py +++ b/modelscope/preprocessors/common.py @@ -2,6 +2,10 @@ import time from collections.abc import Sequence +from typing import Mapping + +import numpy as np +import torch from .builder import PREPROCESSORS, build_preprocessor @@ -25,12 +29,18 @@ class Compose(object): if isinstance(transform, dict): if self.field_name is None: transform = build_preprocessor(transform, field_name) - self.transforms.append(transform) + else: + # if not found key in field_name, try field_name=None(default_group) + try: + transform = build_preprocessor(transform, field_name) + except KeyError: + transform = build_preprocessor(transform, None) elif callable(transform): - self.transforms.append(transform) + pass else: raise TypeError('transform must be callable or a dict, but got' f' {type(transform)}') + self.transforms.append(transform) def __call__(self, data): for t in self.transforms: @@ -52,3 +62,82 @@ class Compose(object): format_string += f'\n {t}' format_string += '\n)' return format_string + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not isinstance(data, str): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PREPROCESSORS.register_module() +class ToTensor(object): + """Convert target object to tensor. + + Args: + keys (Sequence[str]): Key of data to be converted to Tensor. + Only valid when data is type of `Mapping`. If `keys` is None, + all values of keys ​​will be converted to tensor by default. + """ + + def __init__(self, keys=None): + self.keys = keys + + def __call__(self, data): + if isinstance(data, Mapping): + if self.keys is None: + self.keys = list(data.keys()) + + for key in self.keys: + data[key] = to_tensor(data[key]) + else: + data = to_tensor(data) + + return data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PREPROCESSORS.register_module() +class Filter(object): + """This is usually the last stage of the dataloader transform. + Only data of reserved keys will be kept and passed directly to the model, others will be removed. + + Args: + keys (Sequence[str]): Keys of data to be reserved, others will be removed. + """ + + def __init__(self, reserved_keys): + self.reserved_keys = reserved_keys + + def __call__(self, data): + assert isinstance(data, Mapping) + + reserved_data = {} + for key in self.reserved_keys: + reserved_data[key] = data[key] + + return reserved_data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.reserved_keys})' diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py index 775514a2..6932371d 100644 --- a/modelscope/preprocessors/image.py +++ b/modelscope/preprocessors/image.py @@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor): super().__init__(*args, **kwargs) self.model_dir: str = model_dir + from .common import Filter + + # TODO: `Filter` should be moved to configurarion file of each model + self._transforms = [Filter(reserved_keys=['input', 'target'])] + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """process the raw input data @@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor): Returns: Dict[str, Any]: the preprocessed data """ + for t in self._transforms: + data = t(data) + return data diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 46648832..5046e166 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -19,7 +19,7 @@ from .ofa.utils.collate import collate_fn __all__ = [ 'OfaPreprocessor', - 'MPlugVisualQuestionAnsweringPreprocessor', + 'MPlugPreprocessor', ] @@ -28,7 +28,7 @@ __all__ = [ class OfaPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path @@ -102,39 +102,55 @@ class OfaPreprocessor(Preprocessor): @PREPROCESSORS.register_module( - Fields.multi_modal, - module_name=Preprocessors.mplug_visual_question_answering) -class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor): + Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor) +class MPlugPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via 'bert-base-uncased' tokenizer and configuration - - """ - from transformers import BertTokenizer - from modelscope.models.multi_modal.mplug import CONFIG_NAME, VOCAB_NAME, MPlugConfig - super().__init__(*args, **kwargs) + self.model_dir = model_dir - # tokenizer - self.tokenizer = BertTokenizer.from_pretrained( - osp.join(model_dir, VOCAB_NAME)) + self._tokenizer = None + self._patch_resize_transform = None - # load configuration - config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME)) + @property + def tokenizer(self): + from transformers import BertTokenizer - # Initialize transform - from torchvision import transforms - mean = (0.48145466, 0.4578275, 0.40821073) - std = (0.26862954, 0.26130258, 0.27577711) + if self._tokenizer is None: + self._tokenizer = BertTokenizer.from_pretrained(self.model_dir) + return self._tokenizer + + @property + def patch_resize_transform(self): + if self._patch_resize_transform is None: + from torchvision import transforms + from modelscope.models.multi_modal.mplug import CONFIG_NAME, MPlugConfig + + config = MPlugConfig.from_yaml_file( + osp.join(self.model_dir, CONFIG_NAME)) + + mean = (0.48145466, 0.4578275, 0.40821073) + std = (0.26862954, 0.26130258, 0.27577711) + + self._patch_resize_transform = transforms.Compose([ + transforms.Resize((config.image_res, config.image_res), + interpolation=Image.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=mean, std=std), + ]) + return self._patch_resize_transform + + def __call__(self, *args, **kwargs): + call_mapping = { + Tasks.visual_question_answering: self.vqa_call, + Tasks.image_captioning: self.caption_call + } - self.patch_resize_transform = transforms.Compose([ - transforms.Resize((config.image_res, config.image_res), - interpolation=Image.BICUBIC), - transforms.ToTensor(), - transforms.Normalize(mean=mean, std=std), - ]) + self.cfg = Config.from_file( + osp.join(self.model_dir, ModelFile.CONFIGURATION)) + return call_mapping[self.cfg.task](*args, **kwargs) - def __call__(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]: + def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]: image: Image.Image = data[0] if isinstance(data, tuple) else data['image'] question: str = data[1] if isinstance(data, @@ -147,3 +163,19 @@ class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor): return_tensors='pt') return {'image': image, 'question': question, 'train': False} + + def caption_call( + self, data: Union[Image.Image, tuple, + Dict[str, Any]]) -> Dict[str, Any]: + if isinstance(data, Image.Image): + image = data + elif isinstance(data, tuple): + image = data[0] + else: + image = data['image'] + image = image.convert('RGB') + image = self.patch_resize_transform(image) + image = torch.stack([image], dim=0) + question = self.tokenizer('', return_tensors='pt') + + return {'image': image, 'question': question, 'train': False} diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index f231df9a..25576667 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -4,6 +4,7 @@ import os.path as osp import uuid from typing import Any, Dict, Iterable, Optional, Tuple, Union +import numpy as np from transformers import AutoTokenizer from modelscope.metainfo import Models, Preprocessors @@ -43,7 +44,7 @@ class Tokenize(Preprocessor): class SequenceClassificationPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path @@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor): text_b, return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, **self.tokenize_kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } self.labels_to_id(labels, output) return output @@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor): if labels is not None: if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ and self.label2id is not None: - output[OutputKeys.LABEL] = [ + output[OutputKeys.LABELS] = [ self.label2id[str(label)] for label in labels ] elif label_can_be_mapped(labels) and self.label2id is not None: - output[OutputKeys.LABEL] = self.label2id[str(labels)] + output[OutputKeys.LABELS] = self.label2id[str(labels)] else: - output[OutputKeys.LABEL] = labels + output[OutputKeys.LABELS] = labels @PREPROCESSORS.register_module( @@ -286,7 +291,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase): """ def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path @@ -517,7 +522,7 @@ class NERPreprocessor(Preprocessor): """ def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path @@ -609,7 +614,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): from fairseq.data import Dictionary - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data via the vocab file from the `model_dir` path Args: model_dir (str): model path diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py index c7339538..e2602eaa 100644 --- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py @@ -22,7 +22,7 @@ __all__ = ['DialogIntentPredictionPreprocessor'] class DialogIntentPredictionPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py index 8ed97452..a2157c2b 100644 --- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py @@ -20,7 +20,7 @@ __all__ = ['DialogModelingPreprocessor'] class DialogModelingPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py index 038ab09b..6eb17288 100644 --- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py +++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py @@ -17,7 +17,7 @@ __all__ = ['DialogStateTrackingPreprocessor'] class DialogStateTrackingPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py index f924588c..5bff360f 100644 --- a/modelscope/preprocessors/space/fields/gen_field.py +++ b/modelscope/preprocessors/space/fields/gen_field.py @@ -8,6 +8,7 @@ from itertools import chain import numpy as np from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.utils.constant import ModelFile from modelscope.utils.logger import get_logger from modelscope.utils.nlp.space import ontology, utils from modelscope.utils.nlp.space.db_ops import MultiWozDB @@ -343,7 +344,7 @@ class MultiWOZBPETextField(BPETextField): ] special_tokens.extend(self.add_sepcial_tokens()) self.tokenizer = Tokenizer( - vocab_path=os.path.join(model_dir, 'vocab.txt'), + vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE), special_tokens=special_tokens, tokenizer_type=config.BPETextField.tokenizer_type) self.understand_ids = self.tokenizer.convert_tokens_to_ids( diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py index 4ed7ab6c..dc00e677 100644 --- a/modelscope/preprocessors/space/fields/intent_field.py +++ b/modelscope/preprocessors/space/fields/intent_field.py @@ -14,6 +14,7 @@ import numpy as np from tqdm import tqdm from modelscope.preprocessors.space.tokenizer import Tokenizer +from modelscope.utils.constant import ModelFile from modelscope.utils.nlp.space import ontology from modelscope.utils.nlp.space.scores import hierarchical_set_score from modelscope.utils.nlp.space.utils import list2np @@ -50,7 +51,7 @@ class BPETextField(object): ] special_tokens.extend(self.add_sepcial_tokens()) self.tokenizer = Tokenizer( - vocab_path=os.path.join(model_dir, 'vocab.txt'), + vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE), special_tokens=special_tokens, tokenizer_type=config.BPETextField.tokenizer_type) self.understand_ids = self.numericalize(self.understand_tokens) diff --git a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py index 2032dcf7..b5dd73a9 100644 --- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py +++ b/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py @@ -28,7 +28,7 @@ __all__ = ['ConversationalTextToSqlPreprocessor'] class ConversationalTextToSqlPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path + """preprocess the data Args: model_dir (str): model path diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/star/fields/common_utils.py index 2d33b7ab..431e66b6 100644 --- a/modelscope/preprocessors/star/fields/common_utils.py +++ b/modelscope/preprocessors/star/fields/common_utils.py @@ -193,6 +193,15 @@ class SubPreprocessor(): from nltk import data data.path.append(os.path.join(self.model_dir, 'nltk_data')) + + zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt') + if os.path.exists(zippath): + print('punkt has already exist!') + else: + import zipfile + with zipfile.ZipFile(zippath + '.zip') as zf: + zf.extractall( + os.path.join(self.model_dir, 'nltk_data/tokenizers/')) question = nltk.word_tokenize(question) question = mwtokenizer.tokenize(question) diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py index e7632147..2e2415dc 100644 --- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py +++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py @@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): def prediction_step(self, model, inputs): pass - - def to_task_dataset(self, datasets, mode, preprocessor=None): - # wait for dataset interface to become stable... - return datasets.to_torch_dataset(preprocessor) diff --git a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py index 7ef0de79..0941d1cd 100644 --- a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py +++ b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py @@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer): train_outputs = dict() self._mode = ModeKeys.TRAIN - inputs = self.collate_fn(inputs) # call model forward but not __call__ to skip postprocess if isinstance(inputs, Mapping): d_loss = model._train_forward_d(**inputs) diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py index 3a58557b..75cc226c 100644 --- a/modelscope/trainers/hooks/hook.py +++ b/modelscope/trainers/hooks/hook.py @@ -192,7 +192,7 @@ class Hook: Whether to reach the end of every epoch Returns: bool """ - return trainer.inner_iter + 1 == len(trainer.data_loader) + return trainer.inner_iter + 1 == trainer.iters_per_epoch def is_last_epoch(self, trainer): """ diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py index a204284c..6629a0c9 100644 --- a/modelscope/trainers/hooks/logger/text_logger_hook.py +++ b/modelscope/trainers/hooks/logger/text_logger_hook.py @@ -93,7 +93,7 @@ class TextLoggerHook(LoggerHook): lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}' if self.by_epoch: - log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t' + log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{trainer.iters_per_epoch}]\t' else: log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t' log_str += f'{lr_str}, ' diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 322070a1..3692b486 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): self.train_keys = build_dataset_keys( self.cfg.dataset.train if hasattr(self.cfg, 'dataset') and hasattr(self.cfg.dataset, 'train') else None) - # TODO eval may has special keys, which is now not supported. - # because there is only one preprocessor in the trainer, and it only supports one group of keys. - self.eval_keys = self.train_keys + self.eval_keys = build_dataset_keys( + self.cfg.dataset.val if hasattr(self.cfg, 'dataset') + and hasattr(self.cfg.dataset, 'val') else None) + if len(self.eval_keys) == 0: + self.eval_keys = self.train_keys super().__init__( model=model_dir, @@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): elif isinstance(model, nn.Module): return model - def build_preprocessor(self) -> Preprocessor: + def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: """Build the preprocessor. User can override this method to implement custom logits. @@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): model_args = {} if self.label2id is None else { 'label2id': self.label2id } - cfg = ConfigDict({ - **getattr(self.cfg, 'preprocessor'), - 'model_dir': - self.model_dir, - **model_args, - 'mode': - ModeKeys.TRAIN, - **self.train_keys, - }) - return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + + field_name = Tasks.find_field_by_task(self.cfg.task) + train_preprocessor, eval_preprocessor = None, None + _train_cfg, _eval_cfg = {}, {} + + if 'type' not in self.cfg.preprocessor and ( + 'train' in self.cfg.preprocessor + or 'val' in self.cfg.preprocessor): + if 'train' in self.cfg.preprocessor: + _train_cfg = self.cfg.preprocessor.train + if 'val' in self.cfg.preprocessor: + _eval_cfg = self.cfg.preprocessor.val + else: + _train_cfg = self.cfg.preprocessor + _eval_cfg = self.cfg.preprocessor + + if len(_train_cfg): + _train_cfg.update({ + 'model_dir': self.model_dir, + **model_args, + **self.train_keys, 'mode': ModeKeys.TRAIN + }) + train_preprocessor = build_preprocessor(_train_cfg, field_name) + if len(_eval_cfg): + _eval_cfg.update({ + 'model_dir': self.model_dir, + **model_args, + **self.eval_keys, 'mode': ModeKeys.EVAL + }) + eval_preprocessor = build_preprocessor(_eval_cfg, field_name) + + return train_preprocessor, eval_preprocessor @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) @@ -178,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): """Veco evaluates the datasets one by one. """ - from modelscope.task_datasets import VecoDataset + from modelscope.msdatasets.task_datasets import VecoDataset self.model.eval() self._mode = ModeKeys.EVAL metric_values = {} diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index a96c186c..0916495c 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -5,15 +5,15 @@ import time from collections.abc import Mapping from distutils.version import LooseVersion from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union import json import numpy as np import torch -from addict import Dict from torch import distributed as dist from torch import nn from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataloader import default_collate from torch.utils.data.distributed import DistributedSampler from modelscope.hub.snapshot_download import snapshot_download @@ -21,23 +21,26 @@ from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics from modelscope.models.base import Model, TorchModel from modelscope.msdatasets.ms_dataset import MsDataset -from modelscope.preprocessors import build_preprocessor +from modelscope.msdatasets.task_datasets.builder import build_task_dataset +from modelscope.msdatasets.task_datasets.torch_base_dataset import \ + TorchTaskDataset from modelscope.preprocessors.base import Preprocessor -from modelscope.task_datasets.builder import build_task_dataset -from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset +from modelscope.preprocessors.builder import build_preprocessor +from modelscope.preprocessors.common import Compose from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config, ConfigDict -from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, - ModelFile, Tasks, TrainerStages) +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, + ConfigKeys, Hubs, ModeKeys, ModelFile, + Tasks, TrainerStages) +from modelscope.utils.data_utils import to_device from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg -from modelscope.utils.tensor_utils import torch_default_data_collator -from modelscope.utils.torch_utils import (broadcast, create_device, - get_dist_info, init_dist) +from modelscope.utils.torch_utils import (create_device, get_dist_info, + init_dist) from .base import BaseTrainer from .builder import TRAINERS from .default_config import DEFAULT_CONFIG @@ -83,7 +86,8 @@ class EpochBasedTrainer(BaseTrainer): data_collator: Optional[Callable] = None, train_dataset: Optional[Union[MsDataset, Dataset]] = None, eval_dataset: Optional[Union[MsDataset, Dataset]] = None, - preprocessor: Optional[Preprocessor] = None, + preprocessor: Optional[Union[Preprocessor, + Dict[str, Preprocessor]]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler] = (None, None), @@ -120,24 +124,46 @@ class EpochBasedTrainer(BaseTrainer): else: self.work_dir = self.cfg.train.get('work_dir', './work_dir') - self.preprocessor = None + self.train_preprocessor, self.eval_preprocessor = None, None if isinstance(preprocessor, Preprocessor): - self.preprocessor = preprocessor - elif hasattr(self.cfg, 'preprocessor'): - self.preprocessor = self.build_preprocessor() - if self.preprocessor is not None: - self.preprocessor.mode = ModeKeys.TRAIN + self.train_preprocessor = preprocessor + self.eval_preprocessor = preprocessor + elif isinstance(preprocessor, Mapping): + if not (ConfigKeys.train in preprocessor + or ConfigKeys.val in preprocessor): + raise ValueError( + f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' + ) + if ConfigKeys.train in preprocessor: + assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) + self.train_preprocessor = preprocessor[ConfigKeys.train] + if ConfigKeys.val in preprocessor: + assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) + self.eval_preprocessor = preprocessor[ConfigKeys.val] + elif hasattr(self.cfg, ConfigFields.preprocessor): + self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( + ) + + if self.train_preprocessor is not None: + self.train_preprocessor.mode = ModeKeys.TRAIN + if self.eval_preprocessor is not None: + self.eval_preprocessor.mode = ModeKeys.EVAL + device_name = kwargs.get('device', 'gpu') assert device_name in ['gpu', 'cpu'], 'device should be either cpu or gpu.' self.device = create_device(device_name == 'cpu') self.train_dataset = self.to_task_dataset( - train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) + train_dataset, + mode=ModeKeys.TRAIN, + preprocessor=self.train_preprocessor) self.eval_dataset = self.to_task_dataset( - eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) + eval_dataset, + mode=ModeKeys.EVAL, + preprocessor=self.eval_preprocessor) - self.data_collator = data_collator if data_collator is not None else torch_default_data_collator + self.data_collator = data_collator if data_collator is not None else default_collate self.metrics = self.get_metrics() self._metric_values = None self.optimizers = optimizers @@ -155,6 +181,16 @@ class EpochBasedTrainer(BaseTrainer): else: self._max_epochs = kwargs['max_epochs'] + self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None) + self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None) + if self._train_iters_per_epoch is None and hasattr( + self.cfg.train, 'train_iters_per_epoch'): + self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch + if self._eval_iters_per_epoch is None and hasattr( + self.cfg, 'evaluation') and hasattr(self.cfg.evaluation, + 'val_iters_per_epoch'): + self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch + self.use_fp16 = kwargs.get('use_fp16', False) # TODO @wenmeng.zwm add seed init fn @@ -211,7 +247,32 @@ class EpochBasedTrainer(BaseTrainer): @property def max_iters(self): """int: Maximum training iterations.""" - return self._max_epochs * len(self.data_loader) + return self._max_epochs * self.iters_per_epoch + + @property + def iters_per_epoch(self): + """int: Total iterations of one epoch""" + + def _get_data_len(data_loader): + try: + return len(data_loader) + except Exception as e: + self.logger.error(e) + raise ValueError( + 'Please implement ``__len__`` method for your dataset, ' + 'or add `train_iters_per_epoch` and `train_iters_per_epoch` ' + 'to your configuration file or kwargs') + + if self.mode == ModeKeys.TRAIN: + if self._train_iters_per_epoch is not None: + return self._train_iters_per_epoch + else: + return _get_data_len(self.train_dataloader) + elif self.mode == ModeKeys.EVAL: + if self._eval_iters_per_epoch is not None: + return self._eval_iters_per_epoch + else: + return _get_data_len(self.eval_dataloader) def to_task_dataset(self, datasets: Union[Dataset, List[Dataset]], @@ -228,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): if isinstance(datasets, TorchTaskDataset): return datasets elif isinstance(datasets, MsDataset): - datasets = datasets.to_torch_dataset( - preprocessors=self.preprocessor) - return datasets + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ + else ConfigDict(type=None, mode=mode) + return datasets.to_torch_dataset( + task_data_config=cfg, + task_name=self.cfg.task, + preprocessors=preprocessor) elif isinstance(datasets, List) and isinstance( datasets[0], MsDataset): + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ + else ConfigDict(type=None, mode=mode) datasets = [ - d.to_torch_dataset(preprocessor=self.preprocessor) - for d in datasets + d.to_torch_dataset( + task_data_config=cfg, + task_name=self.cfg.task, + preprocessors=preprocessor) for d in datasets ] cfg = ConfigDict( type=self.cfg.task, mode=mode, datasets=datasets) @@ -258,24 +326,44 @@ class EpochBasedTrainer(BaseTrainer): else: return datasets - def build_preprocessor(self) -> Preprocessor: - """Build the preprocessor. + def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: + """Build train and eval preprocessor. User can override this method to implement custom logits. - Returns: The preprocessor instance. + Returns: The train preprocessor and eval preprocessor instance. """ - # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor - # when they are different ones in training and evaluation - cfg = ConfigDict({ - **getattr(self.cfg, 'preprocessor'), - 'model_dir': - self.model_dir, - 'mode': - ModeKeys.TRAIN, - }) - return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + field_name = Tasks.find_field_by_task(self.cfg.task) + train_preprocessor, eval_preprocessor = None, None + _train_cfg, _eval_cfg = {}, {} + _dafault_args = {'model_dir': self.model_dir} + + if 'type' not in self.cfg.preprocessor and ( + 'train' in self.cfg.preprocessor + or 'val' in self.cfg.preprocessor): + if 'train' in self.cfg.preprocessor: + _train_cfg = self.cfg.preprocessor.train + if 'val' in self.cfg.preprocessor: + _eval_cfg = self.cfg.preprocessor.val + else: + _train_cfg = self.cfg.preprocessor + _eval_cfg = self.cfg.preprocessor + + if len(_train_cfg): + if isinstance(_train_cfg, Sequence): + # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, + # and add mode for Compose or other plans + raise NotImplementedError('Not supported yet!') + _train_cfg.update(_dafault_args) + train_preprocessor = build_preprocessor(_train_cfg, field_name) + if len(_eval_cfg): + if isinstance(_eval_cfg, Sequence): + raise NotImplementedError('Not supported yet!') + _eval_cfg.update(_dafault_args) + eval_preprocessor = build_preprocessor(_eval_cfg, field_name) + + return train_preprocessor, eval_preprocessor def get_metrics(self) -> List[str]: """Get the metric class types. @@ -373,34 +461,6 @@ class EpochBasedTrainer(BaseTrainer): return build_parallel(dp_cfg) - def collate_fn(self, data): - """Prepare the input just before the forward function. - This method will move the tensors to the right device. - Usually this method does not need to be overridden. - - Args: - data: The data out of the dataloader. - - Returns: The processed data. - - """ - from torch.utils.data.dataloader import default_collate - if isinstance(data, dict) or isinstance(data, Mapping): - return type(data)({k: self.collate_fn(v) for k, v in data.items()}) - elif isinstance(data, (tuple, list)): - if isinstance(data[0], (int, float)): - return default_collate(data).to(self.device) - else: - return type(data)(self.collate_fn(v) for v in data) - elif isinstance(data, np.ndarray): - return self.collate_fn(torch.from_numpy(data)) - elif isinstance(data, torch.Tensor): - return data.to(self.device) - elif isinstance(data, (str, int, float, bool)): - return data - else: - raise ValueError(f'Unsupported data type {type(data)}') - def train_step(self, model, inputs): """ Perform a training step on a batch of inputs. @@ -421,7 +481,6 @@ class EpochBasedTrainer(BaseTrainer): # TODO: find more pretty way to change mode model.train() self._mode = ModeKeys.TRAIN - inputs = self.collate_fn(inputs) # call model forward but not __call__ to skip postprocess if isinstance(inputs, Mapping) and not func_receive_dict_inputs(model.forward): @@ -486,7 +545,9 @@ class EpochBasedTrainer(BaseTrainer): if self.train_dataset is None: train_data = self.cfg.dataset.train self.train_dataset = self.build_dataset( - train_data, mode=ModeKeys.TRAIN) + train_data, + mode=ModeKeys.TRAIN, + preprocessor=self.train_preprocessor) data_loader = self._build_dataloader_with_dataset( self.train_dataset, @@ -505,7 +566,9 @@ class EpochBasedTrainer(BaseTrainer): if self.eval_dataset is None: val_data = self.cfg.dataset.val self.eval_dataset = self.build_dataset( - val_data, mode=ModeKeys.EVAL) + val_data, + mode=ModeKeys.EVAL, + preprocessor=self.eval_preprocessor) batch_size = self.cfg.evaluation.batch_size workers = self.cfg.evaluation.workers @@ -521,7 +584,7 @@ class EpochBasedTrainer(BaseTrainer): ) return data_loader - def build_dataset(self, data_cfg, mode): + def build_dataset(self, data_cfg, mode, preprocessor=None): """ Build torch dataset object using data config """ dataset = MsDataset.load( @@ -530,9 +593,13 @@ class EpochBasedTrainer(BaseTrainer): subset_name=data_cfg.subset_name if hasattr( data_cfg, 'subset_name') else None, hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, + **data_cfg, ) + cfg = ConfigDict(type=self.cfg.model.type, mode=mode) torch_dataset = dataset.to_torch_dataset( - preprocessors=self.preprocessor, ) + task_data_config=cfg, + task_name=self.cfg.task, + preprocessors=self.preprocessor) dataset = self.to_task_dataset(torch_dataset, mode) return dataset @@ -698,6 +765,7 @@ class EpochBasedTrainer(BaseTrainer): self.invoke_hook(TrainerStages.before_train_epoch) time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(data_loader): + data_batch = to_device(data_batch, self.device) self.data_batch = data_batch self._inner_iter = i self.invoke_hook(TrainerStages.before_train_iter) @@ -706,6 +774,9 @@ class EpochBasedTrainer(BaseTrainer): del self.data_batch self._iter += 1 + if i + 1 >= self.iters_per_epoch: + break + self.invoke_hook(TrainerStages.after_train_epoch) self._epoch += 1 @@ -721,17 +792,21 @@ class EpochBasedTrainer(BaseTrainer): metric_values = multi_gpu_test( self.model, data_loader, + device=self.device, tmpdir=None, gpu_collect=False, - data_collate_fn=self.collate_fn, - metric_classes=metric_classes) + metric_classes=metric_classes, + data_loader_iters_per_gpu=self.iters_per_epoch) else: from modelscope.trainers.utils.inference import single_gpu_test metric_values = single_gpu_test( self.model, data_loader, - data_collate_fn=self.collate_fn, - metric_classes=metric_classes) + device=self.device, + metric_classes=metric_classes, + data_loader_iters=self.iters_per_epoch) + + self._inner_iter = self.iters_per_epoch - 1 # start from index 0 return metric_values diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index a90a58b6..d368c340 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) Alibaba, Inc. and its affiliates. +import logging import os import pickle import shutil @@ -10,6 +11,7 @@ import torch from torch import distributed as dist from tqdm import tqdm +from modelscope.utils.data_utils import to_device from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, make_tmp_dir) @@ -17,25 +19,41 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, def single_gpu_test(model, data_loader, - data_collate_fn=None, - metric_classes=None): + device, + metric_classes=None, + data_loader_iters=None): """Test model with a single gpu. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. - data_collate_fn: An optional data_collate_fn before fed into the model - metric_classes(List): List of Metric class that uses to collect metrics + device (str | torch.device): The target device for the data. + metric_classes (List): List of Metric class that uses to collect metrics + data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset. Returns: list: The prediction results. """ model.eval() dataset = data_loader.dataset - with tqdm(total=len(dataset), desc='test samples') as pbar: - for data in data_loader: - if data_collate_fn is not None: - data = data_collate_fn(data) + progress_with_iters = False + if data_loader_iters is None: + try: + data_len = len(dataset) + except Exception as e: + logging.error(e) + raise ValueError( + 'Please implement ``__len__`` method for your dataset, or provide ``data_loader_iters``' + ) + desc = 'Total test samples' + else: + progress_with_iters = True + data_len = data_loader_iters + desc = 'Test iterations' + + with tqdm(total=data_len, desc=desc) as pbar: + for i, data in enumerate(data_loader): + data = to_device(data, device) with torch.no_grad(): if isinstance(data, Mapping) and not func_receive_dict_inputs( model.forward): @@ -46,13 +64,19 @@ def single_gpu_test(model, for metric_cls in metric_classes: metric_cls.add(result, data) - if isinstance(data, dict): - batch_size = len(next(iter(data.values()))) + if progress_with_iters: + batch_size = 1 # iteration count else: - batch_size = len(data) + if isinstance(data, dict): + batch_size = len(next(iter(data.values()))) + else: + batch_size = len(data) for _ in range(batch_size): pbar.update() + if progress_with_iters and (i + 1) >= data_len: + break + metric_values = {} for metric_cls in metric_classes: metric_values.update(metric_cls.evaluate()) @@ -62,10 +86,11 @@ def single_gpu_test(model, def multi_gpu_test(model, data_loader, + device, tmpdir=None, gpu_collect=False, - data_collate_fn=None, - metric_classes=None): + metric_classes=None, + data_loader_iters_per_gpu=None): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results @@ -77,12 +102,12 @@ def multi_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. + device: (str | torch.device): The target device for the data. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. - data_collate_fn: An optional data_collate_fn before fed into the model metric_classes(List): List of Metric class that uses to collect metrics - + data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset. Returns: list: The prediction results. """ @@ -90,16 +115,31 @@ def multi_gpu_test(model, results = [] data_list = [] dataset = data_loader.dataset + rank, world_size = get_dist_info() - time.sleep(2) # This line can prevent deadlock problem in some cases. + progress_with_iters = False + if data_loader_iters_per_gpu is None: + try: + data_len = len(dataset) + total_samples = data_len + except Exception as e: + logging.error(e) + raise ValueError( + 'Please implement ``__len__`` method for your dataset, or provide ``data_loader_iters_per_gpu``' + ) + desc = 'Total test samples with multi gpus' + else: + total_samples = 0 + progress_with_iters = True + data_len = data_loader_iters_per_gpu * world_size + desc = 'Total test iterations with multi gpus' - rank, world_size = get_dist_info() + time.sleep(2) # This line can prevent deadlock problem in some cases. count = 0 - with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: - for _, data in enumerate(data_loader): - if data_collate_fn is not None: - data = data_collate_fn(data) + with tqdm(total=data_len, desc=desc) as pbar: + for i, data in enumerate(data_loader): + data = to_device(data, device) data_list.append(data) with torch.no_grad(): if isinstance(data, Mapping) and not func_receive_dict_inputs( @@ -114,24 +154,32 @@ def multi_gpu_test(model, batch_size = len(next(iter(data.values()))) else: batch_size = len(data) + + if progress_with_iters: + total_samples += batch_size * world_size + batch_size = 1 # iteration count + batch_size_all = batch_size * world_size count += batch_size_all - if count > len(dataset): - batch_size_all = len(dataset) - (count - batch_size_all) + if count > data_len: + batch_size_all = data_len - (count - batch_size_all) for _ in range(batch_size_all): pbar.update() + if progress_with_iters and (i + 1) >= data_len: + break + # TODO: allgather data list may cost a lot of memory and needs to be redesigned # collect results and data from all ranks if gpu_collect: - results = collect_results_gpu(results, len(dataset)) - data_list = collect_results_gpu(data_list, len(dataset)) + results = collect_results_gpu(results, total_samples) + data_list = collect_results_gpu(data_list, total_samples) else: if tmpdir is None: tmpdir = make_tmp_dir() - results = collect_results_cpu(results, len(dataset), + results = collect_results_cpu(results, total_samples, os.path.join(tmpdir, 'predict')) - data_list = collect_results_cpu(data_list, len(dataset), + data_list = collect_results_cpu(data_list, total_samples, os.path.join(tmpdir, 'groundtruth')) if is_master(): diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index a86dfbc2..759bd447 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1]) REGISTER_MODULE = 'register_module' IGNORED_PACKAGES = ['modelscope', '.'] SCAN_SUB_FOLDERS = [ - 'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets', - 'trainers' + 'models', 'metrics', 'pipelines', 'preprocessors', + 'msdatasets/task_datasets', 'trainers' ] INDEXER_FILE = 'ast_indexer' DECORATOR_KEY = 'decorators' @@ -43,6 +43,7 @@ MD5_KEY = 'md5' INDEX_KEY = 'index' REQUIREMENT_KEY = 'requirements' MODULE_KEY = 'module' +CLASS_NAME = 'class_name' class AstScaning(object): @@ -237,6 +238,8 @@ class AstScaning(object): ['name']] = final_dict if 'decorator_list' == field and attr != []: + for item in attr: + setattr(item, CLASS_NAME, node.name) self.result_decorator.extend(attr) out += f'{indentstr()}{field}={representation},\n' @@ -294,7 +297,7 @@ class AstScaning(object): else: return getattr(eval(split_list[0]), split_list[1]) - def _registry_indexer(self, parsed_input: tuple) -> tuple: + def _registry_indexer(self, parsed_input: tuple, class_name: str) -> tuple: """format registry information to a tuple indexer Return: @@ -310,7 +313,7 @@ class AstScaning(object): if len(args_list) == 0 and len(keyword_list) == 0: args_list.append(default_group) if len(keyword_list) == 0 and len(args_list) == 1: - args_list.append(None) + args_list.append(class_name) if len(keyword_list) == 1 and len(args_list) == 0: args_list.append(default_group) @@ -344,7 +347,8 @@ class AstScaning(object): if type(node).__name__ != 'Call': continue parse_output = self._parse_decorator(node) - index = self._registry_indexer(parse_output) + index = self._registry_indexer(parse_output, + getattr(node, CLASS_NAME)) if None is not index: results.append(index) return results diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 927eafbd..1a3fb7c3 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -62,6 +62,9 @@ class CVTasks(object): virtual_try_on = 'virtual-try-on' crowd_counting = 'crowd-counting' + # video related + video_single_object_tracking = 'video-single-object-tracking' + class NLPTasks(object): # nlp tasks @@ -203,6 +206,8 @@ class ModelFile(object): TF_CKPT_PREFIX = 'ckpt-' TORCH_MODEL_FILE = 'pytorch_model.pt' TORCH_MODEL_BIN_FILE = 'pytorch_model.bin' + VOCAB_FILE = 'vocab.txt' + ONNX_MODEL_FILE = 'model.onnx' LABEL_MAPPING = 'label_mapping.json' @@ -219,6 +224,12 @@ class ConfigFields(object): evaluation = 'evaluation' +class ConfigKeys(object): + """Fixed keywords in configuration file""" + train = 'train' + val = 'val' + + class Requirements(object): """Requirement names for each module """ diff --git a/modelscope/utils/data_utils.py b/modelscope/utils/data_utils.py new file mode 100644 index 00000000..2bc88e19 --- /dev/null +++ b/modelscope/utils/data_utils.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from collections.abc import Mapping + +import torch + + +def to_device(batch, device, non_blocking=False): + """Put the data to the target cuda device just before the forward function. + Args: + batch: The batch data out of the dataloader. + device: (str | torch.device): The target device for the data. + + Returns: The data to the target device. + + """ + if isinstance(batch, dict) or isinstance(batch, Mapping): + return type(batch)({k: to_device(v, device) for k, v in batch.items()}) + elif isinstance(batch, (tuple, list)): + return type(batch)(to_device(v, device) for v in batch) + elif isinstance(batch, torch.Tensor): + return batch.to(device, non_blocking=non_blocking) + else: + return batch diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py index aca103d2..7889d944 100644 --- a/modelscope/utils/tensor_utils.py +++ b/modelscope/utils/tensor_utils.py @@ -24,65 +24,3 @@ def torch_nested_detach(tensors): if isinstance(tensors, torch.Tensor): return tensors.detach() return tensors - - -def torch_default_data_collator(features): - # TODO @jiangnana.jnn refine this default data collator - import torch - first = features[0] - - if isinstance(first, Mapping): - batch = {} - # Special handling for labels. - # Ensure that tensor is created with the correct type - # (it should be automatically the case, but let's make sure of it.) - if 'label' in first and first['label'] is not None: - label = first['label'].item() if isinstance( - first['label'], torch.Tensor) else first['label'] - # the msdataset return a 0-dimension np.array with a single value, the following part handle this. - if isinstance(label, np.ndarray): - src_dtype = label[()].dtype - dtype = torch.long if label[( - )].dtype == np.int64 else torch.float - else: - src_dtype = type(label) - dtype = torch.long if isinstance(label, int) else torch.float - # add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_" - batch['labels'] = torch.tensor( - np.array([f['label'] for f in features], dtype=src_dtype), - dtype=dtype) - elif 'label_ids' in first and first['label_ids'] is not None: - if isinstance(first['label_ids'], torch.Tensor): - batch['labels'] = torch.stack( - [f['label_ids'] for f in features]) - else: - dtype = torch.long if type( - first['label_ids'][0]) is int else torch.float - batch['labels'] = torch.tensor( - [f['label_ids'] for f in features], dtype=dtype) - - # Handling of all other possible keys. - # Again, we will use the first element to figure out which key/values are not None for this model. - for k, v in first.items(): - if k not in ('label', 'label_ids' - ) and v is not None and not isinstance(v, str): - if isinstance(v, torch.Tensor): - batch[k] = torch.stack([f[k] for f in features]) - elif isinstance(v, list) and isinstance(v[0], torch.Tensor): - batch[k] = torch.stack([d for f in features for d in f[k]]) - else: - batch[k] = torch.tensor(np.array([f[k] for f in features])) - elif isinstance(first, tuple): - batch = [] - for idx in range(len(first)): - if isinstance(first[idx], torch.Tensor): - batch.append(torch.stack([f[idx] for f in features])) - else: - batch.append(torch.tensor([f[idx] for f in features])) - else: - if isinstance(first, torch.Tensor): - batch = torch.stack(features) - else: - batch = torch.tensor(features) - - return batch diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py index 5a606f9c..7adba982 100644 --- a/modelscope/utils/test_utils.py +++ b/modelscope/utils/test_utils.py @@ -50,7 +50,7 @@ def set_test_level(level: int): def create_dummy_test_dataset(feat, label, num): return MsDataset.from_hf_dataset( - Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num))) + Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num))) def download_and_untar(fpath, furl, dst) -> str: diff --git a/modelscope/version.py b/modelscope/version.py index bfeb9e74..40ed83d9 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1 +1 @@ -__version__ = '0.3.4' +__version__ = '0.3.5' diff --git a/requirements/audio.txt b/requirements/audio.txt index 81d288bd..5e4bc104 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -16,6 +16,7 @@ numpy<=1.18 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. protobuf>3,<3.21.0 ptflops +py_sound_connect pytorch_wavelets PyWavelets>=1.0.0 scikit-learn diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ce18dcea..e2b78f06 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,6 +1,5 @@ addict -#version above 2.1.0 introduces backward-compatability issue which is being resolved -datasets==2.1.0 +datasets easydict einops filelock>=3.3.0 diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 0894ce3d..f9118353 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -4,6 +4,7 @@ from modelscope.models import Model from modelscope.msdatasets import MsDataset from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors.base import Preprocessor +from modelscope.utils.constant import DownloadMode from modelscope.utils.test_utils import require_tf, require_torch, test_level @@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor): class MsDatasetTest(unittest.TestCase): + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_coco(self): + ms_ds_train = MsDataset.load( + 'pets_small', + namespace='modelscope', + split='train', + download_mode=DownloadMode.FORCE_REDOWNLOAD, + classes=('1', '2')) + print(ms_ds_train._hf_ds.config_kwargs) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ms_csv_basic(self): ms_ds_train = MsDataset.load( diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py new file mode 100644 index 00000000..e7967edc --- /dev/null +++ b/tests/pipelines/test_key_word_spotting_farfield.py @@ -0,0 +1,43 @@ +import os.path +import unittest + +from modelscope.fileio import File +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + +TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav' + + +class KWSFarfieldTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_normal(self): + kws = pipeline(Tasks.keyword_spotting, model=self.model_id) + inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)} + result = kws(inputs) + self.assertEqual(len(result['kws_list']), 5) + print(result['kws_list'][-1]) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_output(self): + kws = pipeline(Tasks.keyword_spotting, model=self.model_id) + inputs = { + 'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE), + 'output_file': 'output.wav' + } + result = kws(inputs) + self.assertEqual(len(result['kws_list']), 5) + print(result['kws_list'][-1]) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_input_bytes(self): + with open(os.path.join(os.getcwd(), TEST_SPEECH_FILE), 'rb') as f: + data = f.read() + kws = pipeline(Tasks.keyword_spotting, model=self.model_id) + result = kws(data) + self.assertEqual(len(result['kws_list']), 5) + print(result['kws_list'][-1]) diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py new file mode 100644 index 00000000..4b8a813a --- /dev/null +++ b/tests/pipelines/test_mplug_tasks.py @@ -0,0 +1,59 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from PIL import Image + +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class MplugTasksTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_image_captioning_with_model(self): + model = Model.from_pretrained( + 'damo/mplug_image-captioning_coco_base_en') + pipeline_caption = pipeline( + task=Tasks.image_captioning, + model=model, + ) + image = Image.open('data/test/images/image_mplug_vqa.jpg') + result = pipeline_caption({'image': image}) + print(result[OutputKeys.CAPTION]) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_image_captioning_with_name(self): + pipeline_caption = pipeline( + Tasks.image_captioning, + model='damo/mplug_image-captioning_coco_base_en') + image = Image.open('data/test/images/image_mplug_vqa.jpg') + result = pipeline_caption({'image': image}) + print(result[OutputKeys.CAPTION]) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_visual_question_answering_with_model(self): + model = Model.from_pretrained( + 'damo/mplug_visual-question-answering_coco_large_en') + pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model) + image = Image.open('data/test/images/image_mplug_vqa.jpg') + question = 'What is the woman doing?' + input = {'image': image, 'question': question} + result = pipeline_vqa(input) + print(result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_visual_question_answering_with_name(self): + model = 'damo/mplug_visual-question-answering_coco_large_en' + pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model) + image = Image.open('data/test/images/image_mplug_vqa.jpg') + question = 'What is the woman doing?' + input = {'image': image, 'question': question} + result = pipeline_vqa(input) + print(result) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py index 3bf3af87..6152f279 100644 --- a/tests/pipelines/test_multi_modal_embedding.py +++ b/tests/pipelines/test_multi_modal_embedding.py @@ -2,50 +2,58 @@ import unittest -import numpy as np +import torch from modelscope.models import Model +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level class MultiModalEmbeddingTest(unittest.TestCase): - model_id = 'damo/multi-modal_clip-vit-large-patch14_zh' - test_text = {'text': '一张风景图'} + model_id = 'damo/multi-modal_clip-vit-base-patch16_zh' + test_input = {'text': '皮卡丘'} + model_version = 'dev' - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run(self): - pipe_line_multi_modal_embedding = pipeline( - Tasks.multi_modal_embedding, model=self.model_id) - test_str_embedding = pipe_line_multi_modal_embedding( - self.test_text)['text_embedding'] - print(np.sum(np.abs(test_str_embedding))) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + pipeline_multi_modal_embedding = pipeline( + Tasks.multi_modal_embedding, + model=self.model_id, + model_revision=self.model_version) + text_embedding = pipeline_multi_modal_embedding( + self.test_input)[OutputKeys.TEXT_EMBEDDING] + print('l1-norm: {}'.format( + torch.norm(text_embedding, p=1, dim=-1).item())) + print('l2-norm: {}'.format(torch.norm(text_embedding, + dim=-1).item())) # should be 1.0 + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - pipe_line_multi_modal_embedding = pipeline( - task=Tasks.multi_modal_embedding, model=model) - test_str_embedding = pipe_line_multi_modal_embedding( - self.test_text)['text_embedding'] - print(np.sum(np.abs(test_str_embedding))) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_with_model_name(self): - pipe_line_multi_modal_embedding = pipeline( - task=Tasks.multi_modal_embedding, model=self.model_id) - test_str_embedding = pipe_line_multi_modal_embedding( - self.test_text)['text_embedding'] - print(np.sum(np.abs(test_str_embedding))) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + pipeline_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, + model=model, + model_revision=self.model_version) + text_embedding = pipeline_multi_modal_embedding( + self.test_input)[OutputKeys.TEXT_EMBEDDING] + print('l1-norm: {}'.format( + torch.norm(text_embedding, p=1, dim=-1).item())) + print('l2-norm: {}'.format(torch.norm(text_embedding, + dim=-1).item())) # should be 1.0 + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_default_model(self): - pipe_line_multi_modal_embedding = pipeline( - task=Tasks.multi_modal_embedding) - test_str_embedding = pipe_line_multi_modal_embedding( - self.test_text)['text_embedding'] - print(np.sum(np.abs(test_str_embedding))) + pipeline_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, + model_revision=self.model_version) + text_embedding = pipeline_multi_modal_embedding( + self.test_input)[OutputKeys.TEXT_EMBEDDING] + print('l1-norm: {}'.format( + torch.norm(text_embedding, p=1, dim=-1).item())) + print('l2-norm: {}'.format(torch.norm(text_embedding, + dim=-1).item())) # should be 1.0 if __name__ == '__main__': diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py index d86c2266..a2e5ba8e 100644 --- a/tests/pipelines/test_ocr_recognition.py +++ b/tests/pipelines/test_ocr_recognition.py @@ -19,7 +19,7 @@ from modelscope.utils.test_utils import test_level class OCRRecognitionTest(unittest.TestCase): def setUp(self) -> None: - self.model_id = 'damo/cv_convnextTiny_ocr-recognition_damo' + self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo' self.test_image = 'data/test/images/ocr_recognition.jpg' def pipeline_inference(self, pipeline: Pipeline, input_location: str): diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py new file mode 100644 index 00000000..ec010b17 --- /dev/null +++ b/tests/pipelines/test_salient_detection.py @@ -0,0 +1,24 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class SalientDetectionTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_salient_detection(self): + input_location = 'data/test/images/image_salient_detection.jpg' + model_id = 'damo/cv_u2net_salient-detection' + salient_detect = pipeline(Tasks.image_segmentation, model=model_id) + result = salient_detect(input_location) + import cv2 + # result[OutputKeys.MASKS] is salient map result,other keys are not used + cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py index e8b4a551..007e6c73 100644 --- a/tests/pipelines/test_speech_signal_process.py +++ b/tests/pipelines/test_speech_signal_process.py @@ -8,22 +8,10 @@ from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level -NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav' -FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav' -NEAREND_MIC_FILE = 'nearend_mic.wav' -FAREND_SPEECH_FILE = 'farend_speech.wav' +NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav' +FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav' -NOISE_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ANS/sample_audio/speech_with_noise.wav' -NOISE_SPEECH_FILE = 'speech_with_noise.wav' - - -def download(remote_path, local_path): - local_dir = os.path.dirname(local_path) - if len(local_dir) > 0: - if not os.path.exists(local_dir): - os.makedirs(local_dir) - with open(local_path, 'wb') as ofile: - ofile.write(File.read(remote_path)) +NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav' class SpeechSignalProcessTest(unittest.TestCase): @@ -33,13 +21,10 @@ class SpeechSignalProcessTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_aec(self): - # Download audio files - download(NEAREND_MIC_URL, NEAREND_MIC_FILE) - download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE) model_id = 'damo/speech_dfsmn_aec_psm_16k' input = { - 'nearend_mic': NEAREND_MIC_FILE, - 'farend_speech': FAREND_SPEECH_FILE + 'nearend_mic': os.path.join(os.getcwd(), NEAREND_MIC_FILE), + 'farend_speech': os.path.join(os.getcwd(), FAREND_SPEECH_FILE) } aec = pipeline(Tasks.acoustic_echo_cancellation, model=model_id) output_path = os.path.abspath('output.wav') @@ -48,14 +33,11 @@ class SpeechSignalProcessTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_aec_bytes(self): - # Download audio files - download(NEAREND_MIC_URL, NEAREND_MIC_FILE) - download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE) model_id = 'damo/speech_dfsmn_aec_psm_16k' input = {} - with open(NEAREND_MIC_FILE, 'rb') as f: + with open(os.path.join(os.getcwd(), NEAREND_MIC_FILE), 'rb') as f: input['nearend_mic'] = f.read() - with open(FAREND_SPEECH_FILE, 'rb') as f: + with open(os.path.join(os.getcwd(), FAREND_SPEECH_FILE), 'rb') as f: input['farend_speech'] = f.read() aec = pipeline( Tasks.acoustic_echo_cancellation, @@ -67,13 +49,10 @@ class SpeechSignalProcessTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_aec_tuple_bytes(self): - # Download audio files - download(NEAREND_MIC_URL, NEAREND_MIC_FILE) - download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE) model_id = 'damo/speech_dfsmn_aec_psm_16k' - with open(NEAREND_MIC_FILE, 'rb') as f: + with open(os.path.join(os.getcwd(), NEAREND_MIC_FILE), 'rb') as f: nearend_bytes = f.read() - with open(FAREND_SPEECH_FILE, 'rb') as f: + with open(os.path.join(os.getcwd(), FAREND_SPEECH_FILE), 'rb') as f: farend_bytes = f.read() inputs = (nearend_bytes, farend_bytes) aec = pipeline( @@ -86,25 +65,22 @@ class SpeechSignalProcessTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_ans(self): - # Download audio files - download(NOISE_SPEECH_URL, NOISE_SPEECH_FILE) model_id = 'damo/speech_frcrn_ans_cirm_16k' ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id) output_path = os.path.abspath('output.wav') - ans(NOISE_SPEECH_FILE, output_path=output_path) + ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), + output_path=output_path) print(f'Processed audio saved to {output_path}') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ans_bytes(self): - # Download audio files - download(NOISE_SPEECH_URL, NOISE_SPEECH_FILE) model_id = 'damo/speech_frcrn_ans_cirm_16k' ans = pipeline( Tasks.acoustic_noise_suppression, model=model_id, pipeline_name=Pipelines.speech_frcrn_ans_cirm_16k) output_path = os.path.abspath('output.wav') - with open(NOISE_SPEECH_FILE, 'rb') as f: + with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f: data = f.read() ans(data, output_path=output_path) print(f'Processed audio saved to {output_path}') diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py new file mode 100644 index 00000000..f5d4714c --- /dev/null +++ b/tests/pipelines/test_video_single_object_tracking.py @@ -0,0 +1,39 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.models.cv.video_single_object_tracking.utils.utils import \ + show_tracking_result +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class SingleObjectTracking(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/cv_vitb_video-single-object-tracking_ostrack' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_end2end(self): + video_single_object_tracking = pipeline( + Tasks.video_single_object_tracking, model=self.model_id) + video_path = 'data/test/videos/dog.avi' + init_bbox = [414, 343, 514, 449] # [x1, y1, x2, y2] + result = video_single_object_tracking((video_path, init_bbox)) + print('result is : ', result[OutputKeys.BOXES]) + show_tracking_result(video_path, result[OutputKeys.BOXES], + './tracking_result.avi') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_modelhub_default_model(self): + video_single_object_tracking = pipeline( + Tasks.video_single_object_tracking) + video_path = 'data/test/videos/dog.avi' + init_bbox = [414, 343, 514, 449] # [x1, y1, x2, y2] + result = video_single_object_tracking((video_path, init_bbox)) + print('result is : ', result[OutputKeys.BOXES]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_visual_question_answering.py b/tests/pipelines/test_visual_question_answering.py deleted file mode 100644 index 748a86b9..00000000 --- a/tests/pipelines/test_visual_question_answering.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import unittest - -from PIL import Image - -from modelscope.hub.snapshot_download import snapshot_download -from modelscope.models import Model -from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering -from modelscope.pipelines import pipeline -from modelscope.pipelines.multi_modal import VisualQuestionAnsweringPipeline -from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor -from modelscope.utils.constant import Tasks -from modelscope.utils.test_utils import test_level - - -class VisualQuestionAnsweringTest(unittest.TestCase): - - def setUp(self): - self.model_id = 'damo/mplug_visual-question-answering_coco_large_en' - self.input_vqa = { - 'image': Image.open('data/test/images/image_mplug_vqa.jpg'), - 'question': 'What is the woman doing?', - } - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run(self): - cache_path = snapshot_download(self.model_id) - preprocessor = MPlugVisualQuestionAnsweringPreprocessor(cache_path) - model = MPlugForVisualQuestionAnswering(cache_path) - pipeline1 = VisualQuestionAnsweringPipeline( - model, preprocessor=preprocessor) - pipeline2 = pipeline( - Tasks.visual_question_answering, - model=model, - preprocessor=preprocessor) - print(f"question: {self.input_vqa['question']}") - print(f'pipeline1: {pipeline1(self.input_vqa)}') - print(f'pipeline2: {pipeline2(self.input_vqa)}') - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id) - preprocessor = MPlugVisualQuestionAnsweringPreprocessor( - model.model_dir) - pipeline_vqa = pipeline( - task=Tasks.visual_question_answering, - model=model, - preprocessor=preprocessor) - print(pipeline_vqa(self.input_vqa)) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_model_name(self): - pipeline_vqa = pipeline( - Tasks.visual_question_answering, model=self.model_id) - print(pipeline_vqa(self.input_vqa)) - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_with_default_model(self): - pipeline_vqa = pipeline(task=Tasks.visual_question_answering) - print(pipeline_vqa(self.input_vqa)) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/preprocessors/test_common.py b/tests/preprocessors/test_common.py index 1ee13589..714b8588 100644 --- a/tests/preprocessors/test_common.py +++ b/tests/preprocessors/test_common.py @@ -2,7 +2,10 @@ import unittest -from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor +import torch + +from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter, + Preprocessor, ToTensor) class ComposeTest(unittest.TestCase): @@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase): self.assertEqual(output['tmp2'], 'tmp2') +class ToTensorTest(unittest.TestCase): + + def test_totensor(self): + to_tensor_op = ToTensor(keys=['img']) + inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} + inputs = to_tensor_op(inputs) + self.assertIsInstance(inputs['img'], torch.Tensor) + self.assertEqual(inputs['label'], 1) + self.assertEqual(inputs['path'], 'test.jpg') + + +class FilterTest(unittest.TestCase): + + def test_filter(self): + filter_op = Filter(reserved_keys=['img', 'label']) + inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} + inputs = filter_op(inputs) + self.assertIn('img', inputs) + self.assertIn('label', inputs) + self.assertNotIn('path', inputs) + + if __name__ == '__main__': unittest.main() diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py index fca01597..4271e201 100644 --- a/tests/preprocessors/test_nlp.py +++ b/tests/preprocessors/test_nlp.py @@ -2,7 +2,7 @@ import unittest -from modelscope.preprocessors import build_preprocessor +from modelscope.preprocessors import build_preprocessor, nlp from modelscope.utils.constant import Fields, InputFields from modelscope.utils.logger import get_logger diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py index fc59750d..76da1681 100644 --- a/tests/taskdataset/test_veco_dataset.py +++ b/tests/taskdataset/test_veco_dataset.py @@ -2,7 +2,7 @@ import unittest -from modelscope.task_datasets.veco_dataset import VecoDataset +from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset from modelscope.utils.test_utils import test_level diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py index 9e65f127..1338bb2c 100644 --- a/tests/trainers/hooks/test_evaluation_hook.py +++ b/tests/trainers/hooks/test_evaluation_hook.py @@ -12,7 +12,7 @@ from torch import nn from modelscope.metainfo import Trainers from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.trainers import build_trainer -from modelscope.utils.constant import LogKeys, ModelFile +from modelscope.utils.constant import ModelFile from modelscope.utils.registry import default_group from modelscope.utils.test_utils import create_dummy_test_dataset diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py index eb30fb52..86d53ecc 100644 --- a/tests/trainers/hooks/test_lr_scheduler_hook.py +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -9,7 +9,7 @@ import numpy as np import torch from torch import nn from torch.optim import SGD -from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau +from torch.optim.lr_scheduler import MultiStepLR from modelscope.metainfo import Trainers from modelscope.metrics.builder import METRICS, MetricKeys @@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( @@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase): json.dump(json_cfg, f) model = DummyModel() - # optimmizer = SGD(model.parameters(), lr=0.01) - # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) trainer_name = Trainers.default kwargs = dict( cfg_file=config_path, model=model, train_dataset=dummy_dataset, - # optimizers=(optimmizer, lr_scheduler), - max_epochs=7) + max_epochs=7, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( @@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase): train_dataset=dummy_dataset, eval_dataset=dummy_dataset, optimizers=(optimizer, None), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py index 62c70632..25457c1c 100644 --- a/tests/trainers/hooks/test_optimizer_hook.py +++ b/tests/trainers/hooks/test_optimizer_hook.py @@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages from modelscope.utils.test_utils import create_dummy_test_dataset dummy_dataset = create_dummy_test_dataset( - np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10) + np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10) class DummyModel(nn.Module): @@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=2) + max_epochs=2, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py index 6f24809b..614f7688 100644 --- a/tests/trainers/hooks/test_timer_hook.py +++ b/tests/trainers/hooks/test_timer_hook.py @@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( @@ -83,6 +84,7 @@ class IterTimerHookTest(unittest.TestCase): trainer.register_optimizers_hook() trainer.register_hook_from_cfg(trainer.cfg.train.hooks) trainer.data_loader = train_dataloader + trainer.train_dataloader = train_dataloader trainer.invoke_hook(TrainerStages.before_run) for i in range(trainer._epoch, trainer._max_epochs): trainer.invoke_hook(TrainerStages.before_train_epoch) diff --git a/tests/trainers/test_clip_multi_modal_embedding_trainer.py b/tests/trainers/test_clip_multi_modal_embedding_trainer.py deleted file mode 100644 index 03f82854..00000000 --- a/tests/trainers/test_clip_multi_modal_embedding_trainer.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import tempfile -import unittest - -import requests -import torch -import torch.distributed as dist -import torch.multiprocessing as mp - -from modelscope.hub.snapshot_download import snapshot_download -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from modelscope.utils.constant import ModelFile -from modelscope.utils.logger import get_logger -from modelscope.utils.test_utils import test_level - -logger = get_logger() - - -def clip_train_worker(local_rank, ngpus, node_size, node_rank): - global_rank = local_rank + node_rank * ngpus - dist_world_size = node_size * ngpus - - dist.init_process_group( - backend='nccl', world_size=dist_world_size, rank=global_rank) - - model_id = 'damo/multi-modal_clip-vit-large-patch14_zh' - local_model_dir = snapshot_download(model_id) - - default_args = dict( - cfg_file='{}/{}'.format(local_model_dir, ModelFile.CONFIGURATION), - model=model_id, - device_id=local_rank) - trainer = build_trainer( - name=Trainers.clip_multi_modal_embedding, default_args=default_args) - - trainer.train() - trainer.evaluate() - - -class CLIPMultiModalEmbeddingTrainerTest(unittest.TestCase): - - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') - def test_trainer(self): - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = '2001' - NODE_SIZE, NODE_RANK = 1, 0 - logger.info('Train clip with {} machines'.format(NODE_SIZE)) - ngpus = torch.cuda.device_count() - logger.info('Machine: {} has {} GPUs'.format(NODE_RANK, ngpus)) - mp.spawn( - clip_train_worker, - nprocs=ngpus, - args=(ngpus, NODE_SIZE, NODE_RANK)) - logger.info('Training done') - - -if __name__ == '__main__': - unittest.main() - ... diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py index 35d0378f..c8557ff5 100644 --- a/tests/trainers/test_image_instance_segmentation_trainer.py +++ b/tests/trainers/test_image_instance_segmentation_trainer.py @@ -8,10 +8,13 @@ from functools import partial from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers -from modelscope.models.cv.image_instance_segmentation import ( - CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset) +from modelscope.models.cv.image_instance_segmentation import \ + CascadeMaskRCNNSwinModel +from modelscope.msdatasets import MsDataset +from modelscope.msdatasets.task_datasets import \ + ImageInstanceSegmentationCocoDataset from modelscope.trainers import build_trainer -from modelscope.utils.config import Config +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import ModelFile from modelscope.utils.test_utils import test_level @@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) cfg = Config.from_file(config_path) - data_root = cfg.dataset.data_root - classes = tuple(cfg.dataset.classes) max_epochs = cfg.train.max_epochs samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu - - if data_root is None: + try: + train_data_cfg = cfg.dataset.train + val_data_cfg = cfg.dataset.val + except Exception: + train_data_cfg = None + val_data_cfg = None + if train_data_cfg is None: # use default toy data - dataset_path = os.path.join(cache_path, 'toydata.zip') - with zipfile.ZipFile(dataset_path, 'r') as zipf: - zipf.extractall(cache_path) - data_root = cache_path + '/toydata/' - classes = ('Cat', 'Dog') - - self.train_dataset = ImageInstanceSegmentationCocoDataset( - data_root + 'annotations/instances_train.json', - classes=classes, - data_root=data_root, - img_prefix=data_root + 'images/train/', - seg_prefix=None, - test_mode=False) - - self.eval_dataset = ImageInstanceSegmentationCocoDataset( - data_root + 'annotations/instances_val.json', - classes=classes, - data_root=data_root, - img_prefix=data_root + 'images/val/', - seg_prefix=None, - test_mode=True) + train_data_cfg = ConfigDict( + name='pets_small', + split='train', + classes=('Cat', 'Dog'), + test_mode=False) + if val_data_cfg is None: + val_data_cfg = ConfigDict( + name='pets_small', + split='validation', + classes=('Cat', 'Dog'), + test_mode=True) + + self.train_dataset = MsDataset.load( + dataset_name=train_data_cfg.name, + split=train_data_cfg.split, + classes=train_data_cfg.classes, + test_mode=train_data_cfg.test_mode) + assert self.train_dataset.config_kwargs[ + 'classes'] == train_data_cfg.classes + assert next( + iter(self.train_dataset.config_kwargs['split_config'].values())) + + self.eval_dataset = MsDataset.load( + dataset_name=val_data_cfg.name, + split=val_data_cfg.split, + classes=val_data_cfg.classes, + test_mode=val_data_cfg.test_mode) + assert self.eval_dataset.config_kwargs[ + 'classes'] == val_data_cfg.classes + assert next( + iter(self.eval_dataset.config_kwargs['split_config'].values())) from mmcv.parallel import collate diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index b7639024..0259f804 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -3,23 +3,31 @@ import os import shutil import tempfile import unittest -from abc import ABCMeta import json import numpy as np import torch -from datasets import Dataset from torch import nn from torch.optim import SGD from torch.optim.lr_scheduler import StepLR +from torch.utils.data import IterableDataset from modelscope.metainfo import Metrics, Trainers from modelscope.metrics.builder import MetricKeys -from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import create_dummy_test_dataset, test_level + +class DummyIterableDataset(IterableDataset): + + def __iter__(self): + feat = np.random.random(size=(5, )).astype(np.float32) + labels = np.random.randint(0, 4, (1, )) + iterations = [{'feat': feat, 'labels': labels}] * 500 + return iter(iterations) + + dummy_dataset_small = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) @@ -116,7 +124,8 @@ class TrainerTest(unittest.TestCase): data_collator=None, train_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small, - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() @@ -175,7 +184,8 @@ class TrainerTest(unittest.TestCase): train_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small, optimizers=(optimmizer, lr_scheduler), - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() @@ -225,7 +235,8 @@ class TrainerTest(unittest.TestCase): train_dataset=dummy_dataset_big, eval_dataset=dummy_dataset_small, optimizers=(optimmizer, lr_scheduler), - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() @@ -303,6 +314,124 @@ class TrainerTest(unittest.TestCase): for i in [2, 5, 8]: self.assertIn(MetricKeys.ACCURACY, lines[i]) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_train_with_iters_per_epoch(self): + json_cfg = { + 'train': { + 'work_dir': self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'hooks': [{ + 'type': 'EvaluationHook', + 'interval': 1 + }] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1, + 'shuffle': False + }, + 'metrics': [Metrics.seq_cls_metric] + } + } + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimmizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = StepLR(optimmizer, 2) + trainer_name = Trainers.default + kwargs = dict( + cfg_file=config_path, + model=model, + data_collator=None, + optimizers=(optimmizer, lr_scheduler), + train_dataset=DummyIterableDataset(), + eval_dataset=DummyIterableDataset(), + train_iters_per_epoch=20, + val_iters_per_epoch=10, + max_epochs=3, + device='cpu') + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json') + with open(json_file, 'r') as f: + lines = [i.strip() for i in f.readlines()] + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 1, + LogKeys.ITER: 10, + LogKeys.LR: 0.01 + }, json.loads(lines[0])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 1, + LogKeys.ITER: 20, + LogKeys.LR: 0.01 + }, json.loads(lines[1])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 1, + LogKeys.ITER: 10 + }, json.loads(lines[2])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 2, + LogKeys.ITER: 10, + LogKeys.LR: 0.01 + }, json.loads(lines[3])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 2, + LogKeys.ITER: 20, + LogKeys.LR: 0.01 + }, json.loads(lines[4])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 2, + LogKeys.ITER: 10 + }, json.loads(lines[5])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 3, + LogKeys.ITER: 10, + LogKeys.LR: 0.001 + }, json.loads(lines[6])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.TRAIN, + LogKeys.EPOCH: 3, + LogKeys.ITER: 20, + LogKeys.LR: 0.001 + }, json.loads(lines[7])) + self.assertDictContainsSubset( + { + LogKeys.MODE: ModeKeys.EVAL, + LogKeys.EPOCH: 3, + LogKeys.ITER: 10 + }, json.loads(lines[8])) + self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files) + for i in [0, 1, 3, 4, 6, 7]: + self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i]) + self.assertIn(LogKeys.ITER_TIME, lines[i]) + for i in [2, 5, 8]: + self.assertIn(MetricKeys.ACCURACY, lines[i]) + class DummyTrainerTest(unittest.TestCase): diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py index 30390a68..9781816d 100644 --- a/tests/trainers/test_trainer_gpu.py +++ b/tests/trainers/test_trainer_gpu.py @@ -11,6 +11,7 @@ import torch from torch import nn from torch.optim import SGD from torch.optim.lr_scheduler import StepLR +from torch.utils.data import IterableDataset from modelscope.metainfo import Metrics, Trainers from modelscope.metrics.builder import MetricKeys @@ -19,6 +20,16 @@ from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import (DistributedTestCase, create_dummy_test_dataset, test_level) + +class DummyIterableDataset(IterableDataset): + + def __iter__(self): + feat = np.random.random(size=(5, )).astype(np.float32) + labels = np.random.randint(0, 4, (1, )) + iterations = [{'feat': feat, 'labels': labels}] * 500 + return iter(iterations) + + dummy_dataset_small = create_dummy_test_dataset( np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20) @@ -41,7 +52,7 @@ class DummyModel(nn.Module): return dict(logits=x, loss=loss) -def train_func(work_dir, dist=False): +def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs): json_cfg = { 'train': { 'work_dir': work_dir, @@ -72,18 +83,25 @@ def train_func(work_dir, dist=False): optimmizer = SGD(model.parameters(), lr=0.01) lr_scheduler = StepLR(optimmizer, 2) trainer_name = Trainers.default - kwargs = dict( + if iterable_dataset: + train_dataset = DummyIterableDataset() + eval_dataset = DummyIterableDataset() + else: + train_dataset = dummy_dataset_big + eval_dataset = dummy_dataset_small + _kwargs = dict( cfg_file=config_path, model=model, data_collator=None, - train_dataset=dummy_dataset_big, - eval_dataset=dummy_dataset_small, + train_dataset=train_dataset, + eval_dataset=eval_dataset, optimizers=(optimmizer, lr_scheduler), max_epochs=3, device='gpu', - launcher='pytorch' if dist else None) + launcher='pytorch' if dist else None, + **kwargs) - trainer = build_trainer(trainer_name, kwargs) + trainer = build_trainer(trainer_name, _kwargs) trainer.train() @@ -253,6 +271,28 @@ class TrainerTestMultiGpus(DistributedTestCase): for i in [1, 3, 5]: self.assertIn(MetricKeys.ACCURACY, lines[i]) + # TODO: support iters_per_epoch for dist mode + @unittest.skipIf(True, 'need to adapt to DistributedSampler') + def test_multi_gpus_with_iters_per_epoch(self): + self.start( + train_func, + num_gpus=2, + work_dir=self.tmp_dir, + dist=True, + iterable_dataset=True, + train_iters_per_epoch=20, + val_iters_per_epoch=10, + ) + + results_files = os.listdir(self.tmp_dir) + json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) + self.assertEqual(len(json_files), 1) + + with open(json_files[0], 'r') as f: + lines = [i.strip() for i in f.readlines()] + + print(results_files, lines) + if __name__ == '__main__': unittest.main() diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index 7e488c6b..213b6b4f 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_trainer_with_user_defined_config(self): model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' - cfg = read_config(model_id) + cfg = read_config(model_id, revision='beta') cfg.train.max_epochs = 20 cfg.train.work_dir = self.tmp_dir cfg_file = os.path.join(self.tmp_dir, 'config.json') @@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - cfg_file=cfg_file) + cfg_file=cfg_file, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase): os.makedirs(tmp_dir) model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' - cache_path = snapshot_download(model_id) + cache_path = snapshot_download(model_id, revision='beta') model = SbertForSequenceClassification.from_pretrained(cache_path) kwargs = dict( cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), diff --git a/tests/trainers/utils/__init__.py b/tests/trainers/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py new file mode 100644 index 00000000..87e5320e --- /dev/null +++ b/tests/trainers/utils/test_inference.py @@ -0,0 +1,116 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +import torch +from torch import nn +from torch.utils.data import DataLoader + +from modelscope.metrics.builder import MetricKeys +from modelscope.metrics.sequence_classification_metric import \ + SequenceClassificationMetric +from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test +from modelscope.utils.test_utils import (DistributedTestCase, + create_dummy_test_dataset, test_level) +from modelscope.utils.torch_utils import get_dist_info, init_dist + +dummy_dataset = create_dummy_test_dataset( + torch.rand((5, )), torch.randint(0, 4, (1, )), 20) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +def test_func(dist=False): + dummy_model = DummyModel() + dataset = dummy_dataset.to_torch_dataset() + + dummy_loader = DataLoader( + dataset, + batch_size=2, + ) + + metric_class = SequenceClassificationMetric() + + if dist: + init_dist(launcher='pytorch') + + rank, world_size = get_dist_info() + device = torch.device(f'cuda:{rank}') + dummy_model.cuda() + + if world_size > 1: + from torch.nn.parallel.distributed import DistributedDataParallel + dummy_model = DistributedDataParallel( + dummy_model, device_ids=[torch.cuda.current_device()]) + test_func = multi_gpu_test + else: + test_func = single_gpu_test + + metric_results = test_func( + dummy_model, + dummy_loader, + device=device, + metric_classes=[metric_class]) + + return metric_results + + +@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') +class SingleGpuTestTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_single_gpu_test(self): + metric_results = test_func() + self.assertIn(MetricKeys.ACCURACY, metric_results) + + +@unittest.skipIf(not torch.cuda.is_available() + or torch.cuda.device_count() <= 1, 'distributed unittest') +class MultiGpuTestTest(DistributedTestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_multi_gpu_test(self): + self.start( + test_func, + num_gpus=2, + assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x), + dist=True) + + +if __name__ == '__main__': + unittest.main()