Browse Source

Merge branch 'master' into ofa/finetune

# Conflicts:
#	modelscope/preprocessors/ofa/base.py
#	modelscope/preprocessors/ofa/image_captioning.py
#	modelscope/preprocessors/ofa/image_classification.py
#	modelscope/preprocessors/ofa/summarization.py
#	modelscope/preprocessors/ofa/text_classification.py
#	modelscope/preprocessors/ofa/text_to_image_synthesis.py
#	modelscope/preprocessors/ofa/visual_entailment.py
#	modelscope/preprocessors/ofa/visual_grounding.py
#	modelscope/preprocessors/ofa/visual_question_answering.py
master
行嗔 3 years ago
parent
commit
279f64b334
100 changed files with 5750 additions and 840 deletions
  1. +1
    -0
      .gitattributes
  2. +3
    -0
      data/test/audios/3ch_nihaomiya.wav
  3. +3
    -0
      data/test/audios/farend_speech.wav
  4. +3
    -0
      data/test/audios/nearend_mic.wav
  5. +3
    -0
      data/test/audios/speech_with_noise.wav
  6. +3
    -0
      data/test/images/image_salient_detection.jpg
  7. +3
    -0
      data/test/images/ocr_recognition_document.png
  8. +3
    -0
      data/test/videos/dog.avi
  9. +4
    -2
      modelscope/hub/api.py
  10. +5
    -1
      modelscope/metainfo.py
  11. +2
    -0
      modelscope/models/audio/kws/__init__.py
  12. +0
    -0
      modelscope/models/audio/kws/farfield/__init__.py
  13. +495
    -0
      modelscope/models/audio/kws/farfield/fsmn.py
  14. +236
    -0
      modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
  15. +74
    -0
      modelscope/models/audio/kws/farfield/model.py
  16. +121
    -0
      modelscope/models/audio/kws/farfield/model_def.py
  17. +2
    -1
      modelscope/models/cv/__init__.py
  18. +0
    -14
      modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
  19. +0
    -2
      modelscope/models/cv/image_instance_segmentation/__init__.py
  20. +0
    -1
      modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
  21. +2
    -2
      modelscope/models/cv/object_detection/mmdet_model.py
  22. +22
    -0
      modelscope/models/cv/salient_detection/__init__.py
  23. +1
    -0
      modelscope/models/cv/salient_detection/models/__init__.py
  24. +300
    -0
      modelscope/models/cv/salient_detection/models/u2net.py
  25. +63
    -0
      modelscope/models/cv/salient_detection/salient_model.py
  26. +0
    -0
      modelscope/models/cv/video_single_object_tracking/__init__.py
  27. +0
    -0
      modelscope/models/cv/video_single_object_tracking/config/__init__.py
  28. +39
    -0
      modelscope/models/cv/video_single_object_tracking/config/ostrack.py
  29. +0
    -0
      modelscope/models/cv/video_single_object_tracking/models/__init__.py
  30. +0
    -0
      modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py
  31. +54
    -0
      modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
  32. +129
    -0
      modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
  33. +141
    -0
      modelscope/models/cv/video_single_object_tracking/models/layers/head.py
  34. +37
    -0
      modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
  35. +0
    -0
      modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py
  36. +93
    -0
      modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
  37. +109
    -0
      modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
  38. +24
    -0
      modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
  39. +343
    -0
      modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
  40. +0
    -0
      modelscope/models/cv/video_single_object_tracking/tracker/__init__.py
  41. +139
    -0
      modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
  42. +0
    -0
      modelscope/models/cv/video_single_object_tracking/utils/__init__.py
  43. +261
    -0
      modelscope/models/cv/video_single_object_tracking/utils/utils.py
  44. +4
    -4
      modelscope/models/multi_modal/__init__.py
  45. +1
    -1
      modelscope/models/multi_modal/clip/__init__.py
  46. +422
    -0
      modelscope/models/multi_modal/clip/bert_tokenizer.py
  47. +0
    -29
      modelscope/models/multi_modal/clip/clip_bert.py
  48. +0
    -216
      modelscope/models/multi_modal/clip/clip_model.py
  49. +0
    -131
      modelscope/models/multi_modal/clip/clip_vit.py
  50. +82
    -0
      modelscope/models/multi_modal/clip/configuration_bert.py
  51. +677
    -0
      modelscope/models/multi_modal/clip/model.py
  52. +507
    -0
      modelscope/models/multi_modal/clip/modeling_bert.py
  53. +1
    -1
      modelscope/models/multi_modal/diffusion/model.py
  54. +3
    -3
      modelscope/models/multi_modal/gemm/gemm_base.py
  55. +1
    -2
      modelscope/models/multi_modal/mplug/__init__.py
  56. +61
    -1
      modelscope/models/multi_modal/mplug/clip/clip.py
  57. +6
    -23
      modelscope/models/multi_modal/mplug/configuration_mplug.py
  58. +376
    -144
      modelscope/models/multi_modal/mplug/modeling_mplug.py
  59. +15
    -5
      modelscope/models/multi_modal/mplug_for_all_tasks.py
  60. +3
    -1
      modelscope/models/multi_modal/ofa/tokenization_ofa.py
  61. +2
    -1
      modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
  62. +2
    -1
      modelscope/models/nlp/structbert/tokenization_sbert.py
  63. +2
    -1
      modelscope/models/nlp/structbert/tokenization_sbert_fast.py
  64. +54
    -32
      modelscope/msdatasets/ms_dataset.py
  65. +3
    -0
      modelscope/msdatasets/task_datasets/__init__.py
  66. +0
    -0
      modelscope/msdatasets/task_datasets/base.py
  67. +0
    -0
      modelscope/msdatasets/task_datasets/builder.py
  68. +35
    -26
      modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
  69. +0
    -0
      modelscope/msdatasets/task_datasets/torch_base_dataset.py
  70. +0
    -0
      modelscope/msdatasets/task_datasets/veco_dataset.py
  71. +92
    -3
      modelscope/msdatasets/utils/dataset_builder.py
  72. +28
    -11
      modelscope/msdatasets/utils/dataset_utils.py
  73. +24
    -1
      modelscope/outputs.py
  74. +2
    -0
      modelscope/pipelines/audio/__init__.py
  75. +81
    -0
      modelscope/pipelines/audio/kws_farfield_pipeline.py
  76. +1
    -1
      modelscope/pipelines/base.py
  77. +6
    -2
      modelscope/pipelines/builder.py
  78. +2
    -0
      modelscope/pipelines/cv/__init__.py
  79. +47
    -0
      modelscope/pipelines/cv/image_salient_detection_pipeline.py
  80. +80
    -0
      modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
  81. +19
    -5
      modelscope/pipelines/multi_modal/image_captioning_pipeline.py
  82. +10
    -26
      modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
  83. +4
    -6
      modelscope/preprocessors/__init__.py
  84. +91
    -2
      modelscope/preprocessors/common.py
  85. +8
    -0
      modelscope/preprocessors/image.py
  86. +59
    -27
      modelscope/preprocessors/multi_modal.py
  87. +12
    -7
      modelscope/preprocessors/nlp.py
  88. +1
    -1
      modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
  89. +1
    -1
      modelscope/preprocessors/space/dialog_modeling_preprocessor.py
  90. +1
    -1
      modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
  91. +2
    -1
      modelscope/preprocessors/space/fields/gen_field.py
  92. +2
    -1
      modelscope/preprocessors/space/fields/intent_field.py
  93. +1
    -1
      modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
  94. +9
    -0
      modelscope/preprocessors/star/fields/common_utils.py
  95. +0
    -4
      modelscope/trainers/cv/image_instance_segmentation_trainer.py
  96. +0
    -1
      modelscope/trainers/cv/image_portrait_enhancement_trainer.py
  97. +1
    -1
      modelscope/trainers/hooks/hook.py
  98. +1
    -1
      modelscope/trainers/hooks/logger/text_logger_hook.py
  99. +39
    -15
      modelscope/trainers/nlp_trainer.py
  100. +151
    -76
      modelscope/trainers/trainer.py

+ 1
- 0
.gitattributes View File

@@ -4,3 +4,4 @@
*.wav filter=lfs diff=lfs merge=lfs -text
*.JPEG filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.avi filter=lfs diff=lfs merge=lfs -text

+ 3
- 0
data/test/audios/3ch_nihaomiya.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3ad1a268c614076614a2ae6528abc29cc85ae35826d172079d7d9b26a0299559
size 4325096

+ 3
- 0
data/test/audios/farend_speech.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3637ee0628d0953f77d5a32327980af542c43230c4127d2a72b4df1ea2ffb0be
size 320042

+ 3
- 0
data/test/audios/nearend_mic.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cc116af609a66f431f94df6b385ff2aa362f8a2d437c2279f5401e47f9178469
size 320042

+ 3
- 0
data/test/audios/speech_with_noise.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9354345a6297f4522e690d337546aa9a686a7e61eefcd935478a2141b924db8f
size 76770

+ 3
- 0
data/test/images/image_salient_detection.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:70ea0c06f9cfe3882253f7175221d47e394ab9c469076ab220e880b17dbcdd02
size 48552

+ 3
- 0
data/test/images/ocr_recognition_document.png View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:29f2ad929c852f6456367054d13e113078cf06b763fe54d73fd324f789331aa3
size 61611

+ 3
- 0
data/test/videos/dog.avi View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:469090fb217a34a2c096cfd42c251da69dca9fcd1a3c1faae7d29183c1816c14
size 12834294

+ 4
- 2
modelscope/hub/api.py View File

@@ -362,8 +362,10 @@ class HubApi:
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
if file_name.endswith('.csv'):
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name

def get_dataset_access_config(
self,


+ 5
- 1
modelscope/metainfo.py View File

@@ -38,6 +38,7 @@ class Models(object):
# audio models
sambert_hifigan = 'sambert-hifigan'
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
kws_kwsbp = 'kws-kwsbp'
generic_asr = 'generic-asr'

@@ -86,6 +87,7 @@ class Pipelines(object):
body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
human_detection = 'resnet18-human-detection'
object_detection = 'vit-object-detection'
salient_detection = 'u2net-salient-detection'
image_classification = 'image-classification'
face_detection = 'resnet-face-detection-scrfd10gkps'
live_category = 'live-category'
@@ -109,6 +111,7 @@ class Pipelines(object):
skin_retouching = 'unet-skin-retouching'
tinynas_classification = 'tinynas-classification'
crowd_counting = 'hrnet-crowd-counting'
video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'

# nlp tasks
sentence_similarity = 'sentence-similarity'
@@ -132,6 +135,7 @@ class Pipelines(object):
sambert_hifigan_tts = 'sambert-hifigan-tts'
speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
kws_kwsbp = 'kws-kwsbp'
asr_inference = 'asr-inference'

@@ -215,7 +219,7 @@ class Preprocessors(object):

# multi-modal preprocessor
ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
mplug_visual_question_answering = 'mplug-visual-question-answering'
mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'


class Metrics(object):


+ 2
- 0
modelscope/models/audio/kws/__init__.py View File

@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .generic_key_word_spotting import GenericKeyWordSpotting
from .farfield.model import FSMNSeleNetV2Decorator

else:
_import_structure = {
'generic_key_word_spotting': ['GenericKeyWordSpotting'],
'farfield.model': ['FSMNSeleNetV2Decorator'],
}

import sys


+ 0
- 0
modelscope/models/audio/kws/farfield/__init__.py View File


+ 495
- 0
modelscope/models/audio/kws/farfield/fsmn.py View File

@@ -0,0 +1,495 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from .model_def import (HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32,
printNeonMatrix, printNeonVector)

DEBUG = False


def to_kaldi_matrix(np_mat):
""" function that transform as str numpy mat to standard kaldi str matrix

Args:
np_mat: numpy mat

Returns: str
"""
np.set_printoptions(threshold=np.inf, linewidth=np.nan)
out_str = str(np_mat)
out_str = out_str.replace('[', '')
out_str = out_str.replace(']', '')
return '[ %s ]\n' % out_str


def print_tensor(torch_tensor):
""" print torch tensor for debug

Args:
torch_tensor: a tensor
"""
re_str = ''
x = torch_tensor.detach().squeeze().numpy()
re_str += to_kaldi_matrix(x)
re_str += '<!EndOfComponent>\n'
print(re_str)


class LinearTransform(nn.Module):

def __init__(self, input_dim, output_dim):
super(LinearTransform, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.linear = nn.Linear(input_dim, output_dim, bias=False)

self.debug = False
self.dataout = None

def forward(self, input):
output = self.linear(input)

if self.debug:
self.dataout = output

return output

def print_model(self):
printNeonMatrix(self.linear.weight)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<LinearTransform> %d %d\n' % (self.output_dim,
self.input_dim)
re_str += '<LearnRateCoef> 1\n'

linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
re_str += '<!EndOfComponent>\n'

return re_str


class AffineTransform(nn.Module):

def __init__(self, input_dim, output_dim):
super(AffineTransform, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim

self.linear = nn.Linear(input_dim, output_dim)

self.debug = False
self.dataout = None

def forward(self, input):
output = self.linear(input)

if self.debug:
self.dataout = output

return output

def print_model(self):
printNeonMatrix(self.linear.weight)
printNeonVector(self.linear.bias)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
self.input_dim)
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'

linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)

linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
re_str += '<!EndOfComponent>\n'

return re_str


class Fsmn(nn.Module):
"""
FSMN implementation.
"""

def __init__(self,
input_dim,
output_dim,
lorder=None,
rorder=None,
lstride=None,
rstride=None):
super(Fsmn, self).__init__()

self.dim = input_dim

if lorder is None:
return

self.lorder = lorder
self.rorder = rorder
self.lstride = lstride
self.rstride = rstride

self.conv_left = nn.Conv2d(
self.dim,
self.dim, (lorder, 1),
dilation=(lstride, 1),
groups=self.dim,
bias=False)

if rorder > 0:
self.conv_right = nn.Conv2d(
self.dim,
self.dim, (rorder, 1),
dilation=(rstride, 1),
groups=self.dim,
bias=False)
else:
self.conv_right = None

self.debug = False
self.dataout = None

def forward(self, input):
x = torch.unsqueeze(input, 1)
x_per = x.permute(0, 3, 2, 1)

y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])

if self.conv_right is not None:
y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
y_right = y_right[:, :, self.rstride:, :]
out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
else:
out = x_per + self.conv_left(y_left)

out1 = out.permute(0, 3, 2, 1)
output = out1.squeeze(1)

if self.debug:
self.dataout = output

return output

def print_model(self):
tmpw = self.conv_left.weight
tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
for j in range(tmpw.shape[0]):
tmpwm[:, j] = tmpw[j, 0, :, 0]

printNeonMatrix(tmpwm)

if self.conv_right is not None:
tmpw = self.conv_right.weight
tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
for j in range(tmpw.shape[0]):
tmpwm[:, j] = tmpw[j, 0, :, 0]

printNeonMatrix(tmpwm)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Fsmn> %d %d\n' % (self.dim, self.dim)
re_str += '<LearnRateCoef> %d <LOrder> %d <ROrder> %d <LStride> %d <RStride> %d <MaxNorm> 0\n' % (
1, self.lorder, self.rorder, self.lstride, self.rstride)

lfiters = self.state_dict()['conv_left.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)

if self.conv_right is not None:
rfiters = self.state_dict()['conv_right.weight']
x = (rfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)
re_str += '<!EndOfComponent>\n'

return re_str


class RectifiedLinear(nn.Module):

def __init__(self, input_dim, output_dim):
super(RectifiedLinear, self).__init__()
self.dim = input_dim
self.relu = nn.ReLU()

def forward(self, input):
return self.relu(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
re_str += '<!EndOfComponent>\n'
return re_str


class FSMNNet(nn.Module):
"""
FSMN net for keyword spotting
"""

def __init__(self,
input_dim=200,
linear_dim=128,
proj_dim=128,
lorder=10,
rorder=1,
num_syn=5,
fsmn_layers=4):
"""
Args:
input_dim: input dimension
linear_dim: fsmn input dimension
proj_dim: fsmn projection dimension
lorder: fsmn left order
rorder: fsmn right order
num_syn: output dimension
fsmn_layers: no. of sequential fsmn layers
"""
super(FSMNNet, self).__init__()

self.input_dim = input_dim
self.linear_dim = linear_dim
self.proj_dim = proj_dim
self.lorder = lorder
self.rorder = rorder
self.num_syn = num_syn
self.fsmn_layers = fsmn_layers

self.linear1 = AffineTransform(input_dim, linear_dim)
self.relu = RectifiedLinear(linear_dim, linear_dim)

self.fsmn = self._build_repeats(linear_dim, proj_dim, lorder, rorder,
fsmn_layers)

self.linear2 = AffineTransform(linear_dim, num_syn)

@staticmethod
def _build_repeats(linear_dim=136,
proj_dim=68,
lorder=3,
rorder=2,
fsmn_layers=5):
repeats = [
nn.Sequential(
LinearTransform(linear_dim, proj_dim),
Fsmn(proj_dim, proj_dim, lorder, rorder, 1, 1),
AffineTransform(proj_dim, linear_dim),
RectifiedLinear(linear_dim, linear_dim))
for i in range(fsmn_layers)
]

return nn.Sequential(*repeats)

def forward(self, input):
x1 = self.linear1(input)
x2 = self.relu(x1)
x3 = self.fsmn(x2)
x4 = self.linear2(x3)
return x4

def print_model(self):
self.linear1.print_model()

for layer in self.fsmn:
layer[0].print_model()
layer[1].print_model()
layer[2].print_model()

self.linear2.print_model()

def print_header(self):
#
# write total header
#
header = [0.0] * HEADER_BLOCK_SIZE * 4
# numins
header[0] = 0.0
# numouts
header[1] = 0.0
# dimins
header[2] = self.input_dim
# dimouts
header[3] = self.num_syn
# numlayers
header[4] = 3

#
# write each layer's header
#
hidx = 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = self.input_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = self.linear_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_RELU.value)
hidx += 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_SEQUENTIAL_FSMN.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = self.proj_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = self.lorder
header[HEADER_BLOCK_SIZE * hidx + 5] = self.rorder
header[HEADER_BLOCK_SIZE * hidx + 6] = self.fsmn_layers
header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
hidx += 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = self.num_syn
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_SOFTMAX.value)

for h in header:
print(f32ToI32(h))

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Nnet>\n'
re_str += self.linear1.to_kaldi_nnet()
re_str += self.relu.to_kaldi_nnet()

for fsmn in self.fsmn:
re_str += fsmn[0].to_kaldi_nnet()
re_str += fsmn[1].to_kaldi_nnet()
re_str += fsmn[2].to_kaldi_nnet()
re_str += fsmn[3].to_kaldi_nnet()

re_str += self.linear2.to_kaldi_nnet()
re_str += '<Softmax> %d %d\n' % (self.num_syn, self.num_syn)
re_str += '<!EndOfComponent>\n'
re_str += '</Nnet>\n'

return re_str


class DFSMN(nn.Module):
"""
One deep fsmn layer
"""

def __init__(self,
dimproj=64,
dimlinear=128,
lorder=20,
rorder=1,
lstride=1,
rstride=1):
"""
Args:
dimproj: projection dimension, input and output dimension of memory blocks
dimlinear: dimension of mapping layer
lorder: left order
rorder: right order
lstride: left stride
rstride: right stride
"""
super(DFSMN, self).__init__()

self.lorder = lorder
self.rorder = rorder
self.lstride = lstride
self.rstride = rstride

self.expand = AffineTransform(dimproj, dimlinear)
self.shrink = LinearTransform(dimlinear, dimproj)

self.conv_left = nn.Conv2d(
dimproj,
dimproj, (lorder, 1),
dilation=(lstride, 1),
groups=dimproj,
bias=False)

if rorder > 0:
self.conv_right = nn.Conv2d(
dimproj,
dimproj, (rorder, 1),
dilation=(rstride, 1),
groups=dimproj,
bias=False)
else:
self.conv_right = None

def forward(self, input):
f1 = F.relu(self.expand(input))
p1 = self.shrink(f1)

x = torch.unsqueeze(p1, 1)
x_per = x.permute(0, 3, 2, 1)

y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])

if self.conv_right is not None:
y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
y_right = y_right[:, :, self.rstride:, :]
out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
else:
out = x_per + self.conv_left(y_left)

out1 = out.permute(0, 3, 2, 1)
output = input + out1.squeeze(1)

return output

def print_model(self):
self.expand.print_model()
self.shrink.print_model()

tmpw = self.conv_left.weight
tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
for j in range(tmpw.shape[0]):
tmpwm[:, j] = tmpw[j, 0, :, 0]

printNeonMatrix(tmpwm)

if self.conv_right is not None:
tmpw = self.conv_right.weight
tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
for j in range(tmpw.shape[0]):
tmpwm[:, j] = tmpw[j, 0, :, 0]

printNeonMatrix(tmpwm)


def build_dfsmn_repeats(linear_dim=128,
proj_dim=64,
lorder=20,
rorder=1,
fsmn_layers=6):
"""
build stacked dfsmn layers
Args:
linear_dim:
proj_dim:
lorder:
rorder:
fsmn_layers:

Returns:

"""
repeats = [
nn.Sequential(DFSMN(proj_dim, linear_dim, lorder, rorder, 1, 1))
for i in range(fsmn_layers)
]

return nn.Sequential(*repeats)

+ 236
- 0
modelscope/models/audio/kws/farfield/fsmn_sele_v2.py View File

@@ -0,0 +1,236 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32


class FSMNUnit(nn.Module):
""" A multi-channel fsmn unit

"""

def __init__(self, dimlinear=128, dimproj=64, lorder=20, rorder=1):
"""
Args:
dimlinear: input / output dimension
dimproj: fsmn input / output dimension
lorder: left ofder
rorder: right order
"""
super(FSMNUnit, self).__init__()

self.shrink = LinearTransform(dimlinear, dimproj)
self.fsmn = Fsmn(dimproj, dimproj, lorder, rorder, 1, 1)
self.expand = AffineTransform(dimproj, dimlinear)

self.debug = False
self.dataout = None

'''
batch, time, channel, feature
'''

def forward(self, input):
if torch.cuda.is_available():
out = torch.zeros(input.shape).cuda()
else:
out = torch.zeros(input.shape)

for n in range(input.shape[2]):
out1 = self.shrink(input[:, :, n, :])
out2 = self.fsmn(out1)
out[:, :, n, :] = F.relu(self.expand(out2))

if self.debug:
self.dataout = out

return out

def print_model(self):
self.shrink.print_model()
self.fsmn.print_model()
self.expand.print_model()

def to_kaldi_nnet(self):
re_str = self.shrink.to_kaldi_nnet()
re_str += self.fsmn.to_kaldi_nnet()
re_str += self.expand.to_kaldi_nnet()

relu = RectifiedLinear(self.expand.linear.out_features,
self.expand.linear.out_features)
re_str += relu.to_kaldi_nnet()

return re_str


class FSMNSeleNetV2(nn.Module):
""" FSMN model with channel selection.
"""

def __init__(self,
input_dim=120,
linear_dim=128,
proj_dim=64,
lorder=20,
rorder=1,
num_syn=5,
fsmn_layers=5,
sele_layer=0):
"""
Args:
input_dim: input dimension
linear_dim: fsmn input dimension
proj_dim: fsmn projection dimension
lorder: fsmn left order
rorder: fsmn right order
num_syn: output dimension
fsmn_layers: no. of fsmn units
sele_layer: channel selection layer index
"""
super(FSMNSeleNetV2, self).__init__()

self.sele_layer = sele_layer

self.featmap = AffineTransform(input_dim, linear_dim)

self.mem = []
for i in range(fsmn_layers):
unit = FSMNUnit(linear_dim, proj_dim, lorder, rorder)
self.mem.append(unit)
self.add_module('mem_{:d}'.format(i), unit)

self.decision = AffineTransform(linear_dim, num_syn)

def forward(self, input):
# multi-channel feature mapping
if torch.cuda.is_available():
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
self.featmap.linear.out_features).cuda()
else:
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
self.featmap.linear.out_features)

for n in range(input.shape[2]):
x[:, :, n, :] = F.relu(self.featmap(input[:, :, n, :]))

for i, unit in enumerate(self.mem):
y = unit(x)

# perform channel selection
if i == self.sele_layer:
pool = nn.MaxPool2d((y.shape[2], 1), stride=(y.shape[2], 1))
y = pool(y)

x = y

# remove channel dimension
y = torch.squeeze(y, -2)
z = self.decision(y)

return z

def print_model(self):
self.featmap.print_model()

for unit in self.mem:
unit.print_model()

self.decision.print_model()

def print_header(self):
'''
get FSMN params
'''
input_dim = self.featmap.linear.in_features
linear_dim = self.featmap.linear.out_features
proj_dim = self.mem[0].shrink.linear.out_features
lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
rorder = 0
if self.mem[0].fsmn.conv_right is not None:
rorder = self.mem[0].fsmn.conv_right.kernel_size[0]

num_syn = self.decision.linear.out_features
fsmn_layers = len(self.mem)

# no. of output channels, 0.0 means the same as numins
# numouts = 0.0
numouts = 1.0

#
# write total header
#
header = [0.0] * HEADER_BLOCK_SIZE * 4
# numins
header[0] = 0.0
# numouts
header[1] = numouts
# dimins
header[2] = input_dim
# dimouts
header[3] = num_syn
# numlayers
header[4] = 3

#
# write each layer's header
#
hidx = 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_RELU.value)
hidx += 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_SEQUENTIAL_FSMN.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = proj_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = lorder
header[HEADER_BLOCK_SIZE * hidx + 5] = rorder
header[HEADER_BLOCK_SIZE * hidx + 6] = fsmn_layers
if numouts == 1.0:
header[HEADER_BLOCK_SIZE * hidx + 7] = float(self.sele_layer)
else:
header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
hidx += 1

header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_SOFTMAX.value)

for h in header:
print(f32ToI32(h))

def to_kaldi_nnet(self):
re_str = '<Nnet>\n'

re_str = self.featmap.to_kaldi_nnet()

relu = RectifiedLinear(self.featmap.linear.out_features,
self.featmap.linear.out_features)
re_str += relu.to_kaldi_nnet()

for unit in self.mem:
re_str += unit.to_kaldi_nnet()

re_str += self.decision.to_kaldi_nnet()

re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
self.decision.linear.out_features)
re_str += '<!EndOfComponent>\n'
re_str += '</Nnet>\n'

return re_str

+ 74
- 0
modelscope/models/audio/kws/farfield/model.py View File

@@ -0,0 +1,74 @@
import os
from typing import Dict

import torch

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.base import Tensor
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from .fsmn_sele_v2 import FSMNSeleNetV2


@MODELS.register_module(
Tasks.keyword_spotting, module_name=Models.speech_dfsmn_kws_char_farfield)
class FSMNSeleNetV2Decorator(TorchModel):
r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """

MODEL_TXT = 'model.txt'
SC_CONFIG = 'sound_connect.conf'
SC_CONF_ITEM_KWS_MODEL = '${kws_model}'

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the dfsmn model from the `model_dir` path.

Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, *args, **kwargs)
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
model_bin_file = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
self._model = None
if os.path.exists(model_bin_file):
self._model = FSMNSeleNetV2(*args, **kwargs)
checkpoint = torch.load(model_bin_file)
self._model.load_state_dict(checkpoint, strict=False)

self._sc = None
if os.path.exists(model_txt_file):
with open(sc_config_file) as f:
lines = f.readlines()
with open(sc_config_file, 'w') as f:
for line in lines:
if self.SC_CONF_ITEM_KWS_MODEL in line:
line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
model_txt_file)
f.write(line)
import py_sound_connect
self._sc = py_sound_connect.SoundConnect(sc_config_file)
self.size_in = self._sc.bytesPerBlockIn()
self.size_out = self._sc.bytesPerBlockOut()

if self._model is None and self._sc is None:
raise Exception(
f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
...

def forward_decode(self, data: bytes):
result = {'pcm': self._sc.process(data, self.size_out)}
state = self._sc.kwsState()
if state == 2:
result['kws'] = {
'keyword':
self._sc.kwsKeyword(self._sc.kwsSpottedKeywordIndex()),
'offset': self._sc.kwsKeywordOffset(),
'length': self._sc.kwsKeywordLength(),
'confidence': self._sc.kwsConfidence()
}
return result

+ 121
- 0
modelscope/models/audio/kws/farfield/model_def.py View File

@@ -0,0 +1,121 @@
import math
import struct
from enum import Enum

HEADER_BLOCK_SIZE = 10


class LayerType(Enum):
LAYER_DENSE = 1
LAYER_GRU = 2
LAYER_ATTENTION = 3
LAYER_FSMN = 4
LAYER_SEQUENTIAL_FSMN = 5
LAYER_FSMN_SELE = 6
LAYER_GRU_ATTENTION = 7
LAYER_DFSMN = 8


class ActivationType(Enum):
ACTIVATION_NONE = 0
ACTIVATION_RELU = 1
ACTIVATION_TANH = 2
ACTIVATION_SIGMOID = 3
ACTIVATION_SOFTMAX = 4
ACTIVATION_LOGSOFTMAX = 5


def f32ToI32(f):
"""
print layer
"""
bs = struct.pack('f', f)

ba = bytearray()
ba.append(bs[0])
ba.append(bs[1])
ba.append(bs[2])
ba.append(bs[3])

return struct.unpack('i', ba)[0]


def printNeonMatrix(w):
"""
print matrix with neon padding
"""
numrows, numcols = w.shape
numnecols = math.ceil(numcols / 4)

for i in range(numrows):
for j in range(numcols):
print(f32ToI32(w[i, j]))

for j in range(numnecols * 4 - numcols):
print(0)


def printNeonVector(b):
"""
print vector with neon padding
"""
size = b.shape[0]
nesize = math.ceil(size / 4)

for i in range(size):
print(f32ToI32(b[i]))

for i in range(nesize * 4 - size):
print(0)


def printDense(layer):
"""
save dense layer
"""
statedict = layer.state_dict()
printNeonMatrix(statedict['weight'])
printNeonVector(statedict['bias'])


def printGRU(layer):
"""
save gru layer
"""
statedict = layer.state_dict()
weight = [statedict['weight_ih_l0'], statedict['weight_hh_l0']]
bias = [statedict['bias_ih_l0'], statedict['bias_hh_l0']]
numins, numouts = weight[0].shape
numins = numins // 3

# output input weights
w_rx = weight[0][:numins, :]
w_zx = weight[0][numins:numins * 2, :]
w_x = weight[0][numins * 2:, :]
printNeonMatrix(w_zx)
printNeonMatrix(w_rx)
printNeonMatrix(w_x)

# output recurrent weights
w_rh = weight[1][:numins, :]
w_zh = weight[1][numins:numins * 2, :]
w_h = weight[1][numins * 2:, :]
printNeonMatrix(w_zh)
printNeonMatrix(w_rh)
printNeonMatrix(w_h)

# output input bias
b_rx = bias[0][:numins]
b_zx = bias[0][numins:numins * 2]
b_x = bias[0][numins * 2:]
printNeonVector(b_zx)
printNeonVector(b_rx)
printNeonVector(b_x)

# output recurrent bias
b_rh = bias[1][:numins]
b_zh = bias[1][numins:numins * 2]
b_h = bias[1][numins * 2:]
printNeonVector(b_zh)
printNeonVector(b_rh)
printNeonVector(b_h)

+ 2
- 1
modelscope/models/cv/__init__.py View File

@@ -5,4 +5,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
image_colorization, image_denoise, image_instance_segmentation,
image_portrait_enhancement, image_to_image_generation,
image_to_image_translation, object_detection,
product_retrieval_embedding, super_resolution, virual_tryon)
product_retrieval_embedding, salient_detection,
super_resolution, video_single_object_tracking, virual_tryon)

+ 0
- 14
modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py View File

@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.model = NAFNet(**self.config.model.network_g)
self.loss = PSNRLoss()

if torch.cuda.is_available():
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')

self.model = self.model.to(self._device)
self.model = self._load_pretrained(self.model, model_path)

if self.training:
self.model.train()
else:
self.model.eval()

def _load_pretrained(self,
net,
load_path,
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
Returns:
Dict[str, Tensor]: results
"""
for key, value in inputs.items():
inputs[key] = inputs[key].to(self._device)
if self.training:
return self._train_forward(**inputs)
elif 'target' in inputs:


+ 0
- 2
modelscope/models/cv/image_instance_segmentation/__init__.py View File

@@ -7,13 +7,11 @@ if TYPE_CHECKING:
from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
from .model import CascadeMaskRCNNSwinModel
from .postprocess_utils import get_img_ins_seg_result
from .datasets import ImageInstanceSegmentationCocoDataset
else:
_import_structure = {
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
'model': ['CascadeMaskRCNNSwinModel'],
'postprocess_utils': ['get_img_ins_seg_result'],
'datasets': ['ImageInstanceSegmentationCocoDataset']
}

import sys


+ 0
- 1
modelscope/models/cv/image_instance_segmentation/datasets/__init__.py View File

@@ -1,2 +1 @@
from .dataset import ImageInstanceSegmentationCocoDataset
from .transforms import build_preprocess_transform

+ 2
- 2
modelscope/models/cv/object_detection/mmdet_model.py View File

@@ -38,7 +38,7 @@ class DetectionModel(TorchModel):
self.model, model_path, map_location='cpu')
self.class_names = checkpoint['meta']['CLASSES']
config.test_pipeline[0].type = 'LoadImageFromWebcam'
self.test_pipeline = Compose(
self.transform_input = Compose(
replace_ImageToTensor(config.test_pipeline))
self.model.cfg = config
self.model.eval()
@@ -56,7 +56,7 @@ class DetectionModel(TorchModel):

from mmcv.parallel import collate, scatter
data = dict(img=image)
data = self.test_pipeline(data)
data = self.transform_input(data)
data = collate([data], samples_per_gpu=1)
data['img_metas'] = [
img_metas.data[0] for img_metas in data['img_metas']


+ 22
- 0
modelscope/models/cv/salient_detection/__init__.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .salient_model import SalientDetection

else:
_import_structure = {
'salient_model': ['SalientDetection'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 1
- 0
modelscope/models/cv/salient_detection/models/__init__.py View File

@@ -0,0 +1 @@
from .u2net import U2NET

+ 300
- 0
modelscope/models/cv/salient_detection/models/u2net.py View File

@@ -0,0 +1,300 @@
# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
import torch
import torch.nn as nn
import torch.nn.functional as F


class REBNCONV(nn.Module):

def __init__(self, in_ch=3, out_ch=3, dirate=1):
super(REBNCONV, self).__init__()
self.conv_s1 = nn.Conv2d(
in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate)
self.bn_s1 = nn.BatchNorm2d(out_ch)
self.relu_s1 = nn.ReLU(inplace=True)

def forward(self, x):
hx = x
xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
return xout


def _upsample_like(src, tar):
"""upsample tensor 'src' to have the same spatial size with tensor 'tar'."""
src = F.upsample(src, size=tar.shape[2:], mode='bilinear')
return src


class RSU7(nn.Module):

def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
super(RSU7, self).__init__()
self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

def forward(self, x):
hx = x
hxin = self.rebnconvin(hx)
hx1 = self.rebnconv1(hxin)
hx = self.pool1(hx1)
hx2 = self.rebnconv2(hx)
hx = self.pool2(hx2)
hx3 = self.rebnconv3(hx)
hx = self.pool3(hx3)
hx4 = self.rebnconv4(hx)
hx = self.pool4(hx4)
hx5 = self.rebnconv5(hx)
hx = self.pool5(hx5)
hx6 = self.rebnconv6(hx)
hx7 = self.rebnconv7(hx6)
hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
hx6dup = _upsample_like(hx6d, hx5)
hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
hx5dup = _upsample_like(hx5d, hx4)
hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
hx4dup = _upsample_like(hx4d, hx3)
hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
hx3dup = _upsample_like(hx3d, hx2)
hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
hx2dup = _upsample_like(hx2d, hx1)
hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
return hx1d + hxin


class RSU6(nn.Module):

def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
super(RSU6, self).__init__()

self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

def forward(self, x):
hx = x
hxin = self.rebnconvin(hx)
hx1 = self.rebnconv1(hxin)
hx = self.pool1(hx1)
hx2 = self.rebnconv2(hx)
hx = self.pool2(hx2)
hx3 = self.rebnconv3(hx)
hx = self.pool3(hx3)
hx4 = self.rebnconv4(hx)
hx = self.pool4(hx4)
hx5 = self.rebnconv5(hx)
hx6 = self.rebnconv6(hx5)
hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
hx5dup = _upsample_like(hx5d, hx4)
hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
hx4dup = _upsample_like(hx4d, hx3)
hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
hx3dup = _upsample_like(hx3d, hx2)
hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
hx2dup = _upsample_like(hx2d, hx1)
hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
return hx1d + hxin


class RSU5(nn.Module):

def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
super(RSU5, self).__init__()

self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

def forward(self, x):
hx = x
hxin = self.rebnconvin(hx)
hx1 = self.rebnconv1(hxin)
hx = self.pool1(hx1)
hx2 = self.rebnconv2(hx)
hx = self.pool2(hx2)
hx3 = self.rebnconv3(hx)
hx = self.pool3(hx3)
hx4 = self.rebnconv4(hx)
hx5 = self.rebnconv5(hx4)
hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
hx4dup = _upsample_like(hx4d, hx3)
hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
hx3dup = _upsample_like(hx3d, hx2)
hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
hx2dup = _upsample_like(hx2d, hx1)
hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
return hx1d + hxin


class RSU4(nn.Module):

def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
super(RSU4, self).__init__()

self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

def forward(self, x):

hx = x
hxin = self.rebnconvin(hx)
hx1 = self.rebnconv1(hxin)
hx = self.pool1(hx1)
hx2 = self.rebnconv2(hx)
hx = self.pool2(hx2)
hx3 = self.rebnconv3(hx)
hx4 = self.rebnconv4(hx3)
hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
hx3dup = _upsample_like(hx3d, hx2)
hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
hx2dup = _upsample_like(hx2d, hx1)
hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
return hx1d + hxin


class RSU4F(nn.Module):

def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
super(RSU4F, self).__init__()

self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

def forward(self, x):

hx = x
hxin = self.rebnconvin(hx)
hx1 = self.rebnconv1(hxin)
hx2 = self.rebnconv2(hx1)
hx3 = self.rebnconv3(hx2)
hx4 = self.rebnconv4(hx3)
hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
return hx1d + hxin


class U2NET(nn.Module):

def __init__(self, in_ch=3, out_ch=1):
super(U2NET, self).__init__()

# encoder
self.stage1 = RSU7(in_ch, 32, 64)
self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.stage2 = RSU6(64, 32, 128)
self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.stage3 = RSU5(128, 64, 256)
self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.stage4 = RSU4(256, 128, 512)
self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.stage5 = RSU4F(512, 256, 512)
self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
self.stage6 = RSU4F(512, 256, 512)
# decoder
self.stage5d = RSU4F(1024, 256, 512)
self.stage4d = RSU4(1024, 128, 256)
self.stage3d = RSU5(512, 64, 128)
self.stage2d = RSU6(256, 32, 64)
self.stage1d = RSU7(128, 16, 64)
self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
self.outconv = nn.Conv2d(6 * out_ch, out_ch, 1)

def forward(self, x):

hx = x
hx1 = self.stage1(hx)
hx = self.pool12(hx1)
hx2 = self.stage2(hx)
hx = self.pool23(hx2)
hx3 = self.stage3(hx)
hx = self.pool34(hx3)
hx4 = self.stage4(hx)
hx = self.pool45(hx4)
hx5 = self.stage5(hx)
hx = self.pool56(hx5)
hx6 = self.stage6(hx)
hx6up = _upsample_like(hx6, hx5)

hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
hx5dup = _upsample_like(hx5d, hx4)
hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
hx4dup = _upsample_like(hx4d, hx3)
hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
hx3dup = _upsample_like(hx3d, hx2)
hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
hx2dup = _upsample_like(hx2d, hx1)
hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
d1 = self.side1(hx1d)
d2 = self.side2(hx2d)
d2 = _upsample_like(d2, d1)
d3 = self.side3(hx3d)
d3 = _upsample_like(d3, d1)
d4 = self.side4(hx4d)
d4 = _upsample_like(d4, d1)
d5 = self.side5(hx5d)
d5 = _upsample_like(d5, d1)
d6 = self.side6(hx6)
d6 = _upsample_like(d6, d1)
d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
return torch.sigmoid(d0), torch.sigmoid(d1), torch.sigmoid(
d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(
d5), torch.sigmoid(d6)

+ 63
- 0
modelscope/models/cv/salient_detection/salient_model.py View File

@@ -0,0 +1,63 @@
import os.path as osp

import cv2
import numpy as np
import torch
from PIL import Image
from torchvision import transforms

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from .models import U2NET


@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
class SalientDetection(TorchModel):

def __init__(self, model_dir: str, *args, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, *args, **kwargs)
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.model = U2NET(3, 1)
checkpoint = torch.load(model_path, map_location='cpu')
self.transform_input = transforms.Compose([
transforms.Resize((320, 320)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
self.model.load_state_dict(checkpoint)
self.model.eval()

def inference(self, data):
"""data is tensor 3 * H * W ---> return tensor H * W ."""
data = data.unsqueeze(0)
if next(self.model.parameters()).is_cuda:
data = data.to(
torch.device([next(self.model.parameters()).device][0]))

with torch.no_grad():
results = self.model(data)

if next(self.model.parameters()).is_cuda:
return results[0][0, 0, :, :].cpu()
return results[0][0, 0, :, :]

def preprocess(self, image):
"""image is numpy."""
data = self.transform_input(Image.fromarray(image))
return data.float()

def postprocess(self, inputs):
"""resize ."""
data = inputs['data']
w = inputs['img_w']
h = inputs['img_h']
data_norm = (data - torch.min(data)) / (
torch.max(data) - torch.min(data))
data_norm_np = (data_norm.numpy() * 255).astype('uint8')
data_norm_rst = cv2.resize(data_norm_np, (w, h))

return data_norm_rst

+ 0
- 0
modelscope/models/cv/video_single_object_tracking/__init__.py View File


+ 0
- 0
modelscope/models/cv/video_single_object_tracking/config/__init__.py View File


+ 39
- 0
modelscope/models/cv/video_single_object_tracking/config/ostrack.py View File

@@ -0,0 +1,39 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
from easydict import EasyDict as edict

cfg = edict()

# MODEL
cfg.MODEL = edict()

# MODEL.BACKBONE
cfg.MODEL.BACKBONE = edict()
cfg.MODEL.BACKBONE.TYPE = 'vit_base_patch16_224_ce'
cfg.MODEL.BACKBONE.STRIDE = 16
cfg.MODEL.BACKBONE.CAT_MODE = 'direct'
cfg.MODEL.BACKBONE.DROP_PATH_RATE = 0.1
cfg.MODEL.BACKBONE.CE_LOC = [3, 6, 9]
cfg.MODEL.BACKBONE.CE_KEEP_RATIO = [0.7, 0.7, 0.7]
cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE = 'CTR_POINT'

# MODEL.HEAD
cfg.MODEL.HEAD = edict()
cfg.MODEL.HEAD.TYPE = 'CENTER'
cfg.MODEL.HEAD.NUM_CHANNELS = 256

# DATA
cfg.DATA = edict()
cfg.DATA.MEAN = [0.485, 0.456, 0.406]
cfg.DATA.STD = [0.229, 0.224, 0.225]
cfg.DATA.SEARCH = edict()
cfg.DATA.SEARCH.SIZE = 384
cfg.DATA.TEMPLATE = edict()
cfg.DATA.TEMPLATE.SIZE = 192

# TEST
cfg.TEST = edict()
cfg.TEST.TEMPLATE_FACTOR = 2.0
cfg.TEST.TEMPLATE_SIZE = 192
cfg.TEST.SEARCH_FACTOR = 5.0
cfg.TEST.SEARCH_SIZE = 384

+ 0
- 0
modelscope/models/cv/video_single_object_tracking/models/__init__.py View File


+ 0
- 0
modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py View File


+ 54
- 0
modelscope/models/cv/video_single_object_tracking/models/layers/attn.py View File

@@ -0,0 +1,54 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch.nn as nn


class Attention(nn.Module):

def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
attn_drop=0.,
proj_drop=0.,
rpe=False,
z_size=7,
x_size=14):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim**-0.5

self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)

def forward(self, x, mask=None, return_attention=False):
# x: B, N, C
# mask: [B, N, ] torch.bool
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(
0) # make torchscript happy (cannot use tensor as tuple)

attn = (q @ k.transpose(-2, -1)) * self.scale

if mask is not None:
attn = attn.masked_fill(
mask.unsqueeze(1).unsqueeze(2),
float('-inf'),
)

attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)

x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)

if return_attention:
return x, attn
else:
return x

+ 129
- 0
modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py View File

@@ -0,0 +1,129 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import math

import torch
import torch.nn as nn
from timm.models.layers import DropPath, Mlp

from .attn import Attention


def candidate_elimination(attn: torch.Tensor, tokens: torch.Tensor,
lens_t: int, keep_ratio: float,
global_index: torch.Tensor,
box_mask_z: torch.Tensor):
"""
Eliminate potential background candidates for computation reduction and noise cancellation.
Args:
attn (torch.Tensor): [B, num_heads, L_t + L_s, L_t + L_s], attention weights
tokens (torch.Tensor): [B, L_t + L_s, C], template and search region tokens
lens_t (int): length of template
keep_ratio (float): keep ratio of search region tokens (candidates)
global_index (torch.Tensor): global index of search region tokens
box_mask_z (torch.Tensor): template mask used to accumulate attention weights

Returns:
tokens_new (torch.Tensor): tokens after candidate elimination
keep_index (torch.Tensor): indices of kept search region tokens
removed_index (torch.Tensor): indices of removed search region tokens
"""
lens_s = attn.shape[-1] - lens_t
bs, hn, _, _ = attn.shape

lens_keep = math.ceil(keep_ratio * lens_s)
if lens_keep == lens_s:
return tokens, global_index, None

attn_t = attn[:, :, :lens_t, lens_t:]

if box_mask_z is not None:
box_mask_z = box_mask_z.unsqueeze(1).unsqueeze(-1).expand(
-1, attn_t.shape[1], -1, attn_t.shape[-1])
attn_t = attn_t[box_mask_z]
attn_t = attn_t.view(bs, hn, -1, lens_s)
attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s
else:
attn_t = attn_t.mean(dim=2).mean(dim=1) # B, H, L-T, L_s --> B, L_s

# use sort instead of topk, due to the speed issue
# https://github.com/pytorch/pytorch/issues/22812
sorted_attn, indices = torch.sort(attn_t, dim=1, descending=True)

_, topk_idx = sorted_attn[:, :lens_keep], indices[:, :lens_keep]
_, non_topk_idx = sorted_attn[:, lens_keep:], indices[:, lens_keep:]
keep_index = global_index.gather(dim=1, index=topk_idx)
removed_index = global_index.gather(dim=1, index=non_topk_idx)

# separate template and search tokens
tokens_t = tokens[:, :lens_t]
tokens_s = tokens[:, lens_t:]

# obtain the attentive and inattentive tokens
B, L, C = tokens_s.shape
attentive_tokens = tokens_s.gather(
dim=1, index=topk_idx.unsqueeze(-1).expand(B, -1, C))

# concatenate these tokens
tokens_new = torch.cat([tokens_t, attentive_tokens], dim=1)

return tokens_new, keep_index, removed_index


class CEBlock(nn.Module):

def __init__(
self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
keep_ratio_search=1.0,
):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)

self.keep_ratio_search = keep_ratio_search

def forward(self,
x,
global_index_template,
global_index_search,
mask=None,
ce_template_mask=None,
keep_ratio_search=None):
x_attn, attn = self.attn(self.norm1(x), mask, True)
x = x + self.drop_path(x_attn)
lens_t = global_index_template.shape[1]

removed_index_search = None
if self.keep_ratio_search < 1 and (keep_ratio_search is None
or keep_ratio_search < 1):
keep_ratio_search = self.keep_ratio_search if keep_ratio_search is None else keep_ratio_search
x, global_index_search, removed_index_search = candidate_elimination(
attn, x, lens_t, keep_ratio_search, global_index_search,
ce_template_mask)

x = x + self.drop_path(self.mlp(self.norm2(x)))
return x, global_index_template, global_index_search, removed_index_search, attn

+ 141
- 0
modelscope/models/cv/video_single_object_tracking/models/layers/head.py View File

@@ -0,0 +1,141 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch
import torch.nn as nn


def conv(in_planes,
out_planes,
kernel_size=3,
stride=1,
padding=1,
dilation=1):
return nn.Sequential(
nn.Conv2d(
in_planes,
out_planes,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=True), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True))


class CenterPredictor(
nn.Module, ):

def __init__(self, inplanes=64, channel=256, feat_sz=20, stride=16):
super(CenterPredictor, self).__init__()
self.feat_sz = feat_sz
self.stride = stride
self.img_sz = self.feat_sz * self.stride

# corner predict
self.conv1_ctr = conv(inplanes, channel)
self.conv2_ctr = conv(channel, channel // 2)
self.conv3_ctr = conv(channel // 2, channel // 4)
self.conv4_ctr = conv(channel // 4, channel // 8)
self.conv5_ctr = nn.Conv2d(channel // 8, 1, kernel_size=1)

# offset regress
self.conv1_offset = conv(inplanes, channel)
self.conv2_offset = conv(channel, channel // 2)
self.conv3_offset = conv(channel // 2, channel // 4)
self.conv4_offset = conv(channel // 4, channel // 8)
self.conv5_offset = nn.Conv2d(channel // 8, 2, kernel_size=1)

# size regress
self.conv1_size = conv(inplanes, channel)
self.conv2_size = conv(channel, channel // 2)
self.conv3_size = conv(channel // 2, channel // 4)
self.conv4_size = conv(channel // 4, channel // 8)
self.conv5_size = nn.Conv2d(channel // 8, 2, kernel_size=1)

for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)

def forward(self, x, gt_score_map=None):
""" Forward pass with input x. """
score_map_ctr, size_map, offset_map = self.get_score_map(x)

# assert gt_score_map is None
if gt_score_map is None:
bbox = self.cal_bbox(score_map_ctr, size_map, offset_map)
else:
bbox = self.cal_bbox(
gt_score_map.unsqueeze(1), size_map, offset_map)

return score_map_ctr, bbox, size_map, offset_map

def cal_bbox(self,
score_map_ctr,
size_map,
offset_map,
return_score=False):
max_score, idx = torch.max(
score_map_ctr.flatten(1), dim=1, keepdim=True)
idx_y = idx // self.feat_sz
idx_x = idx % self.feat_sz

idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
size = size_map.flatten(2).gather(dim=2, index=idx)
offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)

# cx, cy, w, h
bbox = torch.cat(
[(idx_x.to(torch.float) + offset[:, :1]) / self.feat_sz,
(idx_y.to(torch.float) + offset[:, 1:]) / self.feat_sz,
size.squeeze(-1)],
dim=1)

if return_score:
return bbox, max_score
return bbox

def get_score_map(self, x):

def _sigmoid(x):
y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4)
return y

# ctr branch
x_ctr1 = self.conv1_ctr(x)
x_ctr2 = self.conv2_ctr(x_ctr1)
x_ctr3 = self.conv3_ctr(x_ctr2)
x_ctr4 = self.conv4_ctr(x_ctr3)
score_map_ctr = self.conv5_ctr(x_ctr4)

# offset branch
x_offset1 = self.conv1_offset(x)
x_offset2 = self.conv2_offset(x_offset1)
x_offset3 = self.conv3_offset(x_offset2)
x_offset4 = self.conv4_offset(x_offset3)
score_map_offset = self.conv5_offset(x_offset4)

# size branch
x_size1 = self.conv1_size(x)
x_size2 = self.conv2_size(x_size1)
x_size3 = self.conv3_size(x_size2)
x_size4 = self.conv4_size(x_size3)
score_map_size = self.conv5_size(x_size4)
return _sigmoid(score_map_ctr), _sigmoid(
score_map_size), score_map_offset


def build_box_head(cfg, hidden_dim):
stride = cfg.MODEL.BACKBONE.STRIDE

if cfg.MODEL.HEAD.TYPE == 'CENTER':
in_channel = hidden_dim
out_channel = cfg.MODEL.HEAD.NUM_CHANNELS
feat_sz = int(cfg.DATA.SEARCH.SIZE / stride)
center_head = CenterPredictor(
inplanes=in_channel,
channel=out_channel,
feat_sz=feat_sz,
stride=stride)
return center_head
else:
raise ValueError('HEAD TYPE %s is not supported.'
% cfg.MODEL.HEAD_TYPE)

+ 37
- 0
modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py View File

@@ -0,0 +1,37 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch.nn as nn
from timm.models.layers import to_2tuple


class PatchEmbed(nn.Module):
""" 2D Image to Patch Embedding
"""

def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
norm_layer=None,
flatten=True):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.grid_size = (img_size[0] // patch_size[0],
img_size[1] // patch_size[1])
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.flatten = flatten

self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

def forward(self, x):
x = self.proj(x)
if self.flatten:
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
x = self.norm(x)
return x

+ 0
- 0
modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py View File


+ 93
- 0
modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py View File

@@ -0,0 +1,93 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch.nn as nn
from timm.models.layers import to_2tuple

from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
PatchEmbed


class BaseBackbone(nn.Module):

def __init__(self):
super().__init__()

# for original ViT
self.pos_embed = None
self.img_size = [224, 224]
self.patch_size = 16
self.embed_dim = 384

self.cat_mode = 'direct'

self.pos_embed_z = None
self.pos_embed_x = None

self.template_segment_pos_embed = None
self.search_segment_pos_embed = None

self.return_stage = [2, 5, 8, 11]

def finetune_track(self, cfg, patch_start_index=1):

search_size = to_2tuple(cfg.DATA.SEARCH.SIZE)
template_size = to_2tuple(cfg.DATA.TEMPLATE.SIZE)
new_patch_size = cfg.MODEL.BACKBONE.STRIDE

self.cat_mode = cfg.MODEL.BACKBONE.CAT_MODE

# resize patch embedding
if new_patch_size != self.patch_size:
print(
'Inconsistent Patch Size With The Pretrained Weights, Interpolate The Weight!'
)
old_patch_embed = {}
for name, param in self.patch_embed.named_parameters():
if 'weight' in name:
param = nn.functional.interpolate(
param,
size=(new_patch_size, new_patch_size),
mode='bicubic',
align_corners=False)
param = nn.Parameter(param)
old_patch_embed[name] = param
self.patch_embed = PatchEmbed(
img_size=self.img_size,
patch_size=new_patch_size,
in_chans=3,
embed_dim=self.embed_dim)
self.patch_embed.proj.bias = old_patch_embed['proj.bias']
self.patch_embed.proj.weight = old_patch_embed['proj.weight']

# for patch embedding
patch_pos_embed = self.pos_embed[:, patch_start_index:, :]
patch_pos_embed = patch_pos_embed.transpose(1, 2)
B, E, Q = patch_pos_embed.shape
P_H, P_W = self.img_size[0] // self.patch_size, self.img_size[
1] // self.patch_size
patch_pos_embed = patch_pos_embed.view(B, E, P_H, P_W)

# for search region
H, W = search_size
new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
search_patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_P_H, new_P_W),
mode='bicubic',
align_corners=False)
search_patch_pos_embed = search_patch_pos_embed.flatten(2).transpose(
1, 2)

# for template region
H, W = template_size
new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
template_patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_P_H, new_P_W),
mode='bicubic',
align_corners=False)
template_patch_pos_embed = template_patch_pos_embed.flatten(
2).transpose(1, 2)

self.pos_embed_z = nn.Parameter(template_patch_pos_embed)
self.pos_embed_x = nn.Parameter(search_patch_pos_embed)

+ 109
- 0
modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py View File

@@ -0,0 +1,109 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch
from torch import nn

from modelscope.models.cv.video_single_object_tracking.models.layers.head import \
build_box_head
from .vit_ce import vit_base_patch16_224_ce


class OSTrack(nn.Module):
""" This is the base class for OSTrack """

def __init__(self,
transformer,
box_head,
aux_loss=False,
head_type='CORNER'):
""" Initializes the model.
Parameters:
transformer: torch module of the transformer architecture.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
"""
super().__init__()
self.backbone = transformer
self.box_head = box_head

self.aux_loss = aux_loss
self.head_type = head_type
if head_type == 'CORNER' or head_type == 'CENTER':
self.feat_sz_s = int(box_head.feat_sz)
self.feat_len_s = int(box_head.feat_sz**2)

def forward(
self,
template: torch.Tensor,
search: torch.Tensor,
ce_template_mask=None,
ce_keep_rate=None,
):
x, aux_dict = self.backbone(
z=template,
x=search,
ce_template_mask=ce_template_mask,
ce_keep_rate=ce_keep_rate,
)

# Forward head
feat_last = x
if isinstance(x, list):
feat_last = x[-1]
out = self.forward_head(feat_last, None)

out.update(aux_dict)
out['backbone_feat'] = x
return out

def forward_head(self, cat_feature, gt_score_map=None):
"""
cat_feature: output embeddings of the backbone, it can be (HW1+HW2, B, C) or (HW2, B, C)
"""
enc_opt = cat_feature[:, -self.
feat_len_s:] # encoder output for the search region (B, HW, C)
opt = (enc_opt.unsqueeze(-1)).permute((0, 3, 2, 1)).contiguous()
bs, Nq, C, HW = opt.size()
opt_feat = opt.view(-1, C, self.feat_sz_s, self.feat_sz_s)

if self.head_type == 'CENTER':
# run the center head
score_map_ctr, bbox, size_map, offset_map = self.box_head(
opt_feat, gt_score_map)
outputs_coord = bbox
outputs_coord_new = outputs_coord.view(bs, Nq, 4)
out = {
'pred_boxes': outputs_coord_new,
'score_map': score_map_ctr,
'size_map': size_map,
'offset_map': offset_map
}
return out
else:
raise NotImplementedError


def build_ostrack(cfg):
if cfg.MODEL.BACKBONE.TYPE == 'vit_base_patch16_224_ce':
backbone = vit_base_patch16_224_ce(
False,
drop_path_rate=cfg.MODEL.BACKBONE.DROP_PATH_RATE,
ce_loc=cfg.MODEL.BACKBONE.CE_LOC,
ce_keep_ratio=cfg.MODEL.BACKBONE.CE_KEEP_RATIO,
)
hidden_dim = backbone.embed_dim
patch_start_index = 1
else:
raise NotImplementedError

backbone.finetune_track(cfg=cfg, patch_start_index=patch_start_index)

box_head = build_box_head(cfg, hidden_dim)

model = OSTrack(
backbone,
box_head,
aux_loss=False,
head_type=cfg.MODEL.HEAD.TYPE,
)

return model

+ 24
- 0
modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py View File

@@ -0,0 +1,24 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch


def combine_tokens(template_tokens,
search_tokens,
mode='direct',
return_res=False):
if mode == 'direct':
merged_feature = torch.cat((template_tokens, search_tokens), dim=1)
else:
raise NotImplementedError

return merged_feature


def recover_tokens(merged_tokens, mode='direct'):
if mode == 'direct':
recovered_tokens = merged_tokens
else:
raise NotImplementedError

return recovered_tokens

+ 343
- 0
modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py View File

@@ -0,0 +1,343 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
from functools import partial

import torch
import torch.nn as nn
from timm.models.layers import DropPath, Mlp, to_2tuple

from modelscope.models.cv.video_single_object_tracking.models.layers.attn_blocks import \
CEBlock
from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
PatchEmbed
from .base_backbone import BaseBackbone
from .utils import combine_tokens, recover_tokens


class Attention(nn.Module):

def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim**-0.5

self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)


class Block(nn.Module):

def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)


class VisionTransformer(BaseBackbone):
""" Vision Transformer
A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
- https://arxiv.org/abs/2010.11929
Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
- https://arxiv.org/abs/2012.12877
"""

def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
num_classes=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=True,
distilled=False,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
embed_layer=PatchEmbed,
norm_layer=None,
act_layer=None):
"""
Args:
img_size (int, tuple): input image size
patch_size (int, tuple): patch size
in_chans (int): number of input channels
num_classes (int): number of classes for classification head
embed_dim (int): embedding dimension
depth (int): depth of transformer
num_heads (int): number of attention heads
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
qkv_bias (bool): enable bias for qkv if True
distilled (bool): model includes a distillation token and head as in DeiT models
drop_rate (float): dropout rate
attn_drop_rate (float): attention dropout rate
drop_path_rate (float): stochastic depth rate
embed_layer (nn.Module): patch embedding layer
norm_layer: (nn.Module): normalization layer
"""
super().__init__()
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.num_tokens = 2 if distilled else 1
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
act_layer = act_layer or nn.GELU

self.patch_embed = embed_layer(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches

self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.dist_token = None
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches + self.num_tokens, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)

dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
self.blocks = nn.Sequential(*[
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
act_layer=act_layer) for i in range(depth)
])
self.norm = norm_layer(embed_dim)


class VisionTransformerCE(VisionTransformer):
""" Vision Transformer with candidate elimination (CE) module

A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
- https://arxiv.org/abs/2010.11929

Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
- https://arxiv.org/abs/2012.12877
"""

def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
num_classes=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=True,
distilled=False,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
embed_layer=PatchEmbed,
norm_layer=None,
act_layer=None,
ce_loc=None,
ce_keep_ratio=None):
"""
Args:
img_size (int, tuple): input image size
patch_size (int, tuple): patch size
in_chans (int): number of input channels
num_classes (int): number of classes for classification head
embed_dim (int): embedding dimension
depth (int): depth of transformer
num_heads (int): number of attention heads
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
qkv_bias (bool): enable bias for qkv if True
distilled (bool): model includes a distillation token and head as in DeiT models
drop_rate (float): dropout rate
attn_drop_rate (float): attention dropout rate
drop_path_rate (float): stochastic depth rate
embed_layer (nn.Module): patch embedding layer
norm_layer: (nn.Module): normalization layer
"""
super().__init__()
if isinstance(img_size, tuple):
self.img_size = img_size
else:
self.img_size = to_2tuple(img_size)
self.patch_size = patch_size
self.in_chans = in_chans

self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.num_tokens = 2 if distilled else 1
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
act_layer = act_layer or nn.GELU

self.patch_embed = embed_layer(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches

self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.dist_token = nn.Parameter(torch.zeros(
1, 1, embed_dim)) if distilled else None
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches + self.num_tokens, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)

dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
blocks = []
ce_index = 0
self.ce_loc = ce_loc
for i in range(depth):
ce_keep_ratio_i = 1.0
if ce_loc is not None and i in ce_loc:
ce_keep_ratio_i = ce_keep_ratio[ce_index]
ce_index += 1

blocks.append(
CEBlock(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
act_layer=act_layer,
keep_ratio_search=ce_keep_ratio_i))

self.blocks = nn.Sequential(*blocks)
self.norm = norm_layer(embed_dim)

def forward_features(
self,
z,
x,
mask_x=None,
ce_template_mask=None,
ce_keep_rate=None,
):
B = x.shape[0]

x = self.patch_embed(x)
z = self.patch_embed(z)

z += self.pos_embed_z
x += self.pos_embed_x

x = combine_tokens(z, x, mode=self.cat_mode)

x = self.pos_drop(x)

lens_z = self.pos_embed_z.shape[1]
lens_x = self.pos_embed_x.shape[1]

global_index_t = torch.linspace(0, lens_z - 1, lens_z).to(x.device)
global_index_t = global_index_t.repeat(B, 1)

global_index_s = torch.linspace(0, lens_x - 1, lens_x).to(x.device)
global_index_s = global_index_s.repeat(B, 1)
removed_indexes_s = []
for i, blk in enumerate(self.blocks):
x, global_index_t, global_index_s, removed_index_s, attn = \
blk(x, global_index_t, global_index_s, mask_x, ce_template_mask, ce_keep_rate)

if self.ce_loc is not None and i in self.ce_loc:
removed_indexes_s.append(removed_index_s)

x = self.norm(x)
lens_x_new = global_index_s.shape[1]
lens_z_new = global_index_t.shape[1]

z = x[:, :lens_z_new]
x = x[:, lens_z_new:]

if removed_indexes_s and removed_indexes_s[0] is not None:
removed_indexes_cat = torch.cat(removed_indexes_s, dim=1)

pruned_lens_x = lens_x - lens_x_new
pad_x = torch.zeros([B, pruned_lens_x, x.shape[2]],
device=x.device)
x = torch.cat([x, pad_x], dim=1)
index_all = torch.cat([global_index_s, removed_indexes_cat], dim=1)
# recover original token order
C = x.shape[-1]
x = torch.zeros_like(x).scatter_(
dim=1,
index=index_all.unsqueeze(-1).expand(B, -1, C).to(torch.int64),
src=x)

x = recover_tokens(x, mode=self.cat_mode)

# re-concatenate with the template, which may be further used by other modules
x = torch.cat([z, x], dim=1)

aux_dict = {
'attn': attn,
'removed_indexes_s': removed_indexes_s, # used for visualization
}

return x, aux_dict

def forward(self, z, x, ce_template_mask=None, ce_keep_rate=None):

x, aux_dict = self.forward_features(
z,
x,
ce_template_mask=ce_template_mask,
ce_keep_rate=ce_keep_rate,
)

return x, aux_dict


def _create_vision_transformer(pretrained=False, **kwargs):
model = VisionTransformerCE(**kwargs)
return model


def vit_base_patch16_224_ce(pretrained=False, **kwargs):
""" ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
model = _create_vision_transformer(pretrained=pretrained, **model_kwargs)
return model

+ 0
- 0
modelscope/models/cv/video_single_object_tracking/tracker/__init__.py View File


+ 139
- 0
modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py View File

@@ -0,0 +1,139 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import torch

from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
cfg
from modelscope.models.cv.video_single_object_tracking.models.ostrack.ostrack import \
build_ostrack
from modelscope.models.cv.video_single_object_tracking.utils.utils import (
Preprocessor, clip_box, generate_mask_cond, hann2d, sample_target,
transform_image_to_crop)


class OSTrack():

def __init__(self, ckpt_path, device):
network = build_ostrack(cfg)
network.load_state_dict(
torch.load(ckpt_path, map_location='cpu')['net'], strict=True)
self.cfg = cfg
if device.type == 'cuda':
self.network = network.to(device)
else:
self.network = network
self.network.eval()
self.preprocessor = Preprocessor(device)
self.state = None

self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE
# motion constrain
if device.type == 'cuda':
self.output_window = hann2d(
torch.tensor([self.feat_sz, self.feat_sz]).long(),
centered=True).to(device)
else:
self.output_window = hann2d(
torch.tensor([self.feat_sz, self.feat_sz]).long(),
centered=True)
self.frame_id = 0
# for save boxes from all queries
self.z_dict1 = {}

def initialize(self, image, info: dict):
# forward the template once
z_patch_arr, resize_factor, z_amask_arr = sample_target(
image,
info['init_bbox'],
self.cfg.TEST.TEMPLATE_FACTOR,
output_sz=self.cfg.TEST.TEMPLATE_SIZE)
self.z_patch_arr = z_patch_arr
template = self.preprocessor.process(z_patch_arr, z_amask_arr)
with torch.no_grad():
self.z_dict1 = template

self.box_mask_z = None
if self.cfg.MODEL.BACKBONE.CE_LOC:
template_bbox = self.transform_bbox_to_crop(
info['init_bbox'], resize_factor,
template.tensors.device).squeeze(1)
self.box_mask_z = generate_mask_cond(self.cfg, 1,
template.tensors.device,
template_bbox)

# save states
self.state = info['init_bbox']
self.frame_id = 0

def track(self, image, info: dict = None):
H, W, _ = image.shape
self.frame_id += 1
x_patch_arr, resize_factor, x_amask_arr = sample_target(
image,
self.state,
self.cfg.TEST.SEARCH_FACTOR,
output_sz=self.cfg.TEST.SEARCH_SIZE) # (x1, y1, w, h)
search = self.preprocessor.process(x_patch_arr, x_amask_arr)

with torch.no_grad():
x_dict = search
# merge the template and the search
# run the transformer
out_dict = self.network.forward(
template=self.z_dict1.tensors,
search=x_dict.tensors,
ce_template_mask=self.box_mask_z)

# add hann windows
pred_score_map = out_dict['score_map']
response = self.output_window * pred_score_map
pred_boxes = self.network.box_head.cal_bbox(response,
out_dict['size_map'],
out_dict['offset_map'])
pred_boxes = pred_boxes.view(-1, 4)
# Baseline: Take the mean of all pred boxes as the final result
pred_box = (pred_boxes.mean(dim=0) * self.cfg.TEST.SEARCH_SIZE
/ resize_factor).tolist() # (cx, cy, w, h) [0,1]
# get the final box result
self.state = clip_box(
self.map_box_back(pred_box, resize_factor), H, W, margin=10)

x1, y1, w, h = self.state
x2 = x1 + w
y2 = y1 + h
return {'target_bbox': [x1, y1, x2, y2]}

def map_box_back(self, pred_box: list, resize_factor: float):
cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[
1] + 0.5 * self.state[3]
cx, cy, w, h = pred_box
half_side = 0.5 * self.cfg.TEST.SEARCH_SIZE / resize_factor
cx_real = cx + (cx_prev - half_side)
cy_real = cy + (cy_prev - half_side)
return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]

def transform_bbox_to_crop(self,
box_in,
resize_factor,
device,
box_extract=None,
crop_type='template'):
if crop_type == 'template':
crop_sz = torch.Tensor(
[self.cfg.TEST.TEMPLATE_SIZE, self.cfg.TEST.TEMPLATE_SIZE])
elif crop_type == 'search':
crop_sz = torch.Tensor(
[self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE])
else:
raise NotImplementedError

box_in = torch.tensor(box_in)
if box_extract is None:
box_extract = box_in
else:
box_extract = torch.tensor(box_extract)
template_bbox = transform_image_to_crop(
box_in, box_extract, resize_factor, crop_sz, normalize=True)
template_bbox = template_bbox.view(1, 1, 4).to(device)

return template_bbox

+ 0
- 0
modelscope/models/cv/video_single_object_tracking/utils/__init__.py View File


+ 261
- 0
modelscope/models/cv/video_single_object_tracking/utils/utils.py View File

@@ -0,0 +1,261 @@
# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
# https://github.com/botaoye/OSTrack/
import math
from typing import Optional

import cv2
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor


def hann1d(sz: int, centered=True) -> torch.Tensor:
"""1D cosine window."""
if centered:
return 0.5 * (1 - torch.cos(
(2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
w = 0.5 * (1 + torch.cos(
(2 * math.pi / (sz + 2)) * torch.arange(0, sz // 2 + 1).float()))
return torch.cat([w, w[1:sz - sz // 2].flip((0, ))])


def hann2d(sz: torch.Tensor, centered=True) -> torch.Tensor:
"""2D cosine window."""
return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(
sz[1].item(), centered).reshape(1, 1, 1, -1)


class NestedTensor(object):

def __init__(self, tensors, mask: Optional[Tensor]):
self.tensors = tensors
self.mask = mask


class Preprocessor(object):

def __init__(self, device: str):
self.device = device
self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1))
self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1))
if 'cuda' == self.device.type:
self.mean = self.mean.to(self.device)
self.std = self.std.to(self.device)

def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
# Deal with the image patch
if 'cuda' == self.device.type:
img_tensor = torch.tensor(img_arr).to(self.device).float().permute(
(2, 0, 1)).unsqueeze(dim=0)
else:
img_tensor = torch.tensor(img_arr).float().permute(
(2, 0, 1)).unsqueeze(dim=0)
img_tensor_norm = (
(img_tensor / 255.0) - self.mean) / self.std # (1,3,H,W)

# Deal with the attention mask
if 'cuda' == self.device.type:
amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).to(
self.device).unsqueeze(dim=0) # (1,H,W)
else:
amask_tensor = torch.from_numpy(amask_arr).to(
torch.bool).unsqueeze(dim=0) # (1,H,W)
return NestedTensor(img_tensor_norm, amask_tensor)


def clip_box(box: list, H, W, margin=0):
x1, y1, w, h = box
x2, y2 = x1 + w, y1 + h
x1 = min(max(0, x1), W - margin)
x2 = min(max(margin, x2), W)
y1 = min(max(0, y1), H - margin)
y2 = min(max(margin, y2), H)
w = max(margin, x2 - x1)
h = max(margin, y2 - y1)
if isinstance(x1, torch.Tensor):
x1 = x1.item()
y1 = y1.item()
w = w.item()
h = h.item()
return [x1, y1, w, h]


def generate_mask_cond(cfg, bs, device, gt_bbox):
template_size = cfg.DATA.TEMPLATE.SIZE
stride = cfg.MODEL.BACKBONE.STRIDE
template_feat_size = template_size // stride

if cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE == 'CTR_POINT':
if template_feat_size == 8:
index = slice(3, 4)
elif template_feat_size == 12:
index = slice(5, 6)
elif template_feat_size == 7:
index = slice(3, 4)
elif template_feat_size == 14:
index = slice(6, 7)
else:
raise NotImplementedError
box_mask_z = torch.zeros([bs, template_feat_size, template_feat_size],
device=device)
box_mask_z[:, index, index] = 1
box_mask_z = box_mask_z.flatten(1).to(torch.bool)
else:
raise NotImplementedError

return box_mask_z


def sample_target(im,
target_bb,
search_area_factor,
output_sz=None,
mask=None):
""" Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area

args:
im - cv image
target_bb - target box [x, y, w, h]
search_area_factor - Ratio of crop size to target size
output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.

returns:
cv image - extracted crop
float - the factor by which the crop has been resized to make the crop size equal output_size
"""
if not isinstance(target_bb, list):
x, y, w, h = target_bb.tolist()
else:
x, y, w, h = target_bb
# Crop image
crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)

if crop_sz < 1:
raise Exception('Too small bounding box.')

x1 = round(x + 0.5 * w - crop_sz * 0.5)
x2 = x1 + crop_sz

y1 = round(y + 0.5 * h - crop_sz * 0.5)
y2 = y1 + crop_sz

x1_pad = max(0, -x1)
x2_pad = max(x2 - im.shape[1] + 1, 0)

y1_pad = max(0, -y1)
y2_pad = max(y2 - im.shape[0] + 1, 0)

# Crop target
im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
if mask is not None:
mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad]

# Pad
im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad,
x2_pad, cv2.BORDER_CONSTANT)
# deal with attention mask
H, W, _ = im_crop_padded.shape
att_mask = np.ones((H, W))
end_x, end_y = -x2_pad, -y2_pad
if y2_pad == 0:
end_y = None
if x2_pad == 0:
end_x = None
att_mask[y1_pad:end_y, x1_pad:end_x] = 0
if mask is not None:
mask_crop_padded = F.pad(
mask_crop,
pad=(x1_pad, x2_pad, y1_pad, y2_pad),
mode='constant',
value=0)

if output_sz is not None:
resize_factor = output_sz / crop_sz
im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))
att_mask = cv2.resize(att_mask,
(output_sz, output_sz)).astype(np.bool_)
if mask is None:
return im_crop_padded, resize_factor, att_mask
mask_crop_padded = \
F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz),
mode='bilinear', align_corners=False)[0, 0]
return im_crop_padded, resize_factor, att_mask, mask_crop_padded

else:
if mask is None:
return im_crop_padded, att_mask.astype(np.bool_), 1.0
return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded


def transform_image_to_crop(box_in: torch.Tensor,
box_extract: torch.Tensor,
resize_factor: float,
crop_sz: torch.Tensor,
normalize=False) -> torch.Tensor:
""" Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
args:
box_in - the box for which the co-ordinates are to be transformed
box_extract - the box about which the image crop has been extracted.
resize_factor - the ratio between the original image scale and the scale of the image crop
crop_sz - size of the cropped image

returns:
torch.Tensor - transformed co-ordinates of box_in
"""
box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]

box_in_center = box_in[0:2] + 0.5 * box_in[2:4]

box_out_center = (crop_sz - 1) / 2 + (box_in_center
- box_extract_center) * resize_factor
box_out_wh = box_in[2:4] * resize_factor

box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
if normalize:
return box_out / crop_sz[0]
else:
return box_out


def check_box(box: list, image_height, image_width) -> bool:
""" To check whether the box is within the image range or not
args:
box - the bounding box in the form of [x1, y1, x2, y2]
image_height - the height of the image
image_width - the width of the image

returns:
bool - if box is valid, return True. Otherwise, return False
"""
assert len(box) == 4, 'box must be in the form of: [x1, y1, x2, y2]'
if box[0] < 0 or box[0] >= image_width:
return False
if box[2] < 0 or box[2] >= image_width:
return False
if box[1] < 0 or box[1] >= image_height:
return False
if box[3] < 0 or box[3] >= image_height:
return False
return True


def show_tracking_result(video_in_path, bboxes, video_save_path):
cap = cv2.VideoCapture(video_in_path)
for i in range(len(bboxes)):
box = bboxes[i]
success, frame = cap.read()
if success is False:
raise Exception(video_in_path,
' can not be correctly decoded by OpenCV.')
if i == 0:
size = (frame.shape[1], frame.shape[0])
fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
video_writer = cv2.VideoWriter(video_save_path, fourcc,
cap.get(cv2.CAP_PROP_FPS), size,
True)
cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
5)
video_writer.write(frame)
video_writer.release
cap.release()

+ 4
- 4
modelscope/models/multi_modal/__init__.py View File

@@ -9,9 +9,10 @@ if TYPE_CHECKING:
from .gemm import GEMMForMultiModalEmbedding
from .diffusion import DiffusionForTextToImageSynthesis
from .mmr import VideoCLIPForMultiModalEmbedding
from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering
from .mplug_for_all_tasks import MPlugForAllTasks
from .ofa_for_all_tasks import OfaForAllTasks
from .ofa_for_text_to_image_synthesis_model import \
OfaForTextToImageSynthesis

else:
_import_structure = {
@@ -19,8 +20,7 @@ else:
'diffusion': ['DiffusionForTextToImageSynthesis'],
'gemm': ['GEMMForMultiModalEmbedding'],
'mmr': ['VideoCLIPForMultiModalEmbedding'],
'mplug_for_visual_question_answering':
['MPlugForVisualQuestionAnswering'],
'mplug_for_all_tasks': ['MPlugForAllTasks'],
'ofa_for_all_tasks': ['OfaForAllTasks'],
'ofa_for_text_to_image_synthesis_model':
['OfaForTextToImageSynthesis']


+ 1
- 1
modelscope/models/multi_modal/clip/__init__.py View File

@@ -1 +1 @@
from .clip_model import CLIPForMultiModalEmbedding
from .model import CLIPForMultiModalEmbedding

+ 422
- 0
modelscope/models/multi_modal/clip/bert_tokenizer.py View File

@@ -0,0 +1,422 @@
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""

from __future__ import absolute_import, division, print_function
import collections
import os
import re
import unicodedata

import six


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""

# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.

if not init_checkpoint:
return

m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
if m is None:
return

model_name = m.group(1)

lower_models = [
'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
]

cased_models = [
'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
'multi_cased_L-12_H-768_A-12'
]

is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = 'False'
case_name = 'lowercased'
opposite_flag = 'True'

if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = 'True'
case_name = 'cased'
opposite_flag = 'False'

if is_bad_config:
raise ValueError(
'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
'However, `%s` seems to be a %s model, so you '
'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
'how the model was pre-training. If this error is wrong, please '
'just comment out this check.' %
(actual_flag, init_checkpoint, model_name, case_name,
opposite_flag))


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode('utf-8', 'ignore')
elif isinstance(text, unicode):
return text
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode('utf-8')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, 'r') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)

return split_tokens

def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)

def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)

@staticmethod
def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
""" Converts a sequence of tokens (string) in a single string. """

def clean_up_tokenization(out_string):
""" Clean up a list of simple English tokenization artifacts
like spaces before punctuations and abreviated forms.
"""
out_string = (
out_string.replace(' .', '.').replace(' ?', '?').replace(
' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
" n't", "n't").replace(" 'm", "'m").replace(
" 's", "'s").replace(" 've",
"'ve").replace(" 're", "'re"))
return out_string

text = ' '.join(tokens).replace(' ##', '').strip()
if clean_up_tokenization_spaces:
clean_text = clean_up_tokenization(text)
return clean_text
else:
return text

def vocab_size(self):
return len(self.vocab)


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)

# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)

orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(' '.join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize('NFD', text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == 'Mn':
continue
output.append(char)
return ''.join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return [''.join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(' ')
output.append(char)
output.append(' ')
else:
output.append(char)
return ''.join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(' ')
else:
output.append(char)
return ''.join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""

def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = ''.join(chars[start:end])
if start > 0:
substr = '##' + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
return True
cat = unicodedata.category(char)
if cat == 'Zs':
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == '\t' or char == '\n' or char == '\r':
return False
cat = unicodedata.category(char)
if cat in ('Cc', 'Cf'):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith('P'):
return True
return False

+ 0
- 29
modelscope/models/multi_modal/clip/clip_bert.py View File

@@ -1,29 +0,0 @@
import torch.nn as nn
from transformers import BertConfig, BertForMaskedLM


class TextTransformer(nn.Module):

def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True):
super(TextTransformer, self).__init__()
bert_config = BertConfig.from_dict(config_dict)
if use_grad_ckp:
bert_config.gradient_checkpointing = True

self.bert = BertForMaskedLM(bert_config).bert

self.projector = nn.Linear(
bert_config.hidden_size, feat_dim, bias=False)

def forward(self, input_ids, attention_mask):
trans_features = {
'input_ids': input_ids,
'attention_mask': attention_mask
}

output_states = self.bert(**trans_features, return_dict=False)
output_tokens = output_states[0]

cls_tokens = output_tokens[:, 0, :]

return self.projector(cls_tokens)

+ 0
- 216
modelscope/models/multi_modal/clip/clip_model.py View File

@@ -1,216 +0,0 @@
from typing import Any, Dict

import cv2
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from tokenizers import BertWordPieceTokenizer
from torch.distributed.nn.functional import \
all_gather as all_gather_with_backprop
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.clip_bert import TextTransformer
from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer
from modelscope.utils.constant import ModeKeys, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class CLIPModel(nn.Module):

def __init__(self, model_dir):
super(CLIPModel, self).__init__()
# including vision config and text config
model_config = json.load(
open('{}/encoder_config.json'.format(model_dir)))

# vision encoder
vision_config = model_config['vision_config']
self.img_size = vision_config['input_resolution']
self.vision_encoder = VisionTransformer(
input_resolution=self.img_size,
patch_size=vision_config['patch_size'],
width=vision_config['width'],
layers=vision_config['layers'],
heads=vision_config['heads'],
output_dim=vision_config['feat_dim'],
use_grad_ckp=True)

# text encoder
text_config = model_config['text_config']
self.text_encoder = TextTransformer(
text_config['bert_config'], feat_dim=text_config['feat_dim'])

self.logit_scale = nn.Parameter(torch.ones([]) * 4.6)

def contrastive_loss(self, logits, dim):
neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
return -neg_ce.mean()

def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None):
if img_idx is not None and all_img_idx is not None:
with torch.no_grad():
false_neg_indicator = (
img_idx[:, None] == all_img_idx[None, :])
false_neg_indicator.fill_diagonal_(False)
t2i_sim.masked_fill_(false_neg_indicator, float('-inf'))
i2t_sim.masked_fill_(false_neg_indicator, float('-inf'))
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
else:
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
return (caption_loss + image_loss) / 2.0

def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor,
img_id_list):
img_feat = self.forward(img_tensor, input_type='img')
text_feat = self.forward((text_ids_tensor, text_masks_tensor),
input_type='text')

global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0)
global_text_feat = torch.cat(
all_gather_with_backprop(text_feat), dim=0)
global_img_id_list = torch.cat(
all_gather_with_backprop(img_id_list), dim=0)

t2i_sim_mat = text_feat @ global_img_feat.t()
i2t_sim_mat = img_feat @ global_text_feat.t()

logit_scale = self.logit_scale.exp().clamp(max=100.0)
t2i_sim_mat_logits = t2i_sim_mat * logit_scale
i2t_sim_mat_logits = i2t_sim_mat * logit_scale

loss = self.clip_loss(
t2i_sim_mat_logits,
i2t_sim_mat_logits,
img_idx=img_id_list,
all_img_idx=global_img_id_list)

return loss

def forward(self, input_data, input_type):
if input_type == 'img':
img_embedding = self.vision_encoder(input_data)
img_embedding = F.normalize(img_embedding, p=2.0, dim=1)
return img_embedding
elif input_type == 'text':
text_ids_tensor, text_mask_tensor = input_data
text_embedding = self.text_encoder(text_ids_tensor,
text_mask_tensor)
text_embedding = F.normalize(text_embedding, p=2.0, dim=1)
return text_embedding
elif input_type == ModeKeys.TRAIN:
return self.get_loss(*input_data)
else:
raise ValueError('Unknown input type')


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)
self.clip_model = CLIPModel(model_dir=model_dir)
pretrained_params = torch.load(
'{}/pytorch_model.bin'.format(model_dir), 'cpu')
self.clip_model.load_state_dict(pretrained_params)
self.clip_model.eval()

self.device_id = device_id
if self.device_id >= 0:
self.clip_model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id))
else:
logger.info('Use CPU for inference')

# image preprocessor
norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711))
self.img_preprocessor = Compose([
Resize((self.clip_model.img_size, self.clip_model.img_size),
interpolation=Image.BICUBIC),
ToTensor(), norm_op
])

# text tokenizer
vocab_path = '{}/vocab.txt'.format(model_dir)
self.text_tokenizer = BertWordPieceTokenizer(
vocab_path, lowercase=False)
self.text_tokenizer.enable_truncation(max_length=30)

def tokenize_text(self, text_str):
tokens = self.text_tokenizer.encode(text_str)
max_tokens = 30
text_ids_tensor = torch.zeros((1, max_tokens)).long()
text_mask_tensor = torch.zeros((1, max_tokens))

text_ids, text_mask = tokens.ids, tokens.attention_mask
text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)

return text_ids_tensor, text_mask_tensor

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
input_img = input['img']
if isinstance(input_img, Image.Image):
img_tensor = self.img_preprocessor(input_img)[None, ...]
elif isinstance(input_img, np.ndarray):
if len(input_img.shape) == 2:
input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
input_img = input_img[:, :, ::-1] # in rgb order
input_img = Image.fromarray(
input_img.astype('uint8')).convert('RGB')
img_tensor = self.img_preprocessor(input_img)[None, ...]
else:
raise TypeError(
f'img should be either PIL.Image or np.array, but got {type(input_img)}'
)

if self.device_id >= 0:
img_tensor = img_tensor.to('cuda:{}'.format(self.device_id))

img_embedding = self.clip_model(
input_data=img_tensor, input_type='img')
from modelscope.outputs import OutputKeys
output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy()

if 'text' in input and input['text'] is not None:
text_str = input['text']
if isinstance(text_str, str):
text_ids_tensor, text_mask_tensor = self.tokenize_text(
text_str)
else:
raise TypeError(
f'text should be str, but got {type(text_str)}')

if self.device_id >= 0:
text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
self.device_id))
text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
self.device_id))

text_embedding = self.clip_model(
input_data=(text_ids_tensor, text_mask_tensor),
input_type='text')
output['text_embedding'] = text_embedding.data.cpu().numpy()

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 0
- 131
modelscope/models/multi_modal/clip/clip_vit.py View File

@@ -1,131 +0,0 @@
# Copyright 2021 The OpenAI CLIP Authors. All rights reserved.

from collections import OrderedDict
from typing import Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)


class VisionTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int, use_grad_ckp: bool):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(
width, layers, heads, use_grad_ckp=use_grad_ckp)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
class_embeddings = self.class_embedding.to(x.dtype) + \
torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
x = torch.cat([class_embeddings, x], dim=1)
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x

+ 82
- 0
modelscope/models/multi_modal/clip/configuration_bert.py View File

@@ -0,0 +1,82 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging

logger = logging.getLogger(__name__)


class BertConfig(object):
r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
`BertModel`.


Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""

def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act='gelu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
output_attentions=False,
output_hidden_states=False):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.output_attentions = output_attentions
self.output_hidden_states = output_hidden_states

+ 677
- 0
modelscope/models/multi_modal/clip/model.py View File

@@ -0,0 +1,677 @@
import os
from collections import OrderedDict
from typing import Any, Dict, Iterable, List, Tuple, Union

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
from modelscope.models.multi_modal.clip.configuration_bert import BertConfig
from modelscope.models.multi_modal.clip.modeling_bert import BertModel
from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)

self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)

self.relu = nn.ReLU(inplace=True)
self.downsample = None
self.stride = stride

if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict([('-1', nn.AvgPool2d(stride)),
('0',
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False)),
('1', nn.BatchNorm2d(planes * self.expansion))]))

def forward(self, x: torch.Tensor):
identity = x

out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)
return out


class AttentionPool2d(nn.Module):

def __init__(self,
spacial_dim: int,
embed_dim: int,
num_heads: int,
output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(
torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads

def forward(self, x):
x = x.reshape(x.shape[0], x.shape[1],
x.shape[2] * x.shape[3]).permute(2, 0,
1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
x, _ = F.multi_head_attention_forward(
query=x,
key=x,
value=x,
embed_dim_to_check=x.shape[-1],
num_heads=self.num_heads,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
in_proj_weight=None,
in_proj_bias=torch.cat(
[self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=0,
out_proj_weight=self.c_proj.weight,
out_proj_bias=self.c_proj.bias,
use_separate_proj_weight=True,
training=self.training,
need_weights=False)

return x[0]


class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim,
heads,
input_resolution=224,
width=64):
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(2)
self.relu = nn.ReLU(inplace=True)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
(self.conv3, self.bn3)]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.attnpool(x)

return x


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

def forward(self, x: torch.Tensor):
return self.resblocks(x)


class VisualTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(width, layers, heads)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat(
[ # noqa
self.class_embedding.to(x.dtype) + torch.zeros( # noqa
x.shape[0],
1,
x.shape[-1],
dtype=x.dtype,
device=x.device),
x # noqa
],
dim=1) # noqa shape = [*, grid ** 2 + 1, width]
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x


class CLIP(nn.Module):

def __init__(
self,
embed_dim: int,
# vision
image_resolution: int,
vision_layers: Union[Tuple[int, int, int, int], int],
vision_width: int,
vision_patch_size: int,
# text
vocab_size: int,
text_attention_probs_dropout_prob: float,
text_hidden_act: str,
text_hidden_dropout_prob: float,
text_hidden_size: int,
text_initializer_range: float,
text_intermediate_size: int,
text_max_position_embeddings: int,
text_num_attention_heads: int,
text_num_hidden_layers: int,
text_type_vocab_size: int,
tokenizer: FullTokenizer,
):
super().__init__()

if isinstance(vision_layers, (tuple, list)):
vision_heads = vision_width * 32 // 64
self.visual = ModifiedResNet(
layers=vision_layers,
output_dim=embed_dim,
heads=vision_heads,
input_resolution=image_resolution,
width=vision_width)
else:
vision_heads = vision_width // 64
self.visual = VisualTransformer(
input_resolution=image_resolution,
patch_size=vision_patch_size,
width=vision_width,
layers=vision_layers,
heads=vision_heads,
output_dim=embed_dim)

self.bert_config = BertConfig(
vocab_size_or_config_json_file=vocab_size,
hidden_size=text_hidden_size,
num_hidden_layers=text_num_hidden_layers,
num_attention_heads=text_num_attention_heads,
intermediate_size=text_intermediate_size,
hidden_act=text_hidden_act,
hidden_dropout_prob=text_hidden_dropout_prob,
attention_probs_dropout_prob=text_attention_probs_dropout_prob,
max_position_embeddings=text_max_position_embeddings,
type_vocab_size=text_type_vocab_size,
initializer_range=text_initializer_range,
layer_norm_eps=1e-12,
)
self.bert = BertModel(self.bert_config)

self.text_projection = nn.Parameter(
torch.empty(text_hidden_size, embed_dim))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

self.tokenizer = tokenizer

self.initialize_parameters()

def initialize_parameters(self):
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

if isinstance(self.visual, ModifiedResNet):
if self.visual.attnpool is not None:
std = self.visual.attnpool.c_proj.in_features**-0.5
nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

for resnet_block in [
self.visual.layer1, self.visual.layer2, self.visual.layer3,
self.visual.layer4
]:
for name, param in resnet_block.named_parameters():
if name.endswith('bn3.weight'):
nn.init.zeros_(param)

if self.text_projection is not None:
nn.init.normal_(
self.text_projection, std=self.bert_config.hidden_size**-0.5)

@property
def dtype(self):
return self.visual.conv1.weight.dtype

def encode_image(self, image):
return self.visual(image.type(self.dtype))

def encode_text(self, text):
pad_index = self.tokenizer.vocab['[PAD]']
attn_mask = text.ne(pad_index).type(self.dtype)
x = self.bert(
text, attention_mask=attn_mask)[0].type(
self.dtype) # [batch_size, seq_length, hidden_size]
return x[:, 0, :] @ self.text_projection

def forward(self, image, text):
assert image is not None or text is not None, 'text and image cannot both be None!'

if image is None:
return self.encode_text(text)
elif text is None:
return self.encode_image(image)
image_features = self.encode_image(image)
text_features = self.encode_text(text)

image_features = image_features / image_features.norm(
dim=-1, keepdim=True)
text_features = text_features / text_features.norm(
dim=-1, keepdim=True)

return image_features, text_features, self.logit_scale.exp()

def get_similarity(self, image, text):
image_features = self.encode_image(image)
text_features = self.encode_text(text)

# normalized features
image_features = image_features / image_features.norm(
dim=1, keepdim=True)
text_features = text_features / text_features.norm(dim=1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()

# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text


def convert_models_to_fp32(model):
for p in model.parameters():
p.data = p.data.float()
if p.grad:
p.grad.data = p.grad.data.float()


def convert_weights(model: nn.Module):
"""Convert applicable model parameters to fp16"""

def _convert_weights_to_fp16(module):
if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)):
module.weight.data = module.weight.data.half()
if module.bias is not None:
module.bias.data = module.bias.data.half()

if isinstance(module, nn.MultiheadAttention):
for attr in [
*[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
'in_proj_bias', 'bias_k', 'bias_v'
]:
tensor = getattr(module, attr)
if tensor is not None:
tensor.data = tensor.data.half()

if isinstance(module, BertModel):
module.to(torch.half)

for name in ['text_projection', 'proj']:
if hasattr(module, name):
attr = getattr(module, name)
if attr is not None:
attr.data = attr.data.half()

model.apply(_convert_weights_to_fp16)


def _convert_to_rgb(image):
return image.convert('RGB')


def image_transform(image_size=224):
transform = Compose([
_convert_to_rgb,
Resize((image_size, image_size)),
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711)),
])
return transform


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)

# Initialize the model.
vision_model_config_file = '{}/vision_model_config.json'.format(
model_dir)
logger.info(
f'Loading vision model config from {vision_model_config_file}')
assert os.path.exists(vision_model_config_file)

text_model_config_file = '{}/text_model_config.json'.format(model_dir)
logger.info(f'Loading text model config from {text_model_config_file}')
assert os.path.exists(text_model_config_file)

with open(vision_model_config_file,
'r') as fv, open(text_model_config_file, 'r') as ft:
model_info = json.load(fv)
for k, v in json.load(ft).items():
model_info[k] = v

# image preprocess
self.img_preprocess = image_transform(model_info['image_resolution'])

# text tokenizer
vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.tokenizer = FullTokenizer(vocab_file=vocab_file)

# initialize the model
self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
convert_weights(self.clip_model)

# restore the pretrained weight
checkpoint = torch.load(
f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
sd = checkpoint['state_dict']
if next(iter(sd.items()))[0].startswith('module'):
sd = {k[len('module.'):]: v for k, v in sd.items()}
self.clip_model.load_state_dict(sd)
self.clip_model.eval()

# place the model
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
if self.device == 'cuda':
self.clip_model.to(self.device)
logger.info('Use GPU for inference')
else:
self.clip_model.float()
logger.info('Use CPU for inference')

def tokenize(self,
texts: Union[str, List[str]],
context_length: int = 52) -> torch.LongTensor:
"""
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize
context_length : int
The context length to use; all baseline models use 24 as the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
"""
if isinstance(texts, str):
texts = [texts]

all_tokens = []
for text in texts:
all_tokens.append(
[self.tokenizer.vocab['[CLS]']]
+ self.tokenizer.convert_tokens_to_ids(
self.tokenizer.tokenize(text))[:context_length - 2]
+ [self.tokenizer.vocab['[SEP]']])

result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

for i, tokens in enumerate(all_tokens):
assert len(tokens) <= context_length
result[i, :len(tokens)] = torch.tensor(tokens)

return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
image_input = input['img']

# single image input
if isinstance(image_input, Image.Image):
image_tensor = self.img_preprocess(image_input).unsqueeze(0)
# multi images input
elif isinstance(image_input, list):
if all([isinstance(elem, Image.Image)
for elem in image_input]):
image_tensor = torch.stack(
[self.img_preprocess(elem) for elem in image_input],
dim=0)
else:
unsupported_elem_type = [
type(elem) for elem in image_input
if not isinstance(elem, Image.Image)
][0]
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], \
but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
)

image_tensor = image_tensor.to(self.device)

with torch.no_grad():
image_features = self.clip_model.encode_image(image_tensor)
image_features /= image_features.norm(
dim=-1, keepdim=True) # l2-normalize

output[OutputKeys.IMG_EMBEDDING] = image_features

if 'text' in input and input['text'] is not None:
text_input = input['text']

# single text input
if isinstance(text_input, str):
text_tensor = self.tokenize(text_input)
# multi texts input
elif isinstance(text_input, list):
if all([isinstance(elem, str) for elem in text_input]):
text_tensor = self.tokenize(text_input)
else:
unsupported_elem_type = [
type(elem) for elem in text_input
if not isinstance(elem, str)
][0]
raise TypeError(
f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'text should be str or List[str], but got {type(text_input)}'
)

text_tensor = text_tensor.to(self.device)

with torch.no_grad():
text_features = self.clip_model.encode_text(text_tensor)
text_features /= text_features.norm(
dim=-1, keepdim=True) # l2-normalize
output[OutputKeys.TEXT_EMBEDDING] = text_features

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

@property
def temperature(self):
return 1.0 / self.clip_model.logit_scale.exp()

+ 507
- 0
modelscope/models/multi_modal/clip/modeling_bert.py View File

@@ -0,0 +1,507 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model. """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import math
import os
import sys
from io import open

import json
import torch
from torch import nn

from .configuration_bert import BertConfig

logger = logging.getLogger(__name__)


def gelu(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1 + torch.tanh(
math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {
'gelu': gelu,
'relu': torch.nn.functional.relu,
'swish': swish,
'gelu_new': gelu_new
}

BertLayerNorm = torch.nn.LayerNorm


class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""

def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
config.hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None, position_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):

def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
'The hidden size (%d) is not a multiple of the number of attention '
'heads (%d)' %
(config.hidden_size, config.num_attention_heads))
self.output_attentions = config.output_attentions

self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size
/ config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads,
self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer,
key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(
self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = torch.matmul(attention_probs, value_layer)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (
self.all_head_size, )
context_layer = context_layer.view(*new_context_layer_shape)

outputs = (context_layer,
attention_probs) if self.output_attentions else (
context_layer, )
return outputs


class BertSelfOutput(nn.Module):

def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):

def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
self.pruned_heads = set()

def forward(self, input_tensor, attention_mask=None, head_mask=None):
self_outputs = self.self(input_tensor, attention_mask, head_mask)
attention_output = self.output(self_outputs[0], input_tensor)
outputs = (attention_output,
) + self_outputs[1:] # add attentions if we output them
return outputs


class BertIntermediate(nn.Module):

def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):

def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):

def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
attention_outputs = self.attention(hidden_states, attention_mask,
head_mask)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output, ) + attention_outputs[
1:] # add attentions if we output them
return outputs


class BertEncoder(nn.Module):

def __init__(self, config):
super(BertEncoder, self).__init__()
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList(
[BertLayer(config) for _ in range(config.num_hidden_layers)])

def forward(self, hidden_states, attention_mask=None, head_mask=None):
all_hidden_states = ()
all_attentions = ()
for i, layer_module in enumerate(self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

layer_outputs = layer_module(hidden_states, attention_mask,
head_mask[i])
hidden_states = layer_outputs[0]

if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1], )

# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

outputs = (hidden_states, )
if self.output_hidden_states:
outputs = outputs + (all_hidden_states, )
if self.output_attentions:
outputs = outputs + (all_attentions, )
return outputs # last-layer hidden state, (all hidden states), (all attentions)


class BertPooler(nn.Module):

def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertPredictionHeadTransform(nn.Module):

def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states


class BertLMPredictionHead(nn.Module):

def __init__(self, config):
super(BertLMPredictionHead, self).__init__()
self.transform = BertPredictionHeadTransform(config)

# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(
config.hidden_size, config.vocab_size, bias=False)

self.bias = nn.Parameter(torch.zeros(config.vocab_size))

def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias
return hidden_states


class BertOnlyMLMHead(nn.Module):

def __init__(self, config):
super(BertOnlyMLMHead, self).__init__()
self.predictions = BertLMPredictionHead(config)

def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores


class BertOnlyNSPHead(nn.Module):

def __init__(self, config):
super(BertOnlyNSPHead, self).__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score


class BertPreTrainingHeads(nn.Module):

def __init__(self, config):
super(BertPreTrainingHeads, self).__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score


class BertPreTrainedModel(nn.Module):
config_class = BertConfig
base_model_prefix = 'bert'

def __init__(self, config):
super(BertPreTrainedModel, self).__init__()
self.config = config

def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(
mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()


class BertModel(BertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer)
of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax,
used to compute the weighted average in the self-attention heads.

Examples::

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple

"""

def __init__(self, config):
super(BertModel, self).__init__(config)

self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)

self.apply(self._init_weights)

def forward(self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(
dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters(
)).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers

embedding_output = self.embeddings(
input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids)
encoder_outputs = self.encoder(
embedding_output, extended_attention_mask, head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)

outputs = (
sequence_output,
pooled_output,
) + encoder_outputs[
1:] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)

+ 1
- 1
modelscope/models/multi_modal/diffusion/model.py View File

@@ -136,7 +136,7 @@ class DiffusionForTextToImageSynthesis(Model):
self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024

# text tokenizer
vocab_path = '{}/vocab.txt'.format(model_dir)
vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.tokenizer = Tokenizer(vocab_file=vocab_path, seq_len=64)

# diffusion process


+ 3
- 3
modelscope/models/multi_modal/gemm/gemm_base.py View File

@@ -491,7 +491,9 @@ class GEVL(nn.Module):
gen_logits = self.to_logits(out_embs[-1:, ...])
probs = F.softmax(self.gen_logit_scale.exp() * gen_logits, dim=-1)
pred = torch.argmax(
probs * (1.0 + torch.rand_like(probs)), axis=-1)
probs * (2.0 + torch.rand_like(probs)), axis=-1)
if int(pred) >= eot_token or int(pred) <= 0:
break
pred_tokens.append(pred)
text_input = torch.cat(
[text_input, pred.permute(1, 0).contiguous()], axis=1)
@@ -500,8 +502,6 @@ class GEVL(nn.Module):
for out_tokens in pred_text_tokens:
tokens = []
for x in out_tokens:
if x >= eot_token or x <= 0:
break
tokens.append(int(x))
out_text = self.tokenizer.decode(tokens)
out_text = out_text.strip()


+ 1
- 2
modelscope/models/multi_modal/mplug/__init__.py View File

@@ -14,5 +14,4 @@
# limitations under the License.

from .configuration_mplug import MPlugConfig
from .modeling_mplug import (CONFIG_NAME, VOCAB_NAME,
MPlugForVisualQuestionAnswering)
from .modeling_mplug import CONFIG_NAME, MPlug

+ 61
- 1
modelscope/models/multi_modal/mplug/clip/clip.py View File

@@ -5,9 +5,69 @@ from typing import Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn

from modelscope.models.multi_modal.clip.clip_vit import Transformer

class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])
self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)


class Bottleneck(nn.Module):


+ 6
- 23
modelscope/models/multi_modal/mplug/configuration_mplug.py View File

@@ -15,14 +15,14 @@
# limitations under the License.
""" MPLUG model configuration """
import os
from collections import OrderedDict
from typing import Any, Dict, Mapping, Union
from typing import Any, Dict, Union

import yaml
from transformers import PretrainedConfig
from transformers.onnx import OnnxConfig
from transformers.utils import logging

from modelscope.utils.constant import Tasks

logger = logging.get_logger(__name__)


@@ -32,6 +32,7 @@ class MPlugConfig(PretrainedConfig):

def __init__(
self,
task=Tasks.visual_question_answering,
bert_config='config_bert.json',
image_res=504,
batch_size_train=128,
@@ -64,7 +65,9 @@ class MPlugConfig(PretrainedConfig):
clip_transformer_heads=12,
clip_transformer_layers=12,
**kwargs):

super().__init__(**kwargs)
self.task = task
self.bert_config = bert_config
self.image_res = image_res
self.batch_size_train = batch_size_train
@@ -103,23 +106,3 @@ class MPlugConfig(PretrainedConfig):
with open(yaml_file, 'r') as reader:
config_dict = yaml.load(reader, Loader=yaml.Loader)
return cls(**config_dict)


class MPlugOnnxConfig(OnnxConfig):

@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict([
('input_ids', {
0: 'batch',
1: 'sequence'
}),
('attention_mask', {
0: 'batch',
1: 'sequence'
}),
('token_type_ids', {
0: 'batch',
1: 'sequence'
}),
])

+ 376
- 144
modelscope/models/multi_modal/mplug/modeling_mplug.py View File

@@ -42,14 +42,13 @@ from transformers.utils import logging

from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
from modelscope.models.multi_modal.mplug.predictor import TextGenerator
from modelscope.utils.constant import ModelFile

transformers.logging.set_verbosity_error()

logger = logging.get_logger(__name__)

CONFIG_NAME = 'config.yaml'
WEIGHTS_NAME = 'pytorch_model.bin'
VOCAB_NAME = 'vocab.txt'

_CONFIG_FOR_DOC = 'BertConfig'
_TOKENIZER_FOR_DOC = 'BertTokenizer'
@@ -1726,32 +1725,145 @@ class BertLMHeadModel(BertPreTrainedModel):
return reordered_past


class MPlugForVisualQuestionAnswering(PreTrainedModel):
class BertPrefixModel(BertPreTrainedModel):

_keys_to_ignore_on_load_unexpected = [r'pooler']
_keys_to_ignore_on_load_missing = [
r'position_ids', r'predictions.decoder.bias'
]

def __init__(self, config):
super().__init__(config)

self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)

self.init_weights()

def get_output_embeddings(self):
return self.cls.predictions.decoder

def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings

@add_start_docstrings_to_model_forward(
BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint='bert-base-uncased',
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
is_decoder=True,
reduction='mean',
soft_labels=None,
alpha=0,
return_logits=False,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False

outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
is_decoder=is_decoder,
)

sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)

if return_logits:
return prediction_scores[:, :-1, :].contiguous()

lm_loss = None
if labels is not None:
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :
-1, :].contiguous()
labels = labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss()
lm_loss = loss_fct(
shifted_prediction_scores.view(-1, self.config.vocab_size),
labels.view(-1))
if soft_labels is not None:
loss_distill = -torch.sum(
F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels,
dim=-1)
loss_distill = loss_distill[labels != -100].mean()
lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill

if not return_dict:
output = (prediction_scores, ) + outputs[2:]
return ((lm_loss, ) + output) if lm_loss is not None else output

return CausalLMOutputWithCrossAttentions(
loss=lm_loss,
logits=prediction_scores,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)


class MPlug(PreTrainedModel):
config_class = MPlugConfig

def __init__(self, config):
super().__init__(config)
self.config = config
self.tokenizer = BertTokenizer.from_pretrained(
os.path.join(config.model_dir, VOCAB_NAME))
os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
self.module_setting(config)
self.visual_encoder = self._initialize_clip(config)
self.text_encoder = BertModel(
self.config_encoder, add_pooling_layer=False)
self.fusion_encoder = FusionModel(
self.config_fusion, add_pooling_layer=False)
self.text_decoder = BertLMHeadModel(self.config_decoder)
self.init_distill(config)
self.beam_generator = TextGenerator(config, self.text_decoder)

@classmethod
def from_pretrained(cls, model_dir, load_checkpoint=True):
config = MPlugConfig.from_yaml_file(
from modelscope.utils.constant import Tasks

task_mapping = {
Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
Tasks.image_captioning: MPLUGForImageCaption
}
config = cls.config_class.from_yaml_file(
os.path.join(model_dir, CONFIG_NAME))
config.model_dir = model_dir
model = cls(config)
model = task_mapping[config.task](config)
if load_checkpoint:
checkpoint_path = os.path.join(model_dir, WEIGHTS_NAME)
checkpoint_path = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
checkpoint = torch.load(checkpoint_path, map_location='cpu')
if 'model' in checkpoint:
state_dict = checkpoint['model']
@@ -1803,6 +1915,161 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
clip_model.visual.positional_embedding = pos_embed
return clip_model

def forward(self, *args, **kwargs):
raise NotImplementedError

def module_setting(self, config):
bert_config_path = os.path.join(config.model_dir, config.bert_config)
self.config_encoder = BertConfig.from_json_file(bert_config_path)
self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
self.config_fusion = BertConfig.from_json_file(bert_config_path)
self.config_decoder = BertConfig.from_json_file(bert_config_path)
self.config_decoder.add_cross_attention = True
self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
self.large = False
if self.config_encoder.hidden_size != config.vision_width:
self.visn_fc = nn.Linear(config.vision_width,
self.config_encoder.hidden_size)
self.visn_layer_norm = nn.LayerNorm(
self.config_encoder.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
self.large = True

@torch.no_grad()
def copy_params(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data.copy_(param.data) # initialize
param_m.requires_grad = False # not update by gradient

@torch.no_grad()
def _momentum_update(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data = param_m.data * self.momentum + param.data * (
1. - self.momentum)

def generation(self, question_states, question_atts, out_size=1):
encoder_inputs = [question_states, question_atts]
topk_ids, topk_scores = self.beam_generator.translate_batch(
encoder_inputs, out_size=out_size)
return topk_ids, topk_scores

@staticmethod
def _tile(x, dim, n_tile):
import numpy as np
init_dim = x.size(dim)
repeat_idx = [1] * x.dim()
repeat_idx[dim] = n_tile
x = x.repeat(*(repeat_idx))
order_index = torch.LongTensor(
np.concatenate(
[init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
return torch.index_select(x, dim, order_index.to(x.device))

def rank_answer(self, question_states, question_atts, answer_ids,
answer_atts, k):

num_ques = question_states.size(0)
start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token

start_output = self.text_decoder(
start_ids,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
return_dict=True,
reduction='none')
logits = start_output.logits[:, 0, :] # first token's logit

# topk_probs: top-k probability
# topk_ids: [num_question, k]
answer_first_token = answer_ids[:, 1]
prob_first_token = F.softmax(
logits, dim=1).index_select(
dim=1, index=answer_first_token)
topk_probs, topk_ids = prob_first_token.topk(k, dim=1)

# answer input: [num_question*k, answer_len]
input_ids = []
input_atts = []
for b, topk_id in enumerate(topk_ids):
input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
input_ids = torch.cat(input_ids, dim=0)
input_atts = torch.cat(input_atts, dim=0)

targets_ids = input_ids.masked_fill(
input_ids == self.tokenizer.pad_token_id, -100)

# repeat encoder's output for top-k answers
question_states = self._tile(question_states, 0, k)
question_atts = self._tile(question_atts, 0, k)

output = self.text_decoder(
input_ids,
attention_mask=input_atts,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
labels=targets_ids,
return_dict=True,
reduction='none')

answer_loss = output.loss
answer_loss = answer_loss.view(input_ids.size(0), -1)

# topk_prob: first token probability
topk_probs = topk_probs.view(-1, 1)
log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)

# re-calculate log probabilities for the answer sequences using chain rule
log_probs_sum = log_probs.sum(1)
log_probs_sum = log_probs_sum.view(num_ques, k)

topk_probs = F.softmax(log_probs_sum, dim=-1)
# get top-k after re-ranking
topk_probs, rerank_id = topk_probs.topk(k, dim=1)
topk_ids = torch.gather(topk_ids, 1, rerank_id)

return topk_ids, topk_probs


class MPlugForVisualQuestionAnswering(MPlug):

def __init__(self, config):
super().__init__(config)
self.text_decoder = BertLMHeadModel(self.config_decoder)
self.beam_generator = TextGenerator(config, self.text_decoder)
self.init_distill(config)

def init_distill(self, config):
self.distill = config.distill
if self.distill:
self.visual_encoder_m = self._initialize_clip(config)
self.text_encoder_m = BertModel(
self.config_encoder, add_pooling_layer=False)
self.fusion_encoder_m = FusionModel(
self.config_fusion, add_pooling_layer=False)
self.text_decoder_m = BertLMHeadModel(self.config_decoder)
self.model_pairs = [
[self.visual_encoder, self.visual_encoder_m],
[self.text_encoder, self.text_encoder_m],
[self.text_decoder, self.text_decoder_m],
]
if self.config_encoder.hidden_size != config.vision_width:
self.visn_fc_m = nn.Linear(config.vision_width,
self.config_encoder.hidden_size)
self.visn_layer_norm_m = nn.LayerNorm(
self.config_encoder.hidden_size, eps=1e-12)
self.dropout_m = nn.Dropout(
self.config_encoder.hidden_dropout_prob)
self.model_pairs.extend(
[[self.visn_fc, self.visn_fc_m],
[self.visn_layer_norm, self.visn_layer_norm_m]])
self.copy_params()
self.momentum = 0.995

def forward(self,
image,
question,
@@ -1935,145 +2202,110 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
merge_text_attention)
return topk_ids, topk_probs

def module_setting(self, config):
bert_config_path = os.path.join(config.model_dir, config.bert_config)
self.config_encoder = BertConfig.from_json_file(bert_config_path)
self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
self.config_fusion = BertConfig.from_json_file(bert_config_path)
self.config_decoder = BertConfig.from_json_file(bert_config_path)
self.config_decoder.add_cross_attention = True
self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
self.large = False
if self.config_encoder.hidden_size != config.vision_width:
self.visn_fc = nn.Linear(config.vision_width,
self.config_encoder.hidden_size)
self.visn_layer_norm = nn.LayerNorm(
self.config_encoder.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
self.large = True

def init_distill(self, config):
self.distill = config.distill
if self.distill:
self.visual_encoder_m = self._initialize_clip(config)
self.text_encoder_m = BertModel(
self.config_encoder, add_pooling_layer=False)
self.fusion_encoder_m = FusionModel(
self.config_fusion, add_pooling_layer=False)
self.text_decoder_m = BertLMHeadModel(self.config_decoder)
self.model_pairs = [
[self.visual_encoder, self.visual_encoder_m],
[self.text_encoder, self.text_encoder_m],
[self.text_decoder, self.text_decoder_m],
]
if self.config_encoder.hidden_size != config.vision_width:
self.visn_fc_m = nn.Linear(config.vision_width,
self.config_encoder.hidden_size)
self.visn_layer_norm_m = nn.LayerNorm(
self.config_encoder.hidden_size, eps=1e-12)
self.dropout_m = nn.Dropout(
self.config_encoder.hidden_dropout_prob)
self.model_pairs.extend(
[[self.visn_fc, self.visn_fc_m],
[self.visn_layer_norm, self.visn_layer_norm_m]])
self.copy_params()
self.momentum = 0.995

@torch.no_grad()
def copy_params(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data.copy_(param.data) # initialize
param_m.requires_grad = False # not update by gradient

@torch.no_grad()
def _momentum_update(self):
for model_pair in self.model_pairs:
for param, param_m in zip(model_pair[0].parameters(),
model_pair[1].parameters()):
param_m.data = param_m.data * self.momentum + param.data * (
1. - self.momentum)

def generation(self, question_states, question_atts):
encoder_inputs = [question_states, question_atts]
topk_ids, topk_scores = self.beam_generator.translate_batch(
encoder_inputs)
return topk_ids, topk_scores

@staticmethod
def _tile(x, dim, n_tile):
import numpy as np
init_dim = x.size(dim)
repeat_idx = [1] * x.dim()
repeat_idx[dim] = n_tile
x = x.repeat(*(repeat_idx))
order_index = torch.LongTensor(
np.concatenate(
[init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
return torch.index_select(x, dim, order_index.to(x.device))

def rank_answer(self, question_states, question_atts, answer_ids,
answer_atts, k):

num_ques = question_states.size(0)
start_ids = answer_ids[0, 0].repeat(num_ques, 1) # bos token

start_output = self.text_decoder(
start_ids,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
return_dict=True,
reduction='none')
logits = start_output.logits[:, 0, :] # first token's logit
class MPLUGForImageCaption(MPlug):

# topk_probs: top-k probability
# topk_ids: [num_question, k]
answer_first_token = answer_ids[:, 1]
prob_first_token = F.softmax(
logits, dim=1).index_select(
dim=1, index=answer_first_token)
topk_probs, topk_ids = prob_first_token.topk(k, dim=1)

# answer input: [num_question*k, answer_len]
input_ids = []
input_atts = []
for b, topk_id in enumerate(topk_ids):
input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
input_ids = torch.cat(input_ids, dim=0)
input_atts = torch.cat(input_atts, dim=0)

targets_ids = input_ids.masked_fill(
input_ids == self.tokenizer.pad_token_id, -100)

# repeat encoder's output for top-k answers
question_states = self._tile(question_states, 0, k)
question_atts = self._tile(question_atts, 0, k)
def __init__(self, config):
super().__init__(config)
self.text_decoder = BertPrefixModel(self.config_decoder)
self.beam_generator = TextGenerator(config, self.text_decoder)

output = self.text_decoder(
input_ids,
attention_mask=input_atts,
encoder_hidden_states=question_states,
encoder_attention_mask=question_atts,
labels=targets_ids,
return_dict=True,
reduction='none')
def beam_search(self,
image,
question,
answer=None,
train=True,
out_size=5):
image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
if self.large:
image_embeds = self.dropout(
self.visn_layer_norm(self.visn_fc(image_embeds)))
image_atts = torch.ones(
image_embeds.size()[:-1], dtype=torch.long).to(image.device)
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=False)
image_output, question_output = fusion_output
question_output = torch.cat([image_output, question_output], 1)
merge_text_attention = torch.cat([image_atts, question.attention_mask],
1)
topk_ids, topk_probs = self.generation(
question_output, merge_text_attention, out_size=out_size)
return topk_ids, topk_probs

answer_loss = output.loss
answer_loss = answer_loss.view(input_ids.size(0), -1)
def forward(self,
image,
question,
answer=None,
train=True,
out_size=5,
scst=False):
if (scst):
return self.beam_search(
image, question, answer, train=True, out_size=out_size)
image = image.to(dtype=next(self.parameters()).dtype)
image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
if self.large:
image_embeds = self.dropout(
self.visn_layer_norm(self.visn_fc(image_embeds)))
image_atts = torch.ones(
image_embeds.size()[:-1], dtype=torch.long).to(image.device)

# topk_prob: first token probability
topk_probs = topk_probs.view(-1, 1)
log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
if train:
answer_targets = answer.input_ids.masked_fill(
answer.input_ids == self.tokenizer.pad_token_id, -100)
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=False)

# re-calculate log probabilities for the answer sequences using chain rule
log_probs_sum = log_probs.sum(1)
log_probs_sum = log_probs_sum.view(num_ques, k)
image_output, question_output = fusion_output

topk_probs = F.softmax(log_probs_sum, dim=-1)
# get top-k after re-ranking
topk_probs, rerank_id = topk_probs.topk(k, dim=1)
topk_ids = torch.gather(topk_ids, 1, rerank_id)
question_output = torch.cat([image_output, question_output], 1)
merge_text_attention = torch.cat(
[image_atts, question.attention_mask], 1)

return topk_ids, topk_probs
answer_output = self.text_decoder(
answer.input_ids,
attention_mask=answer.attention_mask,
encoder_hidden_states=question_output,
encoder_attention_mask=merge_text_attention,
labels=answer_targets,
return_dict=True,
reduction='none')
loss = answer_output.loss
return loss
else:
text_output = self.text_encoder(
question.input_ids,
attention_mask=question.attention_mask,
return_dict=True)
text_embeds = text_output.last_hidden_state
fusion_output = self.fusion_encoder(
encoder_embeds=text_embeds,
attention_mask=question.attention_mask,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=False)
image_output, question_output = fusion_output
question_output = torch.cat([image_output, question_output], 1)
merge_text_attention = torch.cat(
[image_atts, question.attention_mask], 1)
topk_ids, topk_probs = self.generation(question_output,
merge_text_attention)
return topk_ids, topk_probs

modelscope/models/multi_modal/mplug_for_visual_question_answering.py → modelscope/models/multi_modal/mplug_for_all_tasks.py View File

@@ -6,12 +6,13 @@ from modelscope.models.base import Tensor
from modelscope.models.builder import MODELS
from modelscope.utils.constant import Tasks

__all__ = ['MPlugForVisualQuestionAnswering']
__all__ = ['MPlugForAllTasks']


@MODELS.register_module(
Tasks.visual_question_answering, module_name=Models.mplug)
class MPlugForVisualQuestionAnswering(TorchModel):
@MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
class MPlugForAllTasks(TorchModel):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the mplug model from the `model_dir` path.
@@ -20,8 +21,8 @@ class MPlugForVisualQuestionAnswering(TorchModel):
"""

super().__init__(model_dir, *args, **kwargs)
from modelscope.models.multi_modal.mplug import MPlugForVisualQuestionAnswering
self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir)
from modelscope.models.multi_modal.mplug import MPlug
self.model = MPlug.from_pretrained(model_dir)
self.tokenizer = self.model.tokenizer

def train(self):
@@ -44,4 +45,13 @@ class MPlugForVisualQuestionAnswering(TorchModel):
}
"""

return self.model(**input)[0]
topk_ids, _ = self.model(**input)
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))

pred_string = self.tokenizer.decode(topk_ids[0][0])
for _old, _new in replace_tokens_bert:
pred_string = pred_string.replace(_old, _new)
pred_string = pred_string.strip()
return pred_string

+ 3
- 1
modelscope/models/multi_modal/ofa/tokenization_ofa.py View File

@@ -22,6 +22,8 @@ from transformers.models.bert.tokenization_bert import (BasicTokenizer,
WordpieceTokenizer)
from transformers.utils import logging

from modelscope.utils.constant import ModelFile

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'}
@@ -42,7 +44,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ofa-base': 1024,
}

VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}

PRETRAINED_VOCAB_FILES_MAP_ZH = {
'vocab_file': {


+ 2
- 1
modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py View File

@@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerFast
from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast
from transformers.utils import logging

from modelscope.utils.constant import ModelFile
from .tokenization_ofa import OFATokenizer, OFATokenizerZH

logger = logging.get_logger(__name__)
@@ -50,7 +51,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ofa-base': 1024,
}

VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}

PRETRAINED_VOCAB_FILES_MAP_ZH = {
'vocab_file': {


+ 2
- 1
modelscope/models/nlp/structbert/tokenization_sbert.py View File

@@ -23,11 +23,12 @@ from typing import List, Optional, Tuple
from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
_is_punctuation, _is_whitespace)

from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger

logger = get_logger(__name__)

VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}

PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}



+ 2
- 1
modelscope/models/nlp/structbert/tokenization_sbert_fast.py View File

@@ -22,13 +22,14 @@ import transformers
from tokenizers import normalizers
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger
from .tokenization_sbert import SbertTokenizer

logger = get_logger(__name__)

VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.txt',
'vocab_file': ModelFile.VOCAB_FILE,
'tokenizer_file': 'tokenizer.json'
}



+ 54
- 32
modelscope/msdatasets/ms_dataset.py View File

@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path)

from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.utils.config import ConfigDict
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DatasetFormations, DownloadMode, Hubs)
from modelscope.utils.logger import get_logger
from .task_datasets.builder import build_task_dataset
from .utils.dataset_builder import ExternalDataset
from .utils.dataset_utils import (get_dataset_files,
get_target_dataset_structure,
load_dataset_builder)
@@ -67,9 +70,16 @@ class MsDataset:
def __len__(self):
return len(self._hf_ds)

@property
def config_kwargs(self):
if isinstance(self._hf_ds, ExternalDataset):
return self._hf_ds.config_kwargs
else:
return None

@classmethod
def from_hf_dataset(cls,
hf_ds: Union[Dataset, DatasetDict],
hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
target: str = None) -> Union[dict, 'MsDataset']:
if isinstance(hf_ds, Dataset):
return cls(hf_ds, target)
@@ -77,6 +87,8 @@ class MsDataset:
if len(hf_ds.keys()) == 1:
return cls(next(iter(hf_ds.values())), target)
return {k: cls(v, target) for k, v in hf_ds.items()}
elif isinstance(hf_ds, ExternalDataset):
return cls(hf_ds)
else:
raise TypeError(
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
@@ -96,7 +108,8 @@ class MsDataset:
Mapping[str, Union[str,
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = DownloadMode.
REUSE_DATASET_IF_EXISTS
REUSE_DATASET_IF_EXISTS,
**config_kwargs,
) -> Union[dict, 'MsDataset']:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args:
@@ -113,6 +126,7 @@ class MsDataset:
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
download_mode (DownloadMode or str, optional): How to treat existing datasets. default
DownloadMode.REUSE_DATASET_IF_EXISTS
**config_kwargs (additional keyword arguments): Keyword arguments to be passed

Returns:
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
@@ -128,7 +142,8 @@ class MsDataset:
split=split,
data_dir=data_dir,
data_files=data_files,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
return MsDataset.from_hf_dataset(dataset, target=target)
elif hub == Hubs.modelscope:
return MsDataset._load_ms_dataset(
@@ -140,22 +155,22 @@ class MsDataset:
split=split,
data_dir=data_dir,
data_files=data_files,
download_mode=download_mode)
download_mode=download_mode,
**config_kwargs)

@staticmethod
def _load_ms_dataset(
dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None
) -> Union[dict, 'MsDataset']:
def _load_ms_dataset(dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[
str, Sequence[str],
Mapping[str, Union[str, Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[dict, 'MsDataset']:
if isinstance(dataset_name, str):
dataset_formation = DatasetFormations.native
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -184,7 +199,8 @@ class MsDataset:
data_dir=data_dir,
data_files=data_files,
cache_dir=MS_DATASETS_CACHE,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
else:
dataset = MsDataset._load_from_ms(
dataset_name,
@@ -195,7 +211,7 @@ class MsDataset:
subset_name=subset_name,
split=split,
download_mode=download_mode,
)
**config_kwargs)
elif isinstance(dataset_name, list):
if target is None:
target = 'target'
@@ -206,16 +222,15 @@ class MsDataset:
return MsDataset.from_hf_dataset(dataset, target=target)

@staticmethod
def _load_from_ms(
dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
) -> Union[Dataset, DatasetDict]:
def _load_from_ms(dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[Dataset, DatasetDict]:
for json_path in dataset_files['.json']:
if json_path.endswith(f'{dataset_name}.json'):
with open(json_path, encoding='utf-8') as dataset_json_file:
@@ -226,7 +241,6 @@ class MsDataset:
meta_map, file_map = get_dataset_files(target_dataset_structure,
dataset_name, namespace,
version)

builder = load_dataset_builder(
dataset_name,
subset_name,
@@ -235,7 +249,8 @@ class MsDataset:
zip_data_files=file_map,
cache_dir=MS_DATASETS_CACHE,
version=version,
split=list(target_dataset_structure.keys()))
split=list(target_dataset_structure.keys()),
**config_kwargs)

download_config = DownloadConfig(
cache_dir=download_dir,
@@ -253,7 +268,6 @@ class MsDataset:
data_dir=download_dir,
)
builder.download_and_prepare(
download_config=download_config,
dl_manager=dl_manager,
download_mode=download_mode.value,
try_from_hf_gcs=False)
@@ -338,6 +352,8 @@ class MsDataset:
self,
columns: Union[str, List[str]] = None,
preprocessors: Union[Callable, List[Callable]] = None,
task_name: str = None,
task_data_config: ConfigDict = None,
**format_kwargs,
):
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -350,6 +366,8 @@ class MsDataset:
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
the output fields of processors will also be added.
task_name (str, default None): task name, refer to :obj:`Tasks` for more details
task_data_config (ConfigDict, default None): config dict for model object.
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

Returns:
@@ -360,6 +378,10 @@ class MsDataset:
raise ImportError(
'The function to_torch_dataset requires pytorch to be installed'
)
if isinstance(self._hf_ds, ExternalDataset):
task_data_config.update({'preprocessor': preprocessors})
return build_task_dataset(task_data_config, task_name,
self._hf_ds.config_kwargs)
if preprocessors is not None:
return self.to_torch_dataset_with_processors(
preprocessors, columns=columns)


modelscope/task_datasets/__init__.py → modelscope/msdatasets/task_datasets/__init__.py View File

@@ -8,6 +8,7 @@ if TYPE_CHECKING:
from .builder import TASK_DATASETS, build_task_dataset
from .torch_base_dataset import TorchTaskDataset
from .veco_dataset import VecoDataset
from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset

else:
_import_structure = {
@@ -15,6 +16,8 @@ else:
'builder': ['TASK_DATASETS', 'build_task_dataset'],
'torch_base_dataset': ['TorchTaskDataset'],
'veco_dataset': ['VecoDataset'],
'image_instance_segmentation_coco_dataset':
['ImageInstanceSegmentationCocoDataset']
}
import sys


modelscope/task_datasets/base.py → modelscope/msdatasets/task_datasets/base.py View File


modelscope/task_datasets/builder.py → modelscope/msdatasets/task_datasets/builder.py View File


modelscope/models/cv/image_instance_segmentation/datasets/dataset.py → modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py View File

@@ -2,14 +2,32 @@ import os.path as osp

import numpy as np
from pycocotools.coco import COCO
from torch.utils.data import Dataset


class ImageInstanceSegmentationCocoDataset(Dataset):
from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from .builder import TASK_DATASETS
from .torch_base_dataset import TorchTaskDataset

DATASET_STRUCTURE = {
'train': {
'annotation': 'annotations/instances_train.json',
'images': 'images/train'
},
'validation': {
'annotation': 'annotations/instances_val.json',
'images': 'images/val'
}
}


@TASK_DATASETS.register_module(
module_name=Models.cascade_mask_rcnn_swin,
group_key=Tasks.image_segmentation)
class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
"""Coco-style dataset for image instance segmentation.

Args:
ann_file (str): Annotation file path.
split_config (dict): Annotation file path. {"train":"xxxxx"}
classes (Sequence[str], optional): Specify classes to load.
If is None, ``cls.CLASSES`` will be used. Default: None.
data_root (str, optional): Data root for ``ann_file``,
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')

def __init__(self,
ann_file,
split_config: dict,
preprocessor=None,
classes=None,
data_root=None,
img_prefix='',
seg_prefix=None,
test_mode=False,
filter_empty_gt=True):
self.ann_file = ann_file
self.data_root = data_root
self.img_prefix = img_prefix
filter_empty_gt=True,
**kwargs):
self.data_root = next(iter(split_config.values()))
self.split = next(iter(split_config.keys()))
self.preprocessor = preprocessor

self.ann_file = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['annotation'])

self.img_prefix = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['images'])
self.seg_prefix = seg_prefix
self.test_mode = test_mode
self.filter_empty_gt = filter_empty_gt
self.CLASSES = self.get_classes(classes)

# join paths if data_root is specified
if self.data_root is not None:
if not osp.isabs(self.ann_file):
self.ann_file = osp.join(self.data_root, self.ann_file)
if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
self.img_prefix = osp.join(self.data_root, self.img_prefix)
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
self.seg_prefix = osp.join(self.data_root, self.seg_prefix)

# load annotations
self.data_infos = self.load_annotations(self.ann_file)

@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
# set group flag for the sampler
self._set_group_flag()

self.preprocessor = None

def __len__(self):
"""Total number of samples of data."""
return len(self.data_infos)
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
raise ValueError(f'Unsupported type {type(classes)} of classes.')

return class_names

def to_torch_dataset(self, preprocessors=None):
self.preprocessor = preprocessors
return self

modelscope/task_datasets/torch_base_dataset.py → modelscope/msdatasets/task_datasets/torch_base_dataset.py View File


modelscope/task_datasets/veco_dataset.py → modelscope/msdatasets/task_datasets/veco_dataset.py View File


+ 92
- 3
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo
from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock

from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger

logger = get_logger()
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv):
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.namespace = namespace
super().__init__(
cache_dir=cache_dir,
name=subset_name,
hash=hash,
namespace=namespace,
data_files=meta_data_files,
**config_kwargs)

@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv):
os.rmdir(self._cache_dir)
self.zip_data_files = zip_data_files

def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
"""Relative path of this dataset in cache_dir:
Will be:
self.name/self.config.version/self.hash/
or if a namespace has been specified:
self.namespace___self.name/self.config.version/self.hash/
"""
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
builder_config = self.config
hash = self.hash
if builder_config:
builder_data_dir = os.path.join(builder_data_dir, self.config_id)
if with_version:
builder_data_dir = os.path.join(builder_data_dir,
str(self.config.version))
if with_hash and hash and isinstance(hash, str):
builder_data_dir = os.path.join(builder_data_dir, hash)
return builder_data_dir

def _build_cache_dir(self):
builder_data_dir = os.path.join(
self._cache_dir_root,
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv):
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
'files': dl_manager.iter_files(files),
'base_dir': zip_data_files.get(split_name)
'files':
dl_manager.iter_files(files),
'base_dir':
os.path.join(
zip_data_files.get(split_name),
os.path.splitext(
self.zip_data_files.get(split_name))[0])
if self.zip_data_files.get(split_name) else
zip_data_files.get(split_name)
}))
return splits

@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv):
logger.error(
f"Failed to read file '{file}' with error {type(e)}: {e}")
raise


class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):

def __init__(
self,
dataset_name: str,
cache_dir: str,
namespace: str,
subset_name: str,
hash: str,
meta_data_files: Mapping[str, Union[str, Sequence[str]]],
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.name = dataset_name
self.subset_name = subset_name
self.namespace = namespace
self.hash = hash
self.data_files = meta_data_files
self.zip_data_files = zip_data_files
self.split_path_dict = None
self.config = None
self._cache_dir_root = os.path.expanduser(cache_dir)
self._cache_dir = self._build_cache_dir()
self._config_kwargs = config_kwargs

def download_and_prepare(self, download_mode, dl_manager,
**download_kwargs):
# Prevent parallel disk operations
lock_path = os.path.join(
self._cache_dir_root,
self._cache_dir.replace(os.sep, '_') + '.lock')
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
logger.warning(
f'Reusing dataset {self.name} ({self._cache_dir})')
return
logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
self._download_and_prepare(dl_manager=dl_manager)

def _download_and_prepare(self, dl_manager):
split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
self.split_path_dict = {
k: os.path.join(v,
os.path.splitext(self.zip_data_files[k])[0])
for k, v in split_path_dict.items()
}

def as_dataset(self):
return ExternalDataset(self.split_path_dict, self._config_kwargs)


class ExternalDataset(object):

def __init__(self, split_path_dict, config_kwargs):
config_kwargs.update({'split_config': split_path_dict})
self.config_kwargs = config_kwargs

def __len__(self):
return len(self.config_kwargs['split_config'])

+ 28
- 11
modelscope/msdatasets/utils/dataset_utils.py View File

@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder

from modelscope.utils.constant import DEFAULT_DATASET_REVISION
from modelscope.utils.logger import get_logger
from .dataset_builder import MsCsvDatasetBuilder
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder

logger = get_logger()

@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict,
modelscope_api = HubApi()
for split, info in subset_split_into.items():
meta_map[split] = modelscope_api.get_dataset_file_url(
info['meta'], dataset_name, namespace, revision)
info.get('meta', ''), dataset_name, namespace, revision)
if info.get('file'):
file_map[split] = info['file']
return meta_map, file_map
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
zip_data_files: Mapping[str, Union[str,
Sequence[str]]],
cache_dir: str, version: Optional[Union[str]],
split: Sequence[str]) -> DatasetBuilder:
split: Sequence[str],
**config_kwargs) -> DatasetBuilder:
sub_dir = os.path.join(version, '_'.join(split))
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
meta_data_file = next(iter(meta_data_files.values()))
if not meta_data_file:
builder_instance = TaskSpecificDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir,
**config_kwargs)
elif meta_data_file.endswith('.csv'):
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
else:
raise NotImplementedError(
f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
)

return builder_instance

+ 24
- 1
modelscope/outputs.py View File

@@ -188,6 +188,16 @@ TASK_OUTPUTS = {
Tasks.body_2d_keypoints:
[OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],

# video single object tracking result for single video
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ]
# }
Tasks.video_single_object_tracking: [OutputKeys.BOXES],

# live category recognition result for single video
# {
# "scores": [0.885272, 0.014790631, 0.014558001],
@@ -405,7 +415,7 @@ TASK_OUTPUTS = {

# audio processed for single file in PCM format
# {
# "output_pcm": np.array with shape(samples,) and dtype float32
# "output_pcm": pcm encoded audio bytes
# }
Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
@@ -417,6 +427,19 @@ TASK_OUTPUTS = {
# }
Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM],

# {
# "kws_list": [
# {
# 'keyword': '', # the keyword spotted
# 'offset': 19.4, # the keyword start time in second
# 'length': 0.68, # the keyword length in second
# 'confidence': 0.85 # the possibility if it is the keyword
# },
# ...
# ]
# }
Tasks.keyword_spotting: [OutputKeys.KWS_LIST],

# ============ multi-modal tasks ===================

# image caption result for single sample


+ 2
- 0
modelscope/pipelines/audio/__init__.py View File

@@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .ans_pipeline import ANSPipeline
from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline
from .kws_farfield_pipeline import KWSFarfieldPipeline
from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline
from .linear_aec_pipeline import LinearAECPipeline
from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline
@@ -14,6 +15,7 @@ else:
_import_structure = {
'ans_pipeline': ['ANSPipeline'],
'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'],
'kws_farfield_pipeline': ['KWSFarfieldPipeline'],
'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'],
'linear_aec_pipeline': ['LinearAECPipeline'],
'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'],


+ 81
- 0
modelscope/pipelines/audio/kws_farfield_pipeline.py View File

@@ -0,0 +1,81 @@
import io
import wave
from typing import Any, Dict

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
Tasks.keyword_spotting,
module_name=Pipelines.speech_dfsmn_kws_char_farfield)
class KWSFarfieldPipeline(Pipeline):
r"""A Keyword Spotting Inference Pipeline .

When invoke the class with pipeline.__call__(), it accept only one parameter:
inputs(str): the path of wav file
"""
SAMPLE_RATE = 16000
SAMPLE_WIDTH = 2
INPUT_CHANNELS = 3
OUTPUT_CHANNELS = 2

def __init__(self, model, **kwargs):
"""
use `model` to create a kws far field pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
self.model = self.model.to(self.device)
self.model.eval()
frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
self._nframe = self.model.size_in // frame_size
self.frame_count = 0

def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
if isinstance(inputs, bytes):
return dict(input_file=inputs)
elif isinstance(inputs, Dict):
return inputs
else:
raise ValueError(f'Not supported input type: {type(inputs)}')

def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
input_file = inputs['input_file']
if isinstance(input_file, bytes):
input_file = io.BytesIO(input_file)
self.frame_count = 0
kws_list = []
with wave.open(input_file, 'rb') as fin:
if 'output_file' in inputs:
with wave.open(inputs['output_file'], 'wb') as fout:
fout.setframerate(self.SAMPLE_RATE)
fout.setnchannels(self.OUTPUT_CHANNELS)
fout.setsampwidth(self.SAMPLE_WIDTH)
self._process(fin, kws_list, fout)
else:
self._process(fin, kws_list)
return {OutputKeys.KWS_LIST: kws_list}

def _process(self,
fin: wave.Wave_read,
kws_list,
fout: wave.Wave_write = None):
data = fin.readframes(self._nframe)
while len(data) >= self.model.size_in:
self.frame_count += self._nframe
result = self.model.forward_decode(data)
if fout:
fout.writeframes(result['pcm'])
if 'kws' in result:
result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
kws_list.append(result['kws'])
data = fin.readframes(self._nframe)

def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
return inputs

+ 1
- 1
modelscope/pipelines/base.py View File

@@ -255,7 +255,7 @@ class Pipeline(ABC):
return self._collate_fn(torch.from_numpy(data))
elif isinstance(data, torch.Tensor):
return data.to(self.device)
elif isinstance(data, (str, int, float, bool, type(None))):
elif isinstance(data, (bytes, str, int, float, bool, type(None))):
return data
elif isinstance(data, InputFeatures):
return data


+ 6
- 2
modelscope/pipelines/builder.py View File

@@ -124,12 +124,16 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.image_classification:
(Pipelines.daily_image_classification,
'damo/cv_vit-base_image-classification_Dailylife-labels'),
Tasks.ocr_recognition: (Pipelines.ocr_recognition,
'damo/cv_convnextTiny_ocr-recognition_damo'),
Tasks.ocr_recognition:
(Pipelines.ocr_recognition,
'damo/cv_convnextTiny_ocr-recognition-general_damo'),
Tasks.skin_retouching: (Pipelines.skin_retouching,
'damo/cv_unet_skin-retouching'),
Tasks.crowd_counting: (Pipelines.crowd_counting,
'damo/cv_hrnet_crowd-counting_dcanet'),
Tasks.video_single_object_tracking:
(Pipelines.video_single_object_tracking,
'damo/cv_vitb_video-single-object-tracking_ostrack'),
}




+ 2
- 0
modelscope/pipelines/cv/__init__.py View File

@@ -10,6 +10,7 @@ if TYPE_CHECKING:
from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
from .crowd_counting_pipeline import CrowdCountingPipeline
from .image_detection_pipeline import ImageDetectionPipeline
from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
from .face_detection_pipeline import FaceDetectionPipeline
from .face_image_generation_pipeline import FaceImageGenerationPipeline
from .face_recognition_pipeline import FaceRecognitionPipeline
@@ -43,6 +44,7 @@ else:
'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
'crowd_counting_pipeline': ['CrowdCountingPipeline'],
'image_detection_pipeline': ['ImageDetectionPipeline'],
'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
'face_detection_pipeline': ['FaceDetectionPipeline'],
'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
'face_recognition_pipeline': ['FaceRecognitionPipeline'],


+ 47
- 0
modelscope/pipelines/cv/image_salient_detection_pipeline.py View File

@@ -0,0 +1,47 @@
from typing import Any, Dict

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
Tasks.image_segmentation, module_name=Pipelines.salient_detection)
class ImageSalientDetectionPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
model: model id on modelscope hub.
"""
super().__init__(model=model, auto_collate=False, **kwargs)

def preprocess(self, input: Input) -> Dict[str, Any]:

img = LoadImage.convert_to_ndarray(input)
img_h, img_w, _ = img.shape
img = self.model.preprocess(img)
result = {'img': img, 'img_w': img_w, 'img_h': img_h}
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:

outputs = self.model.inference(input['img'])
result = {
'data': outputs,
'img_w': input['img_w'],
'img_h': input['img_h']
}
return result

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

data = self.model.postprocess(inputs)
outputs = {
OutputKeys.SCORES: None,
OutputKeys.LABELS: None,
OutputKeys.MASKS: data
}
return outputs

+ 80
- 0
modelscope/pipelines/cv/video_single_object_tracking_pipeline.py View File

@@ -0,0 +1,80 @@
import os.path as osp
from typing import Any, Dict

import cv2

from modelscope.metainfo import Pipelines
from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
cfg
from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
OSTrack
from modelscope.models.cv.video_single_object_tracking.utils.utils import \
check_box
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.video_single_object_tracking,
module_name=Pipelines.video_single_object_tracking)
class VideoSingleObjectTrackingPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a single object tracking pipeline
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
self.cfg = cfg
ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
logger.info(f'loading model from {ckpt_path}')
self.tracker = OSTrack(ckpt_path, self.device)
logger.info('init tracker done')

def preprocess(self, input) -> Input:
self.video_path = input[0]
self.init_bbox = input[1]
return input

def forward(self, input: Input) -> Dict[str, Any]:
output_boxes = []
cap = cv2.VideoCapture(self.video_path)
success, frame = cap.read()
if success is False:
raise Exception(
'modelscope error: %s can not be decoded by OpenCV.' %
(self.video_path))

init_box = self.init_bbox
frame_h, frame_w = frame.shape[0:2]
if not check_box(init_box, frame_h, frame_w):
raise Exception('modelscope error: init_box out of image range ',
init_box)
output_boxes.append(init_box.copy())
init_box[2] = init_box[2] - init_box[0]
init_box[3] = init_box[3] - init_box[1]
self.tracker.initialize(frame, {'init_bbox': init_box})
logger.info('init bbox done')

while True:
ret, frame = cap.read()
if frame is None:
break
out = self.tracker.track(frame)
state = [int(s) for s in out['target_bbox']]
output_boxes.append(state)
cap.release()
logger.info('tracking process done')

return {
OutputKeys.BOXES: output_boxes,
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 19
- 5
modelscope/pipelines/multi_modal/image_captioning_pipeline.py View File

@@ -1,11 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict, Optional, Union

import torch

from modelscope.metainfo import Pipelines
from modelscope.models.multi_modal import OfaForAllTasks
from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Model, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import OfaPreprocessor, Preprocessor
from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
Preprocessor)
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger

@@ -35,9 +39,19 @@ class ImageCaptioningPipeline(Pipeline):
else:
raise NotImplementedError
pipe_model.model.eval()
if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
if preprocessor is None:
if isinstance(pipe_model, OfaForAllTasks):
preprocessor = OfaPreprocessor(pipe_model.model_dir)
elif isinstance(pipe_model, MPlugForAllTasks):
preprocessor = MPlugPreprocessor(pipe_model.model_dir)
super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
with torch.no_grad():
return super().forward(inputs, **forward_params)

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs
if isinstance(self.model, OfaForAllTasks):
return inputs
return {OutputKeys.CAPTION: inputs}

+ 10
- 26
modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py View File

@@ -5,13 +5,12 @@ import torch

from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering,
OfaForAllTasks)
from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline, Tensor
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor,
OfaPreprocessor)
from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
Preprocessor)
from modelscope.utils.constant import Tasks

__all__ = ['VisualQuestionAnsweringPipeline']
@@ -23,9 +22,8 @@ __all__ = ['VisualQuestionAnsweringPipeline']
class VisualQuestionAnsweringPipeline(Pipeline):

def __init__(self,
model: Union[MPlugForVisualQuestionAnswering, str],
preprocessor: Optional[
MPlugVisualQuestionAnsweringPreprocessor] = None,
model: Union[Model, str],
preprocessor: Optional[Preprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a visual question answering pipeline for prediction

@@ -35,18 +33,12 @@ class VisualQuestionAnsweringPipeline(Pipeline):
"""
model = model if isinstance(model,
Model) else Model.from_pretrained(model)
self.tokenizer = None
if preprocessor is None:
if isinstance(model, OfaForAllTasks):
preprocessor = OfaPreprocessor(model.model_dir)
elif isinstance(model, MPlugForVisualQuestionAnswering):
preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
model.model_dir)
if isinstance(model, MPlugForVisualQuestionAnswering):
model.eval()
self.tokenizer = model.tokenizer
else:
model.model.eval()
elif isinstance(model, MPlugForAllTasks):
preprocessor = MPlugPreprocessor(model.model_dir)
model.model.eval()
super().__init__(model=model, preprocessor=preprocessor, **kwargs)

def forward(self, inputs: Dict[str, Any],
@@ -64,14 +56,6 @@ class VisualQuestionAnsweringPipeline(Pipeline):
Returns:
Dict[str, str]: the prediction results
"""
if self.tokenizer is None:
if isinstance(self.model, OfaForAllTasks):
return inputs
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))

pred_string = self.tokenizer.decode(inputs[0][0])
for _old, _new in replace_tokens_bert:
pred_string = pred_string.replace(_old, _new)
pred_string.strip()
return {OutputKeys.TEXT: pred_string}
return {OutputKeys.TEXT: inputs}

+ 4
- 6
modelscope/preprocessors/__init__.py View File

@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .base import Preprocessor
from .builder import PREPROCESSORS, build_preprocessor
from .common import Compose
from .common import Compose, ToTensor, Filter
from .asr import WavToScp
from .audio import LinearAECAndFbank
from .image import (LoadImage, load_image,
@@ -14,8 +14,7 @@ if TYPE_CHECKING:
ImageInstanceSegmentationPreprocessor,
ImageDenoisePreprocessor)
from .kws import WavToLists
from .multi_modal import (OfaPreprocessor,
MPlugVisualQuestionAnsweringPreprocessor)
from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
from .nlp import (Tokenize, SequenceClassificationPreprocessor,
TextGenerationPreprocessor,
TokenClassificationPreprocessor,
@@ -33,7 +32,7 @@ else:
_import_structure = {
'base': ['Preprocessor'],
'builder': ['PREPROCESSORS', 'build_preprocessor'],
'common': ['Compose'],
'common': ['Compose', 'ToTensor', 'Filter'],
'audio': ['LinearAECAndFbank'],
'asr': ['WavToScp'],
'video': ['ReadVideoData'],
@@ -42,8 +41,7 @@ else:
'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
],
'kws': ['WavToLists'],
'multi_modal':
['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'],
'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
'nlp': [
'Tokenize', 'SequenceClassificationPreprocessor',
'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',


+ 91
- 2
modelscope/preprocessors/common.py View File

@@ -2,6 +2,10 @@

import time
from collections.abc import Sequence
from typing import Mapping

import numpy as np
import torch

from .builder import PREPROCESSORS, build_preprocessor

@@ -25,12 +29,18 @@ class Compose(object):
if isinstance(transform, dict):
if self.field_name is None:
transform = build_preprocessor(transform, field_name)
self.transforms.append(transform)
else:
# if not found key in field_name, try field_name=None(default_group)
try:
transform = build_preprocessor(transform, field_name)
except KeyError:
transform = build_preprocessor(transform, None)
elif callable(transform):
self.transforms.append(transform)
pass
else:
raise TypeError('transform must be callable or a dict, but got'
f' {type(transform)}')
self.transforms.append(transform)

def __call__(self, data):
for t in self.transforms:
@@ -52,3 +62,82 @@ class Compose(object):
format_string += f'\n {t}'
format_string += '\n)'
return format_string


def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.

Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.

Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""

if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not isinstance(data, str):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@PREPROCESSORS.register_module()
class ToTensor(object):
"""Convert target object to tensor.

Args:
keys (Sequence[str]): Key of data to be converted to Tensor.
Only valid when data is type of `Mapping`. If `keys` is None,
all values of keys ​​will be converted to tensor by default.
"""

def __init__(self, keys=None):
self.keys = keys

def __call__(self, data):
if isinstance(data, Mapping):
if self.keys is None:
self.keys = list(data.keys())

for key in self.keys:
data[key] = to_tensor(data[key])
else:
data = to_tensor(data)

return data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.keys})'


@PREPROCESSORS.register_module()
class Filter(object):
"""This is usually the last stage of the dataloader transform.
Only data of reserved keys will be kept and passed directly to the model, others will be removed.

Args:
keys (Sequence[str]): Keys of data to be reserved, others will be removed.
"""

def __init__(self, reserved_keys):
self.reserved_keys = reserved_keys

def __call__(self, data):
assert isinstance(data, Mapping)

reserved_data = {}
for key in self.reserved_keys:
reserved_data[key] = data[key]

return reserved_data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.reserved_keys})'

+ 8
- 0
modelscope/preprocessors/image.py View File

@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
super().__init__(*args, **kwargs)
self.model_dir: str = model_dir

from .common import Filter

# TODO: `Filter` should be moved to configurarion file of each model
self._transforms = [Filter(reserved_keys=['input', 'target'])]

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""process the raw input data

@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
Returns:
Dict[str, Any]: the preprocessed data
"""
for t in self._transforms:
data = t(data)

return data




+ 59
- 27
modelscope/preprocessors/multi_modal.py View File

@@ -19,7 +19,7 @@ from .ofa.utils.collate import collate_fn

__all__ = [
'OfaPreprocessor',
'MPlugVisualQuestionAnsweringPreprocessor',
'MPlugPreprocessor',
]


@@ -28,7 +28,7 @@ __all__ = [
class OfaPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path
@@ -102,39 +102,55 @@ class OfaPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
Fields.multi_modal,
module_name=Preprocessors.mplug_visual_question_answering)
class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
class MPlugPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via 'bert-base-uncased' tokenizer and configuration

"""
from transformers import BertTokenizer
from modelscope.models.multi_modal.mplug import CONFIG_NAME, VOCAB_NAME, MPlugConfig

super().__init__(*args, **kwargs)
self.model_dir = model_dir

# tokenizer
self.tokenizer = BertTokenizer.from_pretrained(
osp.join(model_dir, VOCAB_NAME))
self._tokenizer = None
self._patch_resize_transform = None

# load configuration
config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME))
@property
def tokenizer(self):
from transformers import BertTokenizer

# Initialize transform
from torchvision import transforms
mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)
if self._tokenizer is None:
self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
return self._tokenizer

@property
def patch_resize_transform(self):
if self._patch_resize_transform is None:
from torchvision import transforms
from modelscope.models.multi_modal.mplug import CONFIG_NAME, MPlugConfig

config = MPlugConfig.from_yaml_file(
osp.join(self.model_dir, CONFIG_NAME))

mean = (0.48145466, 0.4578275, 0.40821073)
std = (0.26862954, 0.26130258, 0.27577711)

self._patch_resize_transform = transforms.Compose([
transforms.Resize((config.image_res, config.image_res),
interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
return self._patch_resize_transform

def __call__(self, *args, **kwargs):
call_mapping = {
Tasks.visual_question_answering: self.vqa_call,
Tasks.image_captioning: self.caption_call
}

self.patch_resize_transform = transforms.Compose([
transforms.Resize((config.image_res, config.image_res),
interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
self.cfg = Config.from_file(
osp.join(self.model_dir, ModelFile.CONFIGURATION))
return call_mapping[self.cfg.task](*args, **kwargs)

def __call__(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
image: Image.Image = data[0] if isinstance(data,
tuple) else data['image']
question: str = data[1] if isinstance(data,
@@ -147,3 +163,19 @@ class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
return_tensors='pt')

return {'image': image, 'question': question, 'train': False}

def caption_call(
self, data: Union[Image.Image, tuple,
Dict[str, Any]]) -> Dict[str, Any]:
if isinstance(data, Image.Image):
image = data
elif isinstance(data, tuple):
image = data[0]
else:
image = data['image']
image = image.convert('RGB')
image = self.patch_resize_transform(image)
image = torch.stack([image], dim=0)
question = self.tokenizer('', return_tensors='pt')

return {'image': image, 'question': question, 'train': False}

+ 12
- 7
modelscope/preprocessors/nlp.py View File

@@ -4,6 +4,7 @@ import os.path as osp
import uuid
from typing import Any, Dict, Iterable, Optional, Tuple, Union

import numpy as np
from transformers import AutoTokenizer

from modelscope.metainfo import Models, Preprocessors
@@ -43,7 +44,7 @@ class Tokenize(Preprocessor):
class SequenceClassificationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
text_b,
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
**self.tokenize_kwargs)
output = {
k: np.array(v) if isinstance(v, list) else v
for k, v in output.items()
}
self.labels_to_id(labels, output)
return output

@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
if labels is not None:
if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
and self.label2id is not None:
output[OutputKeys.LABEL] = [
output[OutputKeys.LABELS] = [
self.label2id[str(label)] for label in labels
]
elif label_can_be_mapped(labels) and self.label2id is not None:
output[OutputKeys.LABEL] = self.label2id[str(labels)]
output[OutputKeys.LABELS] = self.label2id[str(labels)]
else:
output[OutputKeys.LABEL] = labels
output[OutputKeys.LABELS] = labels


@PREPROCESSORS.register_module(
@@ -286,7 +291,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
"""

def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path
@@ -517,7 +522,7 @@ class NERPreprocessor(Preprocessor):
"""

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path
@@ -609,7 +614,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
from fairseq.data import Dictionary
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data via the vocab file from the `model_dir` path

Args:
model_dir (str): model path


+ 1
- 1
modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py View File

@@ -22,7 +22,7 @@ __all__ = ['DialogIntentPredictionPreprocessor']
class DialogIntentPredictionPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path


+ 1
- 1
modelscope/preprocessors/space/dialog_modeling_preprocessor.py View File

@@ -20,7 +20,7 @@ __all__ = ['DialogModelingPreprocessor']
class DialogModelingPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path


+ 1
- 1
modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py View File

@@ -17,7 +17,7 @@ __all__ = ['DialogStateTrackingPreprocessor']
class DialogStateTrackingPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path


+ 2
- 1
modelscope/preprocessors/space/fields/gen_field.py View File

@@ -8,6 +8,7 @@ from itertools import chain
import numpy as np

from modelscope.preprocessors.space.tokenizer import Tokenizer
from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger
from modelscope.utils.nlp.space import ontology, utils
from modelscope.utils.nlp.space.db_ops import MultiWozDB
@@ -343,7 +344,7 @@ class MultiWOZBPETextField(BPETextField):
]
special_tokens.extend(self.add_sepcial_tokens())
self.tokenizer = Tokenizer(
vocab_path=os.path.join(model_dir, 'vocab.txt'),
vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
special_tokens=special_tokens,
tokenizer_type=config.BPETextField.tokenizer_type)
self.understand_ids = self.tokenizer.convert_tokens_to_ids(


+ 2
- 1
modelscope/preprocessors/space/fields/intent_field.py View File

@@ -14,6 +14,7 @@ import numpy as np
from tqdm import tqdm

from modelscope.preprocessors.space.tokenizer import Tokenizer
from modelscope.utils.constant import ModelFile
from modelscope.utils.nlp.space import ontology
from modelscope.utils.nlp.space.scores import hierarchical_set_score
from modelscope.utils.nlp.space.utils import list2np
@@ -50,7 +51,7 @@ class BPETextField(object):
]
special_tokens.extend(self.add_sepcial_tokens())
self.tokenizer = Tokenizer(
vocab_path=os.path.join(model_dir, 'vocab.txt'),
vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
special_tokens=special_tokens,
tokenizer_type=config.BPETextField.tokenizer_type)
self.understand_ids = self.numericalize(self.understand_tokens)


+ 1
- 1
modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py View File

@@ -28,7 +28,7 @@ __all__ = ['ConversationalTextToSqlPreprocessor']
class ConversationalTextToSqlPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
"""preprocess the data

Args:
model_dir (str): model path


+ 9
- 0
modelscope/preprocessors/star/fields/common_utils.py View File

@@ -193,6 +193,15 @@ class SubPreprocessor():

from nltk import data
data.path.append(os.path.join(self.model_dir, 'nltk_data'))

zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt')
if os.path.exists(zippath):
print('punkt has already exist!')
else:
import zipfile
with zipfile.ZipFile(zippath + '.zip') as zf:
zf.extractall(
os.path.join(self.model_dir, 'nltk_data/tokenizers/'))
question = nltk.word_tokenize(question)
question = mwtokenizer.tokenize(question)



+ 0
- 4
modelscope/trainers/cv/image_instance_segmentation_trainer.py View File

@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer):

def prediction_step(self, model, inputs):
pass

def to_task_dataset(self, datasets, mode, preprocessor=None):
# wait for dataset interface to become stable...
return datasets.to_torch_dataset(preprocessor)

+ 0
- 1
modelscope/trainers/cv/image_portrait_enhancement_trainer.py View File

@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):

train_outputs = dict()
self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess
if isinstance(inputs, Mapping):
d_loss = model._train_forward_d(**inputs)


+ 1
- 1
modelscope/trainers/hooks/hook.py View File

@@ -192,7 +192,7 @@ class Hook:
Whether to reach the end of every epoch
Returns: bool
"""
return trainer.inner_iter + 1 == len(trainer.data_loader)
return trainer.inner_iter + 1 == trainer.iters_per_epoch

def is_last_epoch(self, trainer):
"""


+ 1
- 1
modelscope/trainers/hooks/logger/text_logger_hook.py View File

@@ -93,7 +93,7 @@ class TextLoggerHook(LoggerHook):
lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}'

if self.by_epoch:
log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t'
log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{trainer.iters_per_epoch}]\t'
else:
log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t'
log_str += f'{lr_str}, '


+ 39
- 15
modelscope/trainers/nlp_trainer.py View File

@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
self.train_keys = build_dataset_keys(
self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'train') else None)
# TODO eval may has special keys, which is now not supported.
# because there is only one preprocessor in the trainer, and it only supports one group of keys.
self.eval_keys = self.train_keys
self.eval_keys = build_dataset_keys(
self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'val') else None)
if len(self.eval_keys) == 0:
self.eval_keys = self.train_keys

super().__init__(
model=model_dir,
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
elif isinstance(model, nn.Module):
return model

def build_preprocessor(self) -> Preprocessor:
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build the preprocessor.

User can override this method to implement custom logits.
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
model_args = {} if self.label2id is None else {
'label2id': self.label2id
}
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
**model_args,
'mode':
ModeKeys.TRAIN,
**self.train_keys,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))

field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
_train_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.train_keys, 'mode': ModeKeys.TRAIN
})
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
_eval_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.eval_keys, 'mode': ModeKeys.EVAL
})
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor


@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)
@@ -178,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer):
"""Veco evaluates the datasets one by one.

"""
from modelscope.task_datasets import VecoDataset
from modelscope.msdatasets.task_datasets import VecoDataset
self.model.eval()
self._mode = ModeKeys.EVAL
metric_values = {}


+ 151
- 76
modelscope/trainers/trainer.py View File

@@ -5,15 +5,15 @@ import time
from collections.abc import Mapping
from distutils.version import LooseVersion
from functools import partial
from typing import Callable, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union

import json
import numpy as np
import torch
from addict import Dict
from torch import distributed as dist
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
from torch.utils.data.distributed import DistributedSampler

from modelscope.hub.snapshot_download import snapshot_download
@@ -21,23 +21,26 @@ from modelscope.metainfo import Trainers
from modelscope.metrics import build_metric, task_default_metrics
from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.preprocessors import build_preprocessor
from modelscope.msdatasets.task_datasets.builder import build_task_dataset
from modelscope.msdatasets.task_datasets.torch_base_dataset import \
TorchTaskDataset
from modelscope.preprocessors.base import Preprocessor
from modelscope.task_datasets.builder import build_task_dataset
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
from modelscope.preprocessors.builder import build_preprocessor
from modelscope.preprocessors.common import Compose
from modelscope.trainers.hooks.builder import HOOKS
from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
from modelscope.trainers.optimizer.builder import build_optimizer
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
ModelFile, Tasks, TrainerStages)
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
ConfigKeys, Hubs, ModeKeys, ModelFile,
Tasks, TrainerStages)
from modelscope.utils.data_utils import to_device
from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.logger import get_logger
from modelscope.utils.registry import build_from_cfg
from modelscope.utils.tensor_utils import torch_default_data_collator
from modelscope.utils.torch_utils import (broadcast, create_device,
get_dist_info, init_dist)
from modelscope.utils.torch_utils import (create_device, get_dist_info,
init_dist)
from .base import BaseTrainer
from .builder import TRAINERS
from .default_config import DEFAULT_CONFIG
@@ -83,7 +86,8 @@ class EpochBasedTrainer(BaseTrainer):
data_collator: Optional[Callable] = None,
train_dataset: Optional[Union[MsDataset, Dataset]] = None,
eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
preprocessor: Optional[Preprocessor] = None,
preprocessor: Optional[Union[Preprocessor,
Dict[str, Preprocessor]]] = None,
optimizers: Tuple[torch.optim.Optimizer,
torch.optim.lr_scheduler._LRScheduler] = (None,
None),
@@ -120,24 +124,46 @@ class EpochBasedTrainer(BaseTrainer):
else:
self.work_dir = self.cfg.train.get('work_dir', './work_dir')

self.preprocessor = None
self.train_preprocessor, self.eval_preprocessor = None, None
if isinstance(preprocessor, Preprocessor):
self.preprocessor = preprocessor
elif hasattr(self.cfg, 'preprocessor'):
self.preprocessor = self.build_preprocessor()
if self.preprocessor is not None:
self.preprocessor.mode = ModeKeys.TRAIN
self.train_preprocessor = preprocessor
self.eval_preprocessor = preprocessor
elif isinstance(preprocessor, Mapping):
if not (ConfigKeys.train in preprocessor
or ConfigKeys.val in preprocessor):
raise ValueError(
f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
)
if ConfigKeys.train in preprocessor:
assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
self.train_preprocessor = preprocessor[ConfigKeys.train]
if ConfigKeys.val in preprocessor:
assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
self.eval_preprocessor = preprocessor[ConfigKeys.val]
elif hasattr(self.cfg, ConfigFields.preprocessor):
self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
)

if self.train_preprocessor is not None:
self.train_preprocessor.mode = ModeKeys.TRAIN
if self.eval_preprocessor is not None:
self.eval_preprocessor.mode = ModeKeys.EVAL

device_name = kwargs.get('device', 'gpu')
assert device_name in ['gpu',
'cpu'], 'device should be either cpu or gpu.'
self.device = create_device(device_name == 'cpu')

self.train_dataset = self.to_task_dataset(
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
train_dataset,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)
self.eval_dataset = self.to_task_dataset(
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
eval_dataset,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)

self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
self.data_collator = data_collator if data_collator is not None else default_collate
self.metrics = self.get_metrics()
self._metric_values = None
self.optimizers = optimizers
@@ -155,6 +181,16 @@ class EpochBasedTrainer(BaseTrainer):
else:
self._max_epochs = kwargs['max_epochs']

self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
if self._train_iters_per_epoch is None and hasattr(
self.cfg.train, 'train_iters_per_epoch'):
self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch
if self._eval_iters_per_epoch is None and hasattr(
self.cfg, 'evaluation') and hasattr(self.cfg.evaluation,
'val_iters_per_epoch'):
self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch

self.use_fp16 = kwargs.get('use_fp16', False)

# TODO @wenmeng.zwm add seed init fn
@@ -211,7 +247,32 @@ class EpochBasedTrainer(BaseTrainer):
@property
def max_iters(self):
"""int: Maximum training iterations."""
return self._max_epochs * len(self.data_loader)
return self._max_epochs * self.iters_per_epoch

@property
def iters_per_epoch(self):
"""int: Total iterations of one epoch"""

def _get_data_len(data_loader):
try:
return len(data_loader)
except Exception as e:
self.logger.error(e)
raise ValueError(
'Please implement ``__len__`` method for your dataset, '
'or add `train_iters_per_epoch` and `train_iters_per_epoch` '
'to your configuration file or kwargs')

if self.mode == ModeKeys.TRAIN:
if self._train_iters_per_epoch is not None:
return self._train_iters_per_epoch
else:
return _get_data_len(self.train_dataloader)
elif self.mode == ModeKeys.EVAL:
if self._eval_iters_per_epoch is not None:
return self._eval_iters_per_epoch
else:
return _get_data_len(self.eval_dataloader)

def to_task_dataset(self,
datasets: Union[Dataset, List[Dataset]],
@@ -228,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer):
if isinstance(datasets, TorchTaskDataset):
return datasets
elif isinstance(datasets, MsDataset):
datasets = datasets.to_torch_dataset(
preprocessors=self.preprocessor)
return datasets
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
return datasets.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor)
elif isinstance(datasets, List) and isinstance(
datasets[0], MsDataset):
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
datasets = [
d.to_torch_dataset(preprocessor=self.preprocessor)
for d in datasets
d.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor) for d in datasets
]
cfg = ConfigDict(
type=self.cfg.task, mode=mode, datasets=datasets)
@@ -258,24 +326,44 @@ class EpochBasedTrainer(BaseTrainer):
else:
return datasets

def build_preprocessor(self) -> Preprocessor:
"""Build the preprocessor.
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build train and eval preprocessor.

User can override this method to implement custom logits.

Returns: The preprocessor instance.
Returns: The train preprocessor and eval preprocessor instance.

"""
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
# when they are different ones in training and evaluation
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
'mode':
ModeKeys.TRAIN,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}
_dafault_args = {'model_dir': self.model_dir}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
if isinstance(_train_cfg, Sequence):
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
# and add mode for Compose or other plans
raise NotImplementedError('Not supported yet!')
_train_cfg.update(_dafault_args)
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
if isinstance(_eval_cfg, Sequence):
raise NotImplementedError('Not supported yet!')
_eval_cfg.update(_dafault_args)
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor

def get_metrics(self) -> List[str]:
"""Get the metric class types.
@@ -373,34 +461,6 @@ class EpochBasedTrainer(BaseTrainer):

return build_parallel(dp_cfg)

def collate_fn(self, data):
"""Prepare the input just before the forward function.
This method will move the tensors to the right device.
Usually this method does not need to be overridden.

Args:
data: The data out of the dataloader.

Returns: The processed data.

"""
from torch.utils.data.dataloader import default_collate
if isinstance(data, dict) or isinstance(data, Mapping):
return type(data)({k: self.collate_fn(v) for k, v in data.items()})
elif isinstance(data, (tuple, list)):
if isinstance(data[0], (int, float)):
return default_collate(data).to(self.device)
else:
return type(data)(self.collate_fn(v) for v in data)
elif isinstance(data, np.ndarray):
return self.collate_fn(torch.from_numpy(data))
elif isinstance(data, torch.Tensor):
return data.to(self.device)
elif isinstance(data, (str, int, float, bool)):
return data
else:
raise ValueError(f'Unsupported data type {type(data)}')

def train_step(self, model, inputs):
""" Perform a training step on a batch of inputs.

@@ -421,7 +481,6 @@ class EpochBasedTrainer(BaseTrainer):
# TODO: find more pretty way to change mode
model.train()
self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess
if isinstance(inputs,
Mapping) and not func_receive_dict_inputs(model.forward):
@@ -486,7 +545,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.train_dataset is None:
train_data = self.cfg.dataset.train
self.train_dataset = self.build_dataset(
train_data, mode=ModeKeys.TRAIN)
train_data,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)

data_loader = self._build_dataloader_with_dataset(
self.train_dataset,
@@ -505,7 +566,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.eval_dataset is None:
val_data = self.cfg.dataset.val
self.eval_dataset = self.build_dataset(
val_data, mode=ModeKeys.EVAL)
val_data,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)

batch_size = self.cfg.evaluation.batch_size
workers = self.cfg.evaluation.workers
@@ -521,7 +584,7 @@ class EpochBasedTrainer(BaseTrainer):
)
return data_loader

def build_dataset(self, data_cfg, mode):
def build_dataset(self, data_cfg, mode, preprocessor=None):
""" Build torch dataset object using data config
"""
dataset = MsDataset.load(
@@ -530,9 +593,13 @@ class EpochBasedTrainer(BaseTrainer):
subset_name=data_cfg.subset_name if hasattr(
data_cfg, 'subset_name') else None,
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
**data_cfg,
)
cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
torch_dataset = dataset.to_torch_dataset(
preprocessors=self.preprocessor, )
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=self.preprocessor)
dataset = self.to_task_dataset(torch_dataset, mode)
return dataset

@@ -698,6 +765,7 @@ class EpochBasedTrainer(BaseTrainer):
self.invoke_hook(TrainerStages.before_train_epoch)
time.sleep(2) # Prevent possible deadlock during epoch transition
for i, data_batch in enumerate(data_loader):
data_batch = to_device(data_batch, self.device)
self.data_batch = data_batch
self._inner_iter = i
self.invoke_hook(TrainerStages.before_train_iter)
@@ -706,6 +774,9 @@ class EpochBasedTrainer(BaseTrainer):
del self.data_batch
self._iter += 1

if i + 1 >= self.iters_per_epoch:
break

self.invoke_hook(TrainerStages.after_train_epoch)
self._epoch += 1

@@ -721,17 +792,21 @@ class EpochBasedTrainer(BaseTrainer):
metric_values = multi_gpu_test(
self.model,
data_loader,
device=self.device,
tmpdir=None,
gpu_collect=False,
data_collate_fn=self.collate_fn,
metric_classes=metric_classes)
metric_classes=metric_classes,
data_loader_iters_per_gpu=self.iters_per_epoch)
else:
from modelscope.trainers.utils.inference import single_gpu_test
metric_values = single_gpu_test(
self.model,
data_loader,
data_collate_fn=self.collate_fn,
metric_classes=metric_classes)
device=self.device,
metric_classes=metric_classes,
data_loader_iters=self.iters_per_epoch)

self._inner_iter = self.iters_per_epoch - 1 # start from index 0

return metric_values



Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save