From 291f8fe68c3462abc6462c5e408e7f349203f630 Mon Sep 17 00:00:00 2001 From: "lllcho.lc" Date: Thu, 1 Sep 2022 18:14:37 +0800 Subject: [PATCH 01/28] [to #42322933] Add action-detection model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加新的action-detection task Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9898947 --- .../videos/action_detection_test_video.mp4 | 3 + modelscope/metainfo.py | 1 + .../models/cv/action_detection/__init__.py | 21 +++ .../action_detection/action_detection_onnx.py | 177 ++++++++++++++++++ modelscope/outputs.py | 15 ++ modelscope/pipelines/builder.py | 2 + modelscope/pipelines/cv/__init__.py | 2 + .../pipelines/cv/action_detection_pipeline.py | 63 +++++++ modelscope/utils/constant.py | 1 + tests/pipelines/test_action_detection.py | 22 +++ 10 files changed, 307 insertions(+) create mode 100644 data/test/videos/action_detection_test_video.mp4 create mode 100644 modelscope/models/cv/action_detection/__init__.py create mode 100644 modelscope/models/cv/action_detection/action_detection_onnx.py create mode 100644 modelscope/pipelines/cv/action_detection_pipeline.py create mode 100644 tests/pipelines/test_action_detection.py diff --git a/data/test/videos/action_detection_test_video.mp4 b/data/test/videos/action_detection_test_video.mp4 new file mode 100644 index 00000000..e2ea1d80 --- /dev/null +++ b/data/test/videos/action_detection_test_video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92 +size 3191059 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 6f34b1a3..7c5afe80 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -133,6 +133,7 @@ class Pipelines(object): skin_retouching = 'unet-skin-retouching' tinynas_classification = 'tinynas-classification' crowd_counting = 'hrnet-crowd-counting' + action_detection = 'ResNetC3D-action-detection' video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' image_panoptic_segmentation = 'image-panoptic-segmentation' video_summarization = 'googlenet_pgl_video_summarization' diff --git a/modelscope/models/cv/action_detection/__init__.py b/modelscope/models/cv/action_detection/__init__.py new file mode 100644 index 00000000..fedbe19c --- /dev/null +++ b/modelscope/models/cv/action_detection/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + + from .action_detection_onnx import ActionDetONNX + +else: + _import_structure = {'action_detection_onnx': ['ActionDetONNX']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py new file mode 100644 index 00000000..3c171473 --- /dev/null +++ b/modelscope/models/cv/action_detection/action_detection_onnx.py @@ -0,0 +1,177 @@ +import os +import os.path as osp +import shutil +import subprocess + +import cv2 +import numpy as np +import onnxruntime as rt + +from modelscope.models import Model +from modelscope.utils.constant import Devices +from modelscope.utils.device import verify_device + + +class ActionDetONNX(Model): + + def __init__(self, model_dir, config, *args, **kwargs): + super().__init__(self, model_dir, *args, **kwargs) + model_file = osp.join(config['model_file']) + device_type, device_id = verify_device(self._device_name) + options = rt.SessionOptions() + options.intra_op_num_threads = 1 + options.inter_op_num_threads = 1 + if device_type == Devices.gpu: + sess = rt.InferenceSession( + model_file, + providers=['CUDAExecutionProvider'], + sess_options=options, + provider_options=[{ + 'device_id': device_id + }]) + else: + sess = rt.InferenceSession( + model_file, + providers=['CPUExecutionProvider'], + sess_options=options) + self.input_name = sess.get_inputs()[0].name + self.sess = sess + self.num_stride = len(config['fpn_strides']) + self.score_thresh = np.asarray( + config['pre_nms_thresh'], dtype='float32').reshape((1, -1)) + self.size_divisibility = config['size_divisibility'] + self.nms_threshold = config['nms_thresh'] + self.tmp_dir = config['tmp_dir'] + self.temporal_stride = config['step'] + self.input_data_type = config['input_type'] + self.action_names = config['action_names'] + self.video_length_limit = config['video_length_limit'] + + def resize_box(self, det, height, width, scale_h, scale_w): + bboxs = det[0] + bboxs[:, [0, 2]] *= scale_w + bboxs[:, [1, 3]] *= scale_h + bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1) + bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1) + result = { + 'boxes': bboxs.round().astype('int32').tolist(), + 'scores': det[1].tolist(), + 'labels': [self.action_names[i] for i in det[2].tolist()] + } + return result + + def parse_frames(self, frame_names): + imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names] + imgs = np.stack(imgs).astype(self.input_data_type).transpose( + (3, 0, 1, 2)) # c,t,h,w + imgs = imgs[None] + return imgs + + def forward_img(self, imgs, h, w): + pred = self.sess.run(None, { + self.input_name: imgs, + 'height': np.asarray(h), + 'width': np.asarray(w) + }) + dets = self.post_nms( + pred, + score_threshold=self.score_thresh, + nms_threshold=self.nms_threshold) + return dets + + def forward_video(self, video_name, scale): + min_size, max_size = self._get_sizes(scale) + + tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4]) + if osp.exists(tmp_dir): + shutil.rmtree(tmp_dir) + os.makedirs(tmp_dir) + frame_rate = 2 + cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \ + f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg' + + cmd = cmd.split(' ') + subprocess.call(cmd) + + frame_names = [ + osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir)) + if name.endswith('.jpg') + ] + frame_names = [ + frame_names[i:i + frame_rate * 2] + for i in range(0, + len(frame_names) - frame_rate * 2 + 1, frame_rate + * self.temporal_stride) + ] + timestamp = list( + range(1, + len(frame_names) * self.temporal_stride, + self.temporal_stride)) + batch_imgs = [self.parse_frames(names) for names in frame_names] + + N, _, T, H, W = batch_imgs[0].shape + scale_min = min_size / min(H, W) + h, w = min(int(scale_min * H), + max_size), min(int(scale_min * W), max_size) + h = round(h / self.size_divisibility) * self.size_divisibility + w = round(w / self.size_divisibility) * self.size_divisibility + scale_h, scale_w = H / h, W / w + + results = [] + for imgs in batch_imgs: + det = self.forward_img(imgs, h, w) + det = self.resize_box(det[0], H, W, scale_h, scale_w) + results.append(det) + results = [{ + 'timestamp': t, + 'actions': res + } for t, res in zip(timestamp, results)] + shutil.rmtree(tmp_dir) + return results + + def forward(self, video_name): + return self.forward_video(video_name, scale=1) + + def post_nms(self, pred, score_threshold, nms_threshold=0.3): + pred_bboxes, pred_scores = pred + N = len(pred_bboxes) + dets = [] + for i in range(N): + bboxes, scores = pred_bboxes[i], pred_scores[i] + candidate_inds = scores > score_threshold + scores = scores[candidate_inds] + candidate_nonzeros = candidate_inds.nonzero() + bboxes = bboxes[candidate_nonzeros[0]] + labels = candidate_nonzeros[1] + keep = self._nms(bboxes, scores, labels, nms_threshold) + bbox = bboxes[keep] + score = scores[keep] + label = labels[keep] + dets.append((bbox, score, label)) + return dets + + def _nms(self, boxes, scores, idxs, nms_threshold): + if len(boxes) == 0: + return [] + max_coordinate = boxes.max() + offsets = idxs * (max_coordinate + 1) + boxes_for_nms = boxes + offsets[:, None].astype('float32') + boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0] + boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1] + keep = cv2.dnn.NMSBoxes( + boxes_for_nms.tolist(), + scores.tolist(), + score_threshold=0, + nms_threshold=nms_threshold) + if len(keep.shape) == 2: + keep = np.squeeze(keep, 1) + return keep + + def _get_sizes(self, scale): + if scale == 1: + min_size, max_size = 512, 896 + elif scale == 2: + min_size, max_size = 768, 1280 + else: + min_size, max_size = 1024, 1792 + return min_size, max_size diff --git a/modelscope/outputs.py b/modelscope/outputs.py index aebb9138..7d6cdb59 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -35,6 +35,7 @@ class OutputKeys(object): UUID = 'uuid' WORD = 'word' KWS_LIST = 'kws_list' + TIMESTAMPS = 'timestamps' SPLIT_VIDEO_NUM = 'split_video_num' SPLIT_META_DICT = 'split_meta_dict' @@ -541,6 +542,19 @@ TASK_OUTPUTS = { # } Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS], + # { + # 'labels': ['吸烟', '打电话', '吸烟'], + # 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487], + # 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]], + # 'timestamps': [1, 3, 5] + # } + Tasks.action_detection: [ + OutputKeys.TIMESTAMPS, + OutputKeys.LABELS, + OutputKeys.SCORES, + OutputKeys.BOXES, + ], + # { # 'output': [ # [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509}, @@ -551,6 +565,7 @@ TASK_OUTPUTS = { # {'label': '13421097', 'score': 2.75914817393641e-06}]] # } Tasks.faq_question_answering: [OutputKeys.OUTPUT], + # image person reid result for single sample # { # "img_embedding": np.array with shape [1, D], diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 8a1a3646..c9f0c252 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), Tasks.action_recognition: (Pipelines.action_recognition, 'damo/cv_TAdaConv_action-recognition'), + Tasks.action_detection: (Pipelines.action_detection, + 'damo/cv_ResNetC3D_action-detection_detection2d'), Tasks.live_category: (Pipelines.live_category, 'damo/cv_resnet50_live-category'), Tasks.video_category: (Pipelines.video_category, diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 01c69758..f4e6792b 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -5,6 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .action_recognition_pipeline import ActionRecognitionPipeline + from .action_detection_pipeline import ActionDetectionPipeline from .animal_recognition_pipeline import AnimalRecognitionPipeline from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline @@ -48,6 +49,7 @@ if TYPE_CHECKING: else: _import_structure = { 'action_recognition_pipeline': ['ActionRecognitionPipeline'], + 'action_detection_pipeline': ['ActionDetectionPipeline'], 'animal_recognition_pipeline': ['AnimalRecognitionPipeline'], 'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'], 'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'], diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py new file mode 100644 index 00000000..72335d5b --- /dev/null +++ b/modelscope/pipelines/cv/action_detection_pipeline.py @@ -0,0 +1,63 @@ +import math +import os.path as osp +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.action_detection import ActionDetONNX +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.action_detection, module_name=Pipelines.action_detection) +class ActionDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a action detection pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE) + logger.info(f'loading model from {model_path}') + config_path = osp.join(self.model, ModelFile.CONFIGURATION) + logger.info(f'loading config from {config_path}') + self.cfg = Config.from_file(config_path) + self.cfg.MODEL.model_file = model_path + self.model = ActionDetONNX(self.model, self.cfg.MODEL, + self.device_name) + logger.info('load model done') + + def preprocess(self, input: Input) -> Dict[str, Any]: + if isinstance(input, str): + video_name = input + else: + raise TypeError(f'input should be a str,' + f' but got {type(input)}') + result = {'video_name': video_name} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + preds = self.model.forward(input['video_name']) + labels = sum([pred['actions']['labels'] for pred in preds], []) + scores = sum([pred['actions']['scores'] for pred in preds], []) + boxes = sum([pred['actions']['boxes'] for pred in preds], []) + timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels']) + for pred in preds], []) + out = { + OutputKeys.TIMESTAMPS: timestamps, + OutputKeys.LABELS: labels, + OutputKeys.SCORES: scores, + OutputKeys.BOXES: boxes + } + return out + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 960e9600..2265ef5a 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -58,6 +58,7 @@ class CVTasks(object): # video recognition live_category = 'live-category' action_recognition = 'action-recognition' + action_detection = 'action-detection' video_category = 'video-category' video_embedding = 'video-embedding' virtual_try_on = 'virtual-try-on' diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py new file mode 100644 index 00000000..c752dc78 --- /dev/null +++ b/tests/pipelines/test_action_detection.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.test_utils import test_level + + +class ActionDetectionTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run(self): + action_detection_pipline = pipeline( + Tasks.action_detection, + model='damo/cv_ResNetC3D_action-detection_detection2d') + result = action_detection_pipline( + 'data/test/videos/action_detection_test_video.mp4') + print('action detection results:', result) + + +if __name__ == '__main__': + unittest.main() From f5fb8cf5318f3dfb0015484557dd0e03b9c42a8b Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Thu, 1 Sep 2022 18:56:51 +0800 Subject: [PATCH 02/28] [to #42322933] fix bug about loading new trained model and update doc string Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9987197 --- modelscope/models/audio/ans/__init__.py | 4 +- modelscope/models/audio/ans/complex_nn.py | 6 ++ modelscope/models/audio/ans/conv_stft.py | 1 + modelscope/models/audio/ans/frcrn.py | 62 +++---------------- .../models/audio/ans/se_module_complex.py | 1 + modelscope/models/audio/ans/unet.py | 4 ++ modelscope/trainers/audio/ans_trainer.py | 7 +-- modelscope/utils/audio/audio_utils.py | 18 +++--- 8 files changed, 32 insertions(+), 71 deletions(-) diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py index b602ad01..afcdf314 100644 --- a/modelscope/models/audio/ans/__init__.py +++ b/modelscope/models/audio/ans/__init__.py @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .frcrn import FRCRNModel + from .frcrn import FRCRNDecorator else: _import_structure = { - 'frcrn': ['FRCRNModel'], + 'frcrn': ['FRCRNDecorator'], } import sys diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py index 69dec41e..c61446c2 100644 --- a/modelscope/models/audio/ans/complex_nn.py +++ b/modelscope/models/audio/ans/complex_nn.py @@ -1,3 +1,9 @@ +""" +class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of +Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ). +from https://github.com/sweetcocoa/DeepComplexUNetPyTorch + +""" import torch import torch.nn as nn import torch.nn.functional as F diff --git a/modelscope/models/audio/ans/conv_stft.py b/modelscope/models/audio/ans/conv_stft.py index a47d7817..4b393a4c 100644 --- a/modelscope/models/audio/ans/conv_stft.py +++ b/modelscope/models/audio/ans/conv_stft.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import numpy as np import torch import torch.nn as nn diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py index 59411fbe..b74fc273 100644 --- a/modelscope/models/audio/ans/frcrn.py +++ b/modelscope/models/audio/ans/frcrn.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os from typing import Dict @@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT from .unet import UNet -class FTB(nn.Module): - - def __init__(self, input_dim=257, in_channel=9, r_channel=5): - - super(FTB, self).__init__() - self.in_channel = in_channel - self.conv1 = nn.Sequential( - nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]), - nn.BatchNorm2d(r_channel), nn.ReLU()) - - self.conv1d = nn.Sequential( - nn.Conv1d( - r_channel * input_dim, in_channel, kernel_size=9, padding=4), - nn.BatchNorm1d(in_channel), nn.ReLU()) - self.freq_fc = nn.Linear(input_dim, input_dim, bias=False) - - self.conv2 = nn.Sequential( - nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]), - nn.BatchNorm2d(in_channel), nn.ReLU()) - - def forward(self, inputs): - ''' - inputs should be [Batch, Ca, Dim, Time] - ''' - # T-F attention - conv1_out = self.conv1(inputs) - B, C, D, T = conv1_out.size() - reshape1_out = torch.reshape(conv1_out, [B, C * D, T]) - conv1d_out = self.conv1d(reshape1_out) - conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T]) - - # now is also [B,C,D,T] - att_out = conv1d_out * inputs - - # tranpose to [B,C,T,D] - att_out = torch.transpose(att_out, 2, 3) - freqfc_out = self.freq_fc(att_out) - att_out = torch.transpose(freqfc_out, 2, 3) - - cat_out = torch.cat([att_out, inputs], 1) - outputs = self.conv2(cat_out) - return outputs - - @MODELS.register_module( Tasks.acoustic_noise_suppression, module_name=Models.speech_frcrn_ans_cirm_16k) -class FRCRNModel(TorchModel): +class FRCRNDecorator(TorchModel): r""" A decorator of FRCRN for integrating into modelscope framework """ def __init__(self, model_dir: str, *args, **kwargs): @@ -78,13 +35,14 @@ class FRCRNModel(TorchModel): checkpoint = torch.load( model_bin_file, map_location=torch.device('cpu')) if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: - self.model.load_state_dict( - checkpoint['state_dict'], strict=False) + # the new trained model by user is based on FRCRNDecorator + self.load_state_dict(checkpoint['state_dict']) else: + # The released model on Modelscope is based on FRCRN self.model.load_state_dict(checkpoint, strict=False) - def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - result_list = self.model.forward(input['noisy']) + def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: + result_list = self.model.forward(inputs['noisy']) output = { 'spec_l1': result_list[0], 'wav_l1': result_list[1], @@ -93,12 +51,12 @@ class FRCRNModel(TorchModel): 'wav_l2': result_list[4], 'mask_l2': result_list[5] } - if 'clean' in input: + if 'clean' in inputs: mix_result = self.model.loss( - input['noisy'], input['clean'], result_list, mode='Mix') + inputs['noisy'], inputs['clean'], result_list, mode='Mix') output.update(mix_result) sisnr_result = self.model.loss( - input['noisy'], input['clean'], result_list, mode='SiSNR') + inputs['noisy'], inputs['clean'], result_list, mode='SiSNR') output.update(sisnr_result) # logger hooker will use items under 'log_vars' output['log_vars'] = {k: mix_result[k].item() for k in mix_result} diff --git a/modelscope/models/audio/ans/se_module_complex.py b/modelscope/models/audio/ans/se_module_complex.py index f62fe523..b58eb6ba 100644 --- a/modelscope/models/audio/ans/se_module_complex.py +++ b/modelscope/models/audio/ans/se_module_complex.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import torch from torch import nn diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py index aa5a4254..ae66eb69 100644 --- a/modelscope/models/audio/ans/unet.py +++ b/modelscope/models/audio/ans/unet.py @@ -1,3 +1,7 @@ +""" +Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ). +from https://github.com/sweetcocoa/DeepComplexUNetPyTorch +""" import torch import torch.nn as nn diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py index f782b836..37b201ce 100644 --- a/modelscope/trainers/audio/ans_trainer.py +++ b/modelscope/trainers/audio/ans_trainer.py @@ -1,10 +1,5 @@ -import time -from typing import List, Optional, Union - -from datasets import Dataset - +# Copyright (c) Alibaba, Inc. and its affiliates. from modelscope.metainfo import Trainers -from modelscope.preprocessors import Preprocessor from modelscope.trainers import EpochBasedTrainer from modelscope.trainers.builder import TRAINERS from modelscope.utils.constant import TrainerStages diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py index 14374c65..61964345 100644 --- a/modelscope/utils/audio/audio_utils.py +++ b/modelscope/utils/audio/audio_utils.py @@ -1,5 +1,4 @@ -import numpy as np - +# Copyright (c) Alibaba, Inc. and its affiliates. SEGMENT_LENGTH_TRAIN = 16000 @@ -9,16 +8,13 @@ def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN): It only works in batch mode. """ noisy_arrays = [] - for x in batch['noisy']: - length = len(x['array']) - noisy = np.array(x['array']) - for offset in range(segment_length, length, segment_length): - noisy_arrays.append(noisy[offset - segment_length:offset]) clean_arrays = [] - for x in batch['clean']: - length = len(x['array']) - clean = np.array(x['array']) - for offset in range(segment_length, length, segment_length): + for x, y in zip(batch['noisy'], batch['clean']): + length = min(len(x['array']), len(y['array'])) + noisy = x['array'] + clean = y['array'] + for offset in range(segment_length, length + 1, segment_length): + noisy_arrays.append(noisy[offset - segment_length:offset]) clean_arrays.append(clean[offset - segment_length:offset]) return {'noisy': noisy_arrays, 'clean': clean_arrays} From af4c6f70c296cbffdc6a5962791eed179ed611c7 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Thu, 1 Sep 2022 20:06:42 +0800 Subject: [PATCH 03/28] [to #42322933]allow none decorator registry in ast --- modelscope/utils/ast_utils.py | 65 ++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index 990a9571..263a81b3 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -36,6 +36,7 @@ SCAN_SUB_FOLDERS = [ ] INDEXER_FILE = 'ast_indexer' DECORATOR_KEY = 'decorators' +EXPRESS_KEY = 'express' FROM_IMPORT_KEY = 'from_imports' IMPORT_KEY = 'imports' FILE_NAME_KEY = 'filepath' @@ -45,6 +46,9 @@ INDEX_KEY = 'index' REQUIREMENT_KEY = 'requirements' MODULE_KEY = 'module' CLASS_NAME = 'class_name' +GROUP_KEY = 'group_key' +MODULE_NAME = 'module_name' +MODULE_CLS = 'module_cls' class AstScaning(object): @@ -53,6 +57,7 @@ class AstScaning(object): self.result_import = dict() self.result_from_import = dict() self.result_decorator = [] + self.express = [] def _is_sub_node(self, node: object) -> bool: return isinstance(node, @@ -108,6 +113,7 @@ class AstScaning(object): self.result_import = dict() self.result_from_import = dict() self.result_decorator = [] + self.result_express = [] def scan_ast(self, node: Union[ast.AST, None, str]): self._setup_global() @@ -243,13 +249,19 @@ class AstScaning(object): setattr(item, CLASS_NAME, node.name) self.result_decorator.extend(attr) + if attr != [] and type( + attr + ).__name__ == 'Call' and parent_node_name == 'Expr': + self.result_express.append(attr) + out += f'{indentstr()}{field}={representation},\n' out += indentstr() + ')' return { IMPORT_KEY: self.result_import, FROM_IMPORT_KEY: self.result_from_import, - DECORATOR_KEY: self.result_decorator + DECORATOR_KEY: self.result_decorator, + EXPRESS_KEY: self.result_express }, out def _parse_decorator(self, node: ast.AST) -> tuple: @@ -267,7 +279,10 @@ class AstScaning(object): def _get_args_name(nodes: list) -> list: result = [] for node in nodes: - result.append(_get_attribute_item(node)) + if type(node).__name__ == 'Str': + result.append((node.s, None)) + else: + result.append(_get_attribute_item(node)) return result def _get_keyword_name(nodes: ast.AST) -> list: @@ -276,9 +291,11 @@ class AstScaning(object): if type(node).__name__ == 'keyword': attribute_node = getattr(node, 'value') if type(attribute_node).__name__ == 'Str': - result.append((attribute_node.s, None)) + result.append((getattr(node, + 'arg'), attribute_node.s, None)) else: - result.append(_get_attribute_item(attribute_node)) + result.append((getattr(node, 'arg'), ) + + _get_attribute_item(attribute_node)) return result functions = _get_attribute_item(node.func) @@ -315,10 +332,26 @@ class AstScaning(object): args_list.append(default_group) if len(keyword_list) == 0 and len(args_list) == 1: args_list.append(class_name) - if len(keyword_list) == 1 and len(args_list) == 0: + + if len(keyword_list) > 0 and len(args_list) == 0: + remove_group_item = None + for item in keyword_list: + key, name, attr = item + if key == GROUP_KEY: + args_list.append((name, attr)) + remove_group_item = item + if remove_group_item is not None: + keyword_list.remove(remove_group_item) + + if len(args_list) == 0: args_list.append(default_group) - args_list.extend(keyword_list) + for item in keyword_list: + key, name, attr = item + if key == MODULE_CLS: + class_name = name + else: + args_list.append((name, attr)) for item in args_list: # the case empty input @@ -347,9 +380,14 @@ class AstScaning(object): for node in nodes: if type(node).__name__ != 'Call': continue + class_name = getattr(node, CLASS_NAME, None) + func = getattr(node, 'func') + + if getattr(func, 'attr', None) != REGISTER_MODULE: + continue + parse_output = self._parse_decorator(node) - index = self._registry_indexer(parse_output, - getattr(node, CLASS_NAME)) + index = self._registry_indexer(parse_output, class_name) if None is not index: results.append(index) return results @@ -363,6 +401,8 @@ class AstScaning(object): node = gast.parse(data) output, _ = self.scan_import(node, indent=' ', show_offsets=False) output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY]) + output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY]) + output[DECORATOR_KEY].extend(output[EXPRESS_KEY]) return output @@ -481,6 +521,13 @@ class FilesAstScaning(object): module_import[value_dict[MODULE_KEY]] = value_dict[IMPORT_KEY] return module_import + def _ignore_useless_keys(self, inverted_index): + if ('OPTIMIZERS', 'default', 'name') in inverted_index: + del inverted_index[('OPTIMIZERS', 'default', 'name')] + if ('LR_SCHEDULER', 'default', 'name') in inverted_index: + del inverted_index[('LR_SCHEDULER', 'default', 'name')] + return inverted_index + def get_files_scan_results(self, target_dir=MODELSCOPE_PATH, target_folders=SCAN_SUB_FOLDERS): @@ -514,6 +561,8 @@ class FilesAstScaning(object): MODULE_KEY: module_name } inverted_index_with_results = self._inverted_index(result) + inverted_index_with_results = self._ignore_useless_keys( + inverted_index_with_results) module_import = self._module_import(result) index = { INDEX_KEY: inverted_index_with_results, From 780330897a47bf24437090e48cf4350dae7af8ed Mon Sep 17 00:00:00 2001 From: "peter.lx" Date: Thu, 1 Sep 2022 22:17:14 +0800 Subject: [PATCH 04/28] [to #42322933] add Deberta v2 modeling and fill_mask task, with master merged add Deberta v2 modeling and fill_mask task, with master merged Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9966511 --- modelscope/metainfo.py | 1 + modelscope/models/nlp/__init__.py | 16 +- modelscope/models/nlp/deberta_v2/__init__.py | 73 + .../deberta_v2/configuration_deberta_v2.py | 130 ++ .../nlp/deberta_v2/modeling_deberta_v2.py | 1789 +++++++++++++++++ .../nlp/deberta_v2/tokenization_deberta_v2.py | 546 +++++ .../tokenization_deberta_v2_fast.py | 241 +++ modelscope/models/nlp/masked_language.py | 39 + .../pipelines/nlp/fill_mask_pipeline.py | 16 +- modelscope/preprocessors/nlp.py | 3 + tests/pipelines/test_deberta_tasks.py | 62 + 11 files changed, 2907 insertions(+), 9 deletions(-) create mode 100644 modelscope/models/nlp/deberta_v2/__init__.py create mode 100644 modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py create mode 100644 modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py create mode 100644 tests/pipelines/test_deberta_tasks.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 7c5afe80..971dd3f1 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -37,6 +37,7 @@ class Models(object): bert = 'bert' palm = 'palm-v2' structbert = 'structbert' + deberta_v2 = 'deberta_v2' veco = 'veco' translation = 'csanmt-translation' space_dst = 'space-dst' diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index e17a1d31..fd61e40b 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -9,12 +9,15 @@ if TYPE_CHECKING: from .bert_for_sequence_classification import BertForSequenceClassification from .bert_for_document_segmentation import BertForDocumentSegmentation from .csanmt_for_translation import CsanmtForTranslation - from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM, - BertForMaskedLM) + from .masked_language import ( + StructBertForMaskedLM, + VecoForMaskedLM, + BertForMaskedLM, + DebertaV2ForMaskedLM, + ) from .nncrf_for_named_entity_recognition import ( TransformerCRFForNamedEntityRecognition, LSTMCRFForNamedEntityRecognition) - from .palm_v2 import PalmForTextGeneration from .token_classification import SbertForTokenClassification from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification from .space import SpaceForDialogIntent @@ -22,7 +25,6 @@ if TYPE_CHECKING: from .space import SpaceForDialogStateTracking from .star_text_to_sql import StarForTextToSql from .task_models import (InformationExtractionModel, - SequenceClassificationModel, SingleBackboneTaskModelBase) from .bart_for_text_error_correction import BartForTextErrorCorrection from .gpt3 import GPT3ForTextGeneration @@ -36,8 +38,10 @@ else: 'csanmt_for_translation': ['CsanmtForTranslation'], 'bert_for_sequence_classification': ['BertForSequenceClassification'], 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], - 'masked_language': - ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'], + 'masked_language': [ + 'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM', + 'DebertaV2ForMaskedLM' + ], 'nncrf_for_named_entity_recognition': [ 'TransformerCRFForNamedEntityRecognition', 'LSTMCRFForNamedEntityRecognition' diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py new file mode 100644 index 00000000..664fc6c6 --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/__init__.py @@ -0,0 +1,73 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +_import_structure = { + 'configuration_deberta_v2': [ + 'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config', + 'DebertaV2OnnxConfig' + ], + 'tokenization_deberta_v2': ['DebertaV2Tokenizer'], +} + +if TYPE_CHECKING: + from .configuration_deberta_v2 import DebertaV2Config + from .tokenization_deberta_v2 import DebertaV2Tokenizer + from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast + + from .modeling_deberta_v2 import ( + DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST, + DebertaV2ForMaskedLM, + DebertaV2ForMultipleChoice, + DebertaV2ForQuestionAnswering, + DebertaV2ForSequenceClassification, + DebertaV2ForTokenClassification, + DebertaV2Model, + DebertaV2PreTrainedModel, + ) + +else: + _import_structure = { + 'configuration_deberta_v2': + ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'], + 'tokenization_deberta_v2': ['DebertaV2Tokenizer'] + } + _import_structure['tokenization_deberta_v2_fast'] = [ + 'DebertaV2TokenizerFast' + ] + _import_structure['modeling_deberta_v2'] = [ + 'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST', + 'DebertaV2ForMaskedLM', + 'DebertaV2ForMultipleChoice', + 'DebertaV2ForQuestionAnswering', + 'DebertaV2ForSequenceClassification', + 'DebertaV2ForTokenClassification', + 'DebertaV2Model', + 'DebertaV2PreTrainedModel', + ] + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__) diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py new file mode 100644 index 00000000..65e8f0b7 --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py @@ -0,0 +1,130 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020, Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config""" +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Mapping, Optional, Union + +from transformers import PretrainedConfig + +from modelscope.utils import logger as logging + +logger = logging.get_logger(__name__) + + +class DebertaV2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a + DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the DeBERTa + [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 128100): + Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`DebertaV2Model`]. + hidden_size (`int`, *optional*, defaults to 1536): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 24): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` + are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 0): + The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-7): + The epsilon used by the layer normalization layers. + relative_attention (`bool`, *optional*, defaults to `True`): + Whether use relative position encoding. + max_relative_positions (`int`, *optional*, defaults to -1): + The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value + as `max_position_embeddings`. + pad_token_id (`int`, *optional*, defaults to 0): + The value used to pad input_ids. + position_biased_input (`bool`, *optional*, defaults to `False`): + Whether add absolute position embedding to content embedding. + pos_att_type (`List[str]`, *optional*): + The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`, + `["p2c", "c2p"]`, `["p2c", "c2p"]`. + layer_norm_eps (`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + """ + model_type = 'deberta_v2' + + def __init__(self, + vocab_size=128100, + hidden_size=1536, + num_hidden_layers=24, + num_attention_heads=24, + intermediate_size=6144, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=0, + initializer_range=0.02, + layer_norm_eps=1e-7, + relative_attention=False, + max_relative_positions=-1, + pad_token_id=0, + position_biased_input=True, + pos_att_type=None, + pooler_dropout=0, + pooler_hidden_act='gelu', + **kwargs): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.relative_attention = relative_attention + self.max_relative_positions = max_relative_positions + self.pad_token_id = pad_token_id + self.position_biased_input = position_biased_input + + # Backwards compatibility + if type(pos_att_type) == str: + pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')] + + self.pos_att_type = pos_att_type + self.vocab_size = vocab_size + self.layer_norm_eps = layer_norm_eps + + self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size) + self.pooler_dropout = pooler_dropout + self.pooler_hidden_act = pooler_hidden_act diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py new file mode 100644 index 00000000..1c6b9071 --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py @@ -0,0 +1,1789 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020 Microsoft and the Hugging Face Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DeBERTa-v2 model.""" + +from collections.abc import Sequence +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss +from transformers.activations import ACT2FN +from transformers.file_utils import (add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward) +from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import softmax_backward_data + +from modelscope.utils import logger as logging +from .configuration_deberta_v2 import DebertaV2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = 'DebertaV2Config' +_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer' +_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite' + + +# Copied from transformers.models.deberta.modeling_deberta.ContextPooler +class ContextPooler(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.pooler_hidden_size, + config.pooler_hidden_size) + self.dropout = StableDropout(config.pooler_dropout) + self.config = config + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) + return pooled_output + + @property + def output_dim(self): + return self.config.hidden_size + + +# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2 +class XSoftmax(torch.autograd.Function): + """ + Masked Softmax which is optimized for saving memory + + Args: + input (`torch.tensor`): The input tensor that will apply softmax. + mask (`torch.IntTensor`): + The mask matrix where 0 indicate that element will be ignored in the softmax calculation. + dim (int): The dimension that will apply softmax + + Example: + + ```python + >>> import torch + >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax + + >>> # Make a tensor + >>> x = torch.randn([4, 20, 100]) + + >>> # Create a mask + >>> mask = (x > 0).int() + + >>> # Specify the dimension to apply softmax + >>> dim = -1 + + >>> y = XSoftmax.apply(x, mask, dim) + ```""" + + @staticmethod + def forward(self, input, mask, dim): + self.dim = dim + rmask = ~(mask.to(torch.bool)) + + output = input.masked_fill(rmask, + torch.tensor(torch.finfo(input.dtype).min)) + output = torch.softmax(output, self.dim) + output.masked_fill_(rmask, 0) + self.save_for_backward(output) + return output + + @staticmethod + def backward(self, grad_output): + (output, ) = self.saved_tensors + inputGrad = softmax_backward_data(self, grad_output, output, self.dim, + output) + return inputGrad, None, None + + @staticmethod + def symbolic(g, self, mask, dim): + import torch.onnx.symbolic_helper as sym_help + from torch.onnx.symbolic_opset9 import masked_fill, softmax + + mask_cast_value = g.op( + 'Cast', mask, to_i=sym_help.cast_pytorch_to_onnx['Long']) + r_mask = g.op( + 'Cast', + g.op('Sub', + g.op('Constant', value_t=torch.tensor(1, dtype=torch.int64)), + mask_cast_value), + to_i=sym_help.cast_pytorch_to_onnx['Byte'], + ) + output = masked_fill( + g, self, r_mask, + g.op( + 'Constant', + value_t=torch.tensor(torch.finfo(self.type().dtype()).min))) + output = softmax(g, output, dim) + return masked_fill( + g, output, r_mask, + g.op('Constant', value_t=torch.tensor(0, dtype=torch.uint8))) + + +# Copied from transformers.models.deberta.modeling_deberta.DropoutContext +class DropoutContext(object): + + def __init__(self): + self.dropout = 0 + self.mask = None + self.scale = 1 + self.reuse_mask = True + + +# Copied from transformers.models.deberta.modeling_deberta.get_mask +def get_mask(input, local_context): + if not isinstance(local_context, DropoutContext): + dropout = local_context + mask = None + else: + dropout = local_context.dropout + dropout *= local_context.scale + mask = local_context.mask if local_context.reuse_mask else None + + if dropout > 0 and mask is None: + mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to( + torch.bool) + + if isinstance(local_context, DropoutContext): + if local_context.mask is None: + local_context.mask = mask + + return mask, dropout + + +# Copied from transformers.models.deberta.modeling_deberta.XDropout +class XDropout(torch.autograd.Function): + """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" + + @staticmethod + def forward(ctx, input, local_ctx): + mask, dropout = get_mask(input, local_ctx) + ctx.scale = 1.0 / (1 - dropout) + if dropout > 0: + ctx.save_for_backward(mask) + return input.masked_fill(mask, 0) * ctx.scale + else: + return input + + @staticmethod + def backward(ctx, grad_output): + if ctx.scale > 1: + (mask, ) = ctx.saved_tensors + return grad_output.masked_fill(mask, 0) * ctx.scale, None + else: + return grad_output, None + + @staticmethod + def symbolic(g: torch._C.Graph, input: torch._C.Value, + local_ctx: Union[float, DropoutContext]) -> torch._C.Value: + from torch.onnx import symbolic_opset12 + + dropout_p = local_ctx + if isinstance(local_ctx, DropoutContext): + dropout_p = local_ctx.dropout + # StableDropout only calls this function when training. + train = True + # TODO: We should check if the opset_version being used to export + # is > 12 here, but there's no good way to do that. As-is, if the + # opset_version < 12, export will fail with a CheckerError. + # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like: + # if opset_version < 12: + # return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train) + return symbolic_opset12.dropout(g, input, dropout_p, train) + + +# Copied from transformers.models.deberta.modeling_deberta.StableDropout +class StableDropout(nn.Module): + """ + Optimized dropout module for stabilizing the training + + Args: + drop_prob (float): the dropout probabilities + """ + + def __init__(self, drop_prob): + super().__init__() + self.drop_prob = drop_prob + self.count = 0 + self.context_stack = None + + def forward(self, x): + """ + Call the module + + Args: + x (`torch.tensor`): The input tensor to apply dropout + """ + if self.training and self.drop_prob > 0: + return XDropout.apply(x, self.get_context()) + return x + + def clear_context(self): + self.count = 0 + self.context_stack = None + + def init_context(self, reuse_mask=True, scale=1): + if self.context_stack is None: + self.context_stack = [] + self.count = 0 + for c in self.context_stack: + c.reuse_mask = reuse_mask + c.scale = scale + + def get_context(self): + if self.context_stack is not None: + if self.count >= len(self.context_stack): + self.context_stack.append(DropoutContext()) + ctx = self.context_stack[self.count] + ctx.dropout = self.drop_prob + self.count += 1 + return ctx + else: + return self.drop_prob + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm +class DebertaV2SelfOutput(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2 +class DebertaV2Attention(nn.Module): + + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = DebertaV2SelfOutput(config) + self.config = config + + def forward( + self, + hidden_states, + attention_mask, + output_attentions=False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ): + self_output = self.self( + hidden_states, + attention_mask, + output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if output_attentions: + self_output, att_matrix = self_output + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states) + + if output_attentions: + return (attention_output, att_matrix) + else: + return attention_output + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2 +class DebertaV2Intermediate(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm +class DebertaV2Output(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2 +class DebertaV2Layer(nn.Module): + + def __init__(self, config): + super().__init__() + self.attention = DebertaV2Attention(config) + self.intermediate = DebertaV2Intermediate(config) + self.output = DebertaV2Output(config) + + def forward( + self, + hidden_states, + attention_mask, + query_states=None, + relative_pos=None, + rel_embeddings=None, + output_attentions=False, + ): + attention_output = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if output_attentions: + attention_output, att_matrix = attention_output + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + if output_attentions: + return (layer_output, att_matrix) + else: + return layer_output + + +class ConvLayer(nn.Module): + + def __init__(self, config): + super().__init__() + kernel_size = getattr(config, 'conv_kernel_size', 3) + groups = getattr(config, 'conv_groups', 1) + self.conv_act = getattr(config, 'conv_act', 'tanh') + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size, + padding=(kernel_size - 1) // 2, + groups=groups) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, residual_states, input_mask): + out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute( + 0, 2, 1).contiguous() + rmask = (1 - input_mask).bool() + out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0) + out = ACT2FN[self.conv_act](self.dropout(out)) + + layer_norm_input = residual_states + out + output = self.LayerNorm(layer_norm_input).to(layer_norm_input) + + if input_mask is None: + output_states = output + else: + if input_mask.dim() != layer_norm_input.dim(): + if input_mask.dim() == 4: + input_mask = input_mask.squeeze(1).squeeze(1) + input_mask = input_mask.unsqueeze(2) + + input_mask = input_mask.to(output.dtype) + output_states = output * input_mask + + return output_states + + +class DebertaV2Encoder(nn.Module): + """Modified BertEncoder with relative position bias support""" + + def __init__(self, config): + super().__init__() + + self.layer = nn.ModuleList( + [DebertaV2Layer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, 'relative_attention', False) + + if self.relative_attention: + self.max_relative_positions = getattr(config, + 'max_relative_positions', -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + + self.position_buckets = getattr(config, 'position_buckets', -1) + pos_ebd_size = self.max_relative_positions * 2 + + if self.position_buckets > 0: + pos_ebd_size = self.position_buckets * 2 + + self.rel_embeddings = nn.Embedding(pos_ebd_size, + config.hidden_size) + + self.norm_rel_ebd = [ + x.strip() + for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|') + ] + + if 'layer_norm' in self.norm_rel_ebd: + self.LayerNorm = LayerNorm( + config.hidden_size, + config.layer_norm_eps, + elementwise_affine=True) + + self.conv = ConvLayer(config) if getattr(config, 'conv_kernel_size', + 0) > 0 else None + self.gradient_checkpointing = False + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd): + rel_embeddings = self.LayerNorm(rel_embeddings) + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if attention_mask.dim() <= 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask * extended_attention_mask.squeeze( + -2).unsqueeze(-1) + attention_mask = attention_mask.byte() + elif attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + q = query_states.size( + -2) if query_states is not None else hidden_states.size(-2) + relative_pos = build_relative_position( + q, + hidden_states.size(-2), + bucket_size=self.position_buckets, + max_position=self.max_relative_positions) + return relative_pos + + def forward( + self, + hidden_states, + attention_mask, + output_hidden_states=True, + output_attentions=False, + query_states=None, + relative_pos=None, + return_dict=True, + ): + if attention_mask.dim() <= 2: + input_mask = attention_mask + else: + input_mask = (attention_mask.sum(-2) > 0).byte() + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, + relative_pos) + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[0] + else: + next_kv = hidden_states + rel_embeddings = self.get_rel_embedding() + output_states = next_kv + for i, layer_module in enumerate(self.layer): + + if output_hidden_states: + all_hidden_states = all_hidden_states + (output_states, ) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + output_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + next_kv, + attention_mask, + query_states, + relative_pos, + rel_embeddings, + ) + else: + output_states = layer_module( + next_kv, + attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + ) + + if output_attentions: + output_states, att_m = output_states + + if i == 0 and self.conv is not None: + output_states = self.conv(hidden_states, output_states, + input_mask) + + if query_states is not None: + query_states = output_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i + 1] if i + 1 < len( + self.layer) else None + else: + next_kv = output_states + + if output_attentions: + all_attentions = all_attentions + (att_m, ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (output_states, ) + + if not return_dict: + return tuple( + v for v in [output_states, all_hidden_states, all_attentions] + if v is not None) + return BaseModelOutput( + last_hidden_state=output_states, + hidden_states=all_hidden_states, + attentions=all_attentions) + + +def make_log_bucket_position(relative_pos, bucket_size, max_position): + sign = torch.sign(relative_pos) + mid = bucket_size // 2 + abs_pos = torch.where( + (relative_pos < mid) & (relative_pos > -mid), + torch.tensor(mid - 1).type_as(relative_pos), + torch.abs(relative_pos), + ) + log_pos = ( + torch.ceil( + torch.log(abs_pos / mid) + / torch.log(torch.tensor( + (max_position - 1) / mid)) * (mid - 1)) + mid) + bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos), + log_pos * sign) + return bucket_pos + + +def build_relative_position(query_size, + key_size, + bucket_size=-1, + max_position=-1): + """ + Build relative position according to the query and key + + We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key + \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q - + P_k\\) + + Args: + query_size (int): the length of query + key_size (int): the length of key + bucket_size (int): the size of position bucket + max_position (int): the maximum allowed absolute position + + Return: + `torch.LongTensor`: A tensor with shape [1, query_size, key_size] + + """ + q_ids = torch.arange(0, query_size) + k_ids = torch.arange(0, key_size) + rel_pos_ids = q_ids[:, None] - k_ids[None, :] + if bucket_size > 0 and max_position > 0: + rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, + max_position) + rel_pos_ids = rel_pos_ids.to(torch.long) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = rel_pos_ids.unsqueeze(0) + return rel_pos_ids + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand +def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): + return c2p_pos.expand([ + query_layer.size(0), + query_layer.size(1), + query_layer.size(2), + relative_pos.size(-1) + ]) + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand +def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): + return c2p_pos.expand([ + query_layer.size(0), + query_layer.size(1), + key_layer.size(-2), + key_layer.size(-2) + ]) + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand +def pos_dynamic_expand(pos_index, p2c_att, key_layer): + return pos_index.expand(p2c_att.size()[:2] + + (pos_index.size(-2), key_layer.size(-2))) + + +class DisentangledSelfAttention(nn.Module): + """ + Disentangled self-attention module + + Parameters: + config (`DebertaV2Config`): + A model config class instance with the configuration to build a new model. The schema is similar to + *BertConfig*, for more details, please refer [`DebertaV2Config`] + + """ + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + self.num_attention_heads = config.num_attention_heads + _attention_head_size = config.hidden_size // config.num_attention_heads + self.attention_head_size = getattr(config, 'attention_head_size', + _attention_head_size) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.query_proj = nn.Linear( + config.hidden_size, self.all_head_size, bias=True) + self.key_proj = nn.Linear( + config.hidden_size, self.all_head_size, bias=True) + self.value_proj = nn.Linear( + config.hidden_size, self.all_head_size, bias=True) + + self.share_att_key = getattr(config, 'share_att_key', False) + self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] + self.relative_attention = getattr(config, 'relative_attention', False) + + if self.relative_attention: + self.position_buckets = getattr(config, 'position_buckets', -1) + self.max_relative_positions = getattr(config, + 'max_relative_positions', -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.pos_ebd_size = self.max_relative_positions + if self.position_buckets > 0: + self.pos_ebd_size = self.position_buckets + + self.pos_dropout = StableDropout(config.hidden_dropout_prob) + + if not self.share_att_key: + if 'c2p' in self.pos_att_type: + self.pos_key_proj = nn.Linear( + config.hidden_size, self.all_head_size, bias=True) + if 'p2c' in self.pos_att_type: + self.pos_query_proj = nn.Linear(config.hidden_size, + self.all_head_size) + + self.dropout = StableDropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, attention_heads): + new_x_shape = x.size()[:-1] + (attention_heads, -1) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), + x.size(-1)) + + def forward( + self, + hidden_states, + attention_mask, + output_attentions=False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ): + """ + Call the module + + Args: + hidden_states (`torch.FloatTensor`): + Input states to the module usually the output from previous layer, it will be the Q,K and V in + *Attention(Q,K,V)* + + attention_mask (`torch.ByteTensor`): + An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum + sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j* + th token. + + output_attentions (`bool`, optional): + Whether return the attention matrix. + + query_states (`torch.FloatTensor`, optional): + The *Q* state in *Attention(Q,K,V)*. + + relative_pos (`torch.LongTensor`): + The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with + values ranging in [*-max_relative_positions*, *max_relative_positions*]. + + rel_embeddings (`torch.FloatTensor`): + The embedding of relative distances. It's a tensor of shape [\\(2 \\times + \\text{max_relative_positions}\\), *hidden_size*]. + + + """ + if query_states is None: + query_states = hidden_states + query_layer = self.transpose_for_scores( + self.query_proj(query_states), self.num_attention_heads) + key_layer = self.transpose_for_scores( + self.key_proj(hidden_states), self.num_attention_heads) + value_layer = self.transpose_for_scores( + self.value_proj(hidden_states), self.num_attention_heads) + + rel_att = None + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + if 'c2p' in self.pos_att_type: + scale_factor += 1 + if 'p2c' in self.pos_att_type: + scale_factor += 1 + scale = torch.sqrt( + torch.tensor(query_layer.size(-1), dtype=torch.float) + * scale_factor) + attention_scores = torch.bmm(query_layer, key_layer.transpose( + -1, -2)) / torch.tensor( + scale, dtype=query_layer.dtype) + if self.relative_attention: + rel_embeddings = self.pos_dropout(rel_embeddings) + rel_att = self.disentangled_attention_bias(query_layer, key_layer, + relative_pos, + rel_embeddings, + scale_factor) + + if rel_att is not None: + attention_scores = attention_scores + rel_att + attention_scores = attention_scores + attention_scores = attention_scores.view(-1, self.num_attention_heads, + attention_scores.size(-2), + attention_scores.size(-1)) + + # bsz x height x length x dimension + attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + attention_probs = self.dropout(attention_probs) + context_layer = torch.bmm( + attention_probs.view(-1, attention_probs.size(-2), + attention_probs.size(-1)), value_layer) + context_layer = ( + context_layer.view(-1, self.num_attention_heads, + context_layer.size(-2), + context_layer.size(-1)).permute(0, 2, 1, + 3).contiguous()) + new_context_layer_shape = context_layer.size()[:-2] + (-1, ) + context_layer = context_layer.view(new_context_layer_shape) + if output_attentions: + return (context_layer, attention_probs) + else: + return context_layer + + def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, + rel_embeddings, scale_factor): + if relative_pos is None: + q = query_layer.size(-2) + relative_pos = build_relative_position( + q, + key_layer.size(-2), + bucket_size=self.position_buckets, + max_position=self.max_relative_positions) + if relative_pos.dim() == 2: + relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) + elif relative_pos.dim() == 3: + relative_pos = relative_pos.unsqueeze(1) + # bsz x height x query x key + elif relative_pos.dim() != 4: + raise ValueError( + f'Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}' + ) + + att_span = self.pos_ebd_size + relative_pos = relative_pos.long().to(query_layer.device) + + rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0) + if self.share_att_key: + pos_query_layer = self.transpose_for_scores( + self.query_proj(rel_embeddings), + self.num_attention_heads).repeat( + query_layer.size(0) // self.num_attention_heads, 1, 1) + pos_key_layer = self.transpose_for_scores( + self.key_proj(rel_embeddings), + self.num_attention_heads).repeat( + query_layer.size(0) // self.num_attention_heads, 1, 1) + else: + if 'c2p' in self.pos_att_type: + pos_key_layer = self.transpose_for_scores( + self.pos_key_proj(rel_embeddings), + self.num_attention_heads).repeat( + query_layer.size(0) // self.num_attention_heads, 1, + 1) # .split(self.all_head_size, dim=-1) + if 'p2c' in self.pos_att_type: + pos_query_layer = self.transpose_for_scores( + self.pos_query_proj(rel_embeddings), + self.num_attention_heads).repeat( + query_layer.size(0) // self.num_attention_heads, 1, + 1) # .split(self.all_head_size, dim=-1) + + score = 0 + # content->position + if 'c2p' in self.pos_att_type: + scale = torch.sqrt( + torch.tensor(pos_key_layer.size(-1), dtype=torch.float) + * scale_factor) + c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2)) + c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1) + c2p_att = torch.gather( + c2p_att, + dim=-1, + index=c2p_pos.squeeze(0).expand([ + query_layer.size(0), + query_layer.size(1), + relative_pos.size(-1) + ]), + ) + score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype) + + # position->content + if 'p2c' in self.pos_att_type: + scale = torch.sqrt( + torch.tensor(pos_query_layer.size(-1), dtype=torch.float) + * scale_factor) + if key_layer.size(-2) != query_layer.size(-2): + r_pos = build_relative_position( + key_layer.size(-2), + key_layer.size(-2), + bucket_size=self.position_buckets, + max_position=self.max_relative_positions, + ).to(query_layer.device) + r_pos = r_pos.unsqueeze(0) + else: + r_pos = relative_pos + + p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) + p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2)) + p2c_att = torch.gather( + p2c_att, + dim=-1, + index=p2c_pos.squeeze(0).expand([ + query_layer.size(0), + key_layer.size(-2), + key_layer.size(-2) + ]), + ).transpose(-1, -2) + score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype) + + return score + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm +class DebertaV2Embeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + pad_token_id = getattr(config, 'pad_token_id', 0) + self.embedding_size = getattr(config, 'embedding_size', + config.hidden_size) + self.word_embeddings = nn.Embedding( + config.vocab_size, self.embedding_size, padding_idx=pad_token_id) + + self.position_biased_input = getattr(config, 'position_biased_input', + True) + if not self.position_biased_input: + self.position_embeddings = None + else: + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, self.embedding_size) + + if config.type_vocab_size > 0: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + self.embedding_size) + + if self.embedding_size != config.hidden_size: + self.embed_proj = nn.Linear( + self.embedding_size, config.hidden_size, bias=False) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + 'position_ids', + torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + mask=None, + inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.position_embeddings is not None: + position_embeddings = self.position_embeddings(position_ids.long()) + else: + position_embeddings = torch.zeros_like(inputs_embeds) + + embeddings = inputs_embeds + if self.position_biased_input: + embeddings += position_embeddings + if self.config.type_vocab_size > 0: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings += token_type_embeddings + + if self.embedding_size != self.config.hidden_size: + embeddings = self.embed_proj(embeddings) + + embeddings = self.LayerNorm(embeddings) + + if mask is not None: + if mask.dim() != embeddings.dim(): + if mask.dim() == 4: + mask = mask.squeeze(1).squeeze(1) + mask = mask.unsqueeze(2) + mask = mask.to(embeddings.dtype) + + embeddings = embeddings * mask + + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2 +class DebertaV2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DebertaV2Config + base_model_prefix = 'deberta' + _keys_to_ignore_on_load_missing = ['position_ids'] + _keys_to_ignore_on_load_unexpected = ['position_embeddings'] + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_( + mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, DebertaV2Encoder): + module.gradient_checkpointing = value + + +DEBERTA_START_DOCSTRING = r""" + The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled + Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build + on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two + improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + + Parameters: + config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + 'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.', + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2 +class DebertaV2Model(DebertaV2PreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + self.embeddings = DebertaV2Embeddings(config) + self.encoder = DebertaV2Encoder(config) + self.z_steps = 0 + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError( + 'The prune function is not implemented in DeBERTa model.') + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + 'You cannot specify both input_ids and inputs_embeds at the same time' + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + 'You have to specify either input_ids or inputs_embeds') + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + mask=attention_mask, + inputs_embeds=inputs_embeds, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask, + output_hidden_states=True, + output_attentions=output_attentions, + return_dict=return_dict, + ) + encoded_layers = encoder_outputs[1] + + if self.z_steps > 1: + hidden_states = encoded_layers[-2] + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] + query_states = encoded_layers[-1] + rel_embeddings = self.encoder.get_rel_embedding() + attention_mask = self.encoder.get_attention_mask(attention_mask) + rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: + query_states = layer( + hidden_states, + attention_mask, + output_attentions=False, + query_states=query_states, + relative_pos=rel_pos, + rel_embeddings=rel_embeddings, + ) + encoded_layers.append(query_states) + + sequence_output = encoded_layers[-1] + + if not return_dict: + return (sequence_output, ) + encoder_outputs[ + (1 if output_hidden_states else 2):] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states + if output_hidden_states else None, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """DeBERTa Model with a `language modeling` head on top.""", + DEBERTA_START_DOCSTRING) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2 +class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + _keys_to_ignore_on_load_missing = [ + r'position_ids', r'predictions.decoder.bias' + ] + + def __init__(self, config): + super().__init__(config) + + self.deberta = DebertaV2Model(config) + self.cls = DebertaV2OnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1)) + + if not return_dict: + output = (prediction_scores, ) + outputs[1:] + return ((masked_lm_loss, ) + + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta +class DebertaV2PredictionHeadTransform(nn.Module): + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta +class DebertaV2LMPredictionHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.transform = DebertaV2PredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta +class DebertaV2OnlyMLMHead(nn.Module): + + def __init__(self, config): + super().__init__() + self.predictions = DebertaV2LMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +@add_start_docstrings( + """ + DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2 +class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + num_labels = getattr(config, 'num_labels', 2) + self.num_labels = num_labels + + self.deberta = DebertaV2Model(config) + self.pooler = ContextPooler(config) + output_dim = self.pooler.output_dim + + self.classifier = nn.Linear(output_dim, num_labels) + drop_out = getattr(config, 'cls_dropout', None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = StableDropout(drop_out) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.deberta.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.deberta.set_input_embeddings(new_embeddings) + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + encoder_layer = outputs[0] + pooled_output = self.pooler(encoder_layer) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + # regression task + loss_fn = nn.MSELoss() + logits = logits.view(-1).to(labels.dtype) + loss = loss_fn(logits, labels.view(-1)) + elif labels.dim() == 1 or labels.size(-1) == 1: + label_index = (labels >= 0).nonzero() + labels = labels.long() + if label_index.size(0) > 0: + labeled_logits = torch.gather( + logits, 0, + label_index.expand( + label_index.size(0), logits.size(1))) + labels = torch.gather(labels, 0, label_index.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct( + labeled_logits.view(-1, self.num_labels).float(), + labels.view(-1)) + else: + loss = torch.tensor(0).to(logits) + else: + log_softmax = nn.LogSoftmax(-1) + loss = -((log_softmax(logits) * labels).sum(-1)).mean() + elif self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits, ) + outputs[1:] + return ((loss, ) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions) + + +@add_start_docstrings( + """ + DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2 +class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaV2Model(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits, ) + outputs[1:] + return ((loss, ) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions) + + +@add_start_docstrings( + """ + DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2 +class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r'pooler'] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaV2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss, ) + + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + DEBERTA_START_DOCSTRING, +) +class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel): + + def __init__(self, config): + super().__init__(config) + + num_labels = getattr(config, 'num_labels', 2) + self.num_labels = num_labels + + self.deberta = DebertaV2Model(config) + self.pooler = ContextPooler(config) + output_dim = self.pooler.output_dim + + self.classifier = nn.Linear(output_dim, 1) + drop_out = getattr(config, 'cls_dropout', None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = StableDropout(drop_out) + + self.init_weights() + + def get_input_embeddings(self): + return self.deberta.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.deberta.set_input_embeddings(new_embeddings) + + @add_start_docstrings_to_model_forward( + DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length')) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[ + 1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view( + -1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view( + -1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view( + -1, + token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view( + -1, + attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), + inputs_embeds.size(-1)) + if inputs_embeds is not None else None) + + outputs = self.deberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + encoder_layer = outputs[0] + pooled_output = self.pooler(encoder_layer) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits, ) + outputs[1:] + return ((loss, ) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py new file mode 100644 index 00000000..adb60288 --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py @@ -0,0 +1,546 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020 Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`""" + +import os +import unicodedata +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as sp +from transformers.tokenization_utils import PreTrainedTokenizer + +PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + +PRETRAINED_INIT_CONFIGURATION = {} + +VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'} + + +class DebertaV2Tokenizer(PreTrainedTokenizer): + r""" + Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) + and [jieba](https://github.com/fxsjy/jieba). + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + bos_token (`string`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + eos_token (`string`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. When building a sequence using special tokens, this is not the token that is + used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, + vocab_file, + do_lower_case=False, + split_by_punct=False, + split_chinese=True, + bos_token='[CLS]', + eos_token='[SEP]', + unk_token='[UNK]', + sep_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + mask_token='[MASK]', + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + do_lower_case=do_lower_case, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + split_by_punct=split_by_punct, + split_chinese=split_chinese, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' + ) + self.do_lower_case = do_lower_case + self.split_by_punct = split_by_punct + self.split_chinese = split_chinese + self.vocab_file = vocab_file + self._tokenizer = SPMTokenizer( + vocab_file, + split_by_punct=split_by_punct, + sp_model_kwargs=self.sp_model_kwargs) + self.jieba = None + if self.split_chinese: + try: + import jieba + except ImportError: + raise ImportError( + 'You need to install jieba to split chinese and use DebertaV2Tokenizer. ' + 'See https://pypi.org/project/jieba/ for installation.') + self.jieba = jieba + + @property + def vocab_size(self): + return len(self.vocab) + + @property + def vocab(self): + return self._tokenizer.vocab + + def get_vocab(self): + vocab = self.vocab.copy() + vocab.update(self.get_added_vocab()) + return vocab + + def _tokenize(self, text: str) -> List[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + if self.do_lower_case: + text = text.lower() + if self.split_chinese: + seg_list = [x for x in self.jieba.cut(text)] + text = ' '.join(seg_list) + return self._tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self._tokenizer.spm.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self._tokenizer.spm.IdToPiece( + index) if index < self.vocab_size else self.unk_token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + return self._tokenizer.decode(tokens) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A DeBERTa sequence has the following format: + + - single sequence: [CLS] X [SEP] + - pair of sequences: [CLS] A [SEP] B [SEP] + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + + sep) * [1] + + def prepare_for_tokenization(self, + text, + is_split_into_words=False, + **kwargs): + add_prefix_space = kwargs.pop('add_prefix_space', False) + if is_split_into_words or add_prefix_space: + text = ' ' + text + return (text, kwargs) + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + return self._tokenizer.save_pretrained( + save_directory, filename_prefix=filename_prefix) + + +class SPMTokenizer: + r""" + Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece). + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + def __init__(self, + vocab_file, + split_by_punct=False, + sp_model_kwargs: Optional[Dict[str, Any]] = None): + self.split_by_punct = split_by_punct + self.vocab_file = vocab_file + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) + if not os.path.exists(vocab_file): + raise FileNotFoundError(f'{vocab_file} does not exist!') + spm.load(vocab_file) + bpe_vocab_size = spm.GetPieceSize() + # Token map + # 0+1 + # 1+1 + # 2+1 + self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)} + self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)] + # self.vocab['[PAD]'] = 0 + # self.vocab['[CLS]'] = 1 + # self.vocab['[SEP]'] = 2 + # self.vocab['[UNK]'] = 3 + + self.spm = spm + + def __getstate__(self): + state = self.__dict__.copy() + state['spm'] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, 'sp_model_kwargs'): + self.sp_model_kwargs = {} + + self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) + self.spm.Load(self.vocab_file) + + def tokenize(self, text): + return self._encode_as_pieces(text) + + def convert_ids_to_tokens(self, ids): + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + def decode(self, tokens, start=-1, end=-1, raw_text=None): + if raw_text is None: + return self.spm.decode_pieces([t for t in tokens]) + else: + words = self.split_to_words(raw_text) + word_tokens = [self.tokenize(w) for w in words] + token2words = [0] * len(tokens) + tid = 0 + for i, w in enumerate(word_tokens): + for k, t in enumerate(w): + token2words[tid] = i + tid += 1 + word_start = token2words[start] + word_end = token2words[end] if end < len(tokens) else len(words) + text = ''.join(words[word_start:word_end]) + return text + + def add_special_token(self, token): + if token not in self.special_tokens: + self.special_tokens.append(token) + if token not in self.vocab: + self.vocab[token] = len(self.vocab) - 1 + self.ids_to_tokens.append(token) + return self.id(token) + + def part_of_whole_word(self, token, is_bos=False): + if is_bos: + return True + if (len(token) == 1 and (_is_whitespace(list(token)[0]))): + return False + if _is_control(list(token)[0]): + return False + if _is_punctuation(list(token)[0]): + return False + if token in self.add_special_token: + return False + + word_start = b'\xe2\x96\x81'.decode('utf-8') + return not token.startswith(word_start) + + def pad(self): + return '[PAD]' + + def bos(self): + return '[CLS]' + + def eos(self): + return '[SEP]' + + def unk(self): + return '[UNK]' + + def mask(self): + return '[MASK]' + + def sym(self, id): + return self.ids_to_tokens[id] + + def id(self, sym): + return self.vocab[sym] if sym in self.vocab else 1 + + def _encode_as_pieces(self, text): + text = convert_to_unicode(text) + if self.split_by_punct: + words = self._run_split_on_punc(text) + pieces = [self.spm.encode(w, out_type=str) for w in words] + return [p for w in pieces for p in w] + else: + return self.spm.encode(text, out_type=str) + + def split_to_words(self, text): + pieces = self._encode_as_pieces(text) + word_start = b'\xe2\x96\x81'.decode('utf-8') + words = [] + offset = 0 + prev_end = 0 + for i, p in enumerate(pieces): + if p.startswith(word_start): + if offset > prev_end: + words.append(text[prev_end:offset]) + prev_end = offset + w = p.replace(word_start, '') + else: + w = p + try: + s = text.index(w, offset) + pn = '' + k = i + 1 + while k < len(pieces): + pn = pieces[k].replace(word_start, '') + if len(pn) > 0: + break + k += 1 + + if len(pn) > 0 and pn in text[offset:s]: + offset = offset + 1 + else: + offset = s + len(w) + except Exception: + offset = offset + 1 + + if prev_end < offset: + words.append(text[prev_end:offset]) + + return words + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize('NFD', text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == 'Mn': + continue + output.append(char) + return ''.join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return [''.join(x) for x in output] + + def save_pretrained(self, path: str, filename_prefix: str = None): + filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] + if filename_prefix is not None: + filename = filename_prefix + '-' + filename + full_path = os.path.join(path, filename) + with open(full_path, 'wb') as fs: + fs.write(self.spm.serialized_model_proto()) + return (full_path, ) + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == ' ' or char == '\t' or char == '\n' or char == '\r': + return True + cat = unicodedata.category(char) + if cat == 'Zs': + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == '\t' or char == '\n' or char == '\r': + return False + cat = unicodedata.category(char) + if cat.startswith('C'): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( + cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith('P'): + return True + return False + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode('utf-8', 'ignore') + else: + raise ValueError(f'Unsupported string type: {type(text)}') diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py new file mode 100644 index 00000000..a1fcecf4 --- /dev/null +++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py @@ -0,0 +1,241 @@ +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. +# Copyright 2020 Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for model DeBERTa.""" + +import os +from shutil import copyfile +from typing import Optional, Tuple + +from transformers.file_utils import is_sentencepiece_available +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + +from modelscope.utils import logger as logging + +if is_sentencepiece_available(): + from .tokenization_deberta_v2 import DebertaV2Tokenizer +else: + DebertaV2Tokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'spm.model', + 'tokenizer_file': 'tokenizer.json' +} + +PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + +PRETRAINED_INIT_CONFIGURATION = {} + + +class DebertaV2TokenizerFast(PreTrainedTokenizerFast): + r""" + Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) + and [rjieba-py](https://github.com/messense/rjieba-py). + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + bos_token (`string`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + eos_token (`string`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. When building a sequence using special tokens, this is not the token that is + used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = DebertaV2Tokenizer + + def __init__(self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=False, + split_by_punct=False, + split_chinese=True, + bos_token='[CLS]', + eos_token='[SEP]', + unk_token='[UNK]', + sep_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + mask_token='[MASK]', + **kwargs) -> None: + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + split_by_punct=split_by_punct, + split_chinese=split_chinese, + **kwargs, + ) + + self.do_lower_case = do_lower_case + self.split_by_punct = split_by_punct + self.split_chinese = split_chinese + self.vocab_file = vocab_file + self.can_save_slow_tokenizer = False if not self.vocab_file else True + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A DeBERTa sequence has the following format: + + - single sequence: [CLS] X [SEP] + - pair of sequences: [CLS] A [SEP] B [SEP] + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + + sep) * [1] + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow ' + 'tokenizer.') + + if not os.path.isdir(save_directory): + logger.error( + f'Vocabulary path ({save_directory}) should be a directory') + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + '-' if filename_prefix else '') + + VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file, ) diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py index 17324be9..4f466c23 100644 --- a/modelscope/models/nlp/masked_language.py +++ b/modelscope/models/nlp/masked_language.py @@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer from modelscope.metainfo import Models from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS +from modelscope.models.nlp.deberta_v2 import \ + DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer from modelscope.models.nlp.structbert import SbertForMaskedLM from modelscope.models.nlp.veco import \ VecoForMaskedLM as VecoForMaskedLMTransformer @@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer): VecoForMaskedLM).from_pretrained( pretrained_model_name_or_path=model_dir, model_dir=model_dir) + + +@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2) +class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer): + """Deberta v2 for MLM model. + + Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets. + """ + + def __init__(self, config, model_dir): + super(TorchModel, self).__init__(model_dir) + DebertaV2ForMaskedLMTransformer.__init__(self, config) + + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + labels=None): + output = DebertaV2ForMaskedLMTransformer.forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + labels=labels) + output[OutputKeys.INPUT_IDS] = input_ids + return output + + @classmethod + def _instantiate(cls, **kwargs): + model_dir = kwargs.get('model_dir') + return super(DebertaV2ForMaskedLMTransformer, + DebertaV2ForMaskedLM).from_pretrained( + pretrained_model_name_or_path=model_dir, + model_dir=model_dir) diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 60a9631b..caba4122 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -13,7 +13,10 @@ from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks __all__ = ['FillMaskPipeline'] -_type_map = {'veco': 'roberta', 'sbert': 'bert'} +_type_map = { + 'veco': 'roberta', + 'sbert': 'bert', +} @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask) @@ -65,7 +68,7 @@ class FillMaskPipeline(Pipeline): self.config = Config.from_file( os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION)) self.tokenizer = preprocessor.tokenizer - self.mask_id = {'roberta': 250001, 'bert': 103} + self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4} self.rep_map = { 'bert': { @@ -85,7 +88,14 @@ class FillMaskPipeline(Pipeline): '': '', '': '', '': ' ' - } + }, + 'deberta_v2': { + '[PAD]': '', + r' +': ' ', + '[SEP]': '', + '[CLS]': '', + '[UNK]': '' + }, } def forward(self, inputs: Dict[str, Any], diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 4882c477..825611d6 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -170,6 +170,9 @@ class NLPTokenizerPreprocessorBase(Preprocessor): elif model_type == Models.veco: from modelscope.models.nlp.veco import VecoTokenizer return VecoTokenizer.from_pretrained(model_dir) + elif model_type == Models.deberta_v2: + from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer + return DebertaV2Tokenizer.from_pretrained(model_dir) else: return AutoTokenizer.from_pretrained(model_dir, use_fast=False) diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py new file mode 100644 index 00000000..4f3206cd --- /dev/null +++ b/tests/pipelines/test_deberta_tasks.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import torch + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.models import Model +from modelscope.models.nlp import DebertaV2ForMaskedLM +from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer, + DebertaV2TokenizerFast) +from modelscope.pipelines import pipeline +from modelscope.pipelines.nlp import FillMaskPipeline +from modelscope.preprocessors import FillMaskPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class DeBERTaV2TaskTest(unittest.TestCase): + model_id_deberta = 'damo/nlp_debertav2_fill-mask_chinese-lite' + + ori_text = '你师父差得动你,你师父可差不动我。' + test_input = '你师父差得动你,你师父可[MASK]不动我。' + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_by_direct_model_download(self): + model_dir = snapshot_download(self.model_id_deberta) + preprocessor = FillMaskPreprocessor( + model_dir, first_sequence='sentence', second_sequence=None) + model = DebertaV2ForMaskedLM.from_pretrained(model_dir) + pipeline1 = FillMaskPipeline(model, preprocessor) + pipeline2 = pipeline( + Tasks.fill_mask, model=model, preprocessor=preprocessor) + ori_text = self.ori_text + test_input = self.test_input + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: ' + f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + # sbert + print(self.model_id_deberta) + model = Model.from_pretrained(self.model_id_deberta) + preprocessor = FillMaskPreprocessor( + model.model_dir, first_sequence='sentence', second_sequence=None) + pipeline_ins = pipeline( + task=Tasks.fill_mask, model=model, preprocessor=preprocessor) + print( + f'\nori_text: {self.ori_text}\ninput: {self.test_input}\npipeline: ' + f'{pipeline_ins(self.test_input)}\n') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_ins = pipeline( + task=Tasks.fill_mask, model=self.model_id_deberta) + ori_text = self.ori_text + test_input = self.test_input + print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' + f'{pipeline_ins(test_input)}\n') + + +if __name__ == '__main__': + unittest.main() From 9e14d6727b7583fed29f0684a1171754a505388d Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Fri, 2 Sep 2022 11:02:43 +0800 Subject: [PATCH 05/28] [to #44571845]fix: ci support multiple image Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9974293 --- .dev_scripts/ci_container_test.sh | 3 - .dev_scripts/dockerci.sh | 5 +- requirements/tensorflow1x.txt | 1 + tests/isolated_cases.txt | 6 - tests/run.py | 191 ++++++++++++++++++++---------- tests/run_config.yaml | 31 +++++ 6 files changed, 165 insertions(+), 72 deletions(-) create mode 100644 requirements/tensorflow1x.txt delete mode 100644 tests/isolated_cases.txt create mode 100644 tests/run_config.yaml diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 2f18aff7..a53c08c6 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -4,8 +4,6 @@ pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html pip install -r requirements/tests.txt -# install numpy<=1.18 for tensorflow==1.15.x -pip install "numpy<=1.18" git config --global --add safe.directory /Maas-lib @@ -26,4 +24,3 @@ else fi echo "Running case with command: $ci_command" $ci_command -#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh index dbb79514..e76f2f14 100644 --- a/.dev_scripts/dockerci.sh +++ b/.dev_scripts/dockerci.sh @@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0' cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' cpu_sets_arr=($cpu_sets) is_get_file_lock=false -CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND} +# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml' +CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND} echo "ci command: $CI_COMMAND" for gpu in $gpus do @@ -16,6 +17,7 @@ do echo "get gpu lock $gpu" CONTAINER_NAME="modelscope-ci-$gpu" let is_get_file_lock=true + # pull image if there are update docker pull ${IMAGE_NAME}:${IMAGE_VERSION} docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ @@ -38,6 +40,7 @@ do --net host \ ${IMAGE_NAME}:${IMAGE_VERSION} \ $CI_COMMAND + if [ $? -ne 0 ]; then echo "Running test case failed, please check the log!" exit -1 diff --git a/requirements/tensorflow1x.txt b/requirements/tensorflow1x.txt new file mode 100644 index 00000000..b139efe1 --- /dev/null +++ b/requirements/tensorflow1x.txt @@ -0,0 +1 @@ +numpy==1.18.5 diff --git a/tests/isolated_cases.txt b/tests/isolated_cases.txt deleted file mode 100644 index be85142a..00000000 --- a/tests/isolated_cases.txt +++ /dev/null @@ -1,6 +0,0 @@ - test_text_to_speech.py - test_multi_modal_embedding.py - test_ofa_tasks.py - test_video_summarization.py - test_dialog_modeling.py - test_csanmt_translation.py diff --git a/tests/run.py b/tests/run.py index 79509745..478cb9d6 100644 --- a/tests/run.py +++ b/tests/run.py @@ -21,6 +21,7 @@ import pandas # if 'import tensorflow' in front of 'import torch'. # Puting a 'import torch' here can bypass this incompatibility. import torch +import yaml from modelscope.utils.logger import get_logger from modelscope.utils.test_utils import set_test_level, test_level @@ -61,6 +62,7 @@ def statistics_test_result(df): result, total_cases, success_cases, failures_cases, error_cases, skipped_cases, expected_failure_cases, unexpected_success_cases) + print('Testing result summary.') print(result_msg) if result == 'FAILED': sys.exit(1) @@ -88,6 +90,7 @@ def gather_test_suites_files(test_dir, pattern): for file in filenames: if fnmatch(file, pattern): case_file_list.append(file) + return case_file_list @@ -125,18 +128,6 @@ def collect_test_results(case_results): return result_list -class TestSuiteRunner: - - def run(self, msg_queue, test_dir, test_suite_file): - test_suite = unittest.TestSuite() - test_case = unittest.defaultTestLoader.discover( - start_dir=test_dir, pattern=test_suite_file) - test_suite.addTest(test_case) - runner = TimeCostTextTestRunner() - test_suite_result = runner.run(test_suite) - msg_queue.put(collect_test_results(test_suite_result)) - - def run_command_with_popen(cmd): with subprocess.Popen( cmd, @@ -148,55 +139,126 @@ def run_command_with_popen(cmd): sys.stdout.write(line) +def save_test_result(df, args): + if args.result_dir is not None: + file_name = str(int(datetime.datetime.now().timestamp() * 1000)) + os.umask(0) + Path(args.result_dir).mkdir(mode=0o777, parents=True, exist_ok=True) + Path(os.path.join(args.result_dir, file_name)).touch( + mode=0o666, exist_ok=True) + df.to_pickle(os.path.join(args.result_dir, file_name)) + + +def run_command(cmd): + logger.info('Running command: %s' % ' '.join(cmd)) + response = subprocess.run( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + response.check_returncode() + logger.info(response.stdout.decode('utf8')) + except subprocess.CalledProcessError as error: + logger.error( + 'stdout: %s, stderr: %s' % + (response.stdout.decode('utf8'), error.stderr.decode('utf8'))) + + +def install_packages(pkgs): + cmd = [sys.executable, '-m', 'pip', 'install'] + for pkg in pkgs: + cmd.append(pkg) + + run_command(cmd) + + +def install_requirements(requirements): + for req in requirements: + cmd = [ + sys.executable, '-m', 'pip', 'install', '-r', + 'requirements/%s' % req, '-f', + 'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html' + ] + run_command(cmd) + + +def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, + result_dir): + # install requirements and deps # run_config['envs'][env] + if 'requirements' in env: + install_requirements(env['requirements']) + if 'dependencies' in env: + install_packages(env['dependencies']) + + for test_suite_file in isolated_cases: # run case in subprocess + if test_suite_file in test_suite_env_map and test_suite_env_map[ + test_suite_file] == env_name: + cmd = [ + 'python', + 'tests/run.py', + '--pattern', + test_suite_file, + '--result_dir', + result_dir, + ] + run_command_with_popen(cmd) + else: + pass # case not in run list. + + # run remain cases in a process. + remain_suite_files = [] + for k, v in test_suite_env_map.items(): + if k not in isolated_cases and v == env_name: + remain_suite_files.append(k) + if len(remain_suite_files) == 0: + return + cmd = ['python', 'tests/run.py', '--result_dir', result_dir, '--suites'] + for suite in remain_suite_files: + cmd.append(suite) + run_command_with_popen(cmd) + + def run_in_subprocess(args): # only case args.isolated_cases run in subporcess, all other run in a subprocess test_suite_files = gather_test_suites_files( os.path.abspath(args.test_dir), args.pattern) + run_config = None + isolated_cases = [] + test_suite_env_map = {} + # put all the case in default env. + for test_suite_file in test_suite_files: + test_suite_env_map[test_suite_file] = 'default' + + if args.run_config is not None and Path(args.run_config).exists(): + with open(args.run_config) as f: + run_config = yaml.load(f, Loader=yaml.FullLoader) + if 'isolated' in run_config: + isolated_cases = run_config['isolated'] + + if 'envs' in run_config: + for env in run_config['envs']: + if env != 'default': + for test_suite in run_config['envs'][env]['tests']: + if test_suite in test_suite_env_map: + test_suite_env_map[test_suite] = env if args.subprocess: # run all case in subprocess isolated_cases = test_suite_files - else: - isolated_cases = [] - with open(args.isolated_cases, 'r') as f: - for line in f: - if line.strip() in test_suite_files: - isolated_cases.append(line.strip()) - - if not args.list_tests: - with tempfile.TemporaryDirectory() as temp_result_dir: - for test_suite_file in isolated_cases: # run case in subprocess - cmd = [ - 'python', 'tests/run.py', '--pattern', test_suite_file, - '--result_dir', temp_result_dir - ] - run_command_with_popen(cmd) - result_dfs = [] - # run remain cases in a process. - remain_suite_files = [ - item for item in test_suite_files if item not in isolated_cases - ] - test_suite = gather_test_suites_in_files(args.test_dir, - remain_suite_files, - args.list_tests) - if test_suite.countTestCases() > 0: - runner = TimeCostTextTestRunner() - result = runner.run(test_suite) - result = collect_test_results(result) - df = test_cases_result_to_df(result) - result_dfs.append(df) - # collect test results - result_path = Path(temp_result_dir) - for result in result_path.iterdir(): - if Path.is_file(result): - df = pandas.read_pickle(result) - result_dfs.append(df) + with tempfile.TemporaryDirectory() as temp_result_dir: + for env in set(test_suite_env_map.values()): + run_case_in_env(env, run_config['envs'][env], test_suite_env_map, + isolated_cases, temp_result_dir) - result_pd = pandas.concat( - result_dfs) # merge result of every test suite. - print_table_result(result_pd) - print_abnormal_case_info(result_pd) - statistics_test_result(result_pd) + result_dfs = [] + result_path = Path(temp_result_dir) + for result in result_path.iterdir(): + if Path.is_file(result): + df = pandas.read_pickle(result) + result_dfs.append(df) + result_pd = pandas.concat( + result_dfs) # merge result of every test suite. + print_table_result(result_pd) + print_abnormal_case_info(result_pd) + statistics_test_result(result_pd) def get_object_full_name(obj): @@ -293,15 +355,19 @@ def print_table_result(df): def main(args): runner = TimeCostTextTestRunner() - test_suite = gather_test_cases( - os.path.abspath(args.test_dir), args.pattern, args.list_tests) + if args.suites is not None and len(args.suites) > 0: + logger.info('Running: %s' % ' '.join(args.suites)) + test_suite = gather_test_suites_in_files(args.test_dir, args.suites, + args.list_tests) + else: + test_suite = gather_test_cases( + os.path.abspath(args.test_dir), args.pattern, args.list_tests) if not args.list_tests: result = runner.run(test_suite) result = collect_test_results(result) df = test_cases_result_to_df(result) if args.result_dir is not None: - file_name = str(int(datetime.datetime.now().timestamp() * 1000)) - df.to_pickle(os.path.join(args.result_dir, file_name)) + save_test_result(df, args) else: print_table_result(df) print_abnormal_case_info(df) @@ -321,9 +387,9 @@ if __name__ == '__main__': parser.add_argument( '--disable_profile', action='store_true', help='disable profiling') parser.add_argument( - '--isolated_cases', + '--run_config', default=None, - help='specified isolated cases config file') + help='specified case run config file(yaml file)') parser.add_argument( '--subprocess', action='store_true', @@ -332,6 +398,10 @@ if __name__ == '__main__': '--result_dir', default=None, help='Save result to directory, internal use only') + parser.add_argument( + '--suites', + nargs='*', + help='Run specified test suites(test suite file list)') args = parser.parse_args() set_test_level(args.level) os.environ['REGRESSION_BASELINE'] = '1' @@ -340,10 +410,7 @@ if __name__ == '__main__': from utils import profiler logger.info('enable profile ...') profiler.enable() - if args.isolated_cases is not None or args.subprocess: + if args.run_config is not None or args.subprocess: run_in_subprocess(args) - elif args.isolated_cases is not None and args.subprocess: - print('isolated_cases and subporcess conflict') - sys.exit(1) else: main(args) diff --git a/tests/run_config.yaml b/tests/run_config.yaml new file mode 100644 index 00000000..591dcd66 --- /dev/null +++ b/tests/run_config.yaml @@ -0,0 +1,31 @@ +# envs option allows fine-grained control for test executoin, for example, +# python tests/run.py --env pytorch +# would only trigger exeutions of all pytorch cases. +# envs option defaults to None for backward compatbility +isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process. + - test_text_to_speech.py + - test_multi_modal_embedding.py + - test_ofa_tasks.py + - test_video_summarization.py + - test_dialog_modeling.py + - test_csanmt_translation.py + +envs: + default: # default env, case not in other env will in default, pytorch. + dependencies: # requirement packages,pip install before test case run. + - numpy>=1.20 + tensorflow1x: # cases excuted tensorflow1.x framework. + requirements: # requirements files run before test case run. + - tensorflow1x.txt + dependencies: # requirement packages,pip install before test case run. + - numpy==1.18.5 + tests: + - test_text_to_speech.py + - test_csanmt_translation.py + - test_translation_trainer.py + - test_ocr_detection.py + - test_automatic_speech_recognition.py + - test_image_matting.py + - test_person_image_cartoon.py + - test_skin_retouching.py + - test_image_style_transfer.py From 1bac4f3349cbd1c343f4fbe1d9ec80198afd1a32 Mon Sep 17 00:00:00 2001 From: "xianzhe.xxz" Date: Fri, 2 Sep 2022 13:10:31 +0800 Subject: [PATCH 06/28] [to #42322933]add tinynas-detection pipeline and models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 接入tinynas-detection,新增tinynas object detection pipeline以及tinynas models。 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9938220 --- modelscope/metainfo.py | 3 + .../models/cv/tinynas_detection/__init__.py | 24 + .../cv/tinynas_detection/backbone/__init__.py | 16 + .../cv/tinynas_detection/backbone/darknet.py | 126 ++++ .../cv/tinynas_detection/backbone/tinynas.py | 347 +++++++++ .../cv/tinynas_detection/core/__init__.py | 2 + .../cv/tinynas_detection/core/base_ops.py | 474 +++++++++++++ .../cv/tinynas_detection/core/neck_ops.py | 324 +++++++++ .../cv/tinynas_detection/core/repvgg_block.py | 205 ++++++ .../models/cv/tinynas_detection/core/utils.py | 196 ++++++ .../models/cv/tinynas_detection/detector.py | 181 +++++ .../cv/tinynas_detection/head/__init__.py | 16 + .../tinynas_detection/head/gfocal_v2_tiny.py | 361 ++++++++++ .../cv/tinynas_detection/neck/__init__.py | 16 + .../tinynas_detection/neck/giraffe_config.py | 235 +++++++ .../cv/tinynas_detection/neck/giraffe_fpn.py | 661 ++++++++++++++++++ .../tinynas_detection/neck/giraffe_fpn_v2.py | 203 ++++++ .../cv/tinynas_detection/tinynas_detector.py | 16 + .../models/cv/tinynas_detection/utils.py | 30 + .../cv/tinynas_detection_pipeline.py | 61 ++ tests/pipelines/test_tinynas_detection.py | 20 + 21 files changed, 3517 insertions(+) create mode 100644 modelscope/models/cv/tinynas_detection/__init__.py create mode 100644 modelscope/models/cv/tinynas_detection/backbone/__init__.py create mode 100644 modelscope/models/cv/tinynas_detection/backbone/darknet.py create mode 100755 modelscope/models/cv/tinynas_detection/backbone/tinynas.py create mode 100644 modelscope/models/cv/tinynas_detection/core/__init__.py create mode 100644 modelscope/models/cv/tinynas_detection/core/base_ops.py create mode 100644 modelscope/models/cv/tinynas_detection/core/neck_ops.py create mode 100644 modelscope/models/cv/tinynas_detection/core/repvgg_block.py create mode 100644 modelscope/models/cv/tinynas_detection/core/utils.py create mode 100644 modelscope/models/cv/tinynas_detection/detector.py create mode 100644 modelscope/models/cv/tinynas_detection/head/__init__.py create mode 100644 modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py create mode 100644 modelscope/models/cv/tinynas_detection/neck/__init__.py create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_config.py create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py create mode 100644 modelscope/models/cv/tinynas_detection/tinynas_detector.py create mode 100644 modelscope/models/cv/tinynas_detection/utils.py create mode 100644 modelscope/pipelines/cv/tinynas_detection_pipeline.py create mode 100644 tests/pipelines/test_tinynas_detection.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 971dd3f1..fd653bac 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -9,6 +9,8 @@ class Models(object): Model name should only contain model info but not task info. """ + tinynas_detection = 'tinynas-detection' + # vision models detection = 'detection' realtime_object_detection = 'realtime-object-detection' @@ -133,6 +135,7 @@ class Pipelines(object): image_to_image_generation = 'image-to-image-generation' skin_retouching = 'unet-skin-retouching' tinynas_classification = 'tinynas-classification' + tinynas_detection = 'tinynas-detection' crowd_counting = 'hrnet-crowd-counting' action_detection = 'ResNetC3D-action-detection' video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' diff --git a/modelscope/models/cv/tinynas_detection/__init__.py b/modelscope/models/cv/tinynas_detection/__init__.py new file mode 100644 index 00000000..13532d10 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .tinynas_detector import Tinynas_detector + +else: + _import_structure = { + 'tinynas_detector': ['TinynasDetector'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/tinynas_detection/backbone/__init__.py b/modelscope/models/cv/tinynas_detection/backbone/__init__.py new file mode 100644 index 00000000..186d06a3 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/backbone/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import copy + +from .darknet import CSPDarknet +from .tinynas import load_tinynas_net + + +def build_backbone(cfg): + backbone_cfg = copy.deepcopy(cfg) + name = backbone_cfg.pop('name') + if name == 'CSPDarknet': + return CSPDarknet(**backbone_cfg) + elif name == 'TinyNAS': + return load_tinynas_net(backbone_cfg) diff --git a/modelscope/models/cv/tinynas_detection/backbone/darknet.py b/modelscope/models/cv/tinynas_detection/backbone/darknet.py new file mode 100644 index 00000000..d3294f0d --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py @@ -0,0 +1,126 @@ +# Copyright (c) Megvii Inc. All rights reserved. +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import torch +from torch import nn + +from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer, + SPPBottleneck) + + +class CSPDarknet(nn.Module): + + def __init__( + self, + dep_mul, + wid_mul, + out_features=('dark3', 'dark4', 'dark5'), + depthwise=False, + act='silu', + reparam=False, + ): + super(CSPDarknet, self).__init__() + assert out_features, 'please provide output features of Darknet' + self.out_features = out_features + Conv = DWConv if depthwise else BaseConv + + base_channels = int(wid_mul * 64) # 64 + base_depth = max(round(dep_mul * 3), 1) # 3 + + # stem + # self.stem = Focus(3, base_channels, ksize=3, act=act) + self.stem = Focus(3, base_channels, 3, act=act) + + # dark2 + self.dark2 = nn.Sequential( + Conv(base_channels, base_channels * 2, 3, 2, act=act), + CSPLayer( + base_channels * 2, + base_channels * 2, + n=base_depth, + depthwise=depthwise, + act=act, + reparam=reparam, + ), + ) + + # dark3 + self.dark3 = nn.Sequential( + Conv(base_channels * 2, base_channels * 4, 3, 2, act=act), + CSPLayer( + base_channels * 4, + base_channels * 4, + n=base_depth * 3, + depthwise=depthwise, + act=act, + reparam=reparam, + ), + ) + + # dark4 + self.dark4 = nn.Sequential( + Conv(base_channels * 4, base_channels * 8, 3, 2, act=act), + CSPLayer( + base_channels * 8, + base_channels * 8, + n=base_depth * 3, + depthwise=depthwise, + act=act, + reparam=reparam, + ), + ) + + # dark5 + self.dark5 = nn.Sequential( + Conv(base_channels * 8, base_channels * 16, 3, 2, act=act), + SPPBottleneck( + base_channels * 16, base_channels * 16, activation=act), + CSPLayer( + base_channels * 16, + base_channels * 16, + n=base_depth, + shortcut=False, + depthwise=depthwise, + act=act, + reparam=reparam, + ), + ) + + def init_weights(self, pretrain=None): + + if pretrain is None: + return + else: + pretrained_dict = torch.load( + pretrain, map_location='cpu')['state_dict'] + new_params = self.state_dict().copy() + for k, v in pretrained_dict.items(): + ks = k.split('.') + if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[ + -1] == 'total_params': + continue + else: + new_params[k] = v + + self.load_state_dict(new_params) + print(f' load pretrain backbone from {pretrain}') + + def forward(self, x): + outputs = {} + x = self.stem(x) + outputs['stem'] = x + x = self.dark2(x) + outputs['dark2'] = x + x = self.dark3(x) + outputs['dark3'] = x + x = self.dark4(x) + outputs['dark4'] = x + x = self.dark5(x) + outputs['dark5'] = x + features_out = [ + outputs['stem'], outputs['dark2'], outputs['dark3'], + outputs['dark4'], outputs['dark5'] + ] + + return features_out diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py new file mode 100755 index 00000000..814ee550 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py @@ -0,0 +1,347 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import torch +import torch.nn as nn + +from ..core.base_ops import Focus, SPPBottleneck, get_activation +from ..core.repvgg_block import RepVggBlock + + +class ConvKXBN(nn.Module): + + def __init__(self, in_c, out_c, kernel_size, stride): + super(ConvKXBN, self).__init__() + self.conv1 = nn.Conv2d( + in_c, + out_c, + kernel_size, + stride, (kernel_size - 1) // 2, + groups=1, + bias=False) + self.bn1 = nn.BatchNorm2d(out_c) + + def forward(self, x): + return self.bn1(self.conv1(x)) + + +class ConvKXBNRELU(nn.Module): + + def __init__(self, in_c, out_c, kernel_size, stride, act='silu'): + super(ConvKXBNRELU, self).__init__() + self.conv = ConvKXBN(in_c, out_c, kernel_size, stride) + if act is None: + self.activation_function = torch.relu + else: + self.activation_function = get_activation(act) + + def forward(self, x): + output = self.conv(x) + return self.activation_function(output) + + +class ResConvK1KX(nn.Module): + + def __init__(self, + in_c, + out_c, + btn_c, + kernel_size, + stride, + force_resproj=False, + act='silu'): + super(ResConvK1KX, self).__init__() + self.stride = stride + self.conv1 = ConvKXBN(in_c, btn_c, 1, 1) + self.conv2 = RepVggBlock( + btn_c, out_c, kernel_size, stride, act='identity') + + if act is None: + self.activation_function = torch.relu + else: + self.activation_function = get_activation(act) + + if stride == 2: + self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2) + else: + self.residual_downsample = nn.Identity() + + if in_c != out_c or force_resproj: + self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) + else: + self.residual_proj = nn.Identity() + + def forward(self, x): + if self.stride != 2: + reslink = self.residual_downsample(x) + reslink = self.residual_proj(reslink) + + output = x + output = self.conv1(output) + output = self.activation_function(output) + output = self.conv2(output) + if self.stride != 2: + output = output + reslink + output = self.activation_function(output) + + return output + + +class SuperResConvK1KX(nn.Module): + + def __init__(self, + in_c, + out_c, + btn_c, + kernel_size, + stride, + num_blocks, + with_spp=False, + act='silu'): + super(SuperResConvK1KX, self).__init__() + if act is None: + self.act = torch.relu + else: + self.act = get_activation(act) + self.block_list = nn.ModuleList() + for block_id in range(num_blocks): + if block_id == 0: + in_channels = in_c + out_channels = out_c + this_stride = stride + force_resproj = False # as a part of CSPLayer, DO NOT need this flag + this_kernel_size = kernel_size + else: + in_channels = out_c + out_channels = out_c + this_stride = 1 + force_resproj = False + this_kernel_size = kernel_size + the_block = ResConvK1KX( + in_channels, + out_channels, + btn_c, + this_kernel_size, + this_stride, + force_resproj, + act=act) + self.block_list.append(the_block) + if block_id == 0 and with_spp: + self.block_list.append( + SPPBottleneck(out_channels, out_channels)) + + def forward(self, x): + output = x + for block in self.block_list: + output = block(output) + return output + + +class ResConvKXKX(nn.Module): + + def __init__(self, + in_c, + out_c, + btn_c, + kernel_size, + stride, + force_resproj=False, + act='silu'): + super(ResConvKXKX, self).__init__() + self.stride = stride + if self.stride == 2: + self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act) + else: + self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1) + self.conv2 = RepVggBlock( + btn_c, out_c, kernel_size, stride, act='identity') + + if act is None: + self.activation_function = torch.relu + else: + self.activation_function = get_activation(act) + + if stride == 2: + self.residual_downsample = nn.AvgPool2d( + kernel_size=2, stride=2) + else: + self.residual_downsample = nn.Identity() + + if in_c != out_c or force_resproj: + self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) + else: + self.residual_proj = nn.Identity() + + def forward(self, x): + if self.stride == 2: + return self.downsampler(x) + reslink = self.residual_downsample(x) + reslink = self.residual_proj(reslink) + + output = x + output = self.conv1(output) + output = self.activation_function(output) + output = self.conv2(output) + + output = output + reslink + output = self.activation_function(output) + + return output + + +class SuperResConvKXKX(nn.Module): + + def __init__(self, + in_c, + out_c, + btn_c, + kernel_size, + stride, + num_blocks, + with_spp=False, + act='silu'): + super(SuperResConvKXKX, self).__init__() + if act is None: + self.act = torch.relu + else: + self.act = get_activation(act) + self.block_list = nn.ModuleList() + for block_id in range(num_blocks): + if block_id == 0: + in_channels = in_c + out_channels = out_c + this_stride = stride + force_resproj = False # as a part of CSPLayer, DO NOT need this flag + this_kernel_size = kernel_size + else: + in_channels = out_c + out_channels = out_c + this_stride = 1 + force_resproj = False + this_kernel_size = kernel_size + the_block = ResConvKXKX( + in_channels, + out_channels, + btn_c, + this_kernel_size, + this_stride, + force_resproj, + act=act) + self.block_list.append(the_block) + if block_id == 0 and with_spp: + self.block_list.append( + SPPBottleneck(out_channels, out_channels)) + + def forward(self, x): + output = x + for block in self.block_list: + output = block(output) + return output + + +class TinyNAS(nn.Module): + + def __init__(self, + structure_info=None, + out_indices=[0, 1, 2, 4, 5], + out_channels=[None, None, 128, 256, 512], + with_spp=False, + use_focus=False, + need_conv1=True, + act='silu'): + super(TinyNAS, self).__init__() + assert len(out_indices) == len(out_channels) + self.out_indices = out_indices + self.need_conv1 = need_conv1 + + self.block_list = nn.ModuleList() + if need_conv1: + self.conv1_list = nn.ModuleList() + for idx, block_info in enumerate(structure_info): + the_block_class = block_info['class'] + if the_block_class == 'ConvKXBNRELU': + if use_focus: + the_block = Focus(block_info['in'], block_info['out'], + block_info['k']) + else: + the_block = ConvKXBNRELU( + block_info['in'], + block_info['out'], + block_info['k'], + block_info['s'], + act=act) + self.block_list.append(the_block) + elif the_block_class == 'SuperResConvK1KX': + spp = with_spp if idx == len(structure_info) - 1 else False + the_block = SuperResConvK1KX( + block_info['in'], + block_info['out'], + block_info['btn'], + block_info['k'], + block_info['s'], + block_info['L'], + spp, + act=act) + self.block_list.append(the_block) + elif the_block_class == 'SuperResConvKXKX': + spp = with_spp if idx == len(structure_info) - 1 else False + the_block = SuperResConvKXKX( + block_info['in'], + block_info['out'], + block_info['btn'], + block_info['k'], + block_info['s'], + block_info['L'], + spp, + act=act) + self.block_list.append(the_block) + if need_conv1: + if idx in self.out_indices and out_channels[ + self.out_indices.index(idx)] is not None: + self.conv1_list.append( + nn.Conv2d(block_info['out'], + out_channels[self.out_indices.index(idx)], + 1)) + else: + self.conv1_list.append(None) + + def init_weights(self, pretrain=None): + pass + + def forward(self, x): + output = x + stage_feature_list = [] + for idx, block in enumerate(self.block_list): + output = block(output) + if idx in self.out_indices: + if self.need_conv1 and self.conv1_list[idx] is not None: + true_out = self.conv1_list[idx](output) + stage_feature_list.append(true_out) + else: + stage_feature_list.append(output) + return stage_feature_list + + +def load_tinynas_net(backbone_cfg): + # load masternet model to path + import ast + + struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str]) + struct_info = ast.literal_eval(struct_str) + for layer in struct_info: + if 'nbitsA' in layer: + del layer['nbitsA'] + if 'nbitsW' in layer: + del layer['nbitsW'] + + model = TinyNAS( + structure_info=struct_info, + out_indices=backbone_cfg.out_indices, + out_channels=backbone_cfg.out_channels, + with_spp=backbone_cfg.with_spp, + use_focus=backbone_cfg.use_focus, + act=backbone_cfg.act, + need_conv1=backbone_cfg.need_conv1, + ) + + return model diff --git a/modelscope/models/cv/tinynas_detection/core/__init__.py b/modelscope/models/cv/tinynas_detection/core/__init__.py new file mode 100644 index 00000000..3dad5e72 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/core/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. diff --git a/modelscope/models/cv/tinynas_detection/core/base_ops.py b/modelscope/models/cv/tinynas_detection/core/base_ops.py new file mode 100644 index 00000000..62729ca2 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py @@ -0,0 +1,474 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .repvgg_block import RepVggBlock + + +class SiLU(nn.Module): + """export-friendly version of nn.SiLU()""" + + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +def get_activation(name='silu', inplace=True): + if name == 'silu': + module = nn.SiLU(inplace=inplace) + elif name == 'relu': + module = nn.ReLU(inplace=inplace) + elif name == 'lrelu': + module = nn.LeakyReLU(0.1, inplace=inplace) + else: + raise AttributeError('Unsupported act type: {}'.format(name)) + return module + + +def get_norm(name, out_channels, inplace=True): + if name == 'bn': + module = nn.BatchNorm2d(out_channels) + elif name == 'gn': + module = nn.GroupNorm(num_channels=out_channels, num_groups=32) + return module + + +class BaseConv(nn.Module): + """A Conv2d -> Batchnorm -> silu/leaky relu block""" + + def __init__(self, + in_channels, + out_channels, + ksize, + stride=1, + groups=1, + bias=False, + act='silu', + norm='bn'): + super().__init__() + # same padding + pad = (ksize - 1) // 2 + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=pad, + groups=groups, + bias=bias, + ) + if norm is not None: + self.bn = get_norm(norm, out_channels, inplace=True) + if act is not None: + self.act = get_activation(act, inplace=True) + self.with_norm = norm is not None + self.with_act = act is not None + + def forward(self, x): + x = self.conv(x) + if self.with_norm: + # x = self.norm(x) + x = self.bn(x) + if self.with_act: + x = self.act(x) + return x + + def fuseforward(self, x): + return self.act(self.conv(x)) + + +class DepthWiseConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + ksize, + stride=1, + groups=None, + bias=False, + act='silu', + norm='bn'): + super().__init__() + padding = (ksize - 1) // 2 + self.depthwise = nn.Conv2d( + in_channels, + in_channels, + kernel_size=ksize, + stride=stride, + padding=padding, + groups=in_channels, + bias=bias, + ) + + self.pointwise = nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias) + if norm is not None: + self.dwnorm = get_norm(norm, in_channels, inplace=True) + self.pwnorm = get_norm(norm, out_channels, inplace=True) + if act is not None: + self.act = get_activation(act, inplace=True) + + self.with_norm = norm is not None + self.with_act = act is not None + self.order = ['depthwise', 'dwnorm', 'pointwise', 'act'] + + def forward(self, x): + + for layer_name in self.order: + layer = self.__getattr__(layer_name) + if layer is not None: + x = layer(x) + return x + + +class DWConv(nn.Module): + """Depthwise Conv + Conv""" + + def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'): + super().__init__() + self.dconv = BaseConv( + in_channels, + in_channels, + ksize=ksize, + stride=stride, + groups=in_channels, + act=act, + ) + self.pconv = BaseConv( + in_channels, out_channels, ksize=1, stride=1, groups=1, act=act) + + def forward(self, x): + x = self.dconv(x) + return self.pconv(x) + + +class Bottleneck(nn.Module): + # Standard bottleneck + def __init__( + self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + act='silu', + reparam=False, + ): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + k_conv1 = 3 if reparam else 1 + self.conv1 = BaseConv( + in_channels, hidden_channels, k_conv1, stride=1, act=act) + if reparam: + self.conv2 = RepVggBlock( + hidden_channels, out_channels, 3, stride=1, act=act) + else: + self.conv2 = Conv( + hidden_channels, out_channels, 3, stride=1, act=act) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + +class ResLayer(nn.Module): + 'Residual layer with `in_channels` inputs.' + + def __init__(self, in_channels: int): + super().__init__() + mid_channels = in_channels // 2 + self.layer1 = BaseConv( + in_channels, mid_channels, ksize=1, stride=1, act='lrelu') + self.layer2 = BaseConv( + mid_channels, in_channels, ksize=3, stride=1, act='lrelu') + + def forward(self, x): + out = self.layer2(self.layer1(x)) + return x + out + + +class SPPBottleneck(nn.Module): + """Spatial pyramid pooling layer used in YOLOv3-SPP""" + + def __init__(self, + in_channels, + out_channels, + kernel_sizes=(5, 9, 13), + activation='silu'): + super().__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv( + in_channels, hidden_channels, 1, stride=1, act=activation) + self.m = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv( + conv2_channels, out_channels, 1, stride=1, act=activation) + + def forward(self, x): + x = self.conv1(x) + x = torch.cat([x] + [m(x) for m in self.m], dim=1) + x = self.conv2(x) + return x + + +class CSPLayer(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=1, + shortcut=True, + expansion=0.5, + depthwise=False, + act='silu', + reparam=False, + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv( + in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = BaseConv( + in_channels, hidden_channels, 1, stride=1, act=act) + self.conv3 = BaseConv( + 2 * hidden_channels, out_channels, 1, stride=1, act=act) + module_list = [ + Bottleneck( + hidden_channels, + hidden_channels, + shortcut, + 1.0, + depthwise, + act=act, + reparam=reparam) for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + return self.conv3(x) + + +class Focus(nn.Module): + """Focus width and height information into channel space.""" + + def __init__(self, + in_channels, + out_channels, + ksize=1, + stride=1, + act='silu'): + super().__init__() + self.conv = BaseConv( + in_channels * 4, out_channels, ksize, stride, act=act) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + ( + patch_top_left, + patch_bot_left, + patch_top_right, + patch_bot_right, + ), + dim=1, + ) + return self.conv(x) + + +class fast_Focus(nn.Module): + + def __init__(self, + in_channels, + out_channels, + ksize=1, + stride=1, + act='silu'): + super(Focus, self).__init__() + self.conv1 = self.focus_conv(w1=1.0) + self.conv2 = self.focus_conv(w3=1.0) + self.conv3 = self.focus_conv(w2=1.0) + self.conv4 = self.focus_conv(w4=1.0) + + self.conv = BaseConv( + in_channels * 4, out_channels, ksize, stride, act=act) + + def forward(self, x): + return self.conv( + torch.cat( + [self.conv1(x), + self.conv2(x), + self.conv3(x), + self.conv4(x)], 1)) + + def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0): + conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False) + conv.weight = self.init_weights_constant(w1, w2, w3, w4) + conv.weight.requires_grad = False + return conv + + def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0): + return nn.Parameter( + torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]], + [[[w1, w2], [w3, w4]]]])) + + +# shufflenet block +def channel_shuffle(x, groups=2): + bat_size, channels, w, h = x.shape + group_c = channels // groups + x = x.view(bat_size, groups, group_c, w, h) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(bat_size, -1, w, h) + return x + + +def conv_1x1_bn(in_c, out_c, stride=1): + return nn.Sequential( + nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False), + nn.BatchNorm2d(out_c), nn.ReLU(True)) + + +def conv_bn(in_c, out_c, stride=2): + return nn.Sequential( + nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False), + nn.BatchNorm2d(out_c), nn.ReLU(True)) + + +class ShuffleBlock(nn.Module): + + def __init__(self, in_c, out_c, downsample=False): + super(ShuffleBlock, self).__init__() + self.downsample = downsample + half_c = out_c // 2 + if downsample: + self.branch1 = nn.Sequential( + # 3*3 dw conv, stride = 2 + # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False), + nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False), + nn.BatchNorm2d(in_c), + # 1*1 pw conv + nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False), + nn.BatchNorm2d(half_c), + nn.ReLU(True)) + + self.branch2 = nn.Sequential( + # 1*1 pw conv + nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False), + nn.BatchNorm2d(half_c), + nn.ReLU(True), + # 3*3 dw conv, stride = 2 + # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False), + nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False), + nn.BatchNorm2d(half_c), + # 1*1 pw conv + nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), + nn.BatchNorm2d(half_c), + nn.ReLU(True)) + else: + # in_c = out_c + assert in_c == out_c + + self.branch2 = nn.Sequential( + # 1*1 pw conv + nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), + nn.BatchNorm2d(half_c), + nn.ReLU(True), + # 3*3 dw conv, stride = 1 + nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False), + nn.BatchNorm2d(half_c), + # 1*1 pw conv + nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), + nn.BatchNorm2d(half_c), + nn.ReLU(True)) + + def forward(self, x): + out = None + if self.downsample: + # if it is downsampling, we don't need to do channel split + out = torch.cat((self.branch1(x), self.branch2(x)), 1) + else: + # channel split + channels = x.shape[1] + c = channels // 2 + x1 = x[:, :c, :, :] + x2 = x[:, c:, :, :] + out = torch.cat((x1, self.branch2(x2)), 1) + return channel_shuffle(out, 2) + + +class ShuffleCSPLayer(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, + in_channels, + out_channels, + n=1, + shortcut=True, + expansion=0.5, + depthwise=False, + act='silu', + ): + """ + Args: + in_channels (int): input channels. + out_channels (int): output channels. + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv( + in_channels, hidden_channels, 1, stride=1, act=act) + self.conv2 = BaseConv( + in_channels, hidden_channels, 1, stride=1, act=act) + module_list = [ + Bottleneck( + hidden_channels, + hidden_channels, + shortcut, + 1.0, + depthwise, + act=act) for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + # add channel shuffle + return channel_shuffle(x, 2) diff --git a/modelscope/models/cv/tinynas_detection/core/neck_ops.py b/modelscope/models/cv/tinynas_detection/core/neck_ops.py new file mode 100644 index 00000000..7f481665 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py @@ -0,0 +1,324 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Swish(nn.Module): + + def __init__(self, inplace=True): + super(Swish, self).__init__() + self.inplace = inplace + + def forward(self, x): + if self.inplace: + x.mul_(F.sigmoid(x)) + return x + else: + return x * F.sigmoid(x) + + +def get_activation(name='silu', inplace=True): + if name is None: + return nn.Identity() + + if isinstance(name, str): + if name == 'silu': + module = nn.SiLU(inplace=inplace) + elif name == 'relu': + module = nn.ReLU(inplace=inplace) + elif name == 'lrelu': + module = nn.LeakyReLU(0.1, inplace=inplace) + elif name == 'swish': + module = Swish(inplace=inplace) + elif name == 'hardsigmoid': + module = nn.Hardsigmoid(inplace=inplace) + else: + raise AttributeError('Unsupported act type: {}'.format(name)) + return module + elif isinstance(name, nn.Module): + return name + else: + raise AttributeError('Unsupported act type: {}'.format(name)) + + +class ConvBNLayer(nn.Module): + + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + act=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2d( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + bias=False) + self.bn = nn.BatchNorm2d(ch_out, ) + self.act = get_activation(act, inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + return x + + +class RepVGGBlock(nn.Module): + + def __init__(self, ch_in, ch_out, act='relu', deploy=False): + super(RepVGGBlock, self).__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.deploy = deploy + self.in_channels = ch_in + self.groups = 1 + if self.deploy is False: + self.rbr_dense = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=None) + self.rbr_1x1 = ConvBNLayer( + ch_in, ch_out, 1, stride=1, padding=0, act=None) + # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None + self.rbr_identity = None + else: + self.rbr_reparam = nn.Conv2d( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=3, + stride=1, + padding=1, + groups=1) + self.act = get_activation(act) if act is None or isinstance( + act, (str, dict)) else act + + def forward(self, x): + if self.deploy: + print('----------deploy----------') + y = self.rbr_reparam(x) + else: + if self.rbr_identity is None: + y = self.rbr_dense(x) + self.rbr_1x1(x) + else: + y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x) + + y = self.act(y) + return y + + def switch_to_deploy(self): + print('switch') + if not hasattr(self, 'rbr_reparam'): + # return + self.rbr_reparam = nn.Conv2d( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=3, + stride=1, + padding=1, + groups=1) + print('switch') + kernel, bias = self.get_equivalent_kernel_bias() + self.rbr_reparam.weight.data = kernel + self.rbr_reparam.bias.data = bias + for para in self.parameters(): + para.detach_() + # self.__delattr__(self.rbr_dense) + # self.__delattr__(self.rbr_1x1) + self.__delattr__('rbr_dense') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + self.deploy = True + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch): + if branch is None: + return 0, 0 + # if isinstance(branch, nn.Sequential): + if isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, nn.BatchNorm2d) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), + dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to( + branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + +class BasicBlock(nn.Module): + + def __init__(self, ch_in, ch_out, act='relu', shortcut=True): + super(BasicBlock, self).__init__() + assert ch_in == ch_out + # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) + # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) + self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) + self.shortcut = shortcut + + def forward(self, x): + # y = self.conv1(x) + y = self.conv2(x) + if self.shortcut: + return x + y + else: + return y + + +class BasicBlock_3x3(nn.Module): + + def __init__(self, ch_in, ch_out, act='relu', shortcut=True): + super(BasicBlock_3x3, self).__init__() + assert ch_in == ch_out + self.conv1 = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=act) + # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) + self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) + self.shortcut = shortcut + + def forward(self, x): + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return x + y + else: + return y + + +class BasicBlock_3x3_Reverse(nn.Module): + + def __init__(self, ch_in, ch_out, act='relu', shortcut=True): + super(BasicBlock_3x3_Reverse, self).__init__() + assert ch_in == ch_out + self.conv1 = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=act) + # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) + self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) + self.shortcut = shortcut + + def forward(self, x): + y = self.conv2(x) + y = self.conv1(y) + if self.shortcut: + return x + y + else: + return y + + +class SPP(nn.Module): + + def __init__( + self, + ch_in, + ch_out, + k, + pool_size, + act='swish', + ): + super(SPP, self).__init__() + self.pool = [] + for i, size in enumerate(pool_size): + pool = nn.MaxPool2d( + kernel_size=size, stride=1, padding=size // 2, ceil_mode=False) + self.add_module('pool{}'.format(i), pool) + self.pool.append(pool) + self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act) + + def forward(self, x): + outs = [x] + + for pool in self.pool: + outs.append(pool(x)) + y = torch.cat(outs, axis=1) + + y = self.conv(y) + return y + + +class CSPStage(nn.Module): + + def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False): + super(CSPStage, self).__init__() + + ch_mid = int(ch_out // 2) + self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act) + self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act) + # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act) + self.convs = nn.Sequential() + + next_ch_in = ch_mid + for i in range(n): + if block_fn == 'BasicBlock': + self.convs.add_module( + str(i), + BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False)) + elif block_fn == 'BasicBlock_3x3': + self.convs.add_module( + str(i), + BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True)) + elif block_fn == 'BasicBlock_3x3_Reverse': + self.convs.add_module( + str(i), + BasicBlock_3x3_Reverse( + next_ch_in, ch_mid, act=act, shortcut=True)) + else: + raise NotImplementedError + if i == (n - 1) // 2 and spp: + self.convs.add_module( + 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) + next_ch_in = ch_mid + # self.convs = nn.Sequential(*convs) + self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act) + + def forward(self, x): + y1 = self.conv1(x) + y2 = self.conv2(x) + + mid_out = [y1] + for conv in self.convs: + y2 = conv(y2) + mid_out.append(y2) + y = torch.cat(mid_out, axis=1) + y = self.conv3(y) + return y diff --git a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py new file mode 100644 index 00000000..06966a4e --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py @@ -0,0 +1,205 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from torch.nn.parameter import Parameter + + +def get_activation(name='silu', inplace=True): + if name == 'silu': + module = nn.SiLU(inplace=inplace) + elif name == 'relu': + module = nn.ReLU(inplace=inplace) + elif name == 'lrelu': + module = nn.LeakyReLU(0.1, inplace=inplace) + elif name == 'identity': + module = nn.Identity() + else: + raise AttributeError('Unsupported act type: {}'.format(name)) + return module + + +def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1): + '''Basic cell for rep-style block, including conv and bn''' + result = nn.Sequential() + result.add_module( + 'conv', + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False)) + result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) + return result + + +class RepVggBlock(nn.Module): + '''RepVggBlock is a basic rep-style block, including training and deploy status + This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py + ''' + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + dilation=1, + groups=1, + padding_mode='zeros', + deploy=False, + use_se=False, + act='relu', + norm=None): + super(RepVggBlock, self).__init__() + """ Initialization of the class. + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 1 + dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + padding_mode (string, optional): Default: 'zeros' + deploy: Whether to be deploy status or training status. Default: False + use_se: Whether to use se. Default: False + """ + self.deploy = deploy + self.groups = groups + self.in_channels = in_channels + self.out_channels = out_channels + + assert kernel_size == 3 + assert padding == 1 + + padding_11 = padding - kernel_size // 2 + + if isinstance(act, str): + self.nonlinearity = get_activation(act) + else: + self.nonlinearity = act + + if use_se: + raise NotImplementedError('se block not supported yet') + else: + self.se = nn.Identity() + + if deploy: + self.rbr_reparam = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + padding_mode=padding_mode) + + else: + self.rbr_identity = None + self.rbr_dense = conv_bn( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups) + self.rbr_1x1 = conv_bn( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=padding_11, + groups=groups) + + def forward(self, inputs): + '''Forward process''' + if hasattr(self, 'rbr_reparam'): + return self.nonlinearity(self.se(self.rbr_reparam(inputs))) + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(inputs) + + return self.nonlinearity( + self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch): + if branch is None: + return 0, 0 + if isinstance(branch, nn.Sequential): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, nn.BatchNorm2d) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), + dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to( + branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def switch_to_deploy(self): + if hasattr(self, 'rbr_reparam'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.rbr_reparam = nn.Conv2d( + in_channels=self.rbr_dense.conv.in_channels, + out_channels=self.rbr_dense.conv.out_channels, + kernel_size=self.rbr_dense.conv.kernel_size, + stride=self.rbr_dense.conv.stride, + padding=self.rbr_dense.conv.padding, + dilation=self.rbr_dense.conv.dilation, + groups=self.rbr_dense.conv.groups, + bias=True) + self.rbr_reparam.weight.data = kernel + self.rbr_reparam.bias.data = bias + for para in self.parameters(): + para.detach_() + self.__delattr__('rbr_dense') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + self.deploy = True diff --git a/modelscope/models/cv/tinynas_detection/core/utils.py b/modelscope/models/cv/tinynas_detection/core/utils.py new file mode 100644 index 00000000..482f12fb --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/core/utils.py @@ -0,0 +1,196 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import numpy as np +import torch +import torchvision + +__all__ = [ + 'filter_box', + 'postprocess_airdet', + 'bboxes_iou', + 'matrix_iou', + 'adjust_box_anns', + 'xyxy2xywh', + 'xyxy2cxcywh', +] + + +def multiclass_nms(multi_bboxes, + multi_scores, + score_thr, + iou_thr, + max_num=100, + score_factors=None): + """NMS for multi-class bboxes. + + Args: + multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_scores (Tensor): shape (n, #class), where the last column + contains scores of the background class, but this will be ignored. + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + nms_thr (float): NMS IoU threshold + max_num (int): if there are more than max_num bboxes after NMS, + only top max_num will be kept. + score_factors (Tensor): The factors multiplied to scores before + applying NMS + + Returns: + tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ + are 0-based. + """ + num_classes = multi_scores.size(1) + # exclude background category + if multi_bboxes.shape[1] > 4: + bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) + else: + bboxes = multi_bboxes[:, None].expand( + multi_scores.size(0), num_classes, 4) + scores = multi_scores + # filter out boxes with low scores + valid_mask = scores > score_thr # 1000 * 80 bool + + # We use masked_select for ONNX exporting purpose, + # which is equivalent to bboxes = bboxes[valid_mask] + # (TODO): as ONNX does not support repeat now, + # we have to use this ugly code + # bboxes -> 1000, 4 + bboxes = torch.masked_select( + bboxes, + torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), + -1)).view(-1, 4) # mask-> 1000*80*4, 80000*4 + if score_factors is not None: + scores = scores * score_factors[:, None] + scores = torch.masked_select(scores, valid_mask) + labels = valid_mask.nonzero(as_tuple=False)[:, 1] + + if bboxes.numel() == 0: + bboxes = multi_bboxes.new_zeros((0, 5)) + labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) + scores = multi_bboxes.new_zeros((0, )) + + return bboxes, scores, labels + + keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr) + + if max_num > 0: + keep = keep[:max_num] + + return bboxes[keep], scores[keep], labels[keep] + + +def filter_box(output, scale_range): + """ + output: (N, 5+class) shape + """ + min_scale, max_scale = scale_range + w = output[:, 2] - output[:, 0] + h = output[:, 3] - output[:, 1] + keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) + return output[keep] + + +def filter_results(boxlist, num_classes, nms_thre): + boxes = boxlist.bbox + scores = boxlist.get_field('scores') + cls = boxlist.get_field('labels') + nms_out_index = torchvision.ops.batched_nms( + boxes, + scores, + cls, + nms_thre, + ) + boxlist = boxlist[nms_out_index] + + return boxlist + + +def postprocess_airdet(prediction, + num_classes, + conf_thre=0.7, + nms_thre=0.45, + imgs=None): + box_corner = prediction.new(prediction.shape) + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + prediction[:, :, :4] = box_corner[:, :, :4] + output = [None for _ in range(len(prediction))] + for i, image_pred in enumerate(prediction): + # If none are remaining => process next image + if not image_pred.size(0): + continue + multi_bboxes = image_pred[:, :4] + multi_scores = image_pred[:, 5:] + detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores, + conf_thre, nms_thre, 500) + detections = torch.cat( + (detections, scores[:, None], scores[:, None], labels[:, None]), + dim=1) + + if output[i] is None: + output[i] = detections + else: + output[i] = torch.cat((output[i], detections)) + return output + + +def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): + if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: + raise IndexError + + if xyxy: + tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) + br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) + area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) + area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) + else: + tl = torch.max( + (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), + ) + br = torch.min( + (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), + ) + + area_a = torch.prod(bboxes_a[:, 2:], 1) + area_b = torch.prod(bboxes_b[:, 2:], 1) + en = (tl < br).type(tl.type()).prod(dim=2) + area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) + return area_i / (area_a[:, None] + area_b - area_i) + + +def matrix_iou(a, b): + """ + return iou of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) + + +def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): + bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) + bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) + return bbox + + +def xyxy2xywh(bboxes): + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + return bboxes + + +def xyxy2cxcywh(bboxes): + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5 + bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5 + return bboxes diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py new file mode 100644 index 00000000..615b13a8 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/detector.py @@ -0,0 +1,181 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import os.path as osp +import pickle + +import cv2 +import torch +import torchvision + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from .backbone import build_backbone +from .head import build_head +from .neck import build_neck +from .utils import parse_config + + +class SingleStageDetector(TorchModel): + """ + The base class of single stage detector. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + """ + init model by cfg + """ + super().__init__(model_dir, *args, **kwargs) + + config_path = osp.join(model_dir, 'airdet_s.py') + config = parse_config(config_path) + self.cfg = config + model_path = osp.join(model_dir, config.model.name) + label_map = osp.join(model_dir, config.model.class_map) + self.label_map = pickle.load(open(label_map, 'rb')) + self.size_divisible = config.dataset.size_divisibility + self.num_classes = config.model.head.num_classes + self.conf_thre = config.model.head.nms_conf_thre + self.nms_thre = config.model.head.nms_iou_thre + + self.backbone = build_backbone(self.cfg.model.backbone) + self.neck = build_neck(self.cfg.model.neck) + self.head = build_head(self.cfg.model.head) + + self.load_pretrain_model(model_path) + + def load_pretrain_model(self, pretrain_model): + + state_dict = torch.load(pretrain_model, map_location='cpu')['model'] + new_state_dict = {} + for k, v in state_dict.items(): + k = k.replace('module.', '') + new_state_dict[k] = v + self.load_state_dict(new_state_dict, strict=True) + + def inference(self, x): + + if self.training: + return self.forward_train(x) + else: + return self.forward_eval(x) + + def forward_train(self, x): + + pass + + def forward_eval(self, x): + + x = self.backbone(x) + x = self.neck(x) + prediction = self.head(x) + + return prediction + + def preprocess(self, image): + image = torch.from_numpy(image).type(torch.float32) + image = image.permute(2, 0, 1) + shape = image.shape # c, h, w + if self.size_divisible > 0: + import math + stride = self.size_divisible + shape = list(shape) + shape[1] = int(math.ceil(shape[1] / stride) * stride) + shape[2] = int(math.ceil(shape[2] / stride) * stride) + shape = tuple(shape) + pad_img = image.new(*shape).zero_() + pad_img[:, :image.shape[1], :image.shape[2]].copy_(image) + pad_img = pad_img.unsqueeze(0) + + return pad_img + + def postprocess(self, preds): + bboxes, scores, labels_idx = postprocess_gfocal( + preds, self.num_classes, self.conf_thre, self.nms_thre) + bboxes = bboxes.cpu().numpy() + scores = scores.cpu().numpy() + labels_idx = labels_idx.cpu().numpy() + labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx] + + return (bboxes, scores, labels) + + +def multiclass_nms(multi_bboxes, + multi_scores, + score_thr, + iou_thr, + max_num=100, + score_factors=None): + """NMS for multi-class bboxes. + + Args: + multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_scores (Tensor): shape (n, #class), where the last column + contains scores of the background class, but this will be ignored. + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + nms_thr (float): NMS IoU threshold + max_num (int): if there are more than max_num bboxes after NMS, + only top max_num will be kept. + score_factors (Tensor): The factors multiplied to scores before + applying NMS + + Returns: + tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ + are 0-based. + """ + num_classes = multi_scores.size(1) + # exclude background category + if multi_bboxes.shape[1] > 4: + bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) + else: + bboxes = multi_bboxes[:, None].expand( + multi_scores.size(0), num_classes, 4) + scores = multi_scores + # filter out boxes with low scores + valid_mask = scores > score_thr # 1000 * 80 bool + + # We use masked_select for ONNX exporting purpose, + # which is equivalent to bboxes = bboxes[valid_mask] + # (TODO): as ONNX does not support repeat now, + # we have to use this ugly code + # bboxes -> 1000, 4 + bboxes = torch.masked_select( + bboxes, + torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), + -1)).view(-1, 4) # mask-> 1000*80*4, 80000*4 + if score_factors is not None: + scores = scores * score_factors[:, None] + scores = torch.masked_select(scores, valid_mask) + labels = valid_mask.nonzero(as_tuple=False)[:, 1] + + if bboxes.numel() == 0: + bboxes = multi_bboxes.new_zeros((0, 5)) + labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) + scores = multi_bboxes.new_zeros((0, )) + + return bboxes, scores, labels + + keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr) + + if max_num > 0: + keep = keep[:max_num] + + return bboxes[keep], scores[keep], labels[keep] + + +def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7): + assert prediction.shape[0] == 1 + for i, image_pred in enumerate(prediction): + # If none are remaining => process next image + if not image_pred.size(0): + continue + multi_bboxes = image_pred[:, :4] + multi_scores = image_pred[:, 4:] + detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores, + conf_thre, nms_thre, 500) + + return detections, scores, labels diff --git a/modelscope/models/cv/tinynas_detection/head/__init__.py b/modelscope/models/cv/tinynas_detection/head/__init__.py new file mode 100644 index 00000000..f870fae1 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/head/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import copy + +from .gfocal_v2_tiny import GFocalHead_Tiny + + +def build_head(cfg): + + head_cfg = copy.deepcopy(cfg) + name = head_cfg.pop('name') + if name == 'GFocalV2': + return GFocalHead_Tiny(**head_cfg) + else: + raise NotImplementedError diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py new file mode 100644 index 00000000..41f35968 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py @@ -0,0 +1,361 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import functools +from functools import partial + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..core.base_ops import BaseConv, DWConv + + +class Scale(nn.Module): + + def __init__(self, scale=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x): + return x * self.scale + + +def multi_apply(func, *args, **kwargs): + + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def xyxy2CxCywh(xyxy, size=None): + x1 = xyxy[..., 0] + y1 = xyxy[..., 1] + x2 = xyxy[..., 2] + y2 = xyxy[..., 3] + + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + + w = x2 - x1 + h = y2 - y1 + if size is not None: + w = w.clamp(min=0, max=size[1]) + h = h.clamp(min=0, max=size[0]) + return torch.stack([cx, cy, w, h], axis=-1) + + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + """ + x1 = points[..., 0] - distance[..., 0] + y1 = points[..., 1] - distance[..., 1] + x2 = points[..., 0] + distance[..., 2] + y2 = points[..., 1] + distance[..., 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + return torch.stack([x1, y1, x2, y2], -1) + + +def bbox2distance(points, bbox, max_dis=None, eps=0.1): + """Decode bounding box based on distances. + """ + left = points[:, 0] - bbox[:, 0] + top = points[:, 1] - bbox[:, 1] + right = bbox[:, 2] - points[:, 0] + bottom = bbox[:, 3] - points[:, 1] + if max_dis is not None: + left = left.clamp(min=0, max=max_dis - eps) + top = top.clamp(min=0, max=max_dis - eps) + right = right.clamp(min=0, max=max_dis - eps) + bottom = bottom.clamp(min=0, max=max_dis - eps) + return torch.stack([left, top, right, bottom], -1) + + +class Integral(nn.Module): + """A fixed layer for calculating integral result from distribution. + """ + + def __init__(self, reg_max=16): + super(Integral, self).__init__() + self.reg_max = reg_max + self.register_buffer('project', + torch.linspace(0, self.reg_max, self.reg_max + 1)) + + def forward(self, x): + """Forward feature from the regression head to get integral result of + bounding box location. + """ + shape = x.size() + x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1) + b, nb, ne, _ = x.size() + x = x.reshape(b * nb * ne, self.reg_max + 1) + y = self.project.type_as(x).unsqueeze(1) + x = torch.matmul(x, y).reshape(b, nb, 4) + return x + + +class GFocalHead_Tiny(nn.Module): + """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality + Estimation for Dense Object Detection. + """ + + def __init__( + self, + num_classes, + in_channels, + stacked_convs=4, # 4 + feat_channels=256, + reg_max=12, + reg_topk=4, + reg_channels=64, + strides=[8, 16, 32], + add_mean=True, + norm='gn', + act='relu', + start_kernel_size=3, + conv_groups=1, + conv_type='BaseConv', + simOTA_cls_weight=1.0, + simOTA_iou_weight=3.0, + octbase=8, + simlqe=False, + **kwargs): + self.simlqe = simlqe + self.num_classes = num_classes + self.in_channels = in_channels + self.strides = strides + self.feat_channels = feat_channels if isinstance(feat_channels, list) \ + else [feat_channels] * len(self.strides) + + self.cls_out_channels = num_classes + 1 # add 1 for keep consistance with former models + # and will be deprecated in future. + self.stacked_convs = stacked_convs + self.conv_groups = conv_groups + self.reg_max = reg_max + self.reg_topk = reg_topk + self.reg_channels = reg_channels + self.add_mean = add_mean + self.total_dim = reg_topk + self.start_kernel_size = start_kernel_size + + self.norm = norm + self.act = act + self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv + + if add_mean: + self.total_dim += 1 + + super(GFocalHead_Tiny, self).__init__() + self.integral = Integral(self.reg_max) + + self._init_layers() + + def _build_not_shared_convs(self, in_channel, feat_channels): + self.relu = nn.ReLU(inplace=True) + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + + for i in range(self.stacked_convs): + chn = feat_channels if i > 0 else in_channel + kernel_size = 3 if i > 0 else self.start_kernel_size + cls_convs.append( + self.conv_module( + chn, + feat_channels, + kernel_size, + stride=1, + groups=self.conv_groups, + norm=self.norm, + act=self.act)) + reg_convs.append( + self.conv_module( + chn, + feat_channels, + kernel_size, + stride=1, + groups=self.conv_groups, + norm=self.norm, + act=self.act)) + if not self.simlqe: + conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)] + else: + conf_vector = [ + nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1) + ] + conf_vector += [self.relu] + conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()] + reg_conf = nn.Sequential(*conf_vector) + + return cls_convs, reg_convs, reg_conf + + def _init_layers(self): + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.reg_confs = nn.ModuleList() + + for i in range(len(self.strides)): + cls_convs, reg_convs, reg_conf = self._build_not_shared_convs( + self.in_channels[i], self.feat_channels[i]) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + self.reg_confs.append(reg_conf) + + self.gfl_cls = nn.ModuleList([ + nn.Conv2d( + self.feat_channels[i], self.cls_out_channels, 3, padding=1) + for i in range(len(self.strides)) + ]) + + self.gfl_reg = nn.ModuleList([ + nn.Conv2d( + self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1) + for i in range(len(self.strides)) + ]) + + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + def forward(self, + xin, + labels=None, + imgs=None, + conf_thre=0.05, + nms_thre=0.7): + + # prepare labels during training + b, c, h, w = xin[0].shape + if labels is not None: + gt_bbox_list = [] + gt_cls_list = [] + for label in labels: + gt_bbox_list.append(label.bbox) + gt_cls_list.append((label.get_field('labels') + - 1).long()) # labels starts from 1 + + # prepare priors for label assignment and bbox decode + mlvl_priors_list = [ + self.get_single_level_center_priors( + xin[i].shape[0], + xin[i].shape[-2:], + stride, + dtype=torch.float32, + device=xin[0].device) for i, stride in enumerate(self.strides) + ] + mlvl_priors = torch.cat(mlvl_priors_list, dim=1) + + # forward for bboxes and classification prediction + cls_scores, bbox_preds = multi_apply( + self.forward_single, + xin, + self.cls_convs, + self.reg_convs, + self.gfl_cls, + self.gfl_reg, + self.reg_confs, + self.scales, + ) + flatten_cls_scores = torch.cat(cls_scores, dim=1) + flatten_bbox_preds = torch.cat(bbox_preds, dim=1) + + # calculating losses or bboxes decoded + if self.training: + loss = self.loss(flatten_cls_scores, flatten_bbox_preds, + gt_bbox_list, gt_cls_list, mlvl_priors) + return loss + else: + output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds, + mlvl_priors) + return output + + def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, + reg_conf, scale): + """Forward feature of a single scale level. + + """ + cls_feat = x + reg_feat = x + + for cls_conv in cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in reg_convs: + reg_feat = reg_conv(reg_feat) + + bbox_pred = scale(gfl_reg(reg_feat)).float() + N, C, H, W = bbox_pred.size() + prob = F.softmax( + bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2) + if not self.simlqe: + prob_topk, _ = prob.topk(self.reg_topk, dim=2) + + if self.add_mean: + stat = torch.cat( + [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2) + else: + stat = prob_topk + + quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W)) + else: + quality_score = reg_conf( + bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W)) + + cls_score = gfl_cls(cls_feat).sigmoid() * quality_score + + flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2) + flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2) + return flatten_cls_score, flatten_bbox_pred + + def get_single_level_center_priors(self, batch_size, featmap_size, stride, + dtype, device): + + h, w = featmap_size + x_range = (torch.arange(0, int(w), dtype=dtype, + device=device)) * stride + y_range = (torch.arange(0, int(h), dtype=dtype, + device=device)) * stride + + x = x_range.repeat(h, 1) + y = y_range.unsqueeze(-1).repeat(1, w) + + y = y.flatten() + x = x.flatten() + strides = x.new_full((x.shape[0], ), stride) + priors = torch.stack([x, y, strides, strides], dim=-1) + + return priors.unsqueeze(0).repeat(batch_size, 1, 1) + + def sample(self, assign_result, gt_bboxes): + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + + if gt_bboxes.numel() == 0: + # hack for index error case + assert pos_assigned_gt_inds.numel() == 0 + pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4) + else: + if len(gt_bboxes.shape) < 2: + gt_bboxes = gt_bboxes.view(-1, 4) + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] + + return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds + + def get_bboxes(self, + cls_preds, + reg_preds, + mlvl_center_priors, + img_meta=None): + + dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None] + bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds) + + res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1) + + return res diff --git a/modelscope/models/cv/tinynas_detection/neck/__init__.py b/modelscope/models/cv/tinynas_detection/neck/__init__.py new file mode 100644 index 00000000..3c418c29 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/neck/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import copy + +from .giraffe_fpn import GiraffeNeck +from .giraffe_fpn_v2 import GiraffeNeckV2 + + +def build_neck(cfg): + neck_cfg = copy.deepcopy(cfg) + name = neck_cfg.pop('name') + if name == 'GiraffeNeck': + return GiraffeNeck(**neck_cfg) + elif name == 'GiraffeNeckV2': + return GiraffeNeckV2(**neck_cfg) diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py new file mode 100644 index 00000000..289fdfd2 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py @@ -0,0 +1,235 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import collections +import itertools +import os + +import networkx as nx +from omegaconf import OmegaConf + +Node = collections.namedtuple('Node', ['id', 'inputs', 'type']) + + +def get_graph_info(graph): + input_nodes = [] + output_nodes = [] + Nodes = [] + for node in range(graph.number_of_nodes()): + tmp = list(graph.neighbors(node)) + tmp.sort() + type = -1 + if node < tmp[0]: + input_nodes.append(node) + type = 0 + if node > tmp[-1]: + output_nodes.append(node) + type = 1 + Nodes.append(Node(node, [n for n in tmp if n < node], type)) + return Nodes, input_nodes, output_nodes + + +def nodeid_trans(id, cur_level, num_levels): + if id % 2 == 1: + gap = int(((id + 1) // 2) * num_levels * 2) + else: + a = (num_levels - cur_level) * 2 - 1 + b = ((id + 1) // 2) * num_levels * 2 + gap = int(a + b) + return cur_level + gap + + +def gen_log2n_graph_file(log2n_graph_file, depth_multiplier): + f = open(log2n_graph_file, 'w') + for i in range(depth_multiplier): + for j in [1, 2, 4, 8, 16, 32]: + if i - j < 0: + break + else: + f.write('%d,%d\n' % (i - j, i)) + f.close() + + +def get_log2n_graph(depth_multiplier): + nodes = [] + connnections = [] + + for i in range(depth_multiplier): + nodes.append(i) + for j in [1, 2, 4, 8, 16, 32]: + if i - j < 0: + break + else: + connnections.append((i - j, i)) + return nodes, connnections + + +def get_dense_graph(depth_multiplier): + nodes = [] + connections = [] + + for i in range(depth_multiplier): + nodes.append(i) + for j in range(i): + connections.append((j, i)) + return nodes, connections + + +def giraffeneck_config(min_level, + max_level, + weight_method=None, + depth_multiplier=5, + with_backslash=False, + with_slash=False, + with_skip_connect=False, + skip_connect_type='dense'): + """Graph config with log2n merge and panet""" + if skip_connect_type == 'dense': + nodes, connections = get_dense_graph(depth_multiplier) + elif skip_connect_type == 'log2n': + nodes, connections = get_log2n_graph(depth_multiplier) + graph = nx.Graph() + graph.add_nodes_from(nodes) + graph.add_edges_from(connections) + + drop_node = [] + nodes, input_nodes, output_nodes = get_graph_info(graph) + + weight_method = weight_method or 'fastattn' + + num_levels = max_level - min_level + 1 + node_ids = {min_level + i: [i] for i in range(num_levels)} + node_ids_per_layer = {} + + pnodes = {} + + def update_drop_node(new_id, input_offsets): + if new_id not in drop_node: + new_id = new_id + else: + while new_id in drop_node: + if new_id in pnodes: + for n in pnodes[new_id]['inputs_offsets']: + if n not in input_offsets and n not in drop_node: + input_offsets.append(n) + new_id = new_id - 1 + if new_id not in input_offsets: + input_offsets.append(new_id) + + # top-down layer + for i in range(max_level, min_level - 1, -1): + node_ids_per_layer[i] = [] + for id, node in enumerate(nodes): + input_offsets = [] + if id in input_nodes: + input_offsets.append(node_ids[i][0]) + else: + if with_skip_connect: + for input_id in node.inputs: + new_id = nodeid_trans(input_id, i - min_level, + num_levels) + update_drop_node(new_id, input_offsets) + + # add top2down + new_id = nodeid_trans(id, i - min_level, num_levels) + + # add backslash node + def cal_backslash_node(id): + ind = id // num_levels + mod = id % num_levels + if ind % 2 == 0: # even + if mod == (num_levels - 1): + last = -1 + else: + last = (ind - 1) * num_levels + ( + num_levels - 1 - mod - 1) + else: # odd + if mod == 0: + last = -1 + else: + last = (ind - 1) * num_levels + ( + num_levels - 1 - mod + 1) + + return last + + # add slash node + def cal_slash_node(id): + ind = id // num_levels + mod = id % num_levels + if ind % 2 == 1: # odd + if mod == (num_levels - 1): + last = -1 + else: + last = (ind - 1) * num_levels + ( + num_levels - 1 - mod - 1) + else: # even + if mod == 0: + last = -1 + else: + last = (ind - 1) * num_levels + ( + num_levels - 1 - mod + 1) + + return last + + # add last node + last = new_id - 1 + update_drop_node(last, input_offsets) + + if with_backslash: + backslash = cal_backslash_node(new_id) + if backslash != -1 and backslash not in input_offsets: + input_offsets.append(backslash) + + if with_slash: + slash = cal_slash_node(new_id) + if slash != -1 and slash not in input_offsets: + input_offsets.append(slash) + + if new_id in drop_node: + input_offsets = [] + + pnodes[new_id] = { + 'reduction': 1 << i, + 'inputs_offsets': input_offsets, + 'weight_method': weight_method, + 'is_out': 0, + } + + input_offsets = [] + for out_id in output_nodes: + new_id = nodeid_trans(out_id, i - min_level, num_levels) + input_offsets.append(new_id) + + pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = { + 'reduction': 1 << i, + 'inputs_offsets': input_offsets, + 'weight_method': weight_method, + 'is_out': 1, + } + + pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0])) + return pnodes + + +def get_graph_config(fpn_name, + min_level=3, + max_level=7, + weight_method='concat', + depth_multiplier=5, + with_backslash=False, + with_slash=False, + with_skip_connect=False, + skip_connect_type='dense'): + name_to_config = { + 'giraffeneck': + giraffeneck_config( + min_level=min_level, + max_level=max_level, + weight_method=weight_method, + depth_multiplier=depth_multiplier, + with_backslash=with_backslash, + with_slash=with_slash, + with_skip_connect=with_skip_connect, + skip_connect_type=skip_connect_type), + } + return name_to_config[fpn_name] diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py new file mode 100644 index 00000000..b7087779 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py @@ -0,0 +1,661 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import logging +import math +from collections import OrderedDict +from functools import partial +from typing import Callable, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm import create_model +from timm.models.layers import (Swish, create_conv2d, create_pool2d, + get_act_layer) + +from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer +from .giraffe_config import get_graph_config + +_ACT_LAYER = Swish + + +class SequentialList(nn.Sequential): + """ This module exists to work around torchscript typing issues list -> list""" + + def __init__(self, *args): + super(SequentialList, self).__init__(*args) + + def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + for module in self: + x = module(x) + return x + + +class ConvBnAct2d(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + padding='', + bias=False, + norm_layer=nn.BatchNorm2d, + act_layer=_ACT_LAYER): + super(ConvBnAct2d, self).__init__() + + self.conv = create_conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + dilation=dilation, + padding=padding, + bias=bias) + self.bn = None if norm_layer is None else norm_layer(out_channels) + self.act = None if act_layer is None else act_layer(inplace=True) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.act is not None: + x = self.act(x) + return x + + +class SeparableConv2d(nn.Module): + """ Separable Conv + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilation=1, + padding='', + bias=False, + channel_multiplier=1.0, + pw_kernel_size=1, + norm_layer=nn.BatchNorm2d, + act_layer=_ACT_LAYER): + super(SeparableConv2d, self).__init__() + self.conv_dw = create_conv2d( + in_channels, + int(in_channels * channel_multiplier), + kernel_size, + stride=stride, + dilation=dilation, + padding=padding, + depthwise=True) + + self.conv_pw = create_conv2d( + int(in_channels * channel_multiplier), + out_channels, + pw_kernel_size, + padding=padding, + bias=bias) + + self.bn = None if norm_layer is None else norm_layer(out_channels) + self.act = None if act_layer is None else act_layer(inplace=True) + + def forward(self, x): + x = self.conv_dw(x) + x = self.conv_pw(x) + if self.bn is not None: + x = self.bn(x) + if self.act is not None: + x = self.act(x) + return x + + +def _init_weight( + m, + n='', +): + """ Weight initialization as per Tensorflow official implementations. + """ + + def _fan_in_out(w, groups=1): + dimensions = w.dim() + if dimensions < 2: + raise ValueError( + 'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions' + ) + num_input_fmaps = w.size(1) + num_output_fmaps = w.size(0) + receptive_field_size = 1 + if w.dim() > 2: + receptive_field_size = w[0][0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + fan_out //= groups + return fan_in, fan_out + + def _glorot_uniform(w, gain=1, groups=1): + fan_in, fan_out = _fan_in_out(w, groups) + gain /= max(1., (fan_in + fan_out) / 2.) # fan avg + limit = math.sqrt(3.0 * gain) + w.data.uniform_(-limit, limit) + + def _variance_scaling(w, gain=1, groups=1): + fan_in, fan_out = _fan_in_out(w, groups) + gain /= max(1., fan_in) # fan in + std = math.sqrt(gain) + w.data.normal_(std=std) + + if isinstance(m, SeparableConv2d): + if 'box_net' in n or 'class_net' in n: + _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups) + _variance_scaling(m.conv_pw.weight) + if m.conv_pw.bias is not None: + if 'class_net.predict' in n: + m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.conv_pw.bias.data.zero_() + else: + _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups) + _glorot_uniform(m.conv_pw.weight) + if m.conv_pw.bias is not None: + m.conv_pw.bias.data.zero_() + elif isinstance(m, ConvBnAct2d): + if 'box_net' in n or 'class_net' in n: + m.conv.weight.data.normal_(std=.01) + if m.conv.bias is not None: + if 'class_net.predict' in n: + m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.conv.bias.data.zero_() + else: + _glorot_uniform(m.conv.weight) + if m.conv.bias is not None: + m.conv.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + + +def _init_weight_alt( + m, + n='', +): + """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition + NOTE: this will likely be removed after some experimentation + """ + if isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + if 'class_net.predict' in n: + m.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) + else: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + + +class Interpolate2d(nn.Module): + r"""Resamples a 2d Image + + The input data is assumed to be of the form + `minibatch x channels x [optional depth] x [optional height] x width`. + Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor. + + The algorithms available for upsampling are nearest neighbor and linear, + bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor, + respectively. + + One can either give a :attr:`scale_factor` or the target output :attr:`size` to + calculate the output size. (You cannot give both, as it is ambiguous) + + Args: + size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional): + output spatial sizes + scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional): + multiplier for spatial size. Has to match input size if it is a tuple. + mode (str, optional): the upsampling algorithm: one of ``'nearest'``, + ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``. + Default: ``'nearest'`` + align_corners (bool, optional): if ``True``, the corner pixels of the input + and output tensors are aligned, and thus preserving the values at + those pixels. This only has effect when :attr:`mode` is + ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False`` + """ + __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name'] + name: str + size: Optional[Union[int, Tuple[int, int]]] + scale_factor: Optional[Union[float, Tuple[float, float]]] + mode: str + align_corners: Optional[bool] + + def __init__(self, + size: Optional[Union[int, Tuple[int, int]]] = None, + scale_factor: Optional[Union[float, Tuple[float, + float]]] = None, + mode: str = 'nearest', + align_corners: bool = False) -> None: + super(Interpolate2d, self).__init__() + self.name = type(self).__name__ + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = None if mode == 'nearest' else align_corners + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.interpolate( + input, + self.size, + self.scale_factor, + self.mode, + self.align_corners, + recompute_scale_factor=False) + + +class ResampleFeatureMap(nn.Sequential): + + def __init__(self, + in_channels, + out_channels, + reduction_ratio=1., + pad_type='', + downsample=None, + upsample=None, + norm_layer=nn.BatchNorm2d, + apply_bn=False, + conv_after_downsample=False, + redundant_bias=False): + super(ResampleFeatureMap, self).__init__() + downsample = downsample or 'max' + upsample = upsample or 'nearest' + self.in_channels = in_channels + self.out_channels = out_channels + self.reduction_ratio = reduction_ratio + self.conv_after_downsample = conv_after_downsample + + conv = None + if in_channels != out_channels: + conv = ConvBnAct2d( + in_channels, + out_channels, + kernel_size=1, + padding=pad_type, + norm_layer=norm_layer if apply_bn else None, + bias=not apply_bn or redundant_bias, + act_layer=None) + + if reduction_ratio > 1: + if conv is not None and not self.conv_after_downsample: + self.add_module('conv', conv) + if downsample in ('max', 'avg'): + stride_size = int(reduction_ratio) + downsample = create_pool2d( + downsample, + kernel_size=stride_size + 1, + stride=stride_size, + padding=pad_type) + else: + downsample = Interpolate2d( + scale_factor=1. / reduction_ratio, mode=downsample) + self.add_module('downsample', downsample) + if conv is not None and self.conv_after_downsample: + self.add_module('conv', conv) + else: + if conv is not None: + self.add_module('conv', conv) + if reduction_ratio < 1: + scale = int(1 // reduction_ratio) + self.add_module( + 'upsample', + Interpolate2d(scale_factor=scale, mode=upsample)) + + +class GiraffeCombine(nn.Module): + + def __init__(self, + feature_info, + fpn_config, + fpn_channels, + inputs_offsets, + target_reduction, + pad_type='', + downsample=None, + upsample=None, + norm_layer=nn.BatchNorm2d, + apply_resample_bn=False, + conv_after_downsample=False, + redundant_bias=False, + weight_method='attn'): + super(GiraffeCombine, self).__init__() + self.inputs_offsets = inputs_offsets + self.weight_method = weight_method + + self.resample = nn.ModuleDict() + reduction_base = feature_info[0]['reduction'] + + target_channels_idx = int( + math.log(target_reduction // reduction_base, 2)) + for idx, offset in enumerate(inputs_offsets): + if offset < len(feature_info): + in_channels = feature_info[offset]['num_chs'] + input_reduction = feature_info[offset]['reduction'] + else: + node_idx = offset + input_reduction = fpn_config[node_idx]['reduction'] + # in_channels = fpn_config[node_idx]['num_chs'] + input_channels_idx = int( + math.log(input_reduction // reduction_base, 2)) + in_channels = feature_info[input_channels_idx]['num_chs'] + + reduction_ratio = target_reduction / input_reduction + if weight_method == 'concat': + self.resample[str(offset)] = ResampleFeatureMap( + in_channels, + in_channels, + reduction_ratio=reduction_ratio, + pad_type=pad_type, + downsample=downsample, + upsample=upsample, + norm_layer=norm_layer, + apply_bn=apply_resample_bn, + conv_after_downsample=conv_after_downsample, + redundant_bias=redundant_bias) + else: + self.resample[str(offset)] = ResampleFeatureMap( + in_channels, + fpn_channels[target_channels_idx], + reduction_ratio=reduction_ratio, + pad_type=pad_type, + downsample=downsample, + upsample=upsample, + norm_layer=norm_layer, + apply_bn=apply_resample_bn, + conv_after_downsample=conv_after_downsample, + redundant_bias=redundant_bias) + + if weight_method == 'attn' or weight_method == 'fastattn': + self.edge_weights = nn.Parameter( + torch.ones(len(inputs_offsets)), requires_grad=True) # WSM + else: + self.edge_weights = None + + def forward(self, x: List[torch.Tensor]): + dtype = x[0].dtype + nodes = [] + if len(self.inputs_offsets) == 0: + return None + for offset, resample in zip(self.inputs_offsets, + self.resample.values()): + input_node = x[offset] + input_node = resample(input_node) + nodes.append(input_node) + + if self.weight_method == 'attn': + normalized_weights = torch.softmax( + self.edge_weights.to(dtype=dtype), dim=0) + out = torch.stack(nodes, dim=-1) * normalized_weights + out = torch.sum(out, dim=-1) + elif self.weight_method == 'fastattn': + edge_weights = nn.functional.relu( + self.edge_weights.to(dtype=dtype)) + weights_sum = torch.sum(edge_weights) + weights_norm = weights_sum + 0.0001 + out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm + for i in range(len(nodes))], + dim=-1) + + out = torch.sum(out, dim=-1) + elif self.weight_method == 'sum': + out = torch.stack(nodes, dim=-1) + out = torch.sum(out, dim=-1) + elif self.weight_method == 'concat': + out = torch.cat(nodes, dim=1) + else: + raise ValueError('unknown weight_method {}'.format( + self.weight_method)) + return out + + +class GiraffeNode(nn.Module): + """ A simple wrapper used in place of nn.Sequential for torchscript typing + Handles input type List[Tensor] -> output type Tensor + """ + + def __init__(self, combine: nn.Module, after_combine: nn.Module): + super(GiraffeNode, self).__init__() + self.combine = combine + self.after_combine = after_combine + + def forward(self, x: List[torch.Tensor]) -> torch.Tensor: + combine_feat = self.combine(x) + if combine_feat is None: + return None + else: + return self.after_combine(combine_feat) + + +class GiraffeLayer(nn.Module): + + def __init__(self, + feature_info, + fpn_config, + inner_fpn_channels, + outer_fpn_channels, + num_levels=5, + pad_type='', + downsample=None, + upsample=None, + norm_layer=nn.BatchNorm2d, + act_layer=_ACT_LAYER, + apply_resample_bn=False, + conv_after_downsample=True, + conv_bn_relu_pattern=False, + separable_conv=True, + redundant_bias=False, + merge_type='conv'): + super(GiraffeLayer, self).__init__() + self.num_levels = num_levels + self.conv_bn_relu_pattern = False + + self.feature_info = {} + for idx, feat in enumerate(feature_info): + self.feature_info[idx] = feat + + self.fnode = nn.ModuleList() + reduction_base = feature_info[0]['reduction'] + for i, fnode_cfg in fpn_config.items(): + logging.debug('fnode {} : {}'.format(i, fnode_cfg)) + + if fnode_cfg['is_out'] == 1: + fpn_channels = outer_fpn_channels + else: + fpn_channels = inner_fpn_channels + + reduction = fnode_cfg['reduction'] + fpn_channels_idx = int(math.log(reduction // reduction_base, 2)) + combine = GiraffeCombine( + self.feature_info, + fpn_config, + fpn_channels, + tuple(fnode_cfg['inputs_offsets']), + target_reduction=reduction, + pad_type=pad_type, + downsample=downsample, + upsample=upsample, + norm_layer=norm_layer, + apply_resample_bn=apply_resample_bn, + conv_after_downsample=conv_after_downsample, + redundant_bias=redundant_bias, + weight_method=fnode_cfg['weight_method']) + + after_combine = nn.Sequential() + + in_channels = 0 + out_channels = 0 + for input_offset in fnode_cfg['inputs_offsets']: + in_channels += self.feature_info[input_offset]['num_chs'] + + out_channels = fpn_channels[fpn_channels_idx] + + if merge_type == 'csp': + after_combine.add_module( + 'CspLayer', + CSPLayer( + in_channels, + out_channels, + 2, + shortcut=True, + depthwise=False, + act='silu')) + elif merge_type == 'shuffle': + after_combine.add_module( + 'shuffleBlock', ShuffleBlock(in_channels, in_channels)) + after_combine.add_module( + 'conv1x1', + create_conv2d(in_channels, out_channels, kernel_size=1)) + elif merge_type == 'conv': + after_combine.add_module( + 'conv1x1', + create_conv2d(in_channels, out_channels, kernel_size=1)) + conv_kwargs = dict( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=pad_type, + bias=False, + norm_layer=norm_layer, + act_layer=act_layer) + if not conv_bn_relu_pattern: + conv_kwargs['bias'] = redundant_bias + conv_kwargs['act_layer'] = None + after_combine.add_module('act', act_layer(inplace=True)) + after_combine.add_module( + 'conv', + SeparableConv2d(**conv_kwargs) + if separable_conv else ConvBnAct2d(**conv_kwargs)) + + self.fnode.append( + GiraffeNode(combine=combine, after_combine=after_combine)) + self.feature_info[i] = dict( + num_chs=fpn_channels[fpn_channels_idx], reduction=reduction) + + self.out_feature_info = [] + out_node = list(self.feature_info.keys())[-num_levels::] + for i in out_node: + self.out_feature_info.append(self.feature_info[i]) + + self.feature_info = self.out_feature_info + + def forward(self, x: List[torch.Tensor]): + for fn in self.fnode: + x.append(fn(x)) + return x[-self.num_levels::] + + +class GiraffeNeck(nn.Module): + + def __init__(self, min_level, max_level, num_levels, norm_layer, + norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels, + out_fpn_channels, weight_method, depth_multiplier, + width_multiplier, with_backslash, with_slash, + with_skip_connect, skip_connect_type, separable_conv, + feature_info, merge_type, pad_type, downsample_type, + upsample_type, apply_resample_bn, conv_after_downsample, + redundant_bias, conv_bn_relu_pattern, alternate_init): + super(GiraffeNeck, self).__init__() + + self.num_levels = num_levels + self.min_level = min_level + self.in_features = [0, 1, 2, 3, 4, 5, + 6][self.min_level - 1:self.min_level - 1 + + num_levels] + self.alternate_init = alternate_init + norm_layer = norm_layer or nn.BatchNorm2d + if norm_kwargs: + norm_layer = partial(norm_layer, **norm_kwargs) + act_layer = get_act_layer(act_type) or _ACT_LAYER + fpn_config = fpn_config or get_graph_config( + fpn_name, + min_level=min_level, + max_level=max_level, + weight_method=weight_method, + depth_multiplier=depth_multiplier, + with_backslash=with_backslash, + with_slash=with_slash, + with_skip_connect=with_skip_connect, + skip_connect_type=skip_connect_type) + + # width scale + for i in range(len(fpn_channels)): + fpn_channels[i] = int(fpn_channels[i] * width_multiplier) + + self.resample = nn.ModuleDict() + for level in range(num_levels): + if level < len(feature_info): + in_chs = feature_info[level]['num_chs'] + reduction = feature_info[level]['reduction'] + else: + # Adds a coarser level by downsampling the last feature map + reduction_ratio = 2 + self.resample[str(level)] = ResampleFeatureMap( + in_channels=in_chs, + out_channels=feature_info[level - 1]['num_chs'], + pad_type=pad_type, + downsample=downsample_type, + upsample=upsample_type, + norm_layer=norm_layer, + reduction_ratio=reduction_ratio, + apply_bn=apply_resample_bn, + conv_after_downsample=conv_after_downsample, + redundant_bias=redundant_bias, + ) + in_chs = feature_info[level - 1]['num_chs'] + reduction = int(reduction * reduction_ratio) + feature_info.append(dict(num_chs=in_chs, reduction=reduction)) + + self.cell = SequentialList() + logging.debug('building giraffeNeck') + giraffe_layer = GiraffeLayer( + feature_info=feature_info, + fpn_config=fpn_config, + inner_fpn_channels=fpn_channels, + outer_fpn_channels=out_fpn_channels, + num_levels=num_levels, + pad_type=pad_type, + downsample=downsample_type, + upsample=upsample_type, + norm_layer=norm_layer, + act_layer=act_layer, + separable_conv=separable_conv, + apply_resample_bn=apply_resample_bn, + conv_after_downsample=conv_after_downsample, + conv_bn_relu_pattern=conv_bn_relu_pattern, + redundant_bias=redundant_bias, + merge_type=merge_type) + self.cell.add_module('giraffeNeck', giraffe_layer) + feature_info = giraffe_layer.feature_info + + def init_weights(self, pretrained=False): + for n, m in self.named_modules(): + if 'backbone' not in n: + if self.alternate_init: + _init_weight_alt(m, n) + else: + _init_weight(m, n) + + def forward(self, x: List[torch.Tensor]): + if type(x) is tuple: + x = list(x) + x = [x[f] for f in self.in_features] + for resample in self.resample.values(): + x.append(resample(x[-1])) + x = self.cell(x) + return x diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py new file mode 100644 index 00000000..b710572f --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py @@ -0,0 +1,203 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import torch +import torch.nn as nn + +from ..core.base_ops import BaseConv, CSPLayer, DWConv +from ..core.neck_ops import CSPStage + + +class GiraffeNeckV2(nn.Module): + + def __init__( + self, + depth=1.0, + width=1.0, + in_features=[2, 3, 4], + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + depthwise=False, + act='silu', + spp=True, + reparam_mode=True, + block_name='BasicBlock', + ): + super().__init__() + self.in_features = in_features + self.in_channels = in_channels + Conv = DWConv if depthwise else BaseConv + + reparam_mode = reparam_mode + + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + + # node x3: input x0, x1 + self.bu_conv13 = Conv( + int(in_channels[1] * width), + int(in_channels[1] * width), + 3, + 2, + act=act) + if reparam_mode: + self.merge_3 = CSPStage( + block_name, + int((in_channels[1] + in_channels[2]) * width), + int(in_channels[2] * width), + round(3 * depth), + act=act, + spp=spp) + else: + self.merge_3 = CSPLayer( + int((in_channels[1] + in_channels[2]) * width), + int(in_channels[2] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act) + + # node x4: input x1, x2, x3 + self.bu_conv24 = Conv( + int(in_channels[0] * width), + int(in_channels[0] * width), + 3, + 2, + act=act) + if reparam_mode: + self.merge_4 = CSPStage( + block_name, + int((in_channels[0] + in_channels[1] + in_channels[2]) + * width), + int(in_channels[1] * width), + round(3 * depth), + act=act, + spp=spp) + else: + self.merge_4 = CSPLayer( + int((in_channels[0] + in_channels[1] + in_channels[2]) + * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act) + + # node x5: input x2, x4 + if reparam_mode: + self.merge_5 = CSPStage( + block_name, + int((in_channels[1] + in_channels[0]) * width), + int(out_channels[0] * width), + round(3 * depth), + act=act, + spp=spp) + else: + self.merge_5 = CSPLayer( + int((in_channels[1] + in_channels[0]) * width), + int(out_channels[0] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act) + + # node x7: input x4, x5 + self.bu_conv57 = Conv( + int(out_channels[0] * width), + int(out_channels[0] * width), + 3, + 2, + act=act) + if reparam_mode: + self.merge_7 = CSPStage( + block_name, + int((out_channels[0] + in_channels[1]) * width), + int(out_channels[1] * width), + round(3 * depth), + act=act, + spp=spp) + else: + self.merge_7 = CSPLayer( + int((out_channels[0] + in_channels[1]) * width), + int(out_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act) + + # node x6: input x3, x4, x7 + self.bu_conv46 = Conv( + int(in_channels[1] * width), + int(in_channels[1] * width), + 3, + 2, + act=act) + self.bu_conv76 = Conv( + int(out_channels[1] * width), + int(out_channels[1] * width), + 3, + 2, + act=act) + if reparam_mode: + self.merge_6 = CSPStage( + block_name, + int((in_channels[1] + out_channels[1] + in_channels[2]) + * width), + int(out_channels[2] * width), + round(3 * depth), + act=act, + spp=spp) + else: + self.merge_6 = CSPLayer( + int((in_channels[1] + out_channels[1] + in_channels[2]) + * width), + int(out_channels[2] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act) + + def init_weights(self): + pass + + def forward(self, out_features): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + # backbone + features = [out_features[f] for f in self.in_features] + [x2, x1, x0] = features + + # node x3 + x13 = self.bu_conv13(x1) + x3 = torch.cat([x0, x13], 1) + x3 = self.merge_3(x3) + + # node x4 + x34 = self.upsample(x3) + x24 = self.bu_conv24(x2) + x4 = torch.cat([x1, x24, x34], 1) + x4 = self.merge_4(x4) + + # node x5 + x45 = self.upsample(x4) + x5 = torch.cat([x2, x45], 1) + x5 = self.merge_5(x5) + + # node x7 + x57 = self.bu_conv57(x5) + x7 = torch.cat([x4, x57], 1) + x7 = self.merge_7(x7) + + # node x6 + x46 = self.bu_conv46(x4) + x76 = self.bu_conv76(x7) + x6 = torch.cat([x3, x46, x76], 1) + x6 = self.merge_6(x6) + + outputs = (x5, x7, x6) + return outputs diff --git a/modelscope/models/cv/tinynas_detection/tinynas_detector.py b/modelscope/models/cv/tinynas_detection/tinynas_detector.py new file mode 100644 index 00000000..e6f144df --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.utils.constant import Tasks +from .detector import SingleStageDetector + + +@MODELS.register_module( + Tasks.image_object_detection, module_name=Models.tinynas_detection) +class TinynasDetector(SingleStageDetector): + + def __init__(self, model_dir, *args, **kwargs): + + super(TinynasDetector, self).__init__(model_dir, *args, **kwargs) diff --git a/modelscope/models/cv/tinynas_detection/utils.py b/modelscope/models/cv/tinynas_detection/utils.py new file mode 100644 index 00000000..d67d3a36 --- /dev/null +++ b/modelscope/models/cv/tinynas_detection/utils.py @@ -0,0 +1,30 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. + +import importlib +import os +import sys +from os.path import dirname, join + + +def get_config_by_file(config_file): + try: + sys.path.append(os.path.dirname(config_file)) + current_config = importlib.import_module( + os.path.basename(config_file).split('.')[0]) + exp = current_config.Config() + except Exception: + raise ImportError( + "{} doesn't contains class named 'Config'".format(config_file)) + return exp + + +def parse_config(config_file): + """ + get config object by file. + Args: + config_file (str): file path of config. + """ + assert (config_file is not None), 'plz provide config file' + if config_file is not None: + return get_config_by_file(config_file) diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py new file mode 100644 index 00000000..b2063629 --- /dev/null +++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py @@ -0,0 +1,61 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict + +import cv2 +import numpy as np +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_object_detection, module_name=Pipelines.tinynas_detection) +class TinynasDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + model: model id on modelscope hub. + """ + super().__init__(model=model, auto_collate=False, **kwargs) + if torch.cuda.is_available(): + self.device = 'cuda' + else: + self.device = 'cpu' + self.model.to(self.device) + self.model.eval() + + def preprocess(self, input: Input) -> Dict[str, Any]: + + img = LoadImage.convert_to_ndarray(input) + self.img = img + img = img.astype(np.float) + img = self.model.preprocess(img) + result = {'img': img.to(self.device)} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + + outputs = self.model.inference(input['img']) + result = {'data': outputs} + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + + bboxes, scores, labels = self.model.postprocess(inputs['data']) + if bboxes is None: + return None + outputs = { + OutputKeys.SCORES: scores, + OutputKeys.LABELS: labels, + OutputKeys.BOXES: bboxes + } + return outputs diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py new file mode 100644 index 00000000..6b2ecd0b --- /dev/null +++ b/tests/pipelines/test_tinynas_detection.py @@ -0,0 +1,20 @@ +import unittest + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class TinynasObjectDetectionTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run(self): + tinynas_object_detection = pipeline( + Tasks.image_object_detection, model='damo/cv_tinynas_detection') + result = tinynas_object_detection( + 'data/test/images/image_detection.jpg') + print(result) + + +if __name__ == '__main__': + unittest.main() From 1a22fa02228f0884bcb48bdaccc4f90a24c85009 Mon Sep 17 00:00:00 2001 From: "jiangnana.jnn" Date: Fri, 2 Sep 2022 14:06:08 +0800 Subject: [PATCH 07/28] fix trainer unittest Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9970626 * fix trainer unittest --- tests/trainers/test_trainer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index 17fa97f9..86909f74 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -17,7 +17,7 @@ from modelscope.metrics.builder import MetricKeys from modelscope.models.base import Model from modelscope.trainers import build_trainer from modelscope.trainers.base import DummyTrainer -from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile +from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks from modelscope.utils.test_utils import create_dummy_test_dataset, test_level @@ -67,6 +67,7 @@ class TrainerTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_train_0(self): json_cfg = { + 'task': Tasks.image_classification, 'train': { 'work_dir': self.tmp_dir, @@ -141,6 +142,7 @@ class TrainerTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_train_1(self): json_cfg = { + 'task': Tasks.image_classification, 'train': { 'work_dir': self.tmp_dir, @@ -201,6 +203,7 @@ class TrainerTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_train_with_default_config(self): json_cfg = { + 'task': Tasks.image_classification, 'train': { 'work_dir': self.tmp_dir, 'dataloader': { @@ -319,6 +322,7 @@ class TrainerTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_train_with_iters_per_epoch(self): json_cfg = { + 'task': Tasks.image_classification, 'train': { 'work_dir': self.tmp_dir, 'dataloader': { From 4d3716cf4ebd0efc814818709234f93eef8e73c5 Mon Sep 17 00:00:00 2001 From: "xingguang.zxg" Date: Fri, 2 Sep 2022 14:14:47 +0800 Subject: [PATCH 08/28] =?UTF-8?q?[to=20#42322933]=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=8C=87=E5=AF=BC=E7=9A=84=E8=AF=AD=E4=B9=89=E5=88=86=E5=89=B2?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 文本指导的语义分割模型,根据输入的文本信息,讲图像中对应文本描述的物体分割出来。 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9942863 --- data/test/images/text_driven_segmentation.jpg | 3 + modelscope/metainfo.py | 2 + .../cv/text_driven_segmentation/__init__.py | 1 + .../cv/text_driven_segmentation/clip.py | 170 ++++++ .../cv/text_driven_segmentation/lseg_base.py | 28 + .../text_driven_segmentation/lseg_blocks.py | 334 +++++++++++ .../cv/text_driven_segmentation/lseg_model.py | 107 ++++ .../cv/text_driven_segmentation/lseg_net.py | 197 +++++++ .../cv/text_driven_segmentation/lseg_vit.py | 543 ++++++++++++++++++ .../cv/text_driven_segmentation/model.py | 458 +++++++++++++++ .../simple_tokenizer.py | 156 +++++ modelscope/outputs.py | 7 + modelscope/pipelines/builder.py | 3 + modelscope/pipelines/cv/__init__.py | 3 + .../cv/text_driven_segmentation_pipleline.py | 51 ++ modelscope/utils/constant.py | 1 + .../test_text_driven_segmentation.py | 28 + 17 files changed, 2092 insertions(+) create mode 100644 data/test/images/text_driven_segmentation.jpg create mode 100644 modelscope/models/cv/text_driven_segmentation/__init__.py create mode 100644 modelscope/models/cv/text_driven_segmentation/clip.py create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_base.py create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_blocks.py create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_model.py create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_net.py create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_vit.py create mode 100644 modelscope/models/cv/text_driven_segmentation/model.py create mode 100644 modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py create mode 100644 modelscope/pipelines/cv/text_driven_segmentation_pipleline.py create mode 100644 tests/pipelines/test_text_driven_segmentation.py diff --git a/data/test/images/text_driven_segmentation.jpg b/data/test/images/text_driven_segmentation.jpg new file mode 100644 index 00000000..e3320b1f --- /dev/null +++ b/data/test/images/text_driven_segmentation.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4 +size 67861 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index fd653bac..3225710a 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -29,6 +29,7 @@ class Models(object): video_summarization = 'pgl-video-summarization' swinL_semantic_segmentation = 'swinL-semantic-segmentation' vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' + text_driven_segmentation = 'text-driven-segmentation' resnet50_bert = 'resnet50-bert' # EasyCV models @@ -143,6 +144,7 @@ class Pipelines(object): video_summarization = 'googlenet_pgl_video_summarization' image_semantic_segmentation = 'image-semantic-segmentation' image_reid_person = 'passvitb-image-reid-person' + text_driven_segmentation = 'text-driven-segmentation' movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' # nlp tasks diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py new file mode 100644 index 00000000..46daad78 --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/__init__.py @@ -0,0 +1 @@ +from .lseg_base import TextDrivenSegmentation diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py new file mode 100644 index 00000000..440cccea --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/clip.py @@ -0,0 +1,170 @@ +""" CLIP +Adapted from https://github.com/openai/CLIP. +Originally MIT License, Copyright (c) 2021 OpenAI. +""" + +import hashlib +import os +import urllib +import warnings +from typing import Any, List, Union + +import torch +from PIL import Image +from pkg_resources import packaging +from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, + ToTensor) +from tqdm import tqdm + +from .model import build_model +from .simple_tokenizer import SimpleTokenizer as _Tokenizer + +try: + from torchvision.transforms import InterpolationMode + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + +if packaging.version.parse( + torch.__version__) < packaging.version.parse('1.7.1'): + warnings.warn('PyTorch version 1.7.1 or higher is recommended') +__all__ = ['load', 'tokenize'] + + +def _convert_image_to_rgb(image): + return image.convert('RGB') + + +def _transform(n_px): + return Compose([ + Resize(n_px, interpolation=BICUBIC), + CenterCrop(n_px), + _convert_image_to_rgb, + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) + + +def load(name: str, + device: Union[str, torch.device] = 'cuda' + if torch.cuda.is_available() else 'cpu', + jit: bool = False, + root: str = None): + + if not jit: + model = build_model().to(device) + if str(device) == 'cpu': + model.float() + return model, _transform(model.visual.input_resolution) + + # patch the device names + device_holder = torch.jit.trace( + lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) + device_node = [ + n for n in device_holder.graph.findAllNodes('prim::Constant') + if 'Device' in repr(n) + ][-1] + + def patch_device(module): + try: + graphs = [module.graph] if hasattr(module, 'graph') else [] + except RuntimeError: + graphs = [] + + if hasattr(module, 'forward1'): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes('prim::Constant'): + if 'value' in node.attributeNames() and str( + node['value']).startswith('cuda'): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == 'cpu': + float_holder = torch.jit.trace( + lambda: torch.ones([]).float(), example_inputs=[]) + float_input = list(float_holder.graph.findNode('aten::to').inputs())[1] + float_node = float_input.node() + + def patch_float(module): + try: + graphs = [module.graph] if hasattr(module, 'graph') else [] + except RuntimeError: + graphs = [] + + if hasattr(module, 'forward1'): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes('aten::to'): + inputs = list(node.inputs()) + for i in [ + 1, 2 + ]: # dtype can be the second or third argument to aten::to() + if inputs[i].node()['value'] == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + + model.float() + + return model, _transform(model.input_resolution.item()) + + +def tokenize( + _tokenizer, + texts: Union[str, List[str]], + context_length: int = 77, + truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]: + """ + Returns the tokenized representation of given input string(s) + + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + + context_length : int + The context length to use; all CLIP models use 77 as the context length + + truncate: bool + Whether to truncate the text in case its encoding is longer than the context length + + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]. + We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long. + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder['<|startoftext|>'] + eot_token = _tokenizer.encoder['<|endoftext|>'] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] + for text in texts] + if packaging.version.parse( + torch.__version__) < packaging.version.parse('1.8.0'): + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + else: + result = torch.zeros(len(all_tokens), context_length, dtype=torch.int) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + if truncate: + tokens = tokens[:context_length] + tokens[-1] = eot_token + else: + raise RuntimeError( + f'Input {texts[i]} is too long for context length {context_length}' + ) + result[i, :len(tokens)] = torch.tensor(tokens) + + return result diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py new file mode 100644 index 00000000..20915396 --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py @@ -0,0 +1,28 @@ +""" +Adapted from https://github.com/isl-org/lang-seg. +Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. +""" + +import torch +import torch.nn as nn + +from .lseg_net import LSeg + + +class TextDrivenSegmentation(nn.Module): + + def __init__(self, model_dir): + super(TextDrivenSegmentation, self).__init__() + self.net = LSeg(model_dir=model_dir) + self.model_dir = model_dir + + def forward(self, img, txt_list): + b = img.size()[0] + batch_name_list = txt_list + xout_list = [] + for i in range(b): + labelset = ['others', batch_name_list[i]] + xout = self.net(img[i:i + 1], labelset=labelset) + xout_list.append(xout) + score_map = torch.cat(xout_list, dim=0) + return score_map diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py new file mode 100644 index 00000000..cb550ab7 --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py @@ -0,0 +1,334 @@ +""" +Adapted from https://github.com/isl-org/lang-seg. +Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. +""" + +import torch +import torch.nn as nn + +from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit + + +def _make_encoder( + backbone, + features, + use_pretrained=True, + groups=1, + expand=False, + exportable=True, + hooks=None, + use_vit_only=False, + use_readout='ignore', + enable_attention_hooks=False, +): + if backbone == 'clip_vitl16_384': + clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384( + use_pretrained, + hooks=hooks, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + scratch = _make_scratch([256, 512, 1024, 1024], + features, + groups=groups, + expand=expand) + else: + raise NotImplementedError(f"Backbone '{backbone}' not implemented") + + return clip_pretrained, pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand is True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + + return scratch + + +class Interpolate(nn.Module): + """Interpolation module.""" + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners, + ) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module.""" + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block.""" + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode='bilinear', align_corners=True) + + return output + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module.""" + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + if self.bn is True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn is True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn is True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block.""" + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + ): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand is True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1, + ) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners) + + output = self.out_conv(output) + return output diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py new file mode 100644 index 00000000..1d7ebdd1 --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py @@ -0,0 +1,107 @@ +import os.path as osp +from typing import Any, Dict + +import json +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.text_driven_segmentation import \ + TextDrivenSegmentation +from modelscope.outputs import OutputKeys +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() +__all__ = ['TextDrivenSeg'] + + +@MODELS.register_module( + Tasks.text_driven_segmentation, + module_name=Models.text_driven_segmentation) +class TextDrivenSeg(TorchModel): + """ text driven segmentation model. + """ + + def __init__(self, model_dir, device_id=0, *args, **kwargs): + super().__init__( + model_dir=model_dir, device_id=device_id, *args, **kwargs) + self.model = TextDrivenSegmentation(model_dir=model_dir) + pretrained_params = torch.load('{}/{}'.format( + model_dir, ModelFile.TORCH_MODEL_BIN_FILE)) + self.model.load_state_dict(pretrained_params) + self.model.eval() + if device_id >= 0 and torch.cuda.is_available(): + self.model.to('cuda:{}'.format(device_id)) + logger.info('Use GPU: {}'.format(device_id)) + else: + device_id = -1 + logger.info('Use CPU for inference') + self.device_id = device_id + + def preprocess(self, img, size=640): + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + h, w, c = img.shape + max_hw = max(h, w) + ratio = 1.0 * size / max_hw + crop_h, crop_w = int(ratio * h), int(ratio * w) + pil_img = Image.fromarray(img) + pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR) + np_img = np.array(pil_img, dtype=np.float32) / 255. + for j in range(3): + np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j] + img_pad = np.zeros((size, size, 3), dtype=np.float32) + img_pad[:crop_h, :crop_w] = np_img + img_pad = torch.from_numpy(img_pad).permute(2, 0, + 1).unsqueeze(0).float() + return img_pad, h, w, crop_h, crop_w + + def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w): + output = np.clip(tensors * 255., a_min=0, a_max=255.) + crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8) + pil_output = Image.fromarray(crop_output) + pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR) + np_output = np.array(pil_output, dtype=np.uint8) + np_output[np_output < 128] = 0 + np_output[np_output >= 128] = 255 + np_output = np.uint8(np_output) + return np_output + + def forward(self, image, text): + """ + image should be numpy array, dtype=np.uint8, shape: height*width*3 + """ + image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess( + image, size=640) + pred = self.inference(image_tensor, text) + msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640) + outputs = {OutputKeys.MASKS: msk} + return outputs + + def inference(self, image, text): + """ + image should be tensor, 1 * 3 * 640 * 640 + """ + with torch.no_grad(): + if self.device_id == -1: + output = self.model(image) + else: + device = torch.device('cuda', self.device_id) + output = self.model(image.to(device), [text]) + output = F.interpolate(output, size=(640, 640), mode='bilinear') + output = F.softmax(output, dim=1) + output = torch.argmax(output, dim=1) + output = output[0] + if self.device_id == -1: + pred = output.data.numpy() + else: + pred = output.data.cpu().numpy() + del output + return pred diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py new file mode 100644 index 00000000..1a558c5c --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py @@ -0,0 +1,197 @@ +""" +Adapted from https://github.com/isl-org/lang-seg. +Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. +""" + +import numpy as np +import torch +import torch.nn as nn + +from . import clip +from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom, + Interpolate, _make_encoder, forward_vit) +from .simple_tokenizer import SimpleTokenizer + + +class depthwise_clipseg_conv(nn.Module): + + def __init__(self): + super(depthwise_clipseg_conv, self).__init__() + self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1) + + def depthwise_clipseg(self, x, channels): + x = torch.cat( + [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)], + dim=1) + return x + + def forward(self, x): + channels = x.shape[1] + out = self.depthwise_clipseg(x, channels) + return out + + +class depthwise_conv(nn.Module): + + def __init__(self, kernel_size=3, stride=1, padding=1): + super(depthwise_conv, self).__init__() + self.depthwise = nn.Conv2d( + 1, 1, kernel_size=kernel_size, stride=stride, padding=padding) + + def forward(self, x): + # support for 4D tensor with NCHW + C, H, W = x.shape[1:] + x = x.reshape(-1, 1, H, W) + x = self.depthwise(x) + x = x.view(-1, C, H, W) + return x + + +class depthwise_block(nn.Module): + + def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): + super(depthwise_block, self).__init__() + self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'lrelu': + self.activation = nn.LeakyReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() + + def forward(self, x, act=True): + x = self.depthwise(x) + if act: + x = self.activation(x) + return x + + +class bottleneck_block(nn.Module): + + def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): + super(bottleneck_block, self).__init__() + self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'lrelu': + self.activation = nn.LeakyReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() + + def forward(self, x, act=True): + sum_layer = x.max(dim=1, keepdim=True)[0] + x = self.depthwise(x) + x = x + sum_layer + if act: + x = self.activation(x) + return x + + +class BaseModel(torch.nn.Module): + + def load(self, path): + """Load model from file. + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if 'optimizer' in parameters: + parameters = parameters['model'] + + self.load_state_dict(parameters) + + +def _make_fusion_block(features, use_bn): + return FeatureFusionBlock_custom( + features, + activation=nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + ) + + +class LSeg(BaseModel): + + def __init__( + self, + features=256, + backbone='clip_vitl16_384', + readout='project', + use_bn=True, + model_dir=None, + ): + super(LSeg, self).__init__() + hooks = { + 'clip_vitl16_384': [5, 11, 17, 23], + } + + # Instantiate backbone and reassemble blocks + self.clip_pretrained, self.pretrained, self.scratch = _make_encoder( + backbone, + features, + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.logit_scale = nn.Parameter(torch.ones([]) + * np.log(1 / 0.07)).exp() + self.out_c = 512 + self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1) + + self.scratch.output_conv = nn.Sequential( + Interpolate(scale_factor=2, mode='bilinear', align_corners=True), ) + + self.tau = 0.07 + self.model_dir = model_dir + self.tokenizer = SimpleTokenizer(model_dir + + '/bpe_simple_vocab_16e6.txt.gz') + + def forward(self, x, labelset=''): + text = clip.tokenize(self.tokenizer, labelset) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + text = text.to(x.device) + text_features = self.clip_pretrained.encode_text(text) + + image_features = self.scratch.head1(path_1) + + imshape = image_features.shape + image_features = image_features.permute(0, 2, 3, + 1).reshape(-1, self.out_c) + + # normalized features + image_features = image_features / image_features.norm( + dim=-1, keepdim=True) + text_features = text_features / text_features.norm( + dim=-1, keepdim=True) + + logits_per_image = image_features @ text_features.t() / self.tau + + out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], + -1).permute(0, 3, 1, 2) + + out = self.scratch.output_conv(out) + + return out diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py new file mode 100644 index 00000000..be2813c2 --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py @@ -0,0 +1,543 @@ +""" +Adapted from https://github.com/isl-org/lang-seg. +Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. +""" + +import math +import types + +import timm +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from . import clip + +activations = {} + + +def get_activation(name): + + def hook(model, input, output): + activations[name] = output + + return hook + + +attention = {} + + +def get_attention(name): + + def hook(module, input, output): + x = input[0] + B, N, C = x.shape + qkv = ( + module.qkv(x).reshape(B, N, 3, module.num_heads, + C // module.num_heads).permute( + 2, 0, 3, 1, 4)) + q, k, _ = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * module.scale + + attn = attn.softmax(dim=-1) # [:,:,1,1:] + attention[name] = attn + + return hook + + +def get_mean_attention_map(attn, token, shape): + attn = attn[:, :, token, 1:] + attn = attn.unflatten(2, torch.Size([shape[2] // 16, + shape[3] // 16])).float() + attn = torch.nn.functional.interpolate( + attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0) + + all_attn = torch.mean(attn, 0) + + return all_attn + + +class Slice(nn.Module): + + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index:] + + +class AddReadout(nn.Module): + + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index:] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential( + nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) + features = torch.cat((x[:, self.start_index:], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +def forward_vit(pretrained, x): + b, c, h, w = x.shape + + # encoder + _ = pretrained.model.forward_flex(x) + + layer_1 = pretrained.activations['1'] + layer_2 = pretrained.activations['2'] + layer_3 = pretrained.activations['3'] + layer_4 = pretrained.activations['4'] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size([ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ]), + )) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)]( + layer_1) + layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)]( + layer_2) + layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)]( + layer_3) + layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)]( + layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, :self.start_index], + posemb[0, self.start_index:], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, + -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=(gs_h, gs_w), mode='bilinear') + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1], + w // self.patch_size[0]) + + B = x.shape[0] + + if hasattr(self.patch_embed, 'backbone'): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[ + -1] # last feature if backbone outputs list/tuple of features + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, 'dist_token', None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + pos_embed + x = self.pos_drop(x) + + gradient_checkpoint = False + for blk in self.blocks: + if gradient_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + + x = self.norm(x) + + return x + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == 'ignore': + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == 'add': + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == 'project': + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def adapt_input_conv(in_chans, conv_weight): + conv_type = conv_weight.dtype + conv_weight = conv_weight.float( + ) # Some weights are in torch.half, ensure it's float for sum on CPU + O, II, J, K = conv_weight.shape + if in_chans == 1: + if II > 3: + assert conv_weight.shape[1] % 3 == 0 + # For models with space2depth stems + conv_weight = conv_weight.reshape(O, II // 3, 3, J, K) + conv_weight = conv_weight.sum(dim=2, keepdim=False) + else: + conv_weight = conv_weight.sum(dim=1, keepdim=True) + elif in_chans != 3: + if II != 3: + raise NotImplementedError( + 'Weight format not supported by conversion.') + else: + # NOTE this strategy should be better than random init, but there could be other combinations of + # the original RGB input layer weights that'd work better for specific cases. + repeat = int(math.ceil(in_chans / 3)) + conv_weight = conv_weight.repeat(1, repeat, 1, + 1)[:, :in_chans, :, :] + conv_weight *= (3 / float(in_chans)) + conv_weight = conv_weight.to(conv_type) + return conv_weight + + +@torch.no_grad() +def _load_weights(model, checkpoint_path, prefix=''): + """ Load weights from .npz checkpoints for official Google Brain Flax implementation + """ + import numpy as np + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + w = np.load(checkpoint_path) + if not prefix and 'opt/target/embedding/kernel' in w: + prefix = 'opt/target/' + + if hasattr(model.patch_embed, 'backbone'): + # hybrid + backbone = model.patch_embed.backbone + stem_only = not hasattr(backbone, 'stem') + stem = backbone if stem_only else backbone.stem + stem.conv.weight.copy_( + adapt_input_conv(stem.conv.weight.shape[1], + _n2p(w[f'{prefix}conv_root/kernel']))) + stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) + stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) + if not stem_only: + for i, stage in enumerate(backbone.stages): + for j, block in enumerate(stage.blocks): + bp = f'{prefix}block{i + 1}/unit{j + 1}/' + for r in range(3): + getattr(block, f'conv{r + 1}').weight.copy_( + _n2p(w[f'{bp}conv{r + 1}/kernel'])) + getattr(block, f'norm{r + 1}').weight.copy_( + _n2p(w[f'{bp}gn{r + 1}/scale'])) + getattr(block, f'norm{r + 1}').bias.copy_( + _n2p(w[f'{bp}gn{r + 1}/bias'])) + if block.downsample is not None: + block.downsample.conv.weight.copy_( + _n2p(w[f'{bp}conv_proj/kernel'])) + block.downsample.norm.weight.copy_( + _n2p(w[f'{bp}gn_proj/scale'])) + block.downsample.norm.bias.copy_( + _n2p(w[f'{bp}gn_proj/bias'])) + embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) + else: + embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1], + _n2p(w[f'{prefix}embedding/kernel'])) + model.patch_embed.proj.weight.copy_(embed_conv_w) + model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) + model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) + pos_embed_w = _n2p( + w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) + if pos_embed_w.shape != model.pos_embed.shape: + pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens', + 1), + model.patch_embed.grid_size) + model.pos_embed.copy_(pos_embed_w) + model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) + model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) + if isinstance( + model.head, nn.Linear + ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: + model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) + model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) + # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights + # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: + # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) + # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) + for i, block in enumerate(model.blocks.children()): + block_prefix = f'{prefix}Transformer/encoderblock_{i}/' + mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' + block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + block.attn.qkv.weight.copy_( + torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T + for n in ('query', 'key', 'value') + ])) + block.attn.qkv.bias.copy_( + torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) + for n in ('query', 'key', 'value') + ])) + block.attn.proj.weight.copy_( + _n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + for r in range(2): + getattr(block.mlp, f'fc{r + 1}').weight.copy_( + _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) + getattr(block.mlp, f'fc{r + 1}').bias.copy_( + _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) + block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) + block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) + + +def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()): + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + ntok_new = posemb_new.shape[1] + if num_prefix_tokens: + posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[ + 0, num_prefix_tokens:] + ntok_new -= num_prefix_tokens + else: + posemb_prefix, posemb_grid = posemb[:, :0], posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) + if not len(gs_new): # backwards compatibility + gs_new = [int(math.sqrt(ntok_new))] * 2 + assert len(gs_new) >= 2 + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, + -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=gs_new, mode='bicubic', align_corners=False) + posemb_grid = posemb_grid.permute(0, 2, 3, + 1).reshape(1, gs_new[0] * gs_new[1], -1) + posemb = torch.cat([posemb_prefix, posemb_grid], dim=1) + return posemb + + +def _make_pretrained_clip_vitl16_384(pretrained, + use_readout='ignore', + hooks=None, + enable_attention_hooks=False): + clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False) + + # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + model = timm.create_model('vit_large_patch16_384', pretrained=False) + hooks = [5, 11, 17, 23] if hooks is None else hooks + pretrained = _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + enable_attention_hooks=enable_attention_hooks, + ) + return clip_pretrained, pretrained + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout='ignore', + start_index=1, + enable_attention_hooks=False, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook( + get_activation('1')) + pretrained.model.blocks[hooks[1]].register_forward_hook( + get_activation('2')) + pretrained.model.blocks[hooks[2]].register_forward_hook( + get_activation('3')) + pretrained.model.blocks[hooks[3]].register_forward_hook( + get_activation('4')) + + pretrained.activations = activations + + if enable_attention_hooks: + pretrained.model.blocks[hooks[0]].attn.register_forward_hook( + get_attention('attn_1')) + pretrained.model.blocks[hooks[1]].attn.register_forward_hook( + get_attention('attn_2')) + pretrained.model.blocks[hooks[2]].attn.register_forward_hook( + get_attention('attn_3')) + pretrained.model.blocks[hooks[3]].attn.register_forward_hook( + get_attention('attn_4')) + pretrained.attention = attention + + readout_oper = get_readout_oper(vit_features, features, use_readout, + start_index) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, + pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model) + + return pretrained diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py new file mode 100644 index 00000000..ece10bab --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/model.py @@ -0,0 +1,458 @@ +""" +Adapted from https://github.com/isl-org/lang-seg. +Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. +""" + +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.relu1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu2 = nn.ReLU(inplace=True) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu3 = nn.ReLU(inplace=True) + + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict([('-1', nn.AvgPool2d(stride)), + ('0', + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False)), + ('1', nn.BatchNorm2d(planes * self.expansion))])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu3(out) + return out + + +class AttentionPool2d(nn.Module): + + def __init__(self, + spacial_dim: int, + embed_dim: int, + num_heads: int, + output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x[:1], + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False) + return x.squeeze(0) + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, + layers, + output_dim, + heads, + input_resolution=224, + width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = nn.Conv2d( + width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.relu3 = nn.ReLU(inplace=True) + self.avgpool = nn.AvgPool2d(2) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, + heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, width, layers, heads, attn_mask=None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + + def __init__(self, input_resolution: int, patch_size: int, width: int, + layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x1 = self.class_embedding.to(x.dtype) + x2 = torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([x1 + x2, x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + +class CLIP(nn.Module): + + def __init__( + self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width) + else: + vision_heads = vision_width // 64 + self.visual = VisionTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask()) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter( + torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features**-0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [ + self.visual.layer1, self.visual.layer2, self.visual.layer3, + self.visual.layer4 + ]: + for name, param in resnet_block.named_parameters(): + if name.endswith('bn3.weight'): + nn.init.zeros_(param) + + proj_std = (self.transformer.width**-0.5) * ( + (2 * self.transformer.layers)**-0.5) + attn_std = self.transformer.width**-0.5 + fc_std = (2 * self.transformer.width)**-0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_( + self.text_projection, std=self.transformer.width**-0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float('-inf')) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + x = self.token_embedding(text).type(self.dtype) + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + x = x[torch.arange(x.shape[0]), + text.argmax(dim=-1)] @ self.text_projection + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm( + dim=1, keepdim=True) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logits_per_image.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(ll): + if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)): + ll.weight.data = ll.weight.data.half() + if ll.bias is not None: + ll.bias.data = ll.bias.data.half() + + if isinstance(ll, nn.MultiheadAttention): + for attr in [ + *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], + 'in_proj_bias', 'bias_k', 'bias_v' + ]: + tensor = getattr(ll, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ['text_projection', 'proj']: + if hasattr(ll, name): + attr = getattr(ll, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(): + model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12) + convert_weights(model) + return model.eval() diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py new file mode 100644 index 00000000..250d680f --- /dev/null +++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py @@ -0,0 +1,156 @@ +""" CLIP +Adapted from https://github.com/openai/CLIP. +Originally MIT License, Copyright (c) 2021 OpenAI. +""" + +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'bpe_simple_vocab_16e6.txt.gz') + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord('!'), + ord('~') + 1)) + list(range( + ord('¡'), + ord('¬') + 1)) + list(range(ord('®'), + ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') + merges = merges[1:49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + '' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = { + '<|startoftext|>': '<|startoftext|>', + '<|endoftext|>': '<|endoftext|>' + } + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + '', ) + pairs = get_pairs(word) + + if not pairs: + return token + '' + + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + error_list = [] + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except Exception as err: + new_word.extend(word[i:]) + error_list.append(err) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] + for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] + for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + 'utf-8', errors='replace').replace('', ' ') + return text diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 7d6cdb59..6fada2b0 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -243,6 +243,13 @@ TASK_OUTPUTS = { # "output_img": np.ndarray with shape [height, width, 3] # } Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG], + # text driven segmentation result for single sample + # { + # "masks": [ + # np.array # 2D array containing only 0, 255 + # ] + # } + Tasks.text_driven_segmentation: [OutputKeys.MASKS], # movide scene segmentation result for a single video # { diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index c9f0c252..40c237c8 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -149,6 +149,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_vitb_video-single-object-tracking_ostrack'), Tasks.image_reid_person: (Pipelines.image_reid_person, 'damo/cv_passvitb_image-reid-person_market'), + Tasks.text_driven_segmentation: + (Pipelines.text_driven_segmentation, + 'damo/cv_vitl16_segmentation_text-driven-seg'), Tasks.movie_scene_segmentation: (Pipelines.movie_scene_segmentation, 'damo/cv_resnet50-bert_video-scene-segmentation_movienet') diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index f4e6792b..c8cb0c6a 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -44,6 +44,7 @@ if TYPE_CHECKING: from .video_category_pipeline import VideoCategoryPipeline from .virtual_try_on_pipeline import VirtualTryonPipeline from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline + from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline else: @@ -97,6 +98,8 @@ else: 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], 'easycv_pipeline': ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'], + 'text_driven_segmentation_pipeline': + ['TextDrivenSegmentationPipeline'], 'movie_scene_segmentation_pipeline': ['MovieSceneSegmentationPipeline'], } diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py new file mode 100644 index 00000000..0985b835 --- /dev/null +++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py @@ -0,0 +1,51 @@ +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + Tasks.text_driven_segmentation, + module_name=Pipelines.text_driven_segmentation) +class TextDrivenSegmentationPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + model: model id on modelscope hub. + """ + super().__init__(model=model, auto_collate=False, **kwargs) + + def preprocess(self, input: Dict) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input['image']) + img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img) + result = { + 'img': img_tensor, + 'ori_h': ori_h, + 'ori_w': ori_w, + 'crop_h': crop_h, + 'crop_w': crop_w, + 'text': input['text'], + } + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + outputs = self.model.inference(input['img'], input['text']) + result = { + 'data': outputs, + 'ori_h': input['ori_h'], + 'ori_w': input['ori_w'], + 'crop_h': input['crop_h'], + 'crop_w': input['crop_w'], + } + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + data = self.model.postprocess(inputs['data'], inputs['crop_h'], + inputs['crop_w'], inputs['ori_h'], + inputs['ori_w']) + outputs = {OutputKeys.MASKS: data} + return outputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 2265ef5a..ed1ec798 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -36,6 +36,7 @@ class CVTasks(object): image_segmentation = 'image-segmentation' portrait_matting = 'portrait-matting' + text_driven_segmentation = 'text-driven-segmentation' # image editing skin_retouching = 'skin-retouching' diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py new file mode 100644 index 00000000..741787d9 --- /dev/null +++ b/tests/pipelines/test_text_driven_segmentation.py @@ -0,0 +1,28 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class TextDrivenSegmentationTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_text_driven_segmentation(self): + input_location = 'data/test/images/text_driven_segmentation.jpg' + test_input = { + 'image': input_location, + 'text': 'bear', + } + model_id = 'damo/cv_vitl16_segmentation_text-driven-seg' + shop_seg = pipeline(Tasks.text_driven_segmentation, model=model_id) + result = shop_seg(test_input) + import cv2 + # result[OutputKeys.MASKS] is segment map result,other keys are not used + cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS]) + + +if __name__ == '__main__': + unittest.main() From 5a2634610a3e1efca692327ab31988313574156d Mon Sep 17 00:00:00 2001 From: "suluyan.sly" Date: Fri, 2 Sep 2022 20:03:19 +0800 Subject: [PATCH 09/28] [to #42322933]skip sbert_en&bert_ch to save ci time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ![](https://cn-hangzhou.oss-cdn.aliyun-inc.com/git/force/uploads/comment/251924/40165669611078357/image.png) fill mask pipeline 测试时间过长 这个task测了4个模型。从保证代码正确性的功能角度看,只测一个bert类(比如sbert中文),一个roberta类(veco)。减少测试的模型数量以减少测试时长。 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006556 * skip sbert_en&bert_ch to save ci time --- tests/pipelines/test_fill_mask.py | 38 ++----------------------------- 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 1b709e27..6b37f6df 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -43,7 +43,7 @@ class FillMaskTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): # sbert - for language in ['zh', 'en']: + for language in ['zh']: model_dir = snapshot_download(self.model_id_sbert[language]) preprocessor = FillMaskPreprocessor( model_dir, first_sequence='sentence', second_sequence=None) @@ -74,24 +74,10 @@ class FillMaskTest(unittest.TestCase): f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n' ) - # zh bert - language = 'zh' - model_dir = snapshot_download(self.model_id_bert) - preprocessor = FillMaskPreprocessor( - model_dir, first_sequence='sentence', second_sequence=None) - model = BertForMaskedLM.from_pretrained(model_dir) - pipeline1 = FillMaskPipeline(model, preprocessor) - pipeline2 = pipeline( - Tasks.fill_mask, model=model, preprocessor=preprocessor) - ori_text = self.ori_texts[language] - test_input = self.test_inputs[language] - print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: ' - f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n') - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_modelhub(self): # sbert - for language in ['zh', 'en']: + for language in ['zh']: print(self.model_id_sbert[language]) model = Model.from_pretrained(self.model_id_sbert[language]) preprocessor = FillMaskPreprocessor( @@ -121,20 +107,6 @@ class FillMaskTest(unittest.TestCase): f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') - # zh bert - model = Model.from_pretrained(self.model_id_bert) - preprocessor = FillMaskPreprocessor( - model.model_dir, first_sequence='sentence', second_sequence=None) - pipeline_ins = pipeline( - Tasks.fill_mask, model=model, preprocessor=preprocessor) - language = 'zh' - ori_text = self.ori_texts[language] - test_input = self.test_inputs[language] - with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'fill_mask_bert_zh'): - print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' - f'{pipeline_ins(test_input)}\n') - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_name(self): # veco @@ -153,12 +125,6 @@ class FillMaskTest(unittest.TestCase): f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'{pipeline_ins(self.test_inputs[language])}\n') - # bert - pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert) - print( - f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' - f'{pipeline_ins(self.test_inputs[language])}\n') - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.fill_mask) From 4073376f512af16fb62814bade1482d2deb55236 Mon Sep 17 00:00:00 2001 From: "shouzhou.bx" Date: Fri, 2 Sep 2022 20:53:29 +0800 Subject: [PATCH 10/28] [to #42322933]add face 2d keypoints by EasyCV Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9934673 * add face 2d keypoints --- .../test_img_face_2d_keypoints.png | 3 ++ modelscope/metainfo.py | 3 ++ modelscope/models/cv/__init__.py | 6 +-- .../models/cv/face_2d_keypoints/__init__.py | 20 +++++++++ .../face_2d_keypoints_align.py | 16 ++++++++ .../cv/face_2d_keypoins/__init__.py | 20 +++++++++ .../face_2d_keypoints_dataset.py | 13 ++++++ modelscope/outputs.py | 9 ++++ modelscope/pipelines/builder.py | 2 + modelscope/pipelines/cv/__init__.py | 8 ++-- .../pipelines/cv/easycv_pipelines/__init__.py | 4 +- .../face_2d_keypoints_pipeline.py | 41 +++++++++++++++++++ modelscope/utils/constant.py | 1 + tests/pipelines/test_face_2d_keypoints.py | 36 ++++++++++++++++ 14 files changed, 175 insertions(+), 7 deletions(-) create mode 100644 data/test/images/keypoints_detect/test_img_face_2d_keypoints.png create mode 100644 modelscope/models/cv/face_2d_keypoints/__init__.py create mode 100644 modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/__init__.py create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py create mode 100644 modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py create mode 100644 tests/pipelines/test_face_2d_keypoints.py diff --git a/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png new file mode 100644 index 00000000..00311c33 --- /dev/null +++ b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766 +size 54390 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 3225710a..06b5a476 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -24,6 +24,7 @@ class Models(object): body_2d_keypoints = 'body-2d-keypoints' body_3d_keypoints = 'body-3d-keypoints' crowd_counting = 'HRNetCrowdCounting' + face_2d_keypoints = 'face-2d-keypoints' panoptic_segmentation = 'swinL-panoptic-segmentation' image_reid_person = 'passvitb' video_summarization = 'pgl-video-summarization' @@ -112,6 +113,7 @@ class Pipelines(object): object_detection = 'vit-object-detection' easycv_detection = 'easycv-detection' easycv_segmentation = 'easycv-segmentation' + face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment' salient_detection = 'u2net-salient-detection' image_classification = 'image-classification' face_detection = 'resnet-face-detection-scrfd10gkps' @@ -353,6 +355,7 @@ class Datasets(object): """ Names for different datasets. """ ClsDataset = 'ClsDataset' + Face2dKeypointsDataset = 'Face2dKeypointsDataset' SegDataset = 'SegDataset' DetDataset = 'DetDataset' DetImagesMixDataset = 'DetImagesMixDataset' diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index 331f23bd..4db43d17 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -3,9 +3,9 @@ # yapf: disable from . import (action_recognition, animal_recognition, body_2d_keypoints, body_3d_keypoints, cartoon, cmdssl_video_embedding, - crowd_counting, face_detection, face_generation, - image_classification, image_color_enhance, image_colorization, - image_denoise, image_instance_segmentation, + crowd_counting, face_2d_keypoints, face_detection, + face_generation, image_classification, image_color_enhance, + image_colorization, image_denoise, image_instance_segmentation, image_panoptic_segmentation, image_portrait_enhancement, image_reid_person, image_semantic_segmentation, image_to_image_generation, image_to_image_translation, diff --git a/modelscope/models/cv/face_2d_keypoints/__init__.py b/modelscope/models/cv/face_2d_keypoints/__init__.py new file mode 100644 index 00000000..636ba0f4 --- /dev/null +++ b/modelscope/models/cv/face_2d_keypoints/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .face_2d_keypoints_align import Face2DKeypoints + +else: + _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py new file mode 100644 index 00000000..468662a0 --- /dev/null +++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.models.face.face_keypoint import FaceKeypoint + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.models.cv.easycv_base import EasyCVBaseModel +from modelscope.utils.constant import Tasks + + +@MODELS.register_module( + group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints) +class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint): + + def __init__(self, model_dir=None, *args, **kwargs): + EasyCVBaseModel.__init__(self, model_dir, args, kwargs) + FaceKeypoint.__init__(self, *args, **kwargs) diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py new file mode 100644 index 00000000..e9d76b7e --- /dev/null +++ b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .face_2d_keypoints_dataset import FaceKeypointDataset + +else: + _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py new file mode 100644 index 00000000..a902999d --- /dev/null +++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py @@ -0,0 +1,13 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset + +from modelscope.metainfo import Datasets +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.utils.constant import Tasks + + +@TASK_DATASETS.register_module( + group_key=Tasks.face_2d_keypoints, + module_name=Datasets.Face2dKeypointsDataset) +class FaceKeypointDataset(_FaceKeypointDataset): + """EasyCV dataset for face 2d keypoints.""" diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 6fada2b0..e84c8dcc 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -57,6 +57,15 @@ TASK_OUTPUTS = { # } Tasks.ocr_recognition: [OutputKeys.TEXT], + # face 2d keypoint result for single sample + # { + # "keypoints": [ + # [x1, y1]*106 + # ], + # "poses": [pitch, roll, yaw] + # } + Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES], + # face detection result for single sample # { # "scores": [0.9, 0.1, 0.05, 0.05] diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 40c237c8..f43d152b 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -103,6 +103,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet_facedetection_scrfd10gkps'), Tasks.face_recognition: (Pipelines.face_recognition, 'damo/cv_ir101_facerecognition_cfglint'), + Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints, + 'damo/cv_mobilenet_face-2d-keypoints_alignment'), Tasks.video_multi_modal_embedding: (Pipelines.video_multi_modal_embedding, 'damo/multi_modal_clip_vtretrival_msrvtt_53'), diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index c8cb0c6a..9e7d80ee 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -43,7 +43,7 @@ if TYPE_CHECKING: from .tinynas_classification_pipeline import TinynasClassificationPipeline from .video_category_pipeline import VideoCategoryPipeline from .virtual_try_on_pipeline import VirtualTryonPipeline - from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline + from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline @@ -96,8 +96,10 @@ else: 'tinynas_classification_pipeline': ['TinynasClassificationPipeline'], 'video_category_pipeline': ['VideoCategoryPipeline'], 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], - 'easycv_pipeline': - ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'], + 'easycv_pipeline': [ + 'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline', + 'Face2DKeypointsPipeline' + ], 'text_driven_segmentation_pipeline': ['TextDrivenSegmentationPipeline'], 'movie_scene_segmentation_pipeline': diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py index 0984ff43..4f149130 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py +++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py @@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .detection_pipeline import EasyCVDetectionPipeline from .segmentation_pipeline import EasyCVSegmentationPipeline + from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline else: _import_structure = { 'detection_pipeline': ['EasyCVDetectionPipeline'], - 'segmentation_pipeline': ['EasyCVSegmentationPipeline'] + 'segmentation_pipeline': ['EasyCVSegmentationPipeline'], + 'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'] } import sys diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py new file mode 100644 index 00000000..eb4d6c15 --- /dev/null +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -0,0 +1,41 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from .base import EasyCVPipeline + + +@PIPELINES.register_module( + Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints) +class Face2DKeypointsPipeline(EasyCVPipeline): + """Pipeline for face 2d keypoints detection.""" + + def __init__(self, + model: str, + model_file_pattern=ModelFile.TORCH_MODEL_FILE, + *args, + **kwargs): + """ + model (str): model id on modelscope hub or local model path. + model_file_pattern (str): model file pattern. + """ + + super(Face2DKeypointsPipeline, self).__init__( + model=model, + model_file_pattern=model_file_pattern, + *args, + **kwargs) + + def show_result(self, img, points, scale=2, save_path=None): + return self.predict_op.show_result(img, points, scale, save_path) + + def __call__(self, inputs) -> Any: + output = self.predict_op(inputs)[0][0] + points = output['point'] + poses = output['pose'] + + return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses} diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index ed1ec798..86808ea1 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -20,6 +20,7 @@ class CVTasks(object): animal_recognition = 'animal-recognition' face_detection = 'face-detection' face_recognition = 'face-recognition' + face_2d_keypoints = 'face-2d-keypoints' human_detection = 'human-detection' human_object_interaction = 'human-object-interaction' face_image_generation = 'face-image-generation' diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py new file mode 100644 index 00000000..a5e347e8 --- /dev/null +++ b/tests/pipelines/test_face_2d_keypoints.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import cv2 + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_face_2d_keypoints(self): + img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png' + model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment' + + face_2d_keypoints_align = pipeline( + task=Tasks.face_2d_keypoints, model=model_id) + output = face_2d_keypoints_align(img_path) + + output_keypoints = output[OutputKeys.KEYPOINTS] + output_pose = output[OutputKeys.POSES] + + img = cv2.imread(img_path) + img = face_2d_keypoints_align.show_result( + img, output_keypoints, scale=2, save_path='face_keypoints.jpg') + + self.assertEqual(output_keypoints.shape[0], 106) + self.assertEqual(output_keypoints.shape[1], 2) + self.assertEqual(output_pose.shape[0], 3) + + +if __name__ == '__main__': + unittest.main() From 00487aa6e1ca1b7ac50b5ca90b3290f2a6068d77 Mon Sep 17 00:00:00 2001 From: "xixing.tj" Date: Sat, 3 Sep 2022 11:38:07 +0800 Subject: [PATCH 11/28] [to #42322933]add error msg when no text detected for ocr_detection task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ocr_detection加上当图片中没有文字时报错的error msg Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10001490 --- modelscope/pipelines/cv/ocr_detection_pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py index 62248714..b73f65a4 100644 --- a/modelscope/pipelines/cv/ocr_detection_pipeline.py +++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py @@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: rboxes = inputs['combined_rboxes'][0] count = inputs['combined_counts'][0] + if count == 0 or count < rboxes.shape[0]: + raise Exception('modelscope error: No text detected') rboxes = rboxes[:count, :] # convert rboxes to polygons and find its coordinates on the original image From 4f72134adf6f6154e5eb02602b33f2066426dbe4 Mon Sep 17 00:00:00 2001 From: "shuying.shu" Date: Sat, 3 Sep 2022 11:50:01 +0800 Subject: [PATCH 12/28] [to #42322933]update test video for movie scene segmentation Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10007852 * update test video for movie scene segmentation --- data/test/videos/movie_scene_segmentation_test_video.mp4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4 index ee6ed528..21ea3cb1 100644 --- a/data/test/videos/movie_scene_segmentation_test_video.mp4 +++ b/data/test/videos/movie_scene_segmentation_test_video.mp4 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f -size 126815483 +oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443 +size 12722029 From ba74cdf97e8944e724b78cdfaf43f2de0fed721b Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Sat, 3 Sep 2022 12:10:16 +0800 Subject: [PATCH 13/28] [to #43878347] Rename runtime.txt to framework.txt Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10000642 * rename runtime.txt to framework.txt --- .readthedocs.yaml | 2 +- docker/Dockerfile.ubuntu | 2 +- requirements.txt | 2 +- requirements/{runtime.txt => framework.txt} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename requirements/{runtime.txt => framework.txt} (100%) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index b88d734a..f7b9c7ea 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -25,4 +25,4 @@ python: install: - requirements: requirements/docs.txt - requirements: requirements/readthedocs.txt - - requirements: requirements/runtime.txt + - requirements: requirements/framework.txt diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 97881007..78da0b6f 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \ # install modelscope COPY requirements /var/modelscope RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ diff --git a/requirements.txt b/requirements.txt index c6e294ba..0832e6ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ --r requirements/runtime.txt +-r requirements/framework.txt diff --git a/requirements/runtime.txt b/requirements/framework.txt similarity index 100% rename from requirements/runtime.txt rename to requirements/framework.txt From 39a309b6554070e68741a36593211ab47910a293 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Sat, 3 Sep 2022 12:18:29 +0800 Subject: [PATCH 14/28] [to #42322933] reduce train epoch from 3 to w --- tests/trainers/test_finetune_mplug.py | 2 +- tests/trainers/test_finetune_token_classificatin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index 351600c6..b46dbf45 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -35,7 +35,7 @@ class TestFinetuneMPlug(unittest.TestCase): }).rename_column('image:FILE', 'image').rename_column('answer:Value', 'answer')) - self.max_epochs = 3 + self.max_epochs = 2 def tearDown(self): shutil.rmtree(self.tmp_dir) diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py index c34410be..9bdab9b7 100644 --- a/tests/trainers/test_finetune_token_classificatin.py +++ b/tests/trainers/test_finetune_token_classificatin.py @@ -92,7 +92,7 @@ class TestFinetuneTokenClassification(unittest.TestCase): } } cfg['preprocessor'] = {'type': 'token-cls-tokenizer'} - cfg.train.max_epochs = 3 + cfg.train.max_epochs = 2 cfg.train.lr_scheduler = { 'type': 'LinearLR', 'start_factor': 1.0, From 04516276265f27996b2ffb293f3ef6315055d0d7 Mon Sep 17 00:00:00 2001 From: "xingguang.zxg" Date: Sat, 3 Sep 2022 13:21:31 +0800 Subject: [PATCH 15/28] =?UTF-8?q?[to=20#42322933]=E5=95=86=E5=93=81?= =?UTF-8?q?=E6=98=BE=E8=91=97=E6=80=A7=E5=88=86=E5=89=B2v1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 商品显著性检测模型,依赖opencv,mmcv-full Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9909897 --- data/test/images/shop_segmentation.jpg | 3 + modelscope/metainfo.py | 2 + modelscope/models/cv/__init__.py | 2 +- .../models/cv/shop_segmentation/__init__.py | 1 + .../models/cv/shop_segmentation/common.py | 59 ++ .../models/cv/shop_segmentation/head_fpn.py | 122 +++ .../models/cv/shop_segmentation/models.py | 901 ++++++++++++++++++ .../models/cv/shop_segmentation/neck_fpn.py | 217 +++++ .../cv/shop_segmentation/shop_seg_base.py | 157 +++ .../cv/shop_segmentation/shop_seg_model.py | 115 +++ .../models/cv/shop_segmentation/utils.py | 199 ++++ modelscope/outputs.py | 8 +- modelscope/pipelines/builder.py | 4 +- modelscope/pipelines/cv/__init__.py | 3 +- .../cv/shop_segmentation_pipleline.py | 51 + modelscope/utils/constant.py | 1 + tests/pipelines/test_shop_segmentation.py | 24 + 17 files changed, 1865 insertions(+), 4 deletions(-) create mode 100644 data/test/images/shop_segmentation.jpg create mode 100644 modelscope/models/cv/shop_segmentation/__init__.py create mode 100644 modelscope/models/cv/shop_segmentation/common.py create mode 100644 modelscope/models/cv/shop_segmentation/head_fpn.py create mode 100644 modelscope/models/cv/shop_segmentation/models.py create mode 100644 modelscope/models/cv/shop_segmentation/neck_fpn.py create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_base.py create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_model.py create mode 100644 modelscope/models/cv/shop_segmentation/utils.py create mode 100644 modelscope/pipelines/cv/shop_segmentation_pipleline.py create mode 100644 tests/pipelines/test_shop_segmentation.py diff --git a/data/test/images/shop_segmentation.jpg b/data/test/images/shop_segmentation.jpg new file mode 100644 index 00000000..ec02881d --- /dev/null +++ b/data/test/images/shop_segmentation.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0 +size 238607 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 06b5a476..b1bf9600 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -32,6 +32,7 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' resnet50_bert = 'resnet50-bert' + shop_segmentation = 'shop-segmentation' # EasyCV models yolox = 'YOLOX' @@ -148,6 +149,7 @@ class Pipelines(object): image_reid_person = 'passvitb-image-reid-person' text_driven_segmentation = 'text-driven-segmentation' movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' + shop_segmentation = 'shop-segmentation' # nlp tasks sentence_similarity = 'sentence-similarity' diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index 4db43d17..f2798b59 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -11,7 +11,7 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, image_to_image_generation, image_to_image_translation, movie_scene_segmentation, object_detection, product_retrieval_embedding, realtime_object_detection, - salient_detection, super_resolution, + salient_detection, shop_segmentation, super_resolution, video_single_object_tracking, video_summarization, virual_tryon) # yapf: enable diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py new file mode 100644 index 00000000..b40a0760 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/__init__.py @@ -0,0 +1 @@ +from .shop_seg_base import SHOPSEG diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py new file mode 100644 index 00000000..00ba9996 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/common.py @@ -0,0 +1,59 @@ +""" +Base modules are adapted from https://github.com/open-mmlab/mmcv/, +originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +https://github.com/open-mmlab/mmsegmentation/, +originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +and adapted from https://github.com/raoyongming/DenseCLIP/, +originally MIT License, Copyright (c) 2022 Rao, Yongming. +""" + +import warnings + +import torch.nn as nn +import torch.nn.functional as F + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > input_w: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +class Upsample(nn.Module): + + def __init__(self, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None): + super(Upsample, self).__init__() + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + if not self.size: + size = [int(t * self.scale_factor) for t in x.shape[-2:]] + else: + size = self.size + return resize(x, size, None, self.mode, self.align_corners) diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py new file mode 100644 index 00000000..b3faa9b8 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/head_fpn.py @@ -0,0 +1,122 @@ +""" FPNHead +Base modules are adapted from https://github.com/open-mmlab/mmcv/, +originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +https://github.com/open-mmlab/mmsegmentation/, +originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +and adapted from https://github.com/raoyongming/DenseCLIP/, +originally MIT License, Copyright (c) 2022 Rao, Yongming. +""" + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from timm.models.layers import drop, drop_path, trunc_normal_ + +from .common import Upsample, resize + + +class FPNHead(nn.Module): + """Panoptic Feature Pyramid Networks. + This head is the implementation of `Semantic FPN + `_. + Args: + feature_strides (tuple[int]): The strides for input feature maps. + stack_lateral. All strides suppose to be power of 2. The first + one is of largest resolution. + """ + + def __init__(self, + channels, + num_classes, + dropout_ratio=0.1, + feature_strides=[4, 8, 16, 32], + align_corners=False, + **kwargs): + super(FPNHead, self).__init__() + self.act_cfg = dict(type='ReLU') + self.channels = channels + self.conv_cfg = None + self.norm_cfg = None + self.norm_cfg = dict(type='BN2d', requires_grad=True) + self.align_corners = align_corners + self.dropout_ratio = dropout_ratio + self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) + if dropout_ratio > 0: + self.dropout = nn.Dropout2d(dropout_ratio) + else: + self.dropout = None + self.in_index = [0, 1, 2, 3] + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + self.scale_heads = nn.ModuleList() + for i in range(len(feature_strides)): + head_length = max( + 1, + int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) + scale_head = [] + for k in range(head_length): + scale_head.append( + ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + if feature_strides[i] != feature_strides[0]: + scale_head.append( + Upsample( + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners)) + self.scale_heads.append(nn.Sequential(*scale_head)) + + self.apply(self._init_weights) + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + inputs = [inputs[i] for i in self.in_index] + return inputs + + def cls_seg(self, feat): + """Classify each pixel.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.conv_seg(feat) + return output + + def forward(self, inputs): + x = self._transform_inputs(inputs) + output = self.scale_heads[0](x[0]) + for i in range(1, len(self.feature_strides)): + # non inplace + output = output + resize( + self.scale_heads[i](x[i]), + size=output.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + + output = self.cls_seg(output) + return output + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias.data, 0) diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py new file mode 100644 index 00000000..8b82d1d1 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/models.py @@ -0,0 +1,901 @@ +""" +Base modules are adapted from https://github.com/open-mmlab/mmcv/, +originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +https://github.com/open-mmlab/mmsegmentation/, +originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +and adapted from https://github.com/raoyongming/DenseCLIP/, +originally MIT License, Copyright (c) 2022 Rao, Yongming. +""" + +import math +from collections import OrderedDict + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import drop, drop_path, trunc_normal_ +from torch import nn + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict([('-1', nn.AvgPool2d(stride)), + ('0', + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False)), + ('1', nn.BatchNorm2d(planes * self.expansion))])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + + def __init__(self, + spacial_dim: int, + embed_dim: int, + num_heads: int, + output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + self.embed_dim = embed_dim + self.spacial_dim = spacial_dim + + def forward(self, x): + B, C, H, W = x.shape + x = x.reshape(x.shape[0], x.shape[1], + x.shape[2] * x.shape[3]).permute(2, 0, + 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + + cls_pos = self.positional_embedding[0:1, :] + spatial_pos = F.interpolate( + self.positional_embedding[1:, ].reshape(1, self.spacial_dim, + self.spacial_dim, + self.embed_dim).permute( + 0, 3, 1, 2), + size=(H, W), + mode='bilinear') + spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0) + positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) + + x = x + positional_embedding[:, None, :] + x, _ = F.multi_head_attention_forward( + query=x, + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False) + + x = x.permute(1, 2, 0) + global_feat = x[:, :, 0] + feature_map = x[:, :, 1:].reshape(B, -1, H, W) + return global_feat, feature_map + + +class CLIPResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, + layers, + output_dim=512, + input_resolution=224, + width=64, + pretrained=None, + **kwargs): + super().__init__() + self.pretrained = pretrained + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d( + width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = torch.jit.load( + pretrained, map_location='cpu').float().state_dict() + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith('visual.'): + new_k = k.replace('visual.', '') + state_dict[new_k] = checkpoint[k] + + u, w = self.load_state_dict(state_dict, False) + print(u, w, 'are misaligned params in CLIPResNet') + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), + (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + + outs = [] + x = self.layer1(x) + outs.append(x) + x = self.layer2(x) + outs.append(x) + x = self.layer3(x) + outs.append(x) + x = self.layer4(x) + outs.append(x) + + return tuple(outs) + + +class CLIPResNetWithAttention(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, + layers, + output_dim=1024, + input_resolution=224, + width=64, + pretrained=None, + **kwargs): + super().__init__() + self.pretrained = pretrained + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d( + width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32, + output_dim) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = torch.jit.load( + pretrained, map_location='cpu').float().state_dict() + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith('visual.'): + new_k = k.replace('visual.', '') + state_dict[new_k] = checkpoint[k] + + if 'positional_embedding' in new_k: + if self.attnpool.positional_embedding.shape != state_dict[ + new_k].shape: + print( + f'Resize the pos_embed shape from {state_dict[new_k].shape}' + f' to {self.attnpool.positional_embedding.shape}' + ) + cls_pos = state_dict[new_k][0:1, :] + H = W = self.input_resolution // 32 + old_h = int( + math.sqrt(state_dict[new_k][1:, ].shape[0])) + spatial_pos = F.interpolate( + state_dict[new_k][1:, ].reshape( + 1, old_h, old_h, + cls_pos.shape[1]).permute(0, 3, 1, 2), + size=(H, W), + mode='bilinear') + spatial_pos = spatial_pos.reshape( + cls_pos.shape[1], H * W).permute(1, 0) + positional_embedding = torch.cat( + [cls_pos, spatial_pos], dim=0) + state_dict[new_k] = positional_embedding + assert self.attnpool.positional_embedding.shape == state_dict[ + new_k].shape + + u, w = self.load_state_dict(state_dict, False) + print(u, w, 'are misaligned params in CLIPResNet') + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), + (self.conv3, self.bn3)]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + + outs = [] + x = self.layer1(x) + outs.append(x) + x = self.layer2(x) + outs.append(x) + x = self.layer3(x) + outs.append(x) + x = self.layer4(x) + outs.append(x) + + x_global, x_local = self.attnpool(x) + outs.append([x_global, x_local]) + + return tuple(outs) + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None, + drop_path=0.): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.drop_path(self.attention(self.ln_1(x))) + x = x + self.drop_path(self.mlp(self.ln_2(x))) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None, + drop_path_rate=0.): + super().__init__() + self.width = width + self.layers = layers + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers) + ] # stochastic depth decay rule + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) + for i in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + + self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, q, k, v): + B, N, C = q.shape + assert k.shape == v.shape + B, M, C = k.shape + q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads) + k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads) + v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads) + + attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale + + attn = attn.softmax(dim=-1) + + x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class TransformerDecoderLayer(nn.Module): + + def __init__( + self, + d_model, + nhead, + dropout=0.1, + ): + super().__init__() + self.self_attn = Attention(d_model, nhead, proj_drop=dropout) + self.cross_attn = Attention(d_model, nhead, proj_drop=dropout) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.mlp = nn.Sequential( + nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout), + nn.Linear(d_model * 4, d_model)) + + def forward(self, x, mem): + q = k = v = self.norm1(x) + x = x + self.self_attn(q, k, v) + q = self.norm2(x) + x = x + self.cross_attn(q, mem, mem) + x = x + self.dropout(self.mlp(self.norm3(x))) + return x + + +class CLIPVisionTransformer(nn.Module): + + def __init__(self, + input_resolution=224, + patch_size=32, + width=768, + layers=12, + heads=12, + output_dim=512, + drop_path_rate=0.0, + out_indices=[3, 5, 7, 11], + pretrained=None, + get_embeddings=False, + **kwargs): + super().__init__() + self.pretrained = pretrained + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.spatial_size = input_resolution // patch_size + self.ln_pre = LayerNorm(width) + self.get_embeddings = get_embeddings + + self.transformer = Transformer( + width, layers, heads, drop_path_rate=drop_path_rate) + + self.out_indices = out_indices + + if get_embeddings: + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + embed_dim = width + + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.SyncBatchNorm(embed_dim), + nn.GELU(), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn3 = nn.GroupNorm(1, embed_dim) + + self.fpn4 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.MaxPool2d(kernel_size=2, stride=2)) + + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.GroupNorm(1, embed_dim) + + self.fpn3 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + self.fpn4 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.MaxPool2d(kernel_size=4, stride=4), + ) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = torch.jit.load( + pretrained, map_location='cpu').float().state_dict() + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith('visual.'): + new_k = k.replace('visual.', '') + state_dict[new_k] = checkpoint[k] + + if 'positional_embedding' in state_dict.keys(): + if self.positional_embedding.shape != state_dict[ + 'positional_embedding'].shape: + print( + f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to' + f' {self.positional_embedding.shape}') + cls_pos = state_dict['positional_embedding'][0:1, :] + spatial_pos = F.interpolate( + state_dict['positional_embedding'][1:, ].reshape( + 1, 14, 14, 768).permute(0, 3, 1, 2), + size=(self.spatial_size, self.spatial_size), + mode='bilinear') + spatial_pos = spatial_pos.reshape( + 768, + self.spatial_size * self.spatial_size).permute(1, 0) + positional_embedding = torch.cat([cls_pos, spatial_pos], + dim=0) + state_dict['positional_embedding'] = positional_embedding + assert self.positional_embedding.shape == state_dict[ + 'positional_embedding'].shape + + u, w = self.load_state_dict(state_dict, False) + print(u, w, 'are misaligned params in vision transformer') + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + B, C, H, W = x.shape + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x1 = self.class_embedding.to(x.dtype) + x2 = torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([x1 + x2, x], dim=1) + pos = self.positional_embedding.to(x.dtype) + cls_pos = pos[0, :] + self.class_embedding.to(x.dtype) + spatial_pos = F.interpolate( + pos[1:, ].reshape(1, self.spatial_size, self.spatial_size, + C).permute(0, 3, 1, 2), + size=(H, W), + mode='bilinear') + spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1) + pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) + x = x + pos + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + + gradientcheckpoint = False + + features = [] + for i, blk in enumerate(self.transformer.resblocks): + if gradientcheckpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + + if i in self.out_indices: + xp = x.permute(1, 0, 2)[:, + 1:, :].permute(0, 2, + 1).reshape(B, -1, H, W) + features.append(xp.contiguous()) + + ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(features)): + features[i] = ops[i](features[i]) + + if self.get_embeddings: + x = x.permute(1, 0, 2) + x = self.ln_post(x) + x = x @ self.proj + + global_embedding = x[:, 0] + visual_embedding = x[:, 1:].reshape(B, H, W, + -1).permute(0, 3, 1, + 2) # B C H W + + features.append([global_embedding, visual_embedding]) + + return tuple(features) + + +class CLIPTextEncoder(nn.Module): + + def __init__(self, + context_length=77, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12, + embed_dim=1024, + out_dim=256, + pretrained=None, + **kwargs): + super().__init__() + + self.pretrained = pretrained + + self.context_length = context_length + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask()) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + self.text_projection = nn.Parameter( + torch.empty(transformer_width, embed_dim)) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = torch.jit.load( + pretrained, map_location='cpu').float().state_dict() + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith('transformer.'): + state_dict[k] = checkpoint[k] + + if k == 'positional_embedding' or k == 'text_projection' or k.startswith( + 'token_embedding') or k.startswith('ln_final'): + if k == 'positional_embedding' and checkpoint[k].size( + 0) > self.context_length: + checkpoint[k] = checkpoint[k][:self.context_length] + print('positional_embedding is tuncated from 77 to', + self.context_length) + state_dict[k] = checkpoint[k] + + u, w = self.load_state_dict(state_dict, False) + print(u, w, 'are misaligned params in text encoder') + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float('-inf')) + mask.triu_(1) # zero out the lower diagonal + return mask + + def forward(self, text): + x = self.token_embedding(text) + x = x + self.positional_embedding + x = x.permute(1, 0, 2) + x = self.transformer(x) + x = x.permute(1, 0, 2) + x = self.ln_final(x) + x = x[torch.arange(x.shape[0]), + text.argmax(dim=-1), ...] @ self.text_projection + return x + + +class CLIPTextContextEncoder(nn.Module): + + def __init__(self, + context_length=22, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12, + embed_dim=1024, + out_dim=256, + pretrained=None, + **kwargs): + super().__init__() + + self.pretrained = pretrained + + self.context_length = context_length + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask()) + + self.embed_dim = embed_dim + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + self.text_projection = nn.Parameter( + torch.empty(transformer_width, embed_dim)) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = torch.jit.load( + pretrained, map_location='cpu').float().state_dict() + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith('transformer.'): + state_dict[k] = checkpoint[k] + + if k == 'positional_embedding' or k == 'text_projection' or k.startswith( + 'token_embedding') or k.startswith('ln_final'): + if k == 'positional_embedding' and checkpoint[k].size( + 0) > self.context_length: + checkpoint[k] = checkpoint[k][:self.context_length] + print('positional_embedding is tuncated from 77 to', + self.context_length) + state_dict[k] = checkpoint[k] + + u, w = self.load_state_dict(state_dict, False) + print(u, w, 'are misaligned params in text encoder') + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float('-inf')) + mask.triu_(1) # zero out the lower diagonal + return mask + + def forward(self, text, context=None): + x_text = self.token_embedding(text) # n_clas, n_text, C + K, N1, C = x_text.shape # 150类 * 5??? * 512 + B, N2, C = context.shape # 1 * 8 * 512 + + eos_indx = text.argmax(dim=-1) + N2 + eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1) + + x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C) + context = context.reshape(B, 1, N2, C).expand(B, K, N2, C) + + x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]], + dim=2).reshape(B * K, N1 + N2, C) + x = x + self.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x) + x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection + x = x.reshape(B, K, self.embed_dim) + return x + + +class ContextDecoder(nn.Module): + + def __init__(self, + transformer_width=256, + transformer_heads=4, + transformer_layers=6, + visual_dim=1024, + dropout=0.1, + **kwargs): + super().__init__() + + self.memory_proj = nn.Sequential( + nn.LayerNorm(visual_dim), + nn.Linear(visual_dim, transformer_width), + nn.LayerNorm(transformer_width), + ) + + self.text_proj = nn.Sequential( + nn.LayerNorm(visual_dim), + nn.Linear(visual_dim, transformer_width), + ) + + self.decoder = nn.ModuleList([ + TransformerDecoderLayer(transformer_width, transformer_heads, + dropout) for _ in range(transformer_layers) + ]) + + self.out_proj = nn.Sequential( + nn.LayerNorm(transformer_width), + nn.Linear(transformer_width, visual_dim)) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, text, visual): + B, N, C = visual.shape + visual = self.memory_proj(visual) + x = self.text_proj(text) + + for layer in self.decoder: + x = layer(x, visual) + + return self.out_proj(x) diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py new file mode 100644 index 00000000..108cb043 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py @@ -0,0 +1,217 @@ +""" FPNneck +Base modules are adapted from https://github.com/open-mmlab/mmcv/, +originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +https://github.com/open-mmlab/mmsegmentation/, +originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +and adapted from https://github.com/raoyongming/DenseCLIP/, +originally MIT License, Copyright (c) 2022 Rao, Yongming. +""" + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from timm.models.layers import drop, drop_path, trunc_normal_ + +from .common import resize + + +class FPN(nn.Module): + """Feature Pyramid Network. + + This neck is the implementation of `Feature Pyramid Networks for Object + Detection `_. + + Args: + in_channels (list[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, its actual mode is specified by `extra_convs_on_inputs`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs + on the original feature from the backbone. If True, + it is equivalent to `add_extra_convs='on_input'`. If False, it is + equivalent to set `add_extra_convs='on_output'`. Default to True. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer in ConvModule. + Default: None. + upsample_cfg (dict): Config dict for interpolate layer. + Default: dict(mode='nearest'). + init_cfg (dict or list[dict], optional): Initialization config dict. + + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + extra_convs_on_inputs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + upsample_cfg=dict(mode='nearest')): + super(FPN, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.fp16_enabled = False + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level < inputs, no extra level is allowed + self.backbone_end_level = end_level + assert end_level <= len(in_channels) + assert num_outs == end_level - start_level + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') + elif add_extra_convs: # True + if extra_convs_on_inputs: + # For compatibility with previous release + # TODO: deprecate `extra_convs_on_inputs` + self.add_extra_convs = 'on_input' + else: + self.add_extra_convs = 'on_output' + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == 'on_input': + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.fpn_convs.append(extra_fpn_conv) + + self.apply(self._init_weights) + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + laterals[i - 1] = laterals[i - 1] + resize( + laterals[i], **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + resize( + laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias.data, 0) diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py new file mode 100644 index 00000000..e3ae0d54 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py @@ -0,0 +1,157 @@ +""" +Base modules are adapted from https://github.com/open-mmlab/mmcv/, +originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, +https://github.com/open-mmlab/mmsegmentation/, +originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, +and adapted from https://github.com/raoyongming/DenseCLIP/, +originally MIT License, Copyright (c) 2022 Rao, Yongming. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .head_fpn import FPNHead +from .models import (CLIPTextContextEncoder, CLIPVisionTransformer, + ContextDecoder) +from .neck_fpn import FPN +from .utils import SimpleTokenizer, tokenize + + +class SHOPSEG(nn.Module): + """Encoder Decoder segmentors. + + EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. + Note that auxiliary_head is only used for deep supervision during training, + which could be dumped during inference. + """ + + def __init__(self, + model_dir, + context_length=22, + context_feature='attention', + score_concat_index=2, + tau=0.07, + token_embed_dim=512, + text_dim=512, + **args): + super(SHOPSEG, self).__init__() + + self.model_dir = model_dir + self.tokenizer = SimpleTokenizer(model_dir + + '/bpe_simple_vocab_16e6.txt.gz') + + backbone = CLIPVisionTransformer( + input_resolution=1024, + patch_size=16, + width=768, + layers=12, + output_dim=512, + drop_path_rate=0.1, + pretrained=False, + get_embeddings=True) + + text_encoder = CLIPTextContextEncoder( + context_length=30, + vocab_size=49408, + transformer_width=512, + transformer_heads=8, + transformer_layers=12, + embed_dim=512, + pretrained=False) + + context_decoder = ContextDecoder( + transformer_width=256, + transformer_heads=4, + transformer_layers=3, + visual_dim=512, + dropout=0.1) + neck = FPN( + in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4) + head_fpd = FPNHead(channels=256, num_classes=2) + + self.backbone = backbone + self.text_encoder = text_encoder + self.context_decoder = context_decoder + self.context_length = context_length + self.score_concat_index = score_concat_index + + self.context_feature = context_feature + self.tau = tau + context_length = self.text_encoder.context_length - self.context_length + self.contexts = nn.Parameter( + torch.randn(1, context_length, token_embed_dim)) + nn.init.trunc_normal_(self.contexts) + self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4) + + self.neck = neck + self.head_fpn = head_fpd + + self.tau = 0.07 + + def encode_text(self, text, context_length): + output = tokenize(self.tokenizer, text, context_length, True) + return output + + def extract_feat(self, img): + """Extract features from images.""" + x = self.backbone(img) + return x + + def after_extract_feat(self, x, name_list): + x_orig = list(x[0:4]) + global_feat, visual_embeddings = x[4] + B, C, H, W = visual_embeddings.shape + if self.context_feature == 'attention': + x1 = global_feat.reshape(B, C, 1) + x2 = visual_embeddings.reshape(B, C, H * W) + visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1) + texts = torch.cat([ + self.encode_text(c, context_length=self.context_length) + for c in name_list + ]) + x1 = texts.to(global_feat.device) + x1 = self.text_encoder(x1, self.contexts) + text_embeddings = x1.expand(B, -1, -1) + # update text_embeddings by visual_context! + # (B, 1, C) + text_diff = self.context_decoder(text_embeddings, visual_context) + # (B, K, C) + text_embeddings = text_embeddings + self.gamma * text_diff + + # compute score map and concat + B, K, C = text_embeddings.shape + visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2) + text = F.normalize(text_embeddings, dim=2, p=2) + score_map_list = [] + bsz = B + for i in range(bsz): + ind = 2 * i + sub_text = torch.cat( + [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]], + dim=1) # 1 * 2 * h * w + + sub_score_map = torch.einsum('bchw,bkc->bkhw', + visual_embeddings[i:i + 1], + sub_text) # 1 * 2 * h * w + score_map_list.append(sub_score_map) + score_map = torch.cat(score_map_list, dim=0) # b * 2 * h * w + x_orig[self.score_concat_index] = torch.cat( + [x_orig[self.score_concat_index], score_map], dim=1) + return x_orig, score_map + + def forward(self, img, text_list=None): + if text_list is None: + bsz = img.size()[0] + text_list = ['foregeound'] * bsz + x = self.extract_feat(img) + _x_orig = [x[i] for i in range(4)] + name_list = [] + for name in text_list: + name_list.append('others') + name_list.append(name[0:20]) + x_orig, score_map = self.after_extract_feat(x, name_list) + x_orig = list(self.neck(x_orig)) + _x_orig = x_orig + pred = self.head_fpn(_x_orig) + return pred diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py new file mode 100644 index 00000000..409c583b --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py @@ -0,0 +1,115 @@ +import os.path as osp +from typing import Any, Dict + +import json +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.shop_segmentation import SHOPSEG +from modelscope.outputs import OutputKeys +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['ShopSegmentation'] + + +@MODELS.register_module( + Tasks.shop_segmentation, module_name=Models.shop_segmentation) +class ShopSegmentation(TorchModel): + """ shop segmentation model. + """ + + def __init__(self, model_dir, device_id=0, *args, **kwargs): + super().__init__( + model_dir=model_dir, device_id=device_id, *args, **kwargs) + + self.model = SHOPSEG(model_dir=model_dir) + pretrained_params = torch.load('{}/{}'.format( + model_dir, ModelFile.TORCH_MODEL_BIN_FILE)) + + self.model.load_state_dict(pretrained_params) + self.model.eval() + self.device_id = device_id + if self.device_id >= 0 and torch.cuda.is_available(): + self.model.to('cuda:{}'.format(self.device_id)) + logger.info('Use GPU: {}'.format(self.device_id)) + else: + self.device_id = -1 + logger.info('Use CPU for inference') + + def preprocess(self, img, size=1024): + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + h, w, c = img.shape + max_hw = max(h, w) + ratio = 1.0 * size / max_hw + crop_h, crop_w = int(ratio * h), int(ratio * w) + pil_img = Image.fromarray(img) + pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR) + np_img = np.array(pil_img, dtype=np.float32) / 255. + + for j in range(3): + np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j] + + img_pad = np.zeros((size, size, 3), dtype=np.float32) + img_pad[:crop_h, :crop_w] = np_img + + img_pad = torch.from_numpy(img_pad).permute(2, 0, + 1).unsqueeze(0).float() + return img_pad, h, w, crop_h, crop_w + + def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w): + output = np.clip(tensors * 255., a_min=0, a_max=255.) + crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8) + + pil_output = Image.fromarray(crop_output) + pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR) + np_output = np.array(pil_output, dtype=np.uint8) + + np_output[np_output < 128] = 0 + np_output[np_output >= 128] = 255 + np_output = np.uint8(np_output) + return np_output + + def forward(self, image): + """ + image should be numpy array, dtype=np.uint8, shape: height*width*3 + """ + image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess( + image, size=1024) + pred = self.inference(image_tensor) + msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024) + + outputs = {OutputKeys.MASKS: msk} + return outputs + + def inference(self, image): + """ + image should be tensor, 1 * 3 * 1024 * 1024 + """ + with torch.no_grad(): + if self.device_id == -1: + output = self.model(image) + else: + device = torch.device('cuda', self.device_id) + output = self.model(image.to(device)) + output = F.interpolate(output, size=(1024, 1024), mode='bilinear') + output = F.softmax(output, dim=1) + output = torch.argmax(output, dim=1) + output = output[0] + if self.device_id == -1: + pred = output.data.numpy() + else: + pred = output.data.cpu().numpy() + + del output + return pred diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py new file mode 100644 index 00000000..c41f8a65 --- /dev/null +++ b/modelscope/models/cv/shop_segmentation/utils.py @@ -0,0 +1,199 @@ +""" CLIP Tokenizer +Adapted from https://github.com/openai/CLIP. +Originally MIT License, Copyright (c) 2021 OpenAI. +""" + +import gzip +import html +import os +from functools import lru_cache +from typing import Any, List, Union + +import ftfy +import regex as re +import torch + + +@lru_cache() +def default_bpe(): + return os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'bpe_simple_vocab_16e6.txt.gz') + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord('!'), + ord('~') + 1)) + list(range( + ord('¡'), + ord('¬') + 1)) + list(range(ord('®'), + ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') + merges = merges[1:49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + '' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = { + '<|startoftext|>': '<|startoftext|>', + '<|endoftext|>': '<|endoftext|>' + } + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + '', ) + pairs = get_pairs(word) + + if not pairs: + return token + '' + + error_list = [] + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except Exception as err: + error_list.append(err) + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] + for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] + for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + 'utf-8', errors='replace').replace('', ' ') + return text + + +def tokenize(tokenizer, + texts, + context_length: int = 77, + truncate: bool = False) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + context_length : int + The context length to use; all CLIP models use 77 as the context length + truncate: bool + Whether to truncate the text in case its encoding is longer than the context length + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = tokenizer.encoder['<|startoftext|>'] + eot_token = tokenizer.encoder['<|endoftext|>'] + all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token] + for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + if truncate: + tokens = tokens[:context_length] + tokens[-1] = eot_token + else: + raise RuntimeError( + f'Input {texts[i]} is too long for context length {context_length}' + ) + result[i, :len(tokens)] = torch.tensor(tokens) + + return result diff --git a/modelscope/outputs.py b/modelscope/outputs.py index e84c8dcc..8fe71ec2 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -259,7 +259,13 @@ TASK_OUTPUTS = { # ] # } Tasks.text_driven_segmentation: [OutputKeys.MASKS], - + # shop segmentation result for single sample + # { + # "masks": [ + # np.array # 2D array containing only 0, 255 + # ] + # } + Tasks.shop_segmentation: [OutputKeys.MASKS], # movide scene segmentation result for a single video # { # "split_video_num":3, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index f43d152b..f6381857 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -156,7 +156,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_vitl16_segmentation_text-driven-seg'), Tasks.movie_scene_segmentation: (Pipelines.movie_scene_segmentation, - 'damo/cv_resnet50-bert_video-scene-segmentation_movienet') + 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'), + Tasks.shop_segmentation: (Pipelines.shop_segmentation, + 'damo/cv_vitb16_segmentation_shop-seg'), } diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 9e7d80ee..d3dba978 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -43,10 +43,10 @@ if TYPE_CHECKING: from .tinynas_classification_pipeline import TinynasClassificationPipeline from .video_category_pipeline import VideoCategoryPipeline from .virtual_try_on_pipeline import VirtualTryonPipeline + from .shop_segmentation_pipleline import ShopSegmentationPipeline from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline - else: _import_structure = { 'action_recognition_pipeline': ['ActionRecognitionPipeline'], @@ -96,6 +96,7 @@ else: 'tinynas_classification_pipeline': ['TinynasClassificationPipeline'], 'video_category_pipeline': ['VideoCategoryPipeline'], 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], + 'shop_segmentation_pipleline': ['ShopSegmentationPipeline'], 'easycv_pipeline': [ 'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline', 'Face2DKeypointsPipeline' diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py new file mode 100644 index 00000000..b7fd90b4 --- /dev/null +++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py @@ -0,0 +1,51 @@ +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation) +class ShopSegmentationPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + model: model id on modelscope hub. + """ + super().__init__(model=model, auto_collate=False, **kwargs) + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input) + img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img) + result = { + 'img': img_tensor, + 'ori_h': ori_h, + 'ori_w': ori_w, + 'crop_h': crop_h, + 'crop_w': crop_w + } + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + + outputs = self.model.inference(input['img']) + result = { + 'data': outputs, + 'ori_h': input['ori_h'], + 'ori_w': input['ori_w'], + 'crop_h': input['crop_h'], + 'crop_w': input['crop_w'], + } + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + + data = self.model.postprocess(inputs['data'], inputs['crop_h'], + inputs['crop_w'], inputs['ori_h'], + inputs['ori_w']) + outputs = {OutputKeys.MASKS: data} + return outputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 86808ea1..1b738bfe 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -38,6 +38,7 @@ class CVTasks(object): image_segmentation = 'image-segmentation' portrait_matting = 'portrait-matting' text_driven_segmentation = 'text-driven-segmentation' + shop_segmentation = 'shop-segmentation' # image editing skin_retouching = 'skin-retouching' diff --git a/tests/pipelines/test_shop_segmentation.py b/tests/pipelines/test_shop_segmentation.py new file mode 100644 index 00000000..58c56dd7 --- /dev/null +++ b/tests/pipelines/test_shop_segmentation.py @@ -0,0 +1,24 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class ShopSegmentationTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_shop_segmentation(self): + input_location = 'data/test/images/shop_segmentation.jpg' + model_id = 'damo/cv_vitb16_segmentation_shop-seg' + shop_seg = pipeline(Tasks.shop_segmentation, model=model_id) + result = shop_seg(input_location) + import cv2 + # result[OutputKeys.MASKS] is segment map result,other keys are not used + cv2.imwrite(input_location + '_shopseg.jpg', result[OutputKeys.MASKS]) + + +if __name__ == '__main__': + unittest.main() From f508be89183cc2d9047bbb6fcbe23685a239959d Mon Sep 17 00:00:00 2001 From: ly261666 Date: Sat, 3 Sep 2022 23:48:42 +0800 Subject: [PATCH 16/28] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EReti?= =?UTF-8?q?naFace=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 新增人脸检测RetinaFace模型; 2. 完成Maas-cv CR标准自查 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9945188 --- data/test/images/retina_face_detection.jpg | 3 + modelscope/metainfo.py | 2 + .../cv/face_detection/retinaface/__init__.py | 0 .../cv/face_detection/retinaface/detection.py | 137 ++++++++++++++++ .../retinaface/models/__init__.py | 0 .../face_detection/retinaface/models/net.py | 149 ++++++++++++++++++ .../retinaface/models/retinaface.py | 145 +++++++++++++++++ .../cv/face_detection/retinaface/utils.py | 123 +++++++++++++++ modelscope/pipelines/base.py | 1 - .../cv/retina_face_detection_pipeline.py | 55 +++++++ tests/pipelines/test_retina_face_detection.py | 33 ++++ 11 files changed, 647 insertions(+), 1 deletion(-) create mode 100644 data/test/images/retina_face_detection.jpg create mode 100644 modelscope/models/cv/face_detection/retinaface/__init__.py create mode 100755 modelscope/models/cv/face_detection/retinaface/detection.py create mode 100755 modelscope/models/cv/face_detection/retinaface/models/__init__.py create mode 100755 modelscope/models/cv/face_detection/retinaface/models/net.py create mode 100755 modelscope/models/cv/face_detection/retinaface/models/retinaface.py create mode 100755 modelscope/models/cv/face_detection/retinaface/utils.py create mode 100644 modelscope/pipelines/cv/retina_face_detection_pipeline.py create mode 100644 tests/pipelines/test_retina_face_detection.py diff --git a/data/test/images/retina_face_detection.jpg b/data/test/images/retina_face_detection.jpg new file mode 100644 index 00000000..c95881fe --- /dev/null +++ b/data/test/images/retina_face_detection.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9 +size 87228 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index b1bf9600..9638268c 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -32,6 +32,7 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' resnet50_bert = 'resnet50-bert' + retinaface = 'retinaface' shop_segmentation = 'shop-segmentation' # EasyCV models @@ -118,6 +119,7 @@ class Pipelines(object): salient_detection = 'u2net-salient-detection' image_classification = 'image-classification' face_detection = 'resnet-face-detection-scrfd10gkps' + retina_face_detection = 'resnet50-face-detection-retinaface' live_category = 'live-category' general_image_classification = 'vit-base_image-classification_ImageNet-labels' daily_image_classification = 'vit-base_image-classification_Dailylife-labels' diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/face_detection/retinaface/detection.py b/modelscope/models/cv/face_detection/retinaface/detection.py new file mode 100755 index 00000000..3dd31659 --- /dev/null +++ b/modelscope/models/cv/face_detection/retinaface/detection.py @@ -0,0 +1,137 @@ +# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface +import cv2 +import numpy as np +import torch +import torch.backends.cudnn as cudnn + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from .models.retinaface import RetinaFace +from .utils import PriorBox, decode, decode_landm, py_cpu_nms + + +@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface) +class RetinaFaceDetection(TorchModel): + + def __init__(self, model_path, device='cuda'): + super().__init__(model_path) + torch.set_grad_enabled(False) + cudnn.benchmark = True + self.model_path = model_path + self.cfg = Config.from_file( + model_path.replace(ModelFile.TORCH_MODEL_FILE, + ModelFile.CONFIGURATION))['models'] + self.net = RetinaFace(cfg=self.cfg) + self.load_model() + self.device = device + self.net = self.net.to(self.device) + + self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device) + + def check_keys(self, pretrained_state_dict): + ckpt_keys = set(pretrained_state_dict.keys()) + model_keys = set(self.net.state_dict().keys()) + used_pretrained_keys = model_keys & ckpt_keys + assert len( + used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' + return True + + def remove_prefix(self, state_dict, prefix): + new_state_dict = dict() + for k, v in state_dict.items(): + if k.startswith(prefix): + new_state_dict[k[len(prefix):]] = v + else: + new_state_dict[k] = v + return new_state_dict + + def load_model(self, load_to_cpu=False): + pretrained_dict = torch.load( + self.model_path, map_location=torch.device('cpu')) + if 'state_dict' in pretrained_dict.keys(): + pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'], + 'module.') + else: + pretrained_dict = self.remove_prefix(pretrained_dict, 'module.') + self.check_keys(pretrained_dict) + self.net.load_state_dict(pretrained_dict, strict=False) + self.net.eval() + + def forward(self, input): + img_raw = input['img'].cpu().numpy() + img = np.float32(img_raw) + + im_height, im_width = img.shape[:2] + ss = 1.0 + # tricky + if max(im_height, im_width) > 1500: + ss = 1000.0 / max(im_height, im_width) + img = cv2.resize(img, (0, 0), fx=ss, fy=ss) + im_height, im_width = img.shape[:2] + + scale = torch.Tensor( + [img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) + img -= (104, 117, 123) + img = img.transpose(2, 0, 1) + img = torch.from_numpy(img).unsqueeze(0) + img = img.to(self.device) + scale = scale.to(self.device) + + loc, conf, landms = self.net(img) # forward pass + del img + + confidence_threshold = 0.9 + nms_threshold = 0.4 + top_k = 5000 + keep_top_k = 750 + + priorbox = PriorBox(self.cfg, image_size=(im_height, im_width)) + priors = priorbox.forward() + priors = priors.to(self.device) + prior_data = priors.data + boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance']) + boxes = boxes * scale + boxes = boxes.cpu().numpy() + scores = conf.squeeze(0).data.cpu().numpy()[:, 1] + landms = decode_landm( + landms.data.squeeze(0), prior_data, self.cfg['variance']) + scale1 = torch.Tensor([ + im_width, im_height, im_width, im_height, im_width, im_height, + im_width, im_height, im_width, im_height + ]) + scale1 = scale1.to(self.device) + landms = landms * scale1 + landms = landms.cpu().numpy() + + # ignore low scores + inds = np.where(scores > confidence_threshold)[0] + boxes = boxes[inds] + landms = landms[inds] + scores = scores[inds] + + # keep top-K before NMS + order = scores.argsort()[::-1][:top_k] + boxes = boxes[order] + landms = landms[order] + scores = scores[order] + + # do NMS + dets = np.hstack((boxes, scores[:, np.newaxis])).astype( + np.float32, copy=False) + keep = py_cpu_nms(dets, nms_threshold) + dets = dets[keep, :] + landms = landms[keep] + + # keep top-K faster NMS + dets = dets[:keep_top_k, :] + landms = landms[:keep_top_k, :] + + landms = landms.reshape((-1, 5, 2)) + landms = landms.reshape( + -1, + 10, + ) + return dets / ss, landms / ss diff --git a/modelscope/models/cv/face_detection/retinaface/models/__init__.py b/modelscope/models/cv/face_detection/retinaface/models/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/modelscope/models/cv/face_detection/retinaface/models/net.py b/modelscope/models/cv/face_detection/retinaface/models/net.py new file mode 100755 index 00000000..3be7c4b9 --- /dev/null +++ b/modelscope/models/cv/face_detection/retinaface/models/net.py @@ -0,0 +1,149 @@ +# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface +import time + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models +import torchvision.models._utils as _utils +from torch.autograd import Variable + + +def conv_bn(inp, oup, stride=1, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_bn_no_relu(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + ) + + +def conv_bn1X1(inp, oup, stride, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), + nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_dw(inp, oup, stride, leaky=0.1): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + ) + + +class SSH(nn.Module): + + def __init__(self, in_channel, out_channel): + super(SSH, self).__init__() + assert out_channel % 4 == 0 + leaky = 0 + if (out_channel <= 64): + leaky = 0.1 + self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1) + + self.conv5X5_1 = conv_bn( + in_channel, out_channel // 4, stride=1, leaky=leaky) + self.conv5X5_2 = conv_bn_no_relu( + out_channel // 4, out_channel // 4, stride=1) + + self.conv7X7_2 = conv_bn( + out_channel // 4, out_channel // 4, stride=1, leaky=leaky) + self.conv7x7_3 = conv_bn_no_relu( + out_channel // 4, out_channel // 4, stride=1) + + def forward(self, input): + conv3X3 = self.conv3X3(input) + + conv5X5_1 = self.conv5X5_1(input) + conv5X5 = self.conv5X5_2(conv5X5_1) + + conv7X7_2 = self.conv7X7_2(conv5X5_1) + conv7X7 = self.conv7x7_3(conv7X7_2) + + out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) + out = F.relu(out) + return out + + +class FPN(nn.Module): + + def __init__(self, in_channels_list, out_channels): + super(FPN, self).__init__() + leaky = 0 + if (out_channels <= 64): + leaky = 0.1 + self.output1 = conv_bn1X1( + in_channels_list[0], out_channels, stride=1, leaky=leaky) + self.output2 = conv_bn1X1( + in_channels_list[1], out_channels, stride=1, leaky=leaky) + self.output3 = conv_bn1X1( + in_channels_list[2], out_channels, stride=1, leaky=leaky) + + self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) + self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) + + def forward(self, input): + # names = list(input.keys()) + input = list(input.values()) + + output1 = self.output1(input[0]) + output2 = self.output2(input[1]) + output3 = self.output3(input[2]) + + up3 = F.interpolate( + output3, size=[output2.size(2), output2.size(3)], mode='nearest') + output2 = output2 + up3 + output2 = self.merge2(output2) + + up2 = F.interpolate( + output2, size=[output1.size(2), output1.size(3)], mode='nearest') + output1 = output1 + up2 + output1 = self.merge1(output1) + + out = [output1, output2, output3] + return out + + +class MobileNetV1(nn.Module): + + def __init__(self): + super(MobileNetV1, self).__init__() + self.stage1 = nn.Sequential( + conv_bn(3, 8, 2, leaky=0.1), # 3 + conv_dw(8, 16, 1), # 7 + conv_dw(16, 32, 2), # 11 + conv_dw(32, 32, 1), # 19 + conv_dw(32, 64, 2), # 27 + conv_dw(64, 64, 1), # 43 + ) + self.stage2 = nn.Sequential( + conv_dw(64, 128, 2), # 43 + 16 = 59 + conv_dw(128, 128, 1), # 59 + 32 = 91 + conv_dw(128, 128, 1), # 91 + 32 = 123 + conv_dw(128, 128, 1), # 123 + 32 = 155 + conv_dw(128, 128, 1), # 155 + 32 = 187 + conv_dw(128, 128, 1), # 187 + 32 = 219 + ) + self.stage3 = nn.Sequential( + conv_dw(128, 256, 2), # 219 +3 2 = 241 + conv_dw(256, 256, 1), # 241 + 64 = 301 + ) + self.avg = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, 1000) + + def forward(self, x): + x = self.stage1(x) + x = self.stage2(x) + x = self.stage3(x) + x = self.avg(x) + x = x.view(-1, 256) + x = self.fc(x) + return x diff --git a/modelscope/models/cv/face_detection/retinaface/models/retinaface.py b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py new file mode 100755 index 00000000..8d2001dd --- /dev/null +++ b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py @@ -0,0 +1,145 @@ +# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models +import torchvision.models._utils as _utils +import torchvision.models.detection.backbone_utils as backbone_utils + +from .net import FPN, SSH, MobileNetV1 + + +class ClassHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(ClassHead, self).__init__() + self.num_anchors = num_anchors + self.conv1x1 = nn.Conv2d( + inchannels, + self.num_anchors * 2, + kernel_size=(1, 1), + stride=1, + padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 2) + + +class BboxHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(BboxHead, self).__init__() + self.conv1x1 = nn.Conv2d( + inchannels, + num_anchors * 4, + kernel_size=(1, 1), + stride=1, + padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 4) + + +class LandmarkHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(LandmarkHead, self).__init__() + self.conv1x1 = nn.Conv2d( + inchannels, + num_anchors * 10, + kernel_size=(1, 1), + stride=1, + padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 10) + + +class RetinaFace(nn.Module): + + def __init__(self, cfg=None): + """ + :param cfg: Network related settings. + """ + super(RetinaFace, self).__init__() + backbone = None + if cfg['name'] == 'Resnet50': + backbone = models.resnet50(pretrained=cfg['pretrain']) + else: + raise Exception('Invalid name') + + self.body = _utils.IntermediateLayerGetter(backbone, + cfg['return_layers']) + in_channels_stage2 = cfg['in_channel'] + in_channels_list = [ + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ] + out_channels = cfg['out_channel'] + self.fpn = FPN(in_channels_list, out_channels) + self.ssh1 = SSH(out_channels, out_channels) + self.ssh2 = SSH(out_channels, out_channels) + self.ssh3 = SSH(out_channels, out_channels) + + self.ClassHead = self._make_class_head( + fpn_num=3, inchannels=cfg['out_channel']) + self.BboxHead = self._make_bbox_head( + fpn_num=3, inchannels=cfg['out_channel']) + self.LandmarkHead = self._make_landmark_head( + fpn_num=3, inchannels=cfg['out_channel']) + + def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2): + classhead = nn.ModuleList() + for i in range(fpn_num): + classhead.append(ClassHead(inchannels, anchor_num)) + return classhead + + def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2): + bboxhead = nn.ModuleList() + for i in range(fpn_num): + bboxhead.append(BboxHead(inchannels, anchor_num)) + return bboxhead + + def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2): + landmarkhead = nn.ModuleList() + for i in range(fpn_num): + landmarkhead.append(LandmarkHead(inchannels, anchor_num)) + return landmarkhead + + def forward(self, inputs): + out = self.body(inputs) + + # FPN + fpn = self.fpn(out) + + # SSH + feature1 = self.ssh1(fpn[0]) + feature2 = self.ssh2(fpn[1]) + feature3 = self.ssh3(fpn[2]) + features = [feature1, feature2, feature3] + + bbox_regressions = torch.cat( + [self.BboxHead[i](feature) for i, feature in enumerate(features)], + dim=1) + classifications = torch.cat( + [self.ClassHead[i](feature) for i, feature in enumerate(features)], + dim=1) + ldm_regressions = torch.cat( + [self.LandmarkHead[i](feat) for i, feat in enumerate(features)], + dim=1) + + output = (bbox_regressions, F.softmax(classifications, + dim=-1), ldm_regressions) + return output diff --git a/modelscope/models/cv/face_detection/retinaface/utils.py b/modelscope/models/cv/face_detection/retinaface/utils.py new file mode 100755 index 00000000..60c9e2dd --- /dev/null +++ b/modelscope/models/cv/face_detection/retinaface/utils.py @@ -0,0 +1,123 @@ +# -------------------------------------------------------- +# Modified from https://github.com/biubug6/Pytorch_Retinaface +# -------------------------------------------------------- + +from itertools import product as product +from math import ceil + +import numpy as np +import torch + + +class PriorBox(object): + + def __init__(self, cfg, image_size=None, phase='train'): + super(PriorBox, self).__init__() + self.min_sizes = cfg['min_sizes'] + self.steps = cfg['steps'] + self.clip = cfg['clip'] + self.image_size = image_size + self.feature_maps = [[ + ceil(self.image_size[0] / step), + ceil(self.image_size[1] / step) + ] for step in self.steps] + self.name = 's' + + def forward(self): + anchors = [] + for k, f in enumerate(self.feature_maps): + min_sizes = self.min_sizes[k] + for i, j in product(range(f[0]), range(f[1])): + for min_size in min_sizes: + s_kx = min_size / self.image_size[1] + s_ky = min_size / self.image_size[0] + dense_cx = [ + x * self.steps[k] / self.image_size[1] + for x in [j + 0.5] + ] + dense_cy = [ + y * self.steps[k] / self.image_size[0] + for y in [i + 0.5] + ] + for cy, cx in product(dense_cy, dense_cx): + anchors += [cx, cy, s_kx, s_ky] + + # back to torch land + output = torch.Tensor(anchors).view(-1, 4) + if self.clip: + output.clamp_(max=1, min=0) + return output + + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep + + +# Adapted from https://github.com/Hakuyume/chainer-ssd +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat( + (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def decode_landm(pre, priors, variances): + """Decode landm from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + pre (tensor): landm predictions for loc layers, + Shape: [num_priors,10] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded landm predictions + """ + a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:] + b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:] + c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:] + d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:] + e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:] + landms = torch.cat((a, b, c, d, e), dim=1) + return landms diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index c0f3cbd0..d4f9c6bf 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -2,7 +2,6 @@ import os.path as osp from abc import ABC, abstractmethod -from contextlib import contextmanager from threading import Lock from typing import Any, Dict, Generator, List, Mapping, Union diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py new file mode 100644 index 00000000..20111c11 --- /dev/null +++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py @@ -0,0 +1,55 @@ +import os.path as osp +from typing import Any, Dict + +import numpy as np + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.face_detection.retinaface import detection +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.face_detection, module_name=Pipelines.retina_face_detection) +class RetinaFaceDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a face detection pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE) + logger.info(f'loading model from {ckpt_path}') + detector = detection.RetinaFaceDetection( + model_path=ckpt_path, device=self.device) + self.detector = detector + logger.info('load model done') + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input) + img = img.astype(np.float32) + result = {'img': img} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + result = self.detector(input) + assert result is not None + bboxes = result[0][:, :4].tolist() + scores = result[0][:, 4].tolist() + lms = result[1].tolist() + return { + OutputKeys.SCORES: scores, + OutputKeys.BOXES: bboxes, + OutputKeys.KEYPOINTS: lms, + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/tests/pipelines/test_retina_face_detection.py b/tests/pipelines/test_retina_face_detection.py new file mode 100644 index 00000000..343e1c91 --- /dev/null +++ b/tests/pipelines/test_retina_face_detection.py @@ -0,0 +1,33 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import unittest + +import cv2 + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import draw_face_detection_result +from modelscope.utils.test_utils import test_level + + +class RetinaFaceDetectionTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/cv_resnet50_face-detection_retinaface' + + def show_result(self, img_path, detection_result): + img = draw_face_detection_result(img_path, detection_result) + cv2.imwrite('result.png', img) + print(f'output written to {osp.abspath("result.png")}') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + face_detection = pipeline(Tasks.face_detection, model=self.model_id) + img_path = 'data/test/images/retina_face_detection.jpg' + + result = face_detection(img_path) + self.show_result(img_path, result) + + +if __name__ == '__main__': + unittest.main() From adab7d3391c636818372697edc48dffb5f2d25d4 Mon Sep 17 00:00:00 2001 From: ly261666 Date: Mon, 5 Sep 2022 09:53:58 +0800 Subject: [PATCH 17/28] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EFER?= =?UTF-8?q?=E4=BA=BA=E8=84=B8=E5=B1=9E=E6=80=A7=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 完成Maas-cv CR自查; 新增个Task,已经跟产品确认可以增加,正在走流程中,目前还不在https://aone.alibaba-inc.com/v2/project/1181559/req#viewIdentifier=d7f112f9d023e2108fa1b0d8这里,后续会增加过来 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9976346 --- .../images/facial_expression_recognition.jpg | 3 + modelscope/metainfo.py | 2 + .../facial_expression_recognition/__init__.py | 0 .../fer/__init__.py | 0 .../fer/facial_expression_recognition.py | 72 ++++++++++ .../fer/transforms.py | 118 ++++++++++++++++ .../facial_expression_recognition/fer/vgg.py | 40 ++++++ modelscope/outputs.py | 8 ++ modelscope/pipelines/builder.py | 3 + .../facial_expression_recognition_pipeline.py | 128 ++++++++++++++++++ modelscope/utils/constant.py | 1 + modelscope/utils/cv/image_utils.py | 20 +++ .../test_facial_expression_recognition.py | 36 +++++ 13 files changed, 431 insertions(+) create mode 100644 data/test/images/facial_expression_recognition.jpg create mode 100644 modelscope/models/cv/facial_expression_recognition/__init__.py create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/__init__.py create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/transforms.py create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/vgg.py create mode 100644 modelscope/pipelines/cv/facial_expression_recognition_pipeline.py create mode 100644 tests/pipelines/test_facial_expression_recognition.py diff --git a/data/test/images/facial_expression_recognition.jpg b/data/test/images/facial_expression_recognition.jpg new file mode 100644 index 00000000..a943fa72 --- /dev/null +++ b/data/test/images/facial_expression_recognition.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633 +size 161535 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 9638268c..47608d02 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -32,6 +32,7 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' resnet50_bert = 'resnet50-bert' + fer = 'fer' retinaface = 'retinaface' shop_segmentation = 'shop-segmentation' @@ -119,6 +120,7 @@ class Pipelines(object): salient_detection = 'u2net-salient-detection' image_classification = 'image-classification' face_detection = 'resnet-face-detection-scrfd10gkps' + facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' retina_face_detection = 'resnet50-face-detection-retinaface' live_category = 'live-category' general_image_classification = 'vit-base_image-classification_ImageNet-labels' diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py new file mode 100644 index 00000000..c5eb71a1 --- /dev/null +++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py @@ -0,0 +1,72 @@ +# The implementation is based on Facial-Expression-Recognition, available at +# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch +import os + +import cv2 +import numpy as np +import torch +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +from PIL import Image +from torch.autograd import Variable + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from . import transforms +from .vgg import VGG + + +@MODELS.register_module( + Tasks.facial_expression_recognition, module_name=Models.fer) +class FacialExpressionRecognition(TorchModel): + + def __init__(self, model_path, device='cuda'): + super().__init__(model_path) + torch.set_grad_enabled(False) + cudnn.benchmark = True + self.model_path = model_path + self.device = device + self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE, + ModelFile.CONFIGURATION) + self.net = VGG('VGG19', cfg_path=self.cfg_path) + self.load_model() + self.net = self.net.to(device) + self.transform_test = transforms.Compose([ + transforms.TenCrop(44), + transforms.Lambda(lambda crops: torch.stack( + [transforms.ToTensor()(crop) for crop in crops])), + ]) + + self.mean = np.array([[104, 117, 123]]) + + def load_model(self, load_to_cpu=False): + pretrained_dict = torch.load( + self.model_path, map_location=torch.device('cpu')) + self.net.load_state_dict(pretrained_dict['net'], strict=True) + self.net.eval() + + def forward(self, input): + img = input['img'] + img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY) + img = cv2.resize(img, (48, 48)) + img = img[:, :, np.newaxis] + img = np.concatenate((img, img, img), axis=2) + + img = Image.fromarray(np.uint8(img)) + inputs = self.transform_test(img) + + ncrops, c, h, w = inputs.shape + + inputs = inputs.view(-1, c, h, w) + inputs = inputs.to(self.device) + inputs = Variable(inputs, volatile=True) + outputs = self.net(inputs) + + outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops + + score = F.softmax(outputs_avg) + _, predicted = torch.max(outputs_avg.data, 0) + + return score, predicted diff --git a/modelscope/models/cv/facial_expression_recognition/fer/transforms.py b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py new file mode 100644 index 00000000..a1448c49 --- /dev/null +++ b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py @@ -0,0 +1,118 @@ +# The implementation is based on Facial-Expression-Recognition, available at +# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch +import numbers +import types + +import numpy as np +import torch +from PIL import Image + + +def to_tensor(pic): + + # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float().div(255) + else: + return img + + +def center_crop(img, output_size): + if isinstance(output_size, numbers.Number): + output_size = (int(output_size), int(output_size)) + w, h = img.size + th, tw = output_size + i = int(round((h - th) / 2.)) + j = int(round((w - tw) / 2.)) + return img.crop((j, i, j + tw, i + th)) + + +def five_crop(img, size): + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + else: + assert len( + size) == 2, 'Please provide only two dimensions (h, w) for size.' + + w, h = img.size + crop_h, crop_w = size + if crop_w > w or crop_h > h: + raise ValueError( + 'Requested crop size {} is bigger than input size {}'.format( + size, (h, w))) + tl = img.crop((0, 0, crop_w, crop_h)) + tr = img.crop((w - crop_w, 0, w, crop_h)) + bl = img.crop((0, h - crop_h, crop_w, h)) + br = img.crop((w - crop_w, h - crop_h, w, h)) + center = center_crop(img, (crop_h, crop_w)) + return (tl, tr, bl, br, center) + + +class TenCrop(object): + + def __init__(self, size, vertical_flip=False): + self.size = size + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + assert len( + size + ) == 2, 'Please provide only two dimensions (h, w) for size.' + self.size = size + self.vertical_flip = vertical_flip + + def __call__(self, img): + first_five = five_crop(img, self.size) + + if self.vertical_flip: + img = img.transpose(Image.FLIP_TOP_BOTTOM) + else: + img = img.transpose(Image.FLIP_LEFT_RIGHT) + + second_five = five_crop(img, self.size) + + return first_five + second_five + + +class Compose(object): + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img): + for t in self.transforms: + img = t(img) + return img + + +class ToTensor(object): + + def __call__(self, pic): + return to_tensor(pic) + + +class Lambda(object): + + def __init__(self, lambd): + assert isinstance(lambd, types.LambdaType) + self.lambd = lambd + + def __call__(self, img): + return self.lambd(img) diff --git a/modelscope/models/cv/facial_expression_recognition/fer/vgg.py b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py new file mode 100644 index 00000000..8120b6cc --- /dev/null +++ b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py @@ -0,0 +1,40 @@ +# The implementation is based on Facial-Expression-Recognition, available at +# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable + +from modelscope.utils.config import Config + + +class VGG(nn.Module): + + def __init__(self, vgg_name, cfg_path): + super(VGG, self).__init__() + model_cfg = Config.from_file(cfg_path)['models'] + self.features = self._make_layers(model_cfg[vgg_name]) + self.classifier = nn.Linear(512, 7) + + def forward(self, x): + out = self.features(x) + out = out.view(out.size(0), -1) + out = F.dropout(out, p=0.5, training=self.training) + out = self.classifier(out) + return out + + def _make_layers(self, cfg): + layers = [] + in_channels = 3 + for x in cfg: + if x == 'M': + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + layers += [ + nn.Conv2d(in_channels, x, kernel_size=3, padding=1), + nn.BatchNorm2d(x), + nn.ReLU(inplace=True) + ] + in_channels = x + layers += [nn.AvgPool2d(kernel_size=1, stride=1)] + return nn.Sequential(*layers) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 8fe71ec2..50668693 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -85,6 +85,14 @@ TASK_OUTPUTS = { Tasks.face_detection: [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS], + # facial expression recognition result for single sample + # { + # "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02], + # "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'] + # } + Tasks.facial_expression_recognition: + [OutputKeys.SCORES, OutputKeys.LABELS], + # face recognition result for single sample # { # "img_embedding": np.array with shape [1, D], diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index f6381857..6f901154 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -103,6 +103,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet_facedetection_scrfd10gkps'), Tasks.face_recognition: (Pipelines.face_recognition, 'damo/cv_ir101_facerecognition_cfglint'), + Tasks.facial_expression_recognition: + (Pipelines.facial_expression_recognition, + 'damo/cv_vgg19_facial-expression-recognition_fer'), Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints, 'damo/cv_mobilenet_face-2d-keypoints_alignment'), Tasks.video_multi_modal_embedding: diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py new file mode 100644 index 00000000..4a80878c --- /dev/null +++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py @@ -0,0 +1,128 @@ +import os.path as osp +from typing import Any, Dict + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.face_recognition.align_face import align_face +from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \ + FacialExpressionRecognition +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.facial_expression_recognition, + module_name=Pipelines.facial_expression_recognition) +class FacialExpressionRecognitionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a face detection pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE) + logger.info(f'loading model from {ckpt_path}') + device = torch.device( + f'cuda:{0}' if torch.cuda.is_available() else 'cpu') + fer = FacialExpressionRecognition(model_path=ckpt_path, device=device) + self.fer = fer + self.device = device + logger.info('load model done') + + # face detect pipeline + det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' + self.face_detection = pipeline( + Tasks.face_detection, model=det_model_id) + + def _choose_face(self, + det_result, + min_face=10, + top_face=1, + center_face=False): + ''' + choose face with maximum area + Args: + det_result: output of face detection pipeline + min_face: minimum size of valid face w/h + top_face: take faces with top max areas + center_face: choose the most centerd face from multi faces, only valid if top_face > 1 + ''' + bboxes = np.array(det_result[OutputKeys.BOXES]) + landmarks = np.array(det_result[OutputKeys.KEYPOINTS]) + if bboxes.shape[0] == 0: + logger.info('Warning: No face detected!') + return None + # face idx with enough size + face_idx = [] + for i in range(bboxes.shape[0]): + box = bboxes[i] + if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face: + face_idx += [i] + if len(face_idx) == 0: + logger.info( + f'Warning: Face size not enough, less than {min_face}x{min_face}!' + ) + return None + bboxes = bboxes[face_idx] + landmarks = landmarks[face_idx] + # find max faces + boxes = np.array(bboxes) + area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + sort_idx = np.argsort(area)[-top_face:] + # find center face + if top_face > 1 and center_face and bboxes.shape[0] > 1: + img_center = [img.shape[1] // 2, img.shape[0] // 2] + min_dist = float('inf') + sel_idx = -1 + for _idx in sort_idx: + box = boxes[_idx] + dist = np.square( + np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square( + np.abs((box[1] + box[3]) / 2 - img_center[1])) + if dist < min_dist: + min_dist = dist + sel_idx = _idx + sort_idx = [sel_idx] + main_idx = sort_idx[-1] + return bboxes[main_idx], landmarks[main_idx] + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input) + img = img[:, :, ::-1] + det_result = self.face_detection(img.copy()) + rtn = self._choose_face(det_result) + face_img = None + if rtn is not None: + _, face_lmks = rtn + face_lmks = face_lmks.reshape(5, 2) + face_img, _ = align_face(img, (112, 112), face_lmks) + face_img = face_img.astype(np.float32) + result = {} + result['img'] = face_img + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + result = self.fer(input) + assert result is not None + scores = result[0].tolist() + labels = result[1].tolist() + return { + OutputKeys.SCORES: scores, + OutputKeys.LABELS: labels, + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 1b738bfe..32185fb9 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -20,6 +20,7 @@ class CVTasks(object): animal_recognition = 'animal-recognition' face_detection = 'face-detection' face_recognition = 'face-recognition' + facial_expression_recognition = 'facial-expression-recognition' face_2d_keypoints = 'face-2d-keypoints' human_detection = 'human-detection' human_object_interaction = 'human-object-interaction' diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index ea1d95b5..cb07ba1a 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -89,6 +89,26 @@ def draw_keypoints(output, original_image): return image +def draw_facial_expression_result(img_path, facial_expression_result): + label_idx = facial_expression_result[OutputKeys.LABELS] + map_list = [ + 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' + ] + label = map_list[label_idx] + + img = cv2.imread(img_path) + assert img is not None, f"Can't read img: {img_path}" + cv2.putText( + img, + 'facial expression: {}'.format(label), (10, 10), + 1, + 1.0, (0, 255, 0), + thickness=1, + lineType=8) + print('facial expression: {}'.format(label)) + return img + + def draw_face_detection_result(img_path, detection_result): bboxes = np.array(detection_result[OutputKeys.BOXES]) kpss = np.array(detection_result[OutputKeys.KEYPOINTS]) diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py new file mode 100644 index 00000000..fff83ad6 --- /dev/null +++ b/tests/pipelines/test_facial_expression_recognition.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import unittest + +import cv2 +import numpy as np + +from modelscope.msdatasets import MsDataset +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import draw_facial_expression_result +from modelscope.utils.test_utils import test_level + + +class FacialExpressionRecognitionTest(unittest.TestCase): + + def setUp(self) -> None: + self.model_id = 'damo/cv_vgg19_facial-expression-recognition_fer' + + def show_result(self, img_path, facial_expression_result): + img = draw_facial_expression_result(img_path, facial_expression_result) + cv2.imwrite('result.png', img) + print(f'output written to {osp.abspath("result.png")}') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + fer = pipeline( + Tasks.facial_expression_recognition, model=self.model_id) + img_path = 'data/test/images/facial_expression_recognition.jpg' + result = fer(img_path) + self.show_result(img_path, result) + + +if __name__ == '__main__': + unittest.main() From 3e92dac3283839fef9e9e9adbc1a9c7edbe5c714 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 5 Sep 2022 09:55:26 +0800 Subject: [PATCH 18/28] [to #42322933]lazy load activate for shop segmentation Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10009052 --- .../models/cv/shop_segmentation/__init__.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py index b40a0760..072628bd 100644 --- a/modelscope/models/cv/shop_segmentation/__init__.py +++ b/modelscope/models/cv/shop_segmentation/__init__.py @@ -1 +1,20 @@ -from .shop_seg_base import SHOPSEG +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .shop_seg_base import SHOPSEG + +else: + _import_structure = {'shop_seg_base': ['SHOPSEG']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) From a9c14e4eadd64e30820b689b47f5e2ebc19516f4 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 5 Sep 2022 11:07:48 +0800 Subject: [PATCH 19/28] [to #42322933] Support saving the best checkpoint for inference 1. Support saving the best checkpoint for inference 2. Fix a bug that _max_iters field does not exist in trainer 3. Fix a bug that function in lambda_lr field cannot be saved to file 4. Fix a bug that save_pretrained would not be called by iterating 5. Fix a bug that interval is not passed from BestCkptHook's init Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9972765 --- modelscope/trainers/hooks/checkpoint_hook.py | 44 ++++++++++--------- modelscope/trainers/hooks/hook.py | 4 +- modelscope/utils/checkpoint.py | 17 ++++--- modelscope/utils/config.py | 3 ++ .../trainers/test_finetune_text_generation.py | 22 +++++----- 5 files changed, 50 insertions(+), 40 deletions(-) diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index cf7a0f7a..fcd8e982 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -27,7 +27,7 @@ class CheckpointHook(Hook): save_last (bool): Whether to save the last checkpoint. Default: True. """ - PRIORITY = Priority.NORMAL + PRIORITY = Priority.LOW def __init__(self, interval=0, @@ -75,25 +75,27 @@ class CheckpointHook(Hook): self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth') save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) - self._save_pretrained(trainer) + if (self.is_last_epoch(trainer) + and self.by_epoch) or (self.is_last_iter(trainer) + and not self.by_epoch): + self._save_pretrained(trainer) def _save_pretrained(self, trainer): - if self.is_last_epoch(trainer) and self.by_epoch: - output_dir = os.path.join(self.save_dir, - ModelFile.TRAIN_OUTPUT_DIR) - from modelscope.trainers.parallel.utils import is_parallel - - if is_parallel(trainer.model): - model = trainer.model.module - else: - model = trainer.model - - if hasattr(model, 'save_pretrained'): - model.save_pretrained( - output_dir, - ModelFile.TORCH_MODEL_BIN_FILE, - save_function=save_checkpoint, - config=trainer.cfg.to_dict()) + output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR) + from modelscope.trainers.parallel.utils import is_parallel + + if is_parallel(trainer.model): + model = trainer.model.module + else: + model = trainer.model + + if hasattr(model, 'save_pretrained'): + model.save_pretrained( + output_dir, + ModelFile.TORCH_MODEL_BIN_FILE, + save_function=save_checkpoint, + config=trainer.cfg.to_dict(), + with_meta=False) def after_train_iter(self, trainer): if self.by_epoch: @@ -133,7 +135,7 @@ class BestCkptSaverHook(CheckpointHook): save_dir (str): Output directory to save best checkpoint. """ - PRIORITY = Priority.NORMAL + PRIORITY = Priority.LOW rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y} def __init__(self, @@ -141,9 +143,11 @@ class BestCkptSaverHook(CheckpointHook): rule='max', by_epoch=True, save_optimizer=True, - save_dir=None): + save_dir=None, + interval=0): assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.' super().__init__( + interval=interval, by_epoch=by_epoch, save_optimizer=save_optimizer, save_dir=save_dir, diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py index 75cc226c..1c567f1c 100644 --- a/modelscope/trainers/hooks/hook.py +++ b/modelscope/trainers/hooks/hook.py @@ -199,14 +199,14 @@ class Hook: Whether to reach the last epoch Returns: bool """ - return trainer.epoch + 1 == trainer._max_epochs + return trainer.epoch + 1 == trainer.max_epochs def is_last_iter(self, trainer): """ Whether to reach the last iteration in the entire training process Returns: bool """ - return trainer.iter + 1 == trainer._max_iters + return trainer.iter + 1 == trainer.max_iters def get_triggered_stages(self): trigger_stages = set() diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py index 8b9d027a..425d3312 100644 --- a/modelscope/utils/checkpoint.py +++ b/modelscope/utils/checkpoint.py @@ -40,7 +40,8 @@ def weights_to_cpu(state_dict): def save_checkpoint(model: torch.nn.Module, filename: str, optimizer: Optional[Optimizer] = None, - meta: Optional[dict] = None) -> None: + meta: Optional[dict] = None, + with_meta: bool = True) -> None: """Save checkpoint to file. The checkpoint will have 3 fields: ``meta``, ``state_dict`` and @@ -65,10 +66,14 @@ def save_checkpoint(model: torch.nn.Module, # save class name to the meta meta.update(CLASSES=model.CLASSES) - checkpoint = { - 'meta': meta, - 'state_dict': weights_to_cpu(model.state_dict()) - } + if with_meta: + checkpoint = { + 'meta': meta, + 'state_dict': weights_to_cpu(model.state_dict()) + } + else: + checkpoint = weights_to_cpu(model.state_dict()) + # save optimizer state dict in the checkpoint if isinstance(optimizer, Optimizer): checkpoint['optimizer'] = optimizer.state_dict() @@ -141,7 +146,7 @@ def save_pretrained(model, # Save the ckpt to the save directory try: - save_function(model, output_ckpt_path) + save_function(model, output_ckpt_path, **kwargs) except Exception as e: raise Exception( f'During saving checkpoints, the error of "{type(e).__name__} ' diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index 42985db6..7d972118 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -9,6 +9,7 @@ import sys import tempfile import types from pathlib import Path +from types import FunctionType from typing import Dict, Union import addict @@ -638,6 +639,8 @@ class JSONIteratorEncoder(json.JSONEncoder): """ def default(self, obj): + if isinstance(obj, FunctionType): + return None try: iterable = iter(obj) except TypeError: diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py index 8cdfdf01..a561effe 100644 --- a/tests/trainers/test_finetune_text_generation.py +++ b/tests/trainers/test_finetune_text_generation.py @@ -128,15 +128,14 @@ class TestFinetuneTextGeneration(unittest.TestCase): @unittest.skip def test_finetune_cnndm(self): - from datasets import load_dataset - dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0') - train_dataset = dataset_dict['train'] \ - .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \ - .remove_columns('id') - eval_dataset = dataset_dict['validation'] \ - .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \ - .remove_columns('id') - num_warmup_steps = 2000 + from modelscope.msdatasets import MsDataset + dataset_dict = MsDataset.load('dureader_robust_qg') + train_dataset = dataset_dict['train'].to_hf_dataset() \ + .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) + eval_dataset = dataset_dict['validation'].to_hf_dataset() \ + .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) + num_warmup_steps = 200 + os.environ['LOCAL_RANK'] = '0' def noam_lambda(current_step: int): current_step += 1 @@ -154,12 +153,11 @@ class TestFinetuneTextGeneration(unittest.TestCase): return cfg kwargs = dict( - model=self.model_id, + model='damo/nlp_palm2.0_text-generation_chinese-base', train_dataset=train_dataset, eval_dataset=eval_dataset, work_dir=self.tmp_dir, - cfg_modify_fn=cfg_modify_fn, - model_revision='beta') + cfg_modify_fn=cfg_modify_fn) trainer = build_trainer( name=Trainers.nlp_base_trainer, default_args=kwargs) trainer.train() From b870e4eed541405380c6bbca78e44a06f947aae7 Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Mon, 5 Sep 2022 13:26:30 +0800 Subject: [PATCH 20/28] [to #42322933] test: use custom config to reduce test time Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011826 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 、 --- modelscope/models/audio/ans/complex_nn.py | 6 +++--- modelscope/models/audio/ans/unet.py | 5 +++-- tests/trainers/audio/test_ans_trainer.py | 10 +++++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py index c61446c2..9768eff7 100644 --- a/modelscope/models/audio/ans/complex_nn.py +++ b/modelscope/models/audio/ans/complex_nn.py @@ -1,7 +1,7 @@ """ -class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of -Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ). -from https://github.com/sweetcocoa/DeepComplexUNetPyTorch +The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d + here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ) +and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch """ import torch diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py index ae66eb69..3a9c5549 100644 --- a/modelscope/models/audio/ans/unet.py +++ b/modelscope/models/audio/ans/unet.py @@ -1,6 +1,7 @@ """ -Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ). -from https://github.com/sweetcocoa/DeepComplexUNetPyTorch +The implementation here is modified based on + Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ) +and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch """ import torch import torch.nn as nn diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py index 176c811f..ed8cd1fe 100644 --- a/tests/trainers/audio/test_ans_trainer.py +++ b/tests/trainers/audio/test_ans_trainer.py @@ -8,12 +8,14 @@ from modelscope.metainfo import Trainers from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.audio.audio_utils import to_segment +from modelscope.utils.hub import read_config from modelscope.utils.test_utils import test_level SEGMENT_LENGTH_TEST = 640 class TestANSTrainer(unittest.TestCase): + REVISION = 'beta' def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory().name @@ -21,6 +23,11 @@ class TestANSTrainer(unittest.TestCase): os.makedirs(self.tmp_dir) self.model_id = 'damo/speech_frcrn_ans_cirm_16k' + cfg = read_config(self.model_id, revision=self.REVISION) + cfg.train.max_epochs = 2 + cfg.train.dataloader.batch_size_per_gpu = 1 + self.cfg_file = os.path.join(self.tmp_dir, 'train_config.json') + cfg.dump(self.cfg_file) hf_ds = MsDataset.load( 'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset() @@ -39,12 +46,13 @@ class TestANSTrainer(unittest.TestCase): def test_trainer(self): kwargs = dict( model=self.model_id, - model_revision='beta', + model_revision=self.REVISION, train_dataset=self.dataset, eval_dataset=self.dataset, max_epochs=2, train_iters_per_epoch=2, val_iters_per_epoch=1, + cfg_file=self.cfg_file, work_dir=self.tmp_dir) trainer = build_trainer( From c25e60c67dc7891a21065e912b30e276c77ccf7e Mon Sep 17 00:00:00 2001 From: ly261666 Date: Mon, 5 Sep 2022 13:52:54 +0800 Subject: [PATCH 21/28] [to #42322933]add lazy load Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011795 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [to #42322933] 新增FER人脸属性识别 --- .../facial_expression_recognition/__init__.py | 20 +++++++++++++++++++ .../fer/__init__.py | 2 ++ modelscope/pipelines/cv/__init__.py | 4 ++++ .../facial_expression_recognition_pipeline.py | 2 +- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py index e69de29b..35a15d18 100644 --- a/modelscope/models/cv/facial_expression_recognition/__init__.py +++ b/modelscope/models/cv/facial_expression_recognition/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .fer import FacialExpressionRecognition + +else: + _import_structure = {'fer': ['FacialExpressionRecognition']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py index e69de29b..2546035b 100644 --- a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py +++ b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .facial_expression_recognition import FacialExpressionRecognition diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index d3dba978..ac1ed82c 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -47,6 +47,8 @@ if TYPE_CHECKING: from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline + from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline + else: _import_structure = { 'action_recognition_pipeline': ['ActionRecognitionPipeline'], @@ -105,6 +107,8 @@ else: ['TextDrivenSegmentationPipeline'], 'movie_scene_segmentation_pipeline': ['MovieSceneSegmentationPipeline'], + 'facial_expression_recognition_pipelin': + ['FacialExpressionRecognitionPipeline'] } import sys diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py index 4a80878c..c5577dcf 100644 --- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py +++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py @@ -8,7 +8,7 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models.cv.face_recognition.align_face import align_face -from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \ +from modelscope.models.cv.facial_expression_recognition import \ FacialExpressionRecognition from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline From f4ca0b8aabe916d010f62dab685625cd3c84c28a Mon Sep 17 00:00:00 2001 From: ly261666 Date: Mon, 5 Sep 2022 15:54:57 +0800 Subject: [PATCH 22/28] [to #42322933]add lazy import Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10012143 --- .../models/cv/face_detection/__init__.py | 22 +++++++++++++++++++ .../cv/face_detection/retinaface/__init__.py | 1 + modelscope/pipelines/cv/__init__.py | 2 ++ .../cv/retina_face_detection_pipeline.py | 7 ++++-- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py index e69de29b..a3c47164 100644 --- a/modelscope/models/cv/face_detection/__init__.py +++ b/modelscope/models/cv/face_detection/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .retinaface import RetinaFaceDetection + +else: + _import_structure = { + 'retinaface': ['RetinaFaceDetection'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py index e69de29b..779aaf1c 100644 --- a/modelscope/models/cv/face_detection/retinaface/__init__.py +++ b/modelscope/models/cv/face_detection/retinaface/__init__.py @@ -0,0 +1 @@ +from .detection import RetinaFaceDetection diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index ac1ed82c..960ed621 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -47,6 +47,7 @@ if TYPE_CHECKING: from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline + from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline else: @@ -107,6 +108,7 @@ else: ['TextDrivenSegmentationPipeline'], 'movie_scene_segmentation_pipeline': ['MovieSceneSegmentationPipeline'], + 'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'], 'facial_expression_recognition_pipelin': ['FacialExpressionRecognitionPipeline'] } diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py index 20111c11..b8c64405 100644 --- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py +++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py @@ -1,10 +1,13 @@ import os.path as osp from typing import Any, Dict +import cv2 import numpy as np +import PIL +import torch from modelscope.metainfo import Pipelines -from modelscope.models.cv.face_detection.retinaface import detection +from modelscope.models.cv.face_detection import RetinaFaceDetection from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES @@ -28,7 +31,7 @@ class RetinaFaceDetectionPipeline(Pipeline): super().__init__(model=model, **kwargs) ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE) logger.info(f'loading model from {ckpt_path}') - detector = detection.RetinaFaceDetection( + detector = RetinaFaceDetection( model_path=ckpt_path, device=self.device) self.detector = detector logger.info('load model done') From 042cff7d68dce03f12a010d8b3723395fccde998 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 5 Sep 2022 16:08:50 +0800 Subject: [PATCH 23/28] [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated. Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10014958 * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated. --- .dev_scripts/ci_container_test.sh | 10 +++++----- .dev_scripts/citest.sh | 19 ------------------- tests/run_config.yaml | 5 +---- 3 files changed, 6 insertions(+), 28 deletions(-) delete mode 100644 .dev_scripts/citest.sh diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index a53c08c6..194a48b3 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -1,8 +1,8 @@ -pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html pip install -r requirements/tests.txt git config --global --add safe.directory /Maas-lib diff --git a/.dev_scripts/citest.sh b/.dev_scripts/citest.sh deleted file mode 100644 index c6e0905f..00000000 --- a/.dev_scripts/citest.sh +++ /dev/null @@ -1,19 +0,0 @@ -pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html - -pip install -r requirements/tests.txt -# install numpy<=1.18 for tensorflow==1.15.x -pip install "numpy<=1.18" - -# linter test -# use internal project for pre-commit due to the network problem -pre-commit run --all-files -if [ $? -ne 0 ]; then - echo "linter test failed, please run 'pre-commit run --all-files' to check" - exit -1 -fi - -PYTHONPATH=. python tests/run.py diff --git a/tests/run_config.yaml b/tests/run_config.yaml index 591dcd66..f44053f6 100644 --- a/tests/run_config.yaml +++ b/tests/run_config.yaml @@ -1,7 +1,4 @@ -# envs option allows fine-grained control for test executoin, for example, -# python tests/run.py --env pytorch -# would only trigger exeutions of all pytorch cases. -# envs option defaults to None for backward compatbility +# isolate cases in env, we can install different dependencies in each env. isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process. - test_text_to_speech.py - test_multi_modal_embedding.py From f660a119f02cbf767521eea322f96faf2bb883c8 Mon Sep 17 00:00:00 2001 From: "xingjun.wxj" Date: Mon, 5 Sep 2022 16:19:45 +0800 Subject: [PATCH 24/28] [to #42322933]Add resumable and large data upload. CR Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9995250 1. add resumable dataset upload 2. add large data upload (up to 48.8TB) --- modelscope/msdatasets/ms_dataset.py | 8 +------ modelscope/msdatasets/utils/oss_utils.py | 24 +++++++++++++++------ modelscope/msdatasets/utils/upload_utils.py | 22 +++++++++---------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 338c6333..28a95643 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -574,14 +574,8 @@ class MsDataset: None """ - from modelscope.hub.api import HubApi - _hub_api = HubApi() - cookies = _hub_api.check_cookies_upload_data(use_cookies=True) _upload_manager = DatasetUploadManager( - dataset_name=dataset_name, - namespace=namespace, - version=version, - cookies=cookies) + dataset_name=dataset_name, namespace=namespace, version=version) _upload_manager.upload(object_name, local_file_path) @staticmethod diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py index 63a1cf77..9a7040a1 100644 --- a/modelscope/msdatasets/utils/oss_utils.py +++ b/modelscope/msdatasets/utils/oss_utils.py @@ -18,6 +18,12 @@ class OssUtilities: self.oss_dir = oss_config['Dir'] self.oss_backup_dir = oss_config['BackupDir'] + self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset' + self.upload_multipart_threshold = 50 * 1024 * 1024 + self.upload_part_size = 1 * 1024 * 1024 + self.upload_num_threads = 4 + self.upload_max_retries = 3 + @staticmethod def _percentage(consumed_bytes, total_bytes): if total_bytes: @@ -42,21 +48,27 @@ class OssUtilities: progress_callback=self._percentage) return local_path - def upload(self, oss_file_name: str, local_file_path: str) -> str: - max_retries = 3 + def upload(self, oss_object_name: str, local_file_path: str) -> str: retry_count = 0 - object_key = os.path.join(self.oss_dir, oss_file_name) + object_key = os.path.join(self.oss_dir, oss_object_name) + resumable_store = oss2.ResumableStore( + root=self.upload_resumable_tmp_store) while True: try: retry_count += 1 - self.bucket.put_object_from_file( + oss2.resumable_upload( + self.bucket, object_key, local_file_path, - progress_callback=self._percentage) + store=resumable_store, + multipart_threshold=self.upload_multipart_threshold, + part_size=self.upload_part_size, + progress_callback=self._percentage, + num_threads=self.upload_num_threads) break except Exception: - if retry_count >= max_retries: + if retry_count >= self.upload_max_retries: raise return object_key diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py index eff3aca0..fbe5c531 100644 --- a/modelscope/msdatasets/utils/upload_utils.py +++ b/modelscope/msdatasets/utils/upload_utils.py @@ -1,23 +1,21 @@ -from http.cookiejar import CookieJar - from .oss_utils import OssUtilities class DatasetUploadManager(object): - def __init__(self, dataset_name: str, namespace: str, version: str, - cookies: CookieJar): + def __init__(self, dataset_name: str, namespace: str, version: str): from modelscope.hub.api import HubApi - api = HubApi() - oss_config = api.get_dataset_access_config_session( - cookies=cookies, + _hub_api = HubApi() + _cookies = _hub_api.check_cookies_upload_data(use_cookies=True) + _oss_config = _hub_api.get_dataset_access_config_session( + cookies=_cookies, dataset_name=dataset_name, namespace=namespace, revision=version) - self.oss_utilities = OssUtilities(oss_config) + self.oss_utilities = OssUtilities(_oss_config) - def upload(self, oss_file_name: str, local_file_path: str) -> str: - oss_object_key = self.oss_utilities.upload( - oss_file_name=oss_file_name, local_file_path=local_file_path) - return oss_object_key + def upload(self, object_name: str, local_file_path: str) -> str: + object_key = self.oss_utilities.upload( + oss_object_name=object_name, local_file_path=local_file_path) + return object_key From 4484dcaa04ca49b7e90954b032118922ee7811ba Mon Sep 17 00:00:00 2001 From: "liangting.zl" Date: Mon, 5 Sep 2022 16:42:40 +0800 Subject: [PATCH 25/28] [to #42322933] feat: add hand keypoints pipeline Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9961906 * feat: add hand keypoints pipeline --- data/test/images/hand_keypoints.jpg | 3 ++ modelscope/metainfo.py | 1 + modelscope/outputs.py | 15 ++++++ modelscope/pipelines/builder.py | 3 ++ modelscope/pipelines/cv/__init__.py | 2 + .../cv/hand_2d_keypoints_pipeline.py | 51 +++++++++++++++++++ modelscope/utils/constant.py | 1 + tests/pipelines/test_hand_2d_keypoints.py | 45 ++++++++++++++++ 8 files changed, 121 insertions(+) create mode 100644 data/test/images/hand_keypoints.jpg create mode 100644 modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py create mode 100644 tests/pipelines/test_hand_2d_keypoints.py diff --git a/data/test/images/hand_keypoints.jpg b/data/test/images/hand_keypoints.jpg new file mode 100644 index 00000000..cb445c26 --- /dev/null +++ b/data/test/images/hand_keypoints.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988 +size 7750 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 47608d02..3ac2f2df 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -112,6 +112,7 @@ class Pipelines(object): hicossl_video_embedding = 'hicossl-s3dg-video_embedding' body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' body_3d_keypoints = 'canonical_body-3d-keypoints_video' + hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image' human_detection = 'resnet18-human-detection' object_detection = 'vit-object-detection' easycv_detection = 'easycv-detection' diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 50668693..c6a7a619 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -219,6 +219,21 @@ TASK_OUTPUTS = { # } Tasks.body_3d_keypoints: [OutputKeys.POSES], + # 2D hand keypoints result for single sample + # { + # "keypoints": [ + # [[x, y, score] * 21], + # [[x, y, score] * 21], + # [[x, y, score] * 21], + # ], + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] + # } + Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES], + # video single object tracking result for single video # { # "boxes": [ diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 6f901154..9f265fb8 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -99,6 +99,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_hrnetv2w32_body-2d-keypoints_image'), Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints, 'damo/cv_canonical_body-3d-keypoints_video'), + Tasks.hand_2d_keypoints: + (Pipelines.hand_2d_keypoints, + 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'), Tasks.face_detection: (Pipelines.face_detection, 'damo/cv_resnet_facedetection_scrfd10gkps'), Tasks.face_recognition: (Pipelines.face_recognition, diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 960ed621..72a225ff 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from .animal_recognition_pipeline import AnimalRecognitionPipeline from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline + from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline from .crowd_counting_pipeline import CrowdCountingPipeline @@ -57,6 +58,7 @@ else: 'animal_recognition_pipeline': ['AnimalRecognitionPipeline'], 'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'], 'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'], + 'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'], 'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'], 'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'], 'crowd_counting_pipeline': ['CrowdCountingPipeline'], diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py new file mode 100644 index 00000000..db66f5d2 --- /dev/null +++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py @@ -0,0 +1,51 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path + +from modelscope.metainfo import Pipelines +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import ModelFile, Tasks +from .easycv_pipelines.base import EasyCVPipeline + + +@PIPELINES.register_module( + Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints) +class Hand2DKeypointsPipeline(EasyCVPipeline): + """Pipeline for hand pose keypoint task.""" + + def __init__(self, + model: str, + model_file_pattern=ModelFile.TORCH_MODEL_FILE, + *args, + **kwargs): + """ + model (str): model id on modelscope hub or local model path. + model_file_pattern (str): model file pattern. + """ + self.model_dir = model + super(Hand2DKeypointsPipeline, self).__init__( + model=model, + model_file_pattern=model_file_pattern, + *args, + **kwargs) + + def _build_predict_op(self): + """Build EasyCV predictor.""" + from easycv.predictors.builder import build_predictor + detection_predictor_type = self.cfg['DETECTION']['type'] + detection_model_path = os.path.join( + self.model_dir, self.cfg['DETECTION']['model_path']) + detection_cfg_file = os.path.join(self.model_dir, + self.cfg['DETECTION']['config_file']) + detection_score_threshold = self.cfg['DETECTION']['score_threshold'] + self.cfg.pipeline.predictor_config[ + 'detection_predictor_config'] = dict( + type=detection_predictor_type, + model_path=detection_model_path, + config_file=detection_cfg_file, + score_threshold=detection_score_threshold) + easycv_config = self._to_easycv_config() + pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, { + 'model_path': self.model_path, + 'config_file': easycv_config + }) + return pipeline_op diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 32185fb9..47d38dd7 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -27,6 +27,7 @@ class CVTasks(object): face_image_generation = 'face-image-generation' body_2d_keypoints = 'body-2d-keypoints' body_3d_keypoints = 'body-3d-keypoints' + hand_2d_keypoints = 'hand-2d-keypoints' general_recognition = 'general-recognition' image_classification = 'image-classification' diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py new file mode 100644 index 00000000..86cd2d06 --- /dev/null +++ b/tests/pipelines/test_hand_2d_keypoints.py @@ -0,0 +1,45 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class Hand2DKeypointsPipelineTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_hand_2d_keypoints(self): + img_path = 'data/test/images/hand_keypoints.jpg' + model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody' + + hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id) + outputs = hand_keypoint(img_path) + self.assertEqual(len(outputs), 1) + + results = outputs[0] + self.assertIn(OutputKeys.KEYPOINTS, results.keys()) + self.assertIn(OutputKeys.BOXES, results.keys()) + self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21) + self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3) + self.assertEqual(results[OutputKeys.BOXES].shape[1], 4) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_hand_2d_keypoints_with_default_model(self): + img_path = 'data/test/images/hand_keypoints.jpg' + + hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints) + outputs = hand_keypoint(img_path) + self.assertEqual(len(outputs), 1) + + results = outputs[0] + self.assertIn(OutputKeys.KEYPOINTS, results.keys()) + self.assertIn(OutputKeys.BOXES, results.keys()) + self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21) + self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3) + self.assertEqual(results[OutputKeys.BOXES].shape[1], 4) + + +if __name__ == '__main__': + unittest.main() From 83dbf713020b7c45cd22b0ebcc366eb73ec5d899 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 5 Sep 2022 17:38:05 +0800 Subject: [PATCH 26/28] [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated. Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10019738 * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated. --- .dev_scripts/ci_container_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 194a48b3..129a6c25 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -1,4 +1,4 @@ -awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html From 3d3f9b45377abad27b9e9272ee294a2f2ee50ea9 Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Mon, 5 Sep 2022 17:51:22 +0800 Subject: [PATCH 27/28] [to #42322933] fix checkpoint format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 修复 palm,gpt3,mplug 模型存在的 finetune 后保存 checkpoint 与原有 checkpoint key 字段存在区别无法使用 from_pretrained 导入的问题 2. 调整 test_finetune_mplug.py 为只保存训练结束时的 checkpoint,减少 ci 耗时 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10016517 --- .../multi_modal/mplug/modeling_mplug.py | 10 +++--- modelscope/models/nlp/gpt3/modeling_gpt3.py | 4 +++ .../models/nlp/palm_v2/modeling_palm.py | 16 ++++----- tests/trainers/test_finetune_mplug.py | 33 ++++++++++--------- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py index 78f60f9b..f469c218 100755 --- a/modelscope/models/multi_modal/mplug/modeling_mplug.py +++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py @@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel): ModelFile.TORCH_MODEL_BIN_FILE) checkpoint = torch.load(checkpoint_path, map_location='cpu') if 'model' in checkpoint: - state_dict = checkpoint['model'] - else: - state_dict = checkpoint['module'] + checkpoint = checkpoint['model'] + checkpoint = { + k.replace('model.', ''): v + for k, v in checkpoint.items() + } - msg = model.load_state_dict(state_dict, strict=False) + msg = model.load_state_dict(checkpoint, strict=False) print('load checkpoint from %s' % checkpoint_path) print(msg) return model diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py index 4e30f697..69e9ba7c 100644 --- a/modelscope/models/nlp/gpt3/modeling_gpt3.py +++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py @@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel): state_dict_file = os.path.join(pretrained_model_name_or_path, ModelFile.TORCH_MODEL_BIN_FILE) state_dict = torch.load(state_dict_file) + state_dict = { + k.replace('model.language_model', 'language_model'): v + for k, v in state_dict.items() + } model.load_state_dict(state_dict) return model diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py index ff6fd732..99b00454 100644 --- a/modelscope/models/nlp/palm_v2/modeling_palm.py +++ b/modelscope/models/nlp/palm_v2/modeling_palm.py @@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel): # Model self.generator.dense.weight = self.decoder.embeddings.weight if checkpoint is not None: - for key in list(checkpoint['model'].keys()): - checkpoint['model'][key.replace('module.', - '')] = checkpoint['model'][key] - msg = self.load_state_dict(checkpoint['model'], strict=False) - print(msg) + if 'model' in checkpoint: + checkpoint = checkpoint['model'] + for key in list(checkpoint.keys()): + checkpoint[key.replace('model.palm.', '')] = checkpoint[key] + self.load_state_dict(checkpoint, strict=False) else: for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): @@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel): return addict.Dict(loss=loss) -class Translator(nn.Module): +class Translator(object): """ Uses a model to translate a batch of sentences. """ @@ -1298,8 +1298,8 @@ class Translator(nn.Module): return results - def forward(self, input_ids: torch.Tensor, - attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: + def __call__(self, input_ids: torch.Tensor, + attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: batch = self.Batch( batch_size=input_ids.size()[0], src=input_ids, diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index b46dbf45..72196fba 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -41,6 +41,18 @@ class TestFinetuneMPlug(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() + def _cfg_modify_fn(self, cfg): + cfg.train.hooks = [{ + 'type': 'CheckpointHook', + 'interval': self.max_epochs + }, { + 'type': 'TextLoggerHook', + 'interval': 1 + }, { + 'type': 'IterTimerHook' + }] + return cfg + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_trainer_with_caption(self): kwargs = dict( @@ -48,15 +60,12 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + cfg_modify_fn=self._cfg_modify_fn) trainer: EpochBasedTrainer = build_trainer( name=Trainers.nlp_base_trainer, default_args=kwargs) trainer.train() - results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{trainer.timestamp}.log.json', results_files) - for i in range(self.max_epochs): - self.assertIn(f'epoch_{i+1}.pth', results_files) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_trainer_with_caption_with_model_and_args(self): @@ -86,15 +95,12 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + cfg_modify_fn=self._cfg_modify_fn) trainer: EpochBasedTrainer = build_trainer( name=Trainers.nlp_base_trainer, default_args=kwargs) trainer.train() - results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{trainer.timestamp}.log.json', results_files) - for i in range(self.max_epochs): - self.assertIn(f'epoch_{i+1}.pth', results_files) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_trainer_with_vqa_with_model_and_args(self): @@ -124,15 +130,12 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + cfg_modify_fn=self._cfg_modify_fn) trainer: EpochBasedTrainer = build_trainer( name=Trainers.nlp_base_trainer, default_args=kwargs) trainer.train() - results_files = os.listdir(self.tmp_dir) - self.assertIn(f'{trainer.timestamp}.log.json', results_files) - for i in range(self.max_epochs): - self.assertIn(f'epoch_{i+1}.pth', results_files) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_trainer_with_retrieval_with_model_and_args(self): From e365023862995b921f74d902a69667933fa58060 Mon Sep 17 00:00:00 2001 From: "feiwu.yfw" Date: Mon, 5 Sep 2022 19:36:46 +0800 Subject: [PATCH 28/28] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dprocessor=E8=BE=93?= =?UTF-8?q?=E5=87=BAtorch.tensor=E6=97=B6=E8=A2=AB=E8=BD=AC=E4=B8=BAnumpy?= =?UTF-8?q?=E7=9A=84=E5=BC=82=E5=B8=B8=20=20=20=20=20=20=20=20=20Link:=20h?= =?UTF-8?q?ttps://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/100218?= =?UTF-8?q?02?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix to_torch_dataset --- modelscope/msdatasets/ms_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 28a95643..691db4fe 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset): for idx in range(iter_start, iter_end): item_dict = self.dataset[idx] res = { - k: np.array(item_dict[k]) + k: torch.tensor(item_dict[k]) for k in self.columns if k in self.retained_columns } for preprocessor in self.preprocessor_list: res.update({ - k: np.array(v) + k: torch.tensor(v) for k, v in preprocessor(item_dict).items() if k in self.retained_columns })