From 0b16dbd66cbe6c1b7281370e3b3c1e19226ff95c Mon Sep 17 00:00:00 2001 From: "qianming.lm" Date: Wed, 27 Jul 2022 23:42:27 +0800 Subject: [PATCH] [to #42322933] add cv_resnet50_video-category to maas lib MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增视频内容分类模型 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9479334 --- .../test/videos/video_category_test_video.mp4 | 3 + modelscope/metainfo.py | 1 + modelscope/outputs.py | 7 + modelscope/pipelines/builder.py | 2 + modelscope/pipelines/cv/__init__.py | 2 + .../pipelines/cv/video_category_pipeline.py | 397 ++++++++++++++++++ modelscope/utils/constant.py | 1 + tests/pipelines/test_video_category.py | 22 + 8 files changed, 435 insertions(+) create mode 100644 data/test/videos/video_category_test_video.mp4 create mode 100644 modelscope/pipelines/cv/video_category_pipeline.py create mode 100644 tests/pipelines/test_video_category.py diff --git a/data/test/videos/video_category_test_video.mp4 b/data/test/videos/video_category_test_video.mp4 new file mode 100644 index 00000000..195af371 --- /dev/null +++ b/data/test/videos/video_category_test_video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc935328ecace53338050a6789250e08b9d17a52efa2339b0e133edc1fae9d4 +size 3943349 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index c6858794..0a80876d 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -76,6 +76,7 @@ class Pipelines(object): face_image_generation = 'gan-face-image-generation' style_transfer = 'AAMS-style-transfer' image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' + video_category = 'video-category' # nlp tasks sentence_similarity = 'sentence-similarity' diff --git a/modelscope/outputs.py b/modelscope/outputs.py index ed2d680d..975a548f 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -85,6 +85,13 @@ TASK_OUTPUTS = { # } Tasks.action_recognition: [OutputKeys.LABELS], + # video category recognition result for single video + # { + # "scores": [0.7716429233551025] + # "labels": ['生活>>好物推荐'], + # } + Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS], + # pose estimation result for single sample # { # "poses": np.array with shape [num_pose, num_keypoint, 3], diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index eb5f0e6d..c580bb72 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -61,6 +61,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), Tasks.action_recognition: (Pipelines.action_recognition, 'damo/cv_TAdaConv_action-recognition'), + Tasks.video_category: (Pipelines.video_category, + 'damo/cv_resnet50_video-category'), Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding, 'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'), diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 5d5f93c1..5aa8f7d0 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -14,6 +14,7 @@ if TYPE_CHECKING: from .image_color_enhance_pipeline import ImageColorEnhancePipeline from .image_colorization_pipeline import ImageColorizationPipeline from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline + from .video_category_pipeline import VideoCategoryPipeline from .image_matting_pipeline import ImageMattingPipeline from .image_super_resolution_pipeline import ImageSuperResolutionPipeline from .style_transfer_pipeline import StyleTransferPipeline @@ -38,6 +39,7 @@ else: 'ocr_detection_pipeline': ['OCRDetectionPipeline'], 'image_instance_segmentation_pipeline': ['ImageInstanceSegmentationPipeline'], + 'video_category_pipeline': ['VideoCategoryPipeline'], } import sys diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py new file mode 100644 index 00000000..2d38031a --- /dev/null +++ b/modelscope/pipelines/cv/video_category_pipeline.py @@ -0,0 +1,397 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from typing import Any, Dict + +import json +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models +import torchvision.transforms.functional as TF +from PIL import Image + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.video_category, module_name=Pipelines.video_category) +class VideoCategoryPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a video-category pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + config_path = osp.join(self.model, ModelFile.CONFIGURATION) + logger.info(f'loading configuration from {config_path}') + with open(config_path, 'r') as f: + config = json.load(f) + self.frame_num = config['frame_num'] + self.level_1_num = config['level_1_num'] + self.level_2_num = config['level_2_num'] + self.resize = config['resize'] + self.crop = config['crop'] + self.mean = config['mean'] + self.std = config['std'] + self.cateproj_v3 = config['cateproj_v3'] + self.class_name = config['class_name'] + self.subclass_name = config['subclass_name'] + logger.info('load configuration done') + + model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE) + logger.info(f'loading model from {model_path}') + self.infer_model = ModelWrapper(self.level_1_num, self.level_2_num, + self.frame_num) + self.device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') + self.infer_model = self.infer_model.to(self.device).eval() + self.infer_model.load_state_dict( + torch.load(model_path, map_location=self.device)) + logger.info('load model done') + self.transforms = VCompose([ + VRescale(size=self.resize), + VCenterCrop(size=self.crop), + VToTensor(), + VNormalize(mean=self.mean, std=self.std) + ]) + + def preprocess(self, input: Input) -> Dict[str, Any]: + if isinstance(input, str): + import decord + from decord import VideoReader, cpu + decord.bridge.set_bridge('native') + vr = VideoReader(input, ctx=cpu(0)) + indices = np.linspace(0, len(vr) - 1, 16).astype(int) + frames = vr.get_batch(indices).asnumpy() + video_input_data = self.transforms( + [Image.fromarray(f) for f in frames]) + else: + raise TypeError(f'input should be a str,' + f' but got {type(input)}') + result = {'video_data': video_input_data} + return result + + @torch.no_grad() + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + pred1, pred2 = self.infer_model(input['video_data'].to(self.device)) + + pred1 = F.softmax(pred1, dim=1) + pred2 = F.softmax(pred2, dim=1) + + vals_2, preds_2 = pred2.cpu().topk(10, 1, True, True) + vals_2 = vals_2.detach().numpy() + preds_2 = preds_2.detach().numpy() + + if vals_2[0][0] >= 0.3: + c2 = int(preds_2[0][0]) + c1 = self.cateproj_v3[c2] + + tag1 = self.class_name[c1] + tag2 = self.subclass_name[c2] + + prob = float(vals_2[0][0]) + else: + vals_1, preds_1 = pred1.cpu().topk(10, 1, True, True) + vals_1 = vals_1.detach().numpy() + preds_1 = preds_1.detach().numpy() + + c1 = int(preds_1[0][0]) + + tag1 = self.class_name[c1] + tag2 = '其他' + + prob = float(vals_1[0][0]) + + return { + OutputKeys.SCORES: [prob], + OutputKeys.LABELS: [tag1 + '>>' + tag2] + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs + + +class TimeFirstBatchNorm1d(nn.Module): + + def __init__(self, dim, groups=None): + super().__init__() + self.groups = groups + self.bn = nn.BatchNorm1d(dim) + + def forward(self, tensor): + _, length, dim = tensor.size() + if self.groups: + dim = dim // self.groups + tensor = tensor.view(-1, dim) + tensor = self.bn(tensor) + if self.groups: + return tensor.view(-1, length, self.groups, dim) + else: + return tensor.view(-1, length, dim) + + +class NeXtVLAD(nn.Module): + """NeXtVLAD layer implementation + Adapted from https://github.com/linrongc/youtube-8m/blob/master/nextvlad.py + """ + + def __init__(self, + num_clusters=64, + dim=128, + alpha=100.0, + groups=8, + expansion=2, + normalize_input=True, + p_drop=0.25, + add_batchnorm=False): + """ + Args: + num_clusters : int + The number of clusters + dim : int + Dimension of descriptors + alpha : float + Parameter of initialization. Larger value is harder assignment. + normalize_input : bool + If true, descriptor-wise L2 normalization is applied to input. + """ + super(NeXtVLAD, self).__init__() + assert dim % groups == 0, '`dim` must be divisible by `groups`' + assert expansion > 1 + self.p_drop = p_drop + self.cluster_dropout = nn.Dropout2d(p_drop) + self.num_clusters = num_clusters + self.dim = dim + self.expansion = expansion + self.grouped_dim = dim * expansion // groups + self.groups = groups + self.alpha = alpha + self.normalize_input = normalize_input + self.add_batchnorm = add_batchnorm + self.expansion_mapper = nn.Linear(dim, dim * expansion) + if add_batchnorm: + self.soft_assignment_mapper = nn.Sequential( + nn.Linear(dim * expansion, num_clusters * groups, bias=False), + TimeFirstBatchNorm1d(num_clusters, groups=groups)) + else: + self.soft_assignment_mapper = nn.Linear( + dim * expansion, num_clusters * groups, bias=True) + self.attention_mapper = nn.Linear(dim * expansion, groups) + self.centroids = nn.Parameter( + torch.rand(num_clusters, self.grouped_dim)) + self.final_bn = nn.BatchNorm1d(num_clusters * self.grouped_dim) + self._init_params() + + def _init_params(self): + for component in (self.soft_assignment_mapper, self.attention_mapper, + self.expansion_mapper): + for module in component.modules(): + self.general_weight_initialization(module) + if self.add_batchnorm: + self.soft_assignment_mapper[0].weight = nn.Parameter( + (2.0 * self.alpha * self.centroids).repeat( + (self.groups, self.groups))) + nn.init.constant_(self.soft_assignment_mapper[1].bn.weight, 1) + nn.init.constant_(self.soft_assignment_mapper[1].bn.bias, 0) + else: + self.soft_assignment_mapper.weight = nn.Parameter( + (2.0 * self.alpha * self.centroids).repeat( + (self.groups, self.groups))) + self.soft_assignment_mapper.bias = nn.Parameter( + (-self.alpha * self.centroids.norm(dim=1)).repeat( + (self.groups, ))) + + def general_weight_initialization(self, module): + if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)): + if module.weight is not None: + nn.init.uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.Linear): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + def forward(self, x, masks=None): + """NeXtVlad Adaptive Pooling + Arguments: + x {torch.Tensor} -- shape: (n_batch, len, dim) + Returns: + torch.Tensor -- shape (n_batch, n_cluster * dim / groups) + """ + if self.normalize_input: + x = F.normalize(x, p=2, dim=2) # across descriptor dim + + # expansion + # shape: (n_batch, len, dim * expansion) + x = self.expansion_mapper(x) + + # soft-assignment + # shape: (n_batch, len, n_cluster, groups) + soft_assign = self.soft_assignment_mapper(x).view( + x.size(0), x.size(1), self.num_clusters, self.groups) + soft_assign = F.softmax(soft_assign, dim=2) + + # attention + # shape: (n_batch, len, groups) + attention = torch.sigmoid(self.attention_mapper(x)) + if masks is not None: + # shape: (n_batch, len, groups) + attention = attention * masks[:, :, None] + + # (n_batch, len, n_cluster, groups, dim / groups) + activation = ( + attention[:, :, None, :, None] * soft_assign[:, :, :, :, None]) + + # calculate residuals to each clusters + # (n_batch, n_cluster, dim / groups) + second_term = ( + activation.sum(dim=3).sum(dim=1) * self.centroids[None, :, :]) + # (n_batch, n_cluster, dim / groups) + first_term = ( + # (n_batch, len, n_cluster, groups, dim / groups) + activation + * x.view(x.size(0), x.size(1), 1, self.groups, + self.grouped_dim)).sum(dim=3).sum(dim=1) + + # vlad shape (n_batch, n_cluster, dim / groups) + vlad = first_term - second_term + vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization + # flatten shape (n_batch, n_cluster * dim / groups) + vlad = vlad.view(x.size(0), -1) # flatten + # vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize + vlad = self.final_bn(vlad) + if self.p_drop: + vlad = self.cluster_dropout( + vlad.view(x.size(0), self.num_clusters, self.grouped_dim, + 1)).view(x.size(0), -1) + return vlad + + +class ModelWrapper(nn.Module): + + def __init__(self, class_num, subclass_num, frame_num): + super(ModelWrapper, self).__init__() + cnn = models.resnet50(pretrained=False) + cnn.fc = nn.Sequential() + self.model = cnn + # Use NextVlad + # output size: (n_batch, n_cluster * dim / groups) + nv_group = 2 + expand = int(2 * frame_num / nv_group) + self.nextvlad = NeXtVLAD( + num_clusters=frame_num, dim=2048, groups=nv_group) + self.fc = nn.Linear(2048 * expand, 2048) + self.head1_p1 = nn.Sequential( + nn.Linear(2048, 2048), + nn.ReLU(), + nn.Linear(2048, 1024), + ) + self.head1_p2 = nn.Sequential( + nn.Linear(1024, 1024), + nn.ReLU(), + nn.Linear(1024, class_num), + ) + self.head2_p1 = nn.Sequential( + nn.Linear(2048, 2048), + nn.ReLU(), + nn.Linear(2048, 1024), + ) + self.head2_p2 = nn.Sequential( + nn.Linear(2048, 1024), + nn.ReLU(), + nn.Linear(1024, subclass_num), + ) + self.fn = frame_num + + def forward(self, x): + x = x.view(-1, 3, 224, 224) + x = self.model(x) + + x = x.view(-1, self.fn, 2048) + x = self.nextvlad(x) + + x = self.fc(x) + + x1 = self.head1_p1(x) + c1 = self.head1_p2(x1) + + x2 = self.head2_p1(x) + c2 = self.head2_p2(torch.cat((x1, x2), dim=1)) + + return c1, c2 + + +class VCompose(object): + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, item): + for t in self.transforms: + item = t(item) + return item + + +class VRescale(object): + + def __init__(self, size=128): + self.size = size + + def __call__(self, vclip): + w, h = vclip[0].size + scale = self.size / min(w, h) + out_w, out_h = int(round(w * scale)), int(round(h * scale)) + vclip = [u.resize((out_w, out_h), Image.BILINEAR) for u in vclip] + return vclip + + +class VCenterCrop(object): + + def __init__(self, size=112): + self.size = size + + def __call__(self, vclip): + w, h = vclip[0].size + assert min(w, h) >= self.size + x1 = (w - self.size) // 2 + y1 = (h - self.size) // 2 + vclip = [ + u.crop((x1, y1, x1 + self.size, y1 + self.size)) for u in vclip + ] + return vclip + + +class VToTensor(object): + + def __call__(self, vclip): + vclip = torch.stack([TF.to_tensor(u) for u in vclip], dim=0) + return vclip + + +class VNormalize(object): + + def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): + self.mean = mean + self.std = std + + def __call__(self, vclip): + assert vclip.min() > -0.1 and vclip.max() < 1.1, \ + 'vclip values should be in [0, 1]' + vclip = vclip.clone() + if not isinstance(self.mean, torch.Tensor): + self.mean = vclip.new_tensor(self.mean).view(1, -1, 1, 1) + if not isinstance(self.std, torch.Tensor): + self.std = vclip.new_tensor(self.std).view(1, -1, 1, 1) + vclip.sub_(self.mean).div_(self.std) + return vclip diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index fafb762f..54035b9e 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -34,6 +34,7 @@ class CVTasks(object): face_image_generation = 'face-image-generation' image_super_resolution = 'image-super-resolution' style_transfer = 'style-transfer' + video_category = 'video-category' image_classification_imagenet = 'image-classification-imagenet' image_classification_dailylife = 'image-classification-dailylife' diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py new file mode 100644 index 00000000..aba56676 --- /dev/null +++ b/tests/pipelines/test_video_category.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class VideoCategoryTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + category_pipeline = pipeline( + Tasks.video_category, model='damo/cv_resnet50_video-category') + result = category_pipeline( + 'data/test/videos/video_category_test_video.mp4') + + print(f'video category output: {result}.') + + +if __name__ == '__main__': + unittest.main()