From 0b16dbd66cbe6c1b7281370e3b3c1e19226ff95c Mon Sep 17 00:00:00 2001
From: "qianming.lm" <qianming.lm@alibaba-inc.com>
Date: Wed, 27 Jul 2022 23:42:27 +0800
Subject: [PATCH] [to #42322933] add cv_resnet50_video-category to maas lib
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增视频内容分类模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9479334
---
 .../test/videos/video_category_test_video.mp4 |   3 +
 modelscope/metainfo.py                        |   1 +
 modelscope/outputs.py                         |   7 +
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../pipelines/cv/video_category_pipeline.py   | 397 ++++++++++++++++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_video_category.py        |  22 +
 8 files changed, 435 insertions(+)
 create mode 100644 data/test/videos/video_category_test_video.mp4
 create mode 100644 modelscope/pipelines/cv/video_category_pipeline.py
 create mode 100644 tests/pipelines/test_video_category.py

diff --git a/data/test/videos/video_category_test_video.mp4 b/data/test/videos/video_category_test_video.mp4
new file mode 100644
index 00000000..195af371
--- /dev/null
+++ b/data/test/videos/video_category_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc935328ecace53338050a6789250e08b9d17a52efa2339b0e133edc1fae9d4
+size 3943349
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index c6858794..0a80876d 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -76,6 +76,7 @@ class Pipelines(object):
     face_image_generation = 'gan-face-image-generation'
     style_transfer = 'AAMS-style-transfer'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
+    video_category = 'video-category'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index ed2d680d..975a548f 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -85,6 +85,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.action_recognition: [OutputKeys.LABELS],
 
+    # video category recognition result for single video
+    # {
+    #       "scores": [0.7716429233551025]
+    #       "labels": ['生活>>好物推荐'],
+    # }
+    Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],
+
     # pose estimation result for single sample
     # {
     #   "poses": np.array with shape [num_pose, num_keypoint, 3],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index eb5f0e6d..c580bb72 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -61,6 +61,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
+    Tasks.video_category: (Pipelines.video_category,
+                           'damo/cv_resnet50_video-category'),
     Tasks.multi_modal_embedding:
     (Pipelines.multi_modal_embedding,
      'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'),
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 5d5f93c1..5aa8f7d0 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -14,6 +14,7 @@ if TYPE_CHECKING:
     from .image_color_enhance_pipeline import ImageColorEnhancePipeline
     from .image_colorization_pipeline import ImageColorizationPipeline
     from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
+    from .video_category_pipeline import VideoCategoryPipeline
     from .image_matting_pipeline import ImageMattingPipeline
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .style_transfer_pipeline import StyleTransferPipeline
@@ -38,6 +39,7 @@ else:
         'ocr_detection_pipeline': ['OCRDetectionPipeline'],
         'image_instance_segmentation_pipeline':
         ['ImageInstanceSegmentationPipeline'],
+        'video_category_pipeline': ['VideoCategoryPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/video_category_pipeline.py b/modelscope/pipelines/cv/video_category_pipeline.py
new file mode 100644
index 00000000..2d38031a
--- /dev/null
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -0,0 +1,397 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.transforms.functional as TF
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_category, module_name=Pipelines.video_category)
+class VideoCategoryPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a video-category pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading configuration from {config_path}')
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+            self.frame_num = config['frame_num']
+            self.level_1_num = config['level_1_num']
+            self.level_2_num = config['level_2_num']
+            self.resize = config['resize']
+            self.crop = config['crop']
+            self.mean = config['mean']
+            self.std = config['std']
+            self.cateproj_v3 = config['cateproj_v3']
+            self.class_name = config['class_name']
+            self.subclass_name = config['subclass_name']
+        logger.info('load configuration done')
+
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        self.infer_model = ModelWrapper(self.level_1_num, self.level_2_num,
+                                        self.frame_num)
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+        self.infer_model = self.infer_model.to(self.device).eval()
+        self.infer_model.load_state_dict(
+            torch.load(model_path, map_location=self.device))
+        logger.info('load model done')
+        self.transforms = VCompose([
+            VRescale(size=self.resize),
+            VCenterCrop(size=self.crop),
+            VToTensor(),
+            VNormalize(mean=self.mean, std=self.std)
+        ])
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            import decord
+            from decord import VideoReader, cpu
+            decord.bridge.set_bridge('native')
+            vr = VideoReader(input, ctx=cpu(0))
+            indices = np.linspace(0, len(vr) - 1, 16).astype(int)
+            frames = vr.get_batch(indices).asnumpy()
+            video_input_data = self.transforms(
+                [Image.fromarray(f) for f in frames])
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        pred1, pred2 = self.infer_model(input['video_data'].to(self.device))
+
+        pred1 = F.softmax(pred1, dim=1)
+        pred2 = F.softmax(pred2, dim=1)
+
+        vals_2, preds_2 = pred2.cpu().topk(10, 1, True, True)
+        vals_2 = vals_2.detach().numpy()
+        preds_2 = preds_2.detach().numpy()
+
+        if vals_2[0][0] >= 0.3:
+            c2 = int(preds_2[0][0])
+            c1 = self.cateproj_v3[c2]
+
+            tag1 = self.class_name[c1]
+            tag2 = self.subclass_name[c2]
+
+            prob = float(vals_2[0][0])
+        else:
+            vals_1, preds_1 = pred1.cpu().topk(10, 1, True, True)
+            vals_1 = vals_1.detach().numpy()
+            preds_1 = preds_1.detach().numpy()
+
+            c1 = int(preds_1[0][0])
+
+            tag1 = self.class_name[c1]
+            tag2 = '其他'
+
+            prob = float(vals_1[0][0])
+
+        return {
+            OutputKeys.SCORES: [prob],
+            OutputKeys.LABELS: [tag1 + '>>' + tag2]
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+
+class TimeFirstBatchNorm1d(nn.Module):
+
+    def __init__(self, dim, groups=None):
+        super().__init__()
+        self.groups = groups
+        self.bn = nn.BatchNorm1d(dim)
+
+    def forward(self, tensor):
+        _, length, dim = tensor.size()
+        if self.groups:
+            dim = dim // self.groups
+        tensor = tensor.view(-1, dim)
+        tensor = self.bn(tensor)
+        if self.groups:
+            return tensor.view(-1, length, self.groups, dim)
+        else:
+            return tensor.view(-1, length, dim)
+
+
+class NeXtVLAD(nn.Module):
+    """NeXtVLAD layer implementation
+    Adapted from https://github.com/linrongc/youtube-8m/blob/master/nextvlad.py
+    """
+
+    def __init__(self,
+                 num_clusters=64,
+                 dim=128,
+                 alpha=100.0,
+                 groups=8,
+                 expansion=2,
+                 normalize_input=True,
+                 p_drop=0.25,
+                 add_batchnorm=False):
+        """
+        Args:
+            num_clusters : int
+                The number of clusters
+            dim : int
+                Dimension of descriptors
+            alpha : float
+                Parameter of initialization. Larger value is harder assignment.
+            normalize_input : bool
+                If true, descriptor-wise L2 normalization is applied to input.
+        """
+        super(NeXtVLAD, self).__init__()
+        assert dim % groups == 0, '`dim` must be divisible by `groups`'
+        assert expansion > 1
+        self.p_drop = p_drop
+        self.cluster_dropout = nn.Dropout2d(p_drop)
+        self.num_clusters = num_clusters
+        self.dim = dim
+        self.expansion = expansion
+        self.grouped_dim = dim * expansion // groups
+        self.groups = groups
+        self.alpha = alpha
+        self.normalize_input = normalize_input
+        self.add_batchnorm = add_batchnorm
+        self.expansion_mapper = nn.Linear(dim, dim * expansion)
+        if add_batchnorm:
+            self.soft_assignment_mapper = nn.Sequential(
+                nn.Linear(dim * expansion, num_clusters * groups, bias=False),
+                TimeFirstBatchNorm1d(num_clusters, groups=groups))
+        else:
+            self.soft_assignment_mapper = nn.Linear(
+                dim * expansion, num_clusters * groups, bias=True)
+        self.attention_mapper = nn.Linear(dim * expansion, groups)
+        self.centroids = nn.Parameter(
+            torch.rand(num_clusters, self.grouped_dim))
+        self.final_bn = nn.BatchNorm1d(num_clusters * self.grouped_dim)
+        self._init_params()
+
+    def _init_params(self):
+        for component in (self.soft_assignment_mapper, self.attention_mapper,
+                          self.expansion_mapper):
+            for module in component.modules():
+                self.general_weight_initialization(module)
+        if self.add_batchnorm:
+            self.soft_assignment_mapper[0].weight = nn.Parameter(
+                (2.0 * self.alpha * self.centroids).repeat(
+                    (self.groups, self.groups)))
+            nn.init.constant_(self.soft_assignment_mapper[1].bn.weight, 1)
+            nn.init.constant_(self.soft_assignment_mapper[1].bn.bias, 0)
+        else:
+            self.soft_assignment_mapper.weight = nn.Parameter(
+                (2.0 * self.alpha * self.centroids).repeat(
+                    (self.groups, self.groups)))
+            self.soft_assignment_mapper.bias = nn.Parameter(
+                (-self.alpha * self.centroids.norm(dim=1)).repeat(
+                    (self.groups, )))
+
+    def general_weight_initialization(self, module):
+        if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
+            if module.weight is not None:
+                nn.init.uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+
+    def forward(self, x, masks=None):
+        """NeXtVlad Adaptive Pooling
+        Arguments:
+            x {torch.Tensor} -- shape: (n_batch, len, dim)
+        Returns:
+            torch.Tensor -- shape (n_batch, n_cluster * dim / groups)
+        """
+        if self.normalize_input:
+            x = F.normalize(x, p=2, dim=2)  # across descriptor dim
+
+        # expansion
+        # shape: (n_batch, len, dim * expansion)
+        x = self.expansion_mapper(x)
+
+        # soft-assignment
+        # shape: (n_batch, len, n_cluster, groups)
+        soft_assign = self.soft_assignment_mapper(x).view(
+            x.size(0), x.size(1), self.num_clusters, self.groups)
+        soft_assign = F.softmax(soft_assign, dim=2)
+
+        # attention
+        # shape: (n_batch, len, groups)
+        attention = torch.sigmoid(self.attention_mapper(x))
+        if masks is not None:
+            # shape: (n_batch, len, groups)
+            attention = attention * masks[:, :, None]
+
+        # (n_batch, len, n_cluster, groups, dim / groups)
+        activation = (
+            attention[:, :, None, :, None] * soft_assign[:, :, :, :, None])
+
+        # calculate residuals to each clusters
+        # (n_batch, n_cluster, dim / groups)
+        second_term = (
+            activation.sum(dim=3).sum(dim=1) * self.centroids[None, :, :])
+        # (n_batch, n_cluster, dim / groups)
+        first_term = (
+            # (n_batch, len, n_cluster, groups, dim / groups)
+            activation
+            * x.view(x.size(0), x.size(1), 1, self.groups,
+                     self.grouped_dim)).sum(dim=3).sum(dim=1)
+
+        # vlad shape (n_batch, n_cluster, dim / groups)
+        vlad = first_term - second_term
+        vlad = F.normalize(vlad, p=2, dim=2)  # intra-normalization
+        # flatten shape (n_batch, n_cluster * dim / groups)
+        vlad = vlad.view(x.size(0), -1)  # flatten
+        # vlad = F.normalize(vlad, p=2, dim=1)  # L2 normalize
+        vlad = self.final_bn(vlad)
+        if self.p_drop:
+            vlad = self.cluster_dropout(
+                vlad.view(x.size(0), self.num_clusters, self.grouped_dim,
+                          1)).view(x.size(0), -1)
+        return vlad
+
+
+class ModelWrapper(nn.Module):
+
+    def __init__(self, class_num, subclass_num, frame_num):
+        super(ModelWrapper, self).__init__()
+        cnn = models.resnet50(pretrained=False)
+        cnn.fc = nn.Sequential()
+        self.model = cnn
+        # Use NextVlad
+        # output size: (n_batch, n_cluster * dim / groups)
+        nv_group = 2
+        expand = int(2 * frame_num / nv_group)
+        self.nextvlad = NeXtVLAD(
+            num_clusters=frame_num, dim=2048, groups=nv_group)
+        self.fc = nn.Linear(2048 * expand, 2048)
+        self.head1_p1 = nn.Sequential(
+            nn.Linear(2048, 2048),
+            nn.ReLU(),
+            nn.Linear(2048, 1024),
+        )
+        self.head1_p2 = nn.Sequential(
+            nn.Linear(1024, 1024),
+            nn.ReLU(),
+            nn.Linear(1024, class_num),
+        )
+        self.head2_p1 = nn.Sequential(
+            nn.Linear(2048, 2048),
+            nn.ReLU(),
+            nn.Linear(2048, 1024),
+        )
+        self.head2_p2 = nn.Sequential(
+            nn.Linear(2048, 1024),
+            nn.ReLU(),
+            nn.Linear(1024, subclass_num),
+        )
+        self.fn = frame_num
+
+    def forward(self, x):
+        x = x.view(-1, 3, 224, 224)
+        x = self.model(x)
+
+        x = x.view(-1, self.fn, 2048)
+        x = self.nextvlad(x)
+
+        x = self.fc(x)
+
+        x1 = self.head1_p1(x)
+        c1 = self.head1_p2(x1)
+
+        x2 = self.head2_p1(x)
+        c2 = self.head2_p2(torch.cat((x1, x2), dim=1))
+
+        return c1, c2
+
+
+class VCompose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, item):
+        for t in self.transforms:
+            item = t(item)
+        return item
+
+
+class VRescale(object):
+
+    def __init__(self, size=128):
+        self.size = size
+
+    def __call__(self, vclip):
+        w, h = vclip[0].size
+        scale = self.size / min(w, h)
+        out_w, out_h = int(round(w * scale)), int(round(h * scale))
+        vclip = [u.resize((out_w, out_h), Image.BILINEAR) for u in vclip]
+        return vclip
+
+
+class VCenterCrop(object):
+
+    def __init__(self, size=112):
+        self.size = size
+
+    def __call__(self, vclip):
+        w, h = vclip[0].size
+        assert min(w, h) >= self.size
+        x1 = (w - self.size) // 2
+        y1 = (h - self.size) // 2
+        vclip = [
+            u.crop((x1, y1, x1 + self.size, y1 + self.size)) for u in vclip
+        ]
+        return vclip
+
+
+class VToTensor(object):
+
+    def __call__(self, vclip):
+        vclip = torch.stack([TF.to_tensor(u) for u in vclip], dim=0)
+        return vclip
+
+
+class VNormalize(object):
+
+    def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, vclip):
+        assert vclip.min() > -0.1 and vclip.max() < 1.1, \
+            'vclip values should be in [0, 1]'
+        vclip = vclip.clone()
+        if not isinstance(self.mean, torch.Tensor):
+            self.mean = vclip.new_tensor(self.mean).view(1, -1, 1, 1)
+        if not isinstance(self.std, torch.Tensor):
+            self.std = vclip.new_tensor(self.std).view(1, -1, 1, 1)
+        vclip.sub_(self.mean).div_(self.std)
+        return vclip
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index fafb762f..54035b9e 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -34,6 +34,7 @@ class CVTasks(object):
     face_image_generation = 'face-image-generation'
     image_super_resolution = 'image-super-resolution'
     style_transfer = 'style-transfer'
+    video_category = 'video-category'
     image_classification_imagenet = 'image-classification-imagenet'
     image_classification_dailylife = 'image-classification-dailylife'
 
diff --git a/tests/pipelines/test_video_category.py b/tests/pipelines/test_video_category.py
new file mode 100644
index 00000000..aba56676
--- /dev/null
+++ b/tests/pipelines/test_video_category.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VideoCategoryTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        category_pipeline = pipeline(
+            Tasks.video_category, model='damo/cv_resnet50_video-category')
+        result = category_pipeline(
+            'data/test/videos/video_category_test_video.mp4')
+
+        print(f'video category output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()