[to #42322933] add cv_resnet50_video-category to maas lib

新增视频内容分类模型 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9479334
3 years ago · 0b16dbd66c
--- a/data/test/videos/video_category_test_video.mp4
+++ b/data/test/videos/video_category_test_video.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:cfc935328ecace53338050a6789250e08b9d17a52efa2339b0e133edc1fae9d4
 size 3943349
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -76,6 +76,7 @@ class Pipelines(object):
    face_image_generation = 'gan-face-image-generation'
    style_transfer = 'AAMS-style-transfer'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    video_category = 'video-category'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -85,6 +85,13 @@ TASK_OUTPUTS = {
    # }
    Tasks.action_recognition: [OutputKeys.LABELS],

    # video category recognition result for single video
    # {
    #       "scores": [0.7716429233551025]
    #       "labels": ['生活>>好物推荐'],
    # }
    Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],

    # pose estimation result for single sample
    # {
    #   "poses": np.array with shape [num_pose, num_keypoint, 3],
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -61,6 +61,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
    Tasks.action_recognition: (Pipelines.action_recognition,
                               'damo/cv_TAdaConv_action-recognition'),
    Tasks.video_category: (Pipelines.video_category,
                           'damo/cv_resnet50_video-category'),
    Tasks.multi_modal_embedding:
    (Pipelines.multi_modal_embedding,
     'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'),
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -14,6 +14,7 @@ if TYPE_CHECKING:
    from .image_color_enhance_pipeline import ImageColorEnhancePipeline
    from .image_colorization_pipeline import ImageColorizationPipeline
    from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .image_matting_pipeline import ImageMattingPipeline
    from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
    from .style_transfer_pipeline import StyleTransferPipeline
@@ -38,6 +39,7 @@ else:
        'ocr_detection_pipeline': ['OCRDetectionPipeline'],
        'image_instance_segmentation_pipeline':
        ['ImageInstanceSegmentationPipeline'],
        'video_category_pipeline': ['VideoCategoryPipeline'],
    }

    import sys
--- a/modelscope/pipelines/cv/video_category_pipeline.py
+++ b/modelscope/pipelines/cv/video_category_pipeline.py
@@ -0,0 +1,397 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models as models
 import torchvision.transforms.functional as TF
 from PIL import Image

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.video_category, module_name=Pipelines.video_category)
 class VideoCategoryPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a video-category pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
        logger.info(f'loading configuration from {config_path}')
        with open(config_path, 'r') as f:
            config = json.load(f)
            self.frame_num = config['frame_num']
            self.level_1_num = config['level_1_num']
            self.level_2_num = config['level_2_num']
            self.resize = config['resize']
            self.crop = config['crop']
            self.mean = config['mean']
            self.std = config['std']
            self.cateproj_v3 = config['cateproj_v3']
            self.class_name = config['class_name']
            self.subclass_name = config['subclass_name']
        logger.info('load configuration done')

        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
        logger.info(f'loading model from {model_path}')
        self.infer_model = ModelWrapper(self.level_1_num, self.level_2_num,
                                        self.frame_num)
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.infer_model = self.infer_model.to(self.device).eval()
        self.infer_model.load_state_dict(
            torch.load(model_path, map_location=self.device))
        logger.info('load model done')
        self.transforms = VCompose([
            VRescale(size=self.resize),
            VCenterCrop(size=self.crop),
            VToTensor(),
            VNormalize(mean=self.mean, std=self.std)
        ])

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if isinstance(input, str):
            import decord
            from decord import VideoReader, cpu
            decord.bridge.set_bridge('native')
            vr = VideoReader(input, ctx=cpu(0))
            indices = np.linspace(0, len(vr) - 1, 16).astype(int)
            frames = vr.get_batch(indices).asnumpy()
            video_input_data = self.transforms(
                [Image.fromarray(f) for f in frames])
        else:
            raise TypeError(f'input should be a str,'
                            f'  but got {type(input)}')
        result = {'video_data': video_input_data}
        return result

    @torch.no_grad()
    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        pred1, pred2 = self.infer_model(input['video_data'].to(self.device))

        pred1 = F.softmax(pred1, dim=1)
        pred2 = F.softmax(pred2, dim=1)

        vals_2, preds_2 = pred2.cpu().topk(10, 1, True, True)
        vals_2 = vals_2.detach().numpy()
        preds_2 = preds_2.detach().numpy()

        if vals_2[0][0] >= 0.3:
            c2 = int(preds_2[0][0])
            c1 = self.cateproj_v3[c2]

            tag1 = self.class_name[c1]
            tag2 = self.subclass_name[c2]

            prob = float(vals_2[0][0])
        else:
            vals_1, preds_1 = pred1.cpu().topk(10, 1, True, True)
            vals_1 = vals_1.detach().numpy()
            preds_1 = preds_1.detach().numpy()

            c1 = int(preds_1[0][0])

            tag1 = self.class_name[c1]
            tag2 = '其他'

            prob = float(vals_1[0][0])

        return {
            OutputKeys.SCORES: [prob],
            OutputKeys.LABELS: [tag1 + '>>' + tag2]
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs


 class TimeFirstBatchNorm1d(nn.Module):

    def __init__(self, dim, groups=None):
        super().__init__()
        self.groups = groups
        self.bn = nn.BatchNorm1d(dim)

    def forward(self, tensor):
        _, length, dim = tensor.size()
        if self.groups:
            dim = dim // self.groups
        tensor = tensor.view(-1, dim)
        tensor = self.bn(tensor)
        if self.groups:
            return tensor.view(-1, length, self.groups, dim)
        else:
            return tensor.view(-1, length, dim)


 class NeXtVLAD(nn.Module):
    """NeXtVLAD layer implementation
    Adapted from https://github.com/linrongc/youtube-8m/blob/master/nextvlad.py
    """

    def __init__(self,
                 num_clusters=64,
                 dim=128,
                 alpha=100.0,
                 groups=8,
                 expansion=2,
                 normalize_input=True,
                 p_drop=0.25,
                 add_batchnorm=False):
        """
        Args:
            num_clusters : int
                The number of clusters
            dim : int
                Dimension of descriptors
            alpha : float
                Parameter of initialization. Larger value is harder assignment.
            normalize_input : bool
                If true, descriptor-wise L2 normalization is applied to input.
        """
        super(NeXtVLAD, self).__init__()
        assert dim % groups == 0, '`dim` must be divisible by `groups`'
        assert expansion > 1
        self.p_drop = p_drop
        self.cluster_dropout = nn.Dropout2d(p_drop)
        self.num_clusters = num_clusters
        self.dim = dim
        self.expansion = expansion
        self.grouped_dim = dim * expansion // groups
        self.groups = groups
        self.alpha = alpha
        self.normalize_input = normalize_input
        self.add_batchnorm = add_batchnorm
        self.expansion_mapper = nn.Linear(dim, dim * expansion)
        if add_batchnorm:
            self.soft_assignment_mapper = nn.Sequential(
                nn.Linear(dim * expansion, num_clusters * groups, bias=False),
                TimeFirstBatchNorm1d(num_clusters, groups=groups))
        else:
            self.soft_assignment_mapper = nn.Linear(
                dim * expansion, num_clusters * groups, bias=True)
        self.attention_mapper = nn.Linear(dim * expansion, groups)
        self.centroids = nn.Parameter(
            torch.rand(num_clusters, self.grouped_dim))
        self.final_bn = nn.BatchNorm1d(num_clusters * self.grouped_dim)
        self._init_params()

    def _init_params(self):
        for component in (self.soft_assignment_mapper, self.attention_mapper,
                          self.expansion_mapper):
            for module in component.modules():
                self.general_weight_initialization(module)
        if self.add_batchnorm:
            self.soft_assignment_mapper[0].weight = nn.Parameter(
                (2.0 * self.alpha * self.centroids).repeat(
                    (self.groups, self.groups)))
            nn.init.constant_(self.soft_assignment_mapper[1].bn.weight, 1)
            nn.init.constant_(self.soft_assignment_mapper[1].bn.bias, 0)
        else:
            self.soft_assignment_mapper.weight = nn.Parameter(
                (2.0 * self.alpha * self.centroids).repeat(
                    (self.groups, self.groups)))
            self.soft_assignment_mapper.bias = nn.Parameter(
                (-self.alpha * self.centroids.norm(dim=1)).repeat(
                    (self.groups, )))

    def general_weight_initialization(self, module):
        if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
            if module.weight is not None:
                nn.init.uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, x, masks=None):
        """NeXtVlad Adaptive Pooling
        Arguments:
            x {torch.Tensor} -- shape: (n_batch, len, dim)
        Returns:
            torch.Tensor -- shape (n_batch, n_cluster * dim / groups)
        """
        if self.normalize_input:
            x = F.normalize(x, p=2, dim=2)  # across descriptor dim

        # expansion
        # shape: (n_batch, len, dim * expansion)
        x = self.expansion_mapper(x)

        # soft-assignment
        # shape: (n_batch, len, n_cluster, groups)
        soft_assign = self.soft_assignment_mapper(x).view(
            x.size(0), x.size(1), self.num_clusters, self.groups)
        soft_assign = F.softmax(soft_assign, dim=2)

        # attention
        # shape: (n_batch, len, groups)
        attention = torch.sigmoid(self.attention_mapper(x))
        if masks is not None:
            # shape: (n_batch, len, groups)
            attention = attention * masks[:, :, None]

        # (n_batch, len, n_cluster, groups, dim / groups)
        activation = (
            attention[:, :, None, :, None] * soft_assign[:, :, :, :, None])

        # calculate residuals to each clusters
        # (n_batch, n_cluster, dim / groups)
        second_term = (
            activation.sum(dim=3).sum(dim=1) * self.centroids[None, :, :])
        # (n_batch, n_cluster, dim / groups)
        first_term = (
            # (n_batch, len, n_cluster, groups, dim / groups)
            activation
            * x.view(x.size(0), x.size(1), 1, self.groups,
                     self.grouped_dim)).sum(dim=3).sum(dim=1)

        # vlad shape (n_batch, n_cluster, dim / groups)
        vlad = first_term - second_term
        vlad = F.normalize(vlad, p=2, dim=2)  # intra-normalization
        # flatten shape (n_batch, n_cluster * dim / groups)
        vlad = vlad.view(x.size(0), -1)  # flatten
        # vlad = F.normalize(vlad, p=2, dim=1)  # L2 normalize
        vlad = self.final_bn(vlad)
        if self.p_drop:
            vlad = self.cluster_dropout(
                vlad.view(x.size(0), self.num_clusters, self.grouped_dim,
                          1)).view(x.size(0), -1)
        return vlad


 class ModelWrapper(nn.Module):

    def __init__(self, class_num, subclass_num, frame_num):
        super(ModelWrapper, self).__init__()
        cnn = models.resnet50(pretrained=False)
        cnn.fc = nn.Sequential()
        self.model = cnn
        # Use NextVlad
        # output size: (n_batch, n_cluster * dim / groups)
        nv_group = 2
        expand = int(2 * frame_num / nv_group)
        self.nextvlad = NeXtVLAD(
            num_clusters=frame_num, dim=2048, groups=nv_group)
        self.fc = nn.Linear(2048 * expand, 2048)
        self.head1_p1 = nn.Sequential(
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
        )
        self.head1_p2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, class_num),
        )
        self.head2_p1 = nn.Sequential(
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
        )
        self.head2_p2 = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, subclass_num),
        )
        self.fn = frame_num

    def forward(self, x):
        x = x.view(-1, 3, 224, 224)
        x = self.model(x)

        x = x.view(-1, self.fn, 2048)
        x = self.nextvlad(x)

        x = self.fc(x)

        x1 = self.head1_p1(x)
        c1 = self.head1_p2(x1)

        x2 = self.head2_p1(x)
        c2 = self.head2_p2(torch.cat((x1, x2), dim=1))

        return c1, c2


 class VCompose(object):

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, item):
        for t in self.transforms:
            item = t(item)
        return item


 class VRescale(object):

    def __init__(self, size=128):
        self.size = size

    def __call__(self, vclip):
        w, h = vclip[0].size
        scale = self.size / min(w, h)
        out_w, out_h = int(round(w * scale)), int(round(h * scale))
        vclip = [u.resize((out_w, out_h), Image.BILINEAR) for u in vclip]
        return vclip


 class VCenterCrop(object):

    def __init__(self, size=112):
        self.size = size

    def __call__(self, vclip):
        w, h = vclip[0].size
        assert min(w, h) >= self.size
        x1 = (w - self.size) // 2
        y1 = (h - self.size) // 2
        vclip = [
            u.crop((x1, y1, x1 + self.size, y1 + self.size)) for u in vclip
        ]
        return vclip


 class VToTensor(object):

    def __call__(self, vclip):
        vclip = torch.stack([TF.to_tensor(u) for u in vclip], dim=0)
        return vclip


 class VNormalize(object):

    def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
        self.mean = mean
        self.std = std

    def __call__(self, vclip):
        assert vclip.min() > -0.1 and vclip.max() < 1.1, \
            'vclip values should be in [0, 1]'
        vclip = vclip.clone()
        if not isinstance(self.mean, torch.Tensor):
            self.mean = vclip.new_tensor(self.mean).view(1, -1, 1, 1)
        if not isinstance(self.std, torch.Tensor):
            self.std = vclip.new_tensor(self.std).view(1, -1, 1, 1)
        vclip.sub_(self.mean).div_(self.std)
        return vclip
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -34,6 +34,7 @@ class CVTasks(object):
    face_image_generation = 'face-image-generation'
    image_super_resolution = 'image-super-resolution'
    style_transfer = 'style-transfer'
    video_category = 'video-category'
    image_classification_imagenet = 'image-classification-imagenet'
    image_classification_dailylife = 'image-classification-dailylife'

--- a/tests/pipelines/test_video_category.py
+++ b/tests/pipelines/test_video_category.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest

 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level


 class VideoCategoryTest(unittest.TestCase):

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub(self):
        category_pipeline = pipeline(
            Tasks.video_category, model='damo/cv_resnet50_video-category')
        result = category_pipeline(
            'data/test/videos/video_category_test_video.mp4')

        print(f'video category output: {result}.')


 if __name__ == '__main__':
    unittest.main()