Browse Source

[to #42322933] add cv_resnet50_video-category to maas lib

新增视频内容分类模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9479334
master
qianming.lm yingda.chen 3 years ago
parent
commit
0b16dbd66c
8 changed files with 435 additions and 0 deletions
  1. +3
    -0
      data/test/videos/video_category_test_video.mp4
  2. +1
    -0
      modelscope/metainfo.py
  3. +7
    -0
      modelscope/outputs.py
  4. +2
    -0
      modelscope/pipelines/builder.py
  5. +2
    -0
      modelscope/pipelines/cv/__init__.py
  6. +397
    -0
      modelscope/pipelines/cv/video_category_pipeline.py
  7. +1
    -0
      modelscope/utils/constant.py
  8. +22
    -0
      tests/pipelines/test_video_category.py

+ 3
- 0
data/test/videos/video_category_test_video.mp4 View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cfc935328ecace53338050a6789250e08b9d17a52efa2339b0e133edc1fae9d4
size 3943349

+ 1
- 0
modelscope/metainfo.py View File

@@ -76,6 +76,7 @@ class Pipelines(object):
face_image_generation = 'gan-face-image-generation'
style_transfer = 'AAMS-style-transfer'
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
video_category = 'video-category'

# nlp tasks
sentence_similarity = 'sentence-similarity'


+ 7
- 0
modelscope/outputs.py View File

@@ -85,6 +85,13 @@ TASK_OUTPUTS = {
# }
Tasks.action_recognition: [OutputKeys.LABELS],

# video category recognition result for single video
# {
# "scores": [0.7716429233551025]
# "labels": ['生活>>好物推荐'],
# }
Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],

# pose estimation result for single sample
# {
# "poses": np.array with shape [num_pose, num_keypoint, 3],


+ 2
- 0
modelscope/pipelines/builder.py View File

@@ -61,6 +61,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
Tasks.action_recognition: (Pipelines.action_recognition,
'damo/cv_TAdaConv_action-recognition'),
Tasks.video_category: (Pipelines.video_category,
'damo/cv_resnet50_video-category'),
Tasks.multi_modal_embedding:
(Pipelines.multi_modal_embedding,
'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'),


+ 2
- 0
modelscope/pipelines/cv/__init__.py View File

@@ -14,6 +14,7 @@ if TYPE_CHECKING:
from .image_color_enhance_pipeline import ImageColorEnhancePipeline
from .image_colorization_pipeline import ImageColorizationPipeline
from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
from .video_category_pipeline import VideoCategoryPipeline
from .image_matting_pipeline import ImageMattingPipeline
from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
from .style_transfer_pipeline import StyleTransferPipeline
@@ -38,6 +39,7 @@ else:
'ocr_detection_pipeline': ['OCRDetectionPipeline'],
'image_instance_segmentation_pipeline':
['ImageInstanceSegmentationPipeline'],
'video_category_pipeline': ['VideoCategoryPipeline'],
}

import sys


+ 397
- 0
modelscope/pipelines/cv/video_category_pipeline.py View File

@@ -0,0 +1,397 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp
from typing import Any, Dict

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms.functional as TF
from PIL import Image

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.video_category, module_name=Pipelines.video_category)
class VideoCategoryPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a video-category pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
config_path = osp.join(self.model, ModelFile.CONFIGURATION)
logger.info(f'loading configuration from {config_path}')
with open(config_path, 'r') as f:
config = json.load(f)
self.frame_num = config['frame_num']
self.level_1_num = config['level_1_num']
self.level_2_num = config['level_2_num']
self.resize = config['resize']
self.crop = config['crop']
self.mean = config['mean']
self.std = config['std']
self.cateproj_v3 = config['cateproj_v3']
self.class_name = config['class_name']
self.subclass_name = config['subclass_name']
logger.info('load configuration done')

model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
logger.info(f'loading model from {model_path}')
self.infer_model = ModelWrapper(self.level_1_num, self.level_2_num,
self.frame_num)
self.device = torch.device(
'cuda' if torch.cuda.is_available() else 'cpu')
self.infer_model = self.infer_model.to(self.device).eval()
self.infer_model.load_state_dict(
torch.load(model_path, map_location=self.device))
logger.info('load model done')
self.transforms = VCompose([
VRescale(size=self.resize),
VCenterCrop(size=self.crop),
VToTensor(),
VNormalize(mean=self.mean, std=self.std)
])

def preprocess(self, input: Input) -> Dict[str, Any]:
if isinstance(input, str):
import decord
from decord import VideoReader, cpu
decord.bridge.set_bridge('native')
vr = VideoReader(input, ctx=cpu(0))
indices = np.linspace(0, len(vr) - 1, 16).astype(int)
frames = vr.get_batch(indices).asnumpy()
video_input_data = self.transforms(
[Image.fromarray(f) for f in frames])
else:
raise TypeError(f'input should be a str,'
f' but got {type(input)}')
result = {'video_data': video_input_data}
return result

@torch.no_grad()
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
pred1, pred2 = self.infer_model(input['video_data'].to(self.device))

pred1 = F.softmax(pred1, dim=1)
pred2 = F.softmax(pred2, dim=1)

vals_2, preds_2 = pred2.cpu().topk(10, 1, True, True)
vals_2 = vals_2.detach().numpy()
preds_2 = preds_2.detach().numpy()

if vals_2[0][0] >= 0.3:
c2 = int(preds_2[0][0])
c1 = self.cateproj_v3[c2]

tag1 = self.class_name[c1]
tag2 = self.subclass_name[c2]

prob = float(vals_2[0][0])
else:
vals_1, preds_1 = pred1.cpu().topk(10, 1, True, True)
vals_1 = vals_1.detach().numpy()
preds_1 = preds_1.detach().numpy()

c1 = int(preds_1[0][0])

tag1 = self.class_name[c1]
tag2 = '其他'

prob = float(vals_1[0][0])

return {
OutputKeys.SCORES: [prob],
OutputKeys.LABELS: [tag1 + '>>' + tag2]
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs


class TimeFirstBatchNorm1d(nn.Module):

def __init__(self, dim, groups=None):
super().__init__()
self.groups = groups
self.bn = nn.BatchNorm1d(dim)

def forward(self, tensor):
_, length, dim = tensor.size()
if self.groups:
dim = dim // self.groups
tensor = tensor.view(-1, dim)
tensor = self.bn(tensor)
if self.groups:
return tensor.view(-1, length, self.groups, dim)
else:
return tensor.view(-1, length, dim)


class NeXtVLAD(nn.Module):
"""NeXtVLAD layer implementation
Adapted from https://github.com/linrongc/youtube-8m/blob/master/nextvlad.py
"""

def __init__(self,
num_clusters=64,
dim=128,
alpha=100.0,
groups=8,
expansion=2,
normalize_input=True,
p_drop=0.25,
add_batchnorm=False):
"""
Args:
num_clusters : int
The number of clusters
dim : int
Dimension of descriptors
alpha : float
Parameter of initialization. Larger value is harder assignment.
normalize_input : bool
If true, descriptor-wise L2 normalization is applied to input.
"""
super(NeXtVLAD, self).__init__()
assert dim % groups == 0, '`dim` must be divisible by `groups`'
assert expansion > 1
self.p_drop = p_drop
self.cluster_dropout = nn.Dropout2d(p_drop)
self.num_clusters = num_clusters
self.dim = dim
self.expansion = expansion
self.grouped_dim = dim * expansion // groups
self.groups = groups
self.alpha = alpha
self.normalize_input = normalize_input
self.add_batchnorm = add_batchnorm
self.expansion_mapper = nn.Linear(dim, dim * expansion)
if add_batchnorm:
self.soft_assignment_mapper = nn.Sequential(
nn.Linear(dim * expansion, num_clusters * groups, bias=False),
TimeFirstBatchNorm1d(num_clusters, groups=groups))
else:
self.soft_assignment_mapper = nn.Linear(
dim * expansion, num_clusters * groups, bias=True)
self.attention_mapper = nn.Linear(dim * expansion, groups)
self.centroids = nn.Parameter(
torch.rand(num_clusters, self.grouped_dim))
self.final_bn = nn.BatchNorm1d(num_clusters * self.grouped_dim)
self._init_params()

def _init_params(self):
for component in (self.soft_assignment_mapper, self.attention_mapper,
self.expansion_mapper):
for module in component.modules():
self.general_weight_initialization(module)
if self.add_batchnorm:
self.soft_assignment_mapper[0].weight = nn.Parameter(
(2.0 * self.alpha * self.centroids).repeat(
(self.groups, self.groups)))
nn.init.constant_(self.soft_assignment_mapper[1].bn.weight, 1)
nn.init.constant_(self.soft_assignment_mapper[1].bn.bias, 0)
else:
self.soft_assignment_mapper.weight = nn.Parameter(
(2.0 * self.alpha * self.centroids).repeat(
(self.groups, self.groups)))
self.soft_assignment_mapper.bias = nn.Parameter(
(-self.alpha * self.centroids.norm(dim=1)).repeat(
(self.groups, )))

def general_weight_initialization(self, module):
if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
if module.weight is not None:
nn.init.uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.Linear):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)

def forward(self, x, masks=None):
"""NeXtVlad Adaptive Pooling
Arguments:
x {torch.Tensor} -- shape: (n_batch, len, dim)
Returns:
torch.Tensor -- shape (n_batch, n_cluster * dim / groups)
"""
if self.normalize_input:
x = F.normalize(x, p=2, dim=2) # across descriptor dim

# expansion
# shape: (n_batch, len, dim * expansion)
x = self.expansion_mapper(x)

# soft-assignment
# shape: (n_batch, len, n_cluster, groups)
soft_assign = self.soft_assignment_mapper(x).view(
x.size(0), x.size(1), self.num_clusters, self.groups)
soft_assign = F.softmax(soft_assign, dim=2)

# attention
# shape: (n_batch, len, groups)
attention = torch.sigmoid(self.attention_mapper(x))
if masks is not None:
# shape: (n_batch, len, groups)
attention = attention * masks[:, :, None]

# (n_batch, len, n_cluster, groups, dim / groups)
activation = (
attention[:, :, None, :, None] * soft_assign[:, :, :, :, None])

# calculate residuals to each clusters
# (n_batch, n_cluster, dim / groups)
second_term = (
activation.sum(dim=3).sum(dim=1) * self.centroids[None, :, :])
# (n_batch, n_cluster, dim / groups)
first_term = (
# (n_batch, len, n_cluster, groups, dim / groups)
activation
* x.view(x.size(0), x.size(1), 1, self.groups,
self.grouped_dim)).sum(dim=3).sum(dim=1)

# vlad shape (n_batch, n_cluster, dim / groups)
vlad = first_term - second_term
vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization
# flatten shape (n_batch, n_cluster * dim / groups)
vlad = vlad.view(x.size(0), -1) # flatten
# vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize
vlad = self.final_bn(vlad)
if self.p_drop:
vlad = self.cluster_dropout(
vlad.view(x.size(0), self.num_clusters, self.grouped_dim,
1)).view(x.size(0), -1)
return vlad


class ModelWrapper(nn.Module):

def __init__(self, class_num, subclass_num, frame_num):
super(ModelWrapper, self).__init__()
cnn = models.resnet50(pretrained=False)
cnn.fc = nn.Sequential()
self.model = cnn
# Use NextVlad
# output size: (n_batch, n_cluster * dim / groups)
nv_group = 2
expand = int(2 * frame_num / nv_group)
self.nextvlad = NeXtVLAD(
num_clusters=frame_num, dim=2048, groups=nv_group)
self.fc = nn.Linear(2048 * expand, 2048)
self.head1_p1 = nn.Sequential(
nn.Linear(2048, 2048),
nn.ReLU(),
nn.Linear(2048, 1024),
)
self.head1_p2 = nn.Sequential(
nn.Linear(1024, 1024),
nn.ReLU(),
nn.Linear(1024, class_num),
)
self.head2_p1 = nn.Sequential(
nn.Linear(2048, 2048),
nn.ReLU(),
nn.Linear(2048, 1024),
)
self.head2_p2 = nn.Sequential(
nn.Linear(2048, 1024),
nn.ReLU(),
nn.Linear(1024, subclass_num),
)
self.fn = frame_num

def forward(self, x):
x = x.view(-1, 3, 224, 224)
x = self.model(x)

x = x.view(-1, self.fn, 2048)
x = self.nextvlad(x)

x = self.fc(x)

x1 = self.head1_p1(x)
c1 = self.head1_p2(x1)

x2 = self.head2_p1(x)
c2 = self.head2_p2(torch.cat((x1, x2), dim=1))

return c1, c2


class VCompose(object):

def __init__(self, transforms):
self.transforms = transforms

def __call__(self, item):
for t in self.transforms:
item = t(item)
return item


class VRescale(object):

def __init__(self, size=128):
self.size = size

def __call__(self, vclip):
w, h = vclip[0].size
scale = self.size / min(w, h)
out_w, out_h = int(round(w * scale)), int(round(h * scale))
vclip = [u.resize((out_w, out_h), Image.BILINEAR) for u in vclip]
return vclip


class VCenterCrop(object):

def __init__(self, size=112):
self.size = size

def __call__(self, vclip):
w, h = vclip[0].size
assert min(w, h) >= self.size
x1 = (w - self.size) // 2
y1 = (h - self.size) // 2
vclip = [
u.crop((x1, y1, x1 + self.size, y1 + self.size)) for u in vclip
]
return vclip


class VToTensor(object):

def __call__(self, vclip):
vclip = torch.stack([TF.to_tensor(u) for u in vclip], dim=0)
return vclip


class VNormalize(object):

def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
self.mean = mean
self.std = std

def __call__(self, vclip):
assert vclip.min() > -0.1 and vclip.max() < 1.1, \
'vclip values should be in [0, 1]'
vclip = vclip.clone()
if not isinstance(self.mean, torch.Tensor):
self.mean = vclip.new_tensor(self.mean).view(1, -1, 1, 1)
if not isinstance(self.std, torch.Tensor):
self.std = vclip.new_tensor(self.std).view(1, -1, 1, 1)
vclip.sub_(self.mean).div_(self.std)
return vclip

+ 1
- 0
modelscope/utils/constant.py View File

@@ -34,6 +34,7 @@ class CVTasks(object):
face_image_generation = 'face-image-generation'
image_super_resolution = 'image-super-resolution'
style_transfer = 'style-transfer'
video_category = 'video-category'
image_classification_imagenet = 'image-classification-imagenet'
image_classification_dailylife = 'image-classification-dailylife'



+ 22
- 0
tests/pipelines/test_video_category.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class VideoCategoryTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub(self):
category_pipeline = pipeline(
Tasks.video_category, model='damo/cv_resnet50_video-category')
result = category_pipeline(
'data/test/videos/video_category_test_video.mp4')

print(f'video category output: {result}.')


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save