Browse Source

Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts:
#	modelscope/models/multi_modal/ofa/utils/__init__.py
master
行嗔 3 years ago
parent
commit
fcbcd8e1b6
100 changed files with 4495 additions and 42 deletions
  1. +3
    -0
      data/test/images/face_emotion.jpg
  2. +3
    -0
      data/test/images/face_human_hand_detection.jpg
  3. +3
    -0
      data/test/images/product_segmentation.jpg
  4. +10
    -0
      modelscope/metainfo.py
  5. +2
    -0
      modelscope/metrics/image_portrait_enhancement_metric.py
  6. +2
    -0
      modelscope/models/cv/action_recognition/__init__.py
  7. +1198
    -0
      modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
  8. +2
    -0
      modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
  9. +2
    -0
      modelscope/models/cv/body_2d_keypoints/w48.py
  10. +2
    -0
      modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
  11. +1
    -1
      modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
  12. +2
    -0
      modelscope/models/cv/cartoon/facelib/LK/lk.py
  13. +2
    -0
      modelscope/models/cv/cartoon/facelib/config.py
  14. +2
    -0
      modelscope/models/cv/cartoon/facelib/face_detector.py
  15. +2
    -0
      modelscope/models/cv/cartoon/facelib/face_landmark.py
  16. +2
    -0
      modelscope/models/cv/cartoon/facelib/facer.py
  17. +2
    -4
      modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
  18. +1
    -5
      modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
  19. +2
    -0
      modelscope/models/cv/cartoon/utils.py
  20. +1
    -0
      modelscope/models/cv/face_detection/mogface/__init__.py
  21. +1
    -0
      modelscope/models/cv/face_detection/mtcnn/__init__.py
  22. +1
    -0
      modelscope/models/cv/face_detection/retinaface/__init__.py
  23. +1
    -0
      modelscope/models/cv/face_detection/ulfd_slim/__init__.py
  24. +20
    -0
      modelscope/models/cv/face_emotion/__init__.py
  25. +6
    -0
      modelscope/models/cv/face_emotion/efficient/__init__.py
  26. +380
    -0
      modelscope/models/cv/face_emotion/efficient/model.py
  27. +559
    -0
      modelscope/models/cv/face_emotion/efficient/utils.py
  28. +67
    -0
      modelscope/models/cv/face_emotion/emotion_infer.py
  29. +96
    -0
      modelscope/models/cv/face_emotion/emotion_model.py
  30. +0
    -0
      modelscope/models/cv/face_emotion/face_alignment/__init__.py
  31. +79
    -0
      modelscope/models/cv/face_emotion/face_alignment/face.py
  32. +59
    -0
      modelscope/models/cv/face_emotion/face_alignment/face_align.py
  33. +2
    -0
      modelscope/models/cv/face_generation/op/conv2d_gradfix.py
  34. +2
    -0
      modelscope/models/cv/face_generation/op/fused_act.py
  35. +2
    -0
      modelscope/models/cv/face_generation/op/upfirdn2d.py
  36. +2
    -0
      modelscope/models/cv/face_generation/stylegan2.py
  37. +20
    -0
      modelscope/models/cv/face_human_hand_detection/__init__.py
  38. +133
    -0
      modelscope/models/cv/face_human_hand_detection/det_infer.py
  39. +395
    -0
      modelscope/models/cv/face_human_hand_detection/ghost_pan.py
  40. +427
    -0
      modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
  41. +64
    -0
      modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
  42. +182
    -0
      modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
  43. +277
    -0
      modelscope/models/cv/face_human_hand_detection/utils.py
  44. +2
    -0
      modelscope/models/cv/image_colorization/unet.py
  45. +2
    -0
      modelscope/models/cv/image_colorization/utils.py
  46. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/align_faces.py
  47. +1
    -0
      modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
  48. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
  49. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/gpen.py
  50. +1
    -0
      modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
  51. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
  52. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/losses/losses.py
  53. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
  54. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
  55. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
  56. +2
    -0
      modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
  57. +1
    -0
      modelscope/models/cv/image_to_image_generation/model.py
  58. +1
    -0
      modelscope/models/cv/image_to_image_generation/models/autoencoder.py
  59. +2
    -0
      modelscope/models/cv/image_to_image_generation/models/clip.py
  60. +1
    -0
      modelscope/models/cv/image_to_image_generation/ops/diffusion.py
  61. +1
    -0
      modelscope/models/cv/image_to_image_generation/ops/losses.py
  62. +1
    -0
      modelscope/models/cv/image_to_image_translation/data/transforms.py
  63. +1
    -0
      modelscope/models/cv/image_to_image_translation/model_translation.py
  64. +1
    -0
      modelscope/models/cv/image_to_image_translation/models/autoencoder.py
  65. +2
    -0
      modelscope/models/cv/image_to_image_translation/models/clip.py
  66. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/apps.py
  67. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/degradation.py
  68. +3
    -0
      modelscope/models/cv/image_to_image_translation/ops/diffusion.py
  69. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/losses.py
  70. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/metrics.py
  71. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/random_color.py
  72. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/random_mask.py
  73. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/svd.py
  74. +1
    -0
      modelscope/models/cv/image_to_image_translation/ops/utils.py
  75. +12
    -9
      modelscope/models/cv/movie_scene_segmentation/model.py
  76. +10
    -2
      modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
  77. +20
    -0
      modelscope/models/cv/product_segmentation/__init__.py
  78. +197
    -0
      modelscope/models/cv/product_segmentation/net.py
  79. +77
    -0
      modelscope/models/cv/product_segmentation/seg_infer.py
  80. +1
    -0
      modelscope/models/cv/skin_retouching/detection_model/detection_module.py
  81. +1
    -0
      modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
  82. +1
    -0
      modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
  83. +1
    -0
      modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
  84. +1
    -0
      modelscope/models/cv/skin_retouching/unet_deploy.py
  85. +1
    -0
      modelscope/models/cv/skin_retouching/utils.py
  86. +1
    -0
      modelscope/models/cv/skin_retouching/weights_init.py
  87. +2
    -0
      modelscope/models/cv/super_resolution/arch_util.py
  88. +2
    -0
      modelscope/models/cv/super_resolution/rrdbnet_arch.py
  89. +2
    -0
      modelscope/models/multi_modal/clip/__init__.py
  90. +15
    -0
      modelscope/models/multi_modal/clip/model.py
  91. +1
    -0
      modelscope/models/multi_modal/gemm/gemm_base.py
  92. +7
    -4
      modelscope/models/multi_modal/gemm/gemm_model.py
  93. +1
    -1
      modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
  94. +1
    -1
      modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
  95. +1
    -1
      modelscope/models/multi_modal/mmr/models/tokenization_clip.py
  96. +2
    -0
      modelscope/models/multi_modal/ofa/__init__.py
  97. +14
    -0
      modelscope/models/multi_modal/ofa/resnet.py
  98. +1
    -0
      modelscope/models/multi_modal/ofa/utils/__init__.py
  99. +2
    -0
      modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
  100. +56
    -14
      modelscope/outputs.py

+ 3
- 0
data/test/images/face_emotion.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e
size 123341

+ 3
- 0
data/test/images/face_human_hand_detection.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487
size 23889

+ 3
- 0
data/test/images/product_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec
size 108357

+ 10
- 0
modelscope/metainfo.py View File

@@ -40,6 +40,9 @@ class Models(object):
ulfd = 'ulfd'
video_inpainting = 'video-inpainting'
hand_static = 'hand-static'
face_human_hand_detection = 'face-human-hand-detection'
face_emotion = 'face-emotion'
product_segmentation = 'product-segmentation'

# EasyCV models
yolox = 'YOLOX'
@@ -179,9 +182,16 @@ class Pipelines(object):
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
shop_segmentation = 'shop-segmentation'
video_inpainting = 'video-inpainting'
pst_action_recognition = 'patchshift-action-recognition'
hand_static = 'hand-static'
face_human_hand_detection = 'face-human-hand-detection'
face_emotion = 'face-emotion'
product_segmentation = 'product-segmentation'

# nlp tasks
automatic_post_editing = 'automatic-post-editing'
translation_quality_estimation = 'translation-quality-estimation'
domain_classification = 'domain-classification'
sentence_similarity = 'sentence-similarity'
word_segmentation = 'word-segmentation'
part_of_speech = 'part-of-speech'


+ 2
- 0
modelscope/metrics/image_portrait_enhancement_metric.py View File

@@ -1,3 +1,5 @@
# Part of the implementation is borrowed and modified from BasicSR, publicly available at
# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
from typing import Dict

import numpy as np


+ 2
- 0
modelscope/models/cv/action_recognition/__init__.py View File

@@ -7,11 +7,13 @@ if TYPE_CHECKING:

from .models import BaseVideoModel
from .tada_convnext import TadaConvNeXt
from .temporal_patch_shift_transformer import PatchShiftTransformer

else:
_import_structure = {
'models': ['BaseVideoModel'],
'tada_convnext': ['TadaConvNeXt'],
'temporal_patch_shift_transformer': ['PatchShiftTransformer']
}

import sys


+ 1198
- 0
modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
File diff suppressed because it is too large
View File


+ 2
- 0
modelscope/models/cv/body_2d_keypoints/hrnet_v2.py View File

@@ -1,3 +1,5 @@
# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.

import os

import numpy as np


+ 2
- 0
modelscope/models/cv/body_2d_keypoints/w48.py View File

@@ -1,3 +1,5 @@
# The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.

cfg_128x128_15 = {
'DATASET': {
'TYPE': 'DAMO',


+ 2
- 0
modelscope/models/cv/body_3d_keypoints/body_3d_pose.py View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import logging
import os.path as osp
from typing import Any, Dict, List, Union


+ 1
- 1
modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py View File

@@ -1,4 +1,4 @@
# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
# The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D
import torch
import torch.nn as nn



+ 2
- 0
modelscope/models/cv/cartoon/facelib/LK/lk.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

import numpy as np

from modelscope.models.cv.cartoon.facelib.config import config as cfg


+ 2
- 0
modelscope/models/cv/cartoon/facelib/config.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

import os

import numpy as np


+ 2
- 0
modelscope/models/cv/cartoon/facelib/face_detector.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

import time

import cv2


+ 2
- 0
modelscope/models/cv/cartoon/facelib/face_landmark.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

import cv2
import numpy as np
import tensorflow as tf


+ 2
- 0
modelscope/models/cv/cartoon/facelib/facer.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

import time

import cv2


+ 2
- 4
modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py View File

@@ -1,7 +1,5 @@
"""
Created on Mon Apr 24 15:43:29 2017
@author: zhaoy
"""
# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch

import cv2
import numpy as np



+ 1
- 5
modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py View File

@@ -1,8 +1,4 @@
"""
Created on Tue Jul 11 06:54:28 2017

@author: zhaoyafei
"""
# The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch

import numpy as np
from numpy.linalg import inv, lstsq


+ 2
- 0
modelscope/models/cv/cartoon/utils.py View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import os

import cv2


+ 1
- 0
modelscope/models/cv/face_detection/mogface/__init__.py View File

@@ -1 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .models.detectors import MogFaceDetector

+ 1
- 0
modelscope/models/cv/face_detection/mtcnn/__init__.py View File

@@ -1 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .models.detector import MtcnnFaceDetector

+ 1
- 0
modelscope/models/cv/face_detection/retinaface/__init__.py View File

@@ -1 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .detection import RetinaFaceDetection

+ 1
- 0
modelscope/models/cv/face_detection/ulfd_slim/__init__.py View File

@@ -1 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .detection import UlfdFaceDetector

+ 20
- 0
modelscope/models/cv/face_emotion/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .emotion_model import EfficientNetForFaceEmotion

else:
_import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 6
- 0
modelscope/models/cv/face_emotion/efficient/__init__.py View File

@@ -0,0 +1,6 @@
# The implementation here is modified based on EfficientNet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

from .model import VALID_MODELS, EfficientNet
from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet,
get_model_params)

+ 380
- 0
modelscope/models/cv/face_emotion/efficient/model.py View File

@@ -0,0 +1,380 @@
# The implementation here is modified based on EfficientNet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

import torch
from torch import nn
from torch.nn import functional as F

from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size,
drop_connect, efficientnet_params, get_model_params,
get_same_padding_conv2d, load_pretrained_weights,
round_filters, round_repeats)

VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2',
'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5',
'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8',
'efficientnet-l2')


class MBConvBlock(nn.Module):

def __init__(self, block_args, global_params, image_size=None):
super().__init__()
self._block_args = block_args
self._bn_mom = 1 - global_params.batch_norm_momentum
self._bn_eps = global_params.batch_norm_epsilon
self.has_se = (self._block_args.se_ratio
is not None) and (0 < self._block_args.se_ratio <= 1)
self.id_skip = block_args.id_skip

inp = self._block_args.input_filters
oup = self._block_args.input_filters * self._block_args.expand_ratio
if self._block_args.expand_ratio != 1:
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._expand_conv = Conv2d(
in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
self._bn0 = nn.BatchNorm2d(
num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

k = self._block_args.kernel_size
s = self._block_args.stride
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._depthwise_conv = Conv2d(
in_channels=oup,
out_channels=oup,
groups=oup,
kernel_size=k,
stride=s,
bias=False)
self._bn1 = nn.BatchNorm2d(
num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
image_size = calculate_output_image_size(image_size, s)

if self.has_se:
Conv2d = get_same_padding_conv2d(image_size=(1, 1))
num_squeezed_channels = max(
1,
int(self._block_args.input_filters
* self._block_args.se_ratio))
self._se_reduce = Conv2d(
in_channels=oup,
out_channels=num_squeezed_channels,
kernel_size=1)
self._se_expand = Conv2d(
in_channels=num_squeezed_channels,
out_channels=oup,
kernel_size=1)

final_oup = self._block_args.output_filters
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._project_conv = Conv2d(
in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
self._bn2 = nn.BatchNorm2d(
num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
self._swish = MemoryEfficientSwish()

def forward(self, inputs, drop_connect_rate=None):
"""MBConvBlock's forward function.
Args:
inputs (tensor): Input tensor.
drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
Returns:
Output of this block after processing.
"""

x = inputs
if self._block_args.expand_ratio != 1:
x = self._expand_conv(inputs)
x = self._bn0(x)
x = self._swish(x)

x = self._depthwise_conv(x)
x = self._bn1(x)
x = self._swish(x)

if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1)
x_squeezed = self._se_reduce(x_squeezed)
x_squeezed = self._swish(x_squeezed)
x_squeezed = self._se_expand(x_squeezed)
x = torch.sigmoid(x_squeezed) * x

x = self._project_conv(x)
x = self._bn2(x)

input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
if drop_connect_rate:
x = drop_connect(
x, p=drop_connect_rate, training=self.training)
x = x + inputs
return x

def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()


class EfficientNet(nn.Module):
"""EfficientNet model.
Most easily loaded with the .from_name or .from_pretrained methods.
Args:
blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
global_params (namedtuple): A set of GlobalParams shared between blocks.
References:
[1] https://arxiv.org/abs/1905.11946 (EfficientNet)
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> model.eval()
>>> outputs = model(inputs)
"""

def __init__(self, blocks_args=None, global_params=None):
super().__init__()
assert isinstance(blocks_args, list), 'blocks_args should be a list'
assert len(blocks_args) > 0, 'block args must be greater than 0'
self._global_params = global_params
self._blocks_args = blocks_args

bn_mom = 1 - self._global_params.batch_norm_momentum
bn_eps = self._global_params.batch_norm_epsilon
image_size = global_params.image_size
Conv2d = get_same_padding_conv2d(image_size=image_size)

in_channels = 3
out_channels = round_filters(32, self._global_params)
self._conv_stem = Conv2d(
in_channels, out_channels, kernel_size=3, stride=2, bias=False)
self._bn0 = nn.BatchNorm2d(
num_features=out_channels, momentum=bn_mom, eps=bn_eps)
image_size = calculate_output_image_size(image_size, 2)

self._blocks = nn.ModuleList([])
for block_args in self._blocks_args:

block_args = block_args._replace(
input_filters=round_filters(block_args.input_filters,
self._global_params),
output_filters=round_filters(block_args.output_filters,
self._global_params),
num_repeat=round_repeats(block_args.num_repeat,
self._global_params))

self._blocks.append(
MBConvBlock(
block_args, self._global_params, image_size=image_size))
image_size = calculate_output_image_size(image_size,
block_args.stride)
if block_args.num_repeat > 1:
block_args = block_args._replace(
input_filters=block_args.output_filters, stride=1)
for _ in range(block_args.num_repeat - 1):
self._blocks.append(
MBConvBlock(
block_args, self._global_params,
image_size=image_size))

in_channels = block_args.output_filters
out_channels = round_filters(1280, self._global_params)
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._conv_head = Conv2d(
in_channels, out_channels, kernel_size=1, bias=False)
self._bn1 = nn.BatchNorm2d(
num_features=out_channels, momentum=bn_mom, eps=bn_eps)

self._avg_pooling = nn.AdaptiveAvgPool2d(1)
if self._global_params.include_top:
self._dropout = nn.Dropout(self._global_params.dropout_rate)
self._fc = nn.Linear(out_channels, self._global_params.num_classes)

self._swish = MemoryEfficientSwish()

def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
for block in self._blocks:
block.set_swish(memory_efficient)

def extract_endpoints(self, inputs):
"""Use convolution layer to extract features
from reduction levels i in [1, 2, 3, 4, 5].
Args:
inputs (tensor): Input tensor.
Returns:
Dictionary of last intermediate features
with reduction levels i in [1, 2, 3, 4, 5].
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> endpoints = model.extract_endpoints(inputs)
>>> print(endpoints['reduction_1'].shape) # torch.Size([1, 16, 112, 112])
>>> print(endpoints['reduction_2'].shape) # torch.Size([1, 24, 56, 56])
>>> print(endpoints['reduction_3'].shape) # torch.Size([1, 40, 28, 28])
>>> print(endpoints['reduction_4'].shape) # torch.Size([1, 112, 14, 14])
>>> print(endpoints['reduction_5'].shape) # torch.Size([1, 320, 7, 7])
>>> print(endpoints['reduction_6'].shape) # torch.Size([1, 1280, 7, 7])
"""
endpoints = dict()

x = self._swish(self._bn0(self._conv_stem(inputs)))
prev_x = x

for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(
self._blocks) # scale drop connect_rate
x = block(x, drop_connect_rate=drop_connect_rate)
if prev_x.size(2) > x.size(2):
endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x
elif idx == len(self._blocks) - 1:
endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
prev_x = x

x = self._swish(self._bn1(self._conv_head(x)))
endpoints['reduction_{}'.format(len(endpoints) + 1)] = x

return endpoints

def extract_features(self, inputs):
"""use convolution layer to extract feature .
Args:
inputs (tensor): Input tensor.
Returns:
Output of the final convolution
layer in the efficientnet model.
"""
x = self._swish(self._bn0(self._conv_stem(inputs)))

for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self._blocks)
x = block(x, drop_connect_rate=drop_connect_rate)
x = self._swish(self._bn1(self._conv_head(x)))

return x

def forward(self, inputs):
"""EfficientNet's forward function.
Calls extract_features to extract features, applies final linear layer, and returns logits.
Args:
inputs (tensor): Input tensor.
Returns:
Output of this model after processing.
"""
x = self.extract_features(inputs)
x = self._avg_pooling(x)
if self._global_params.include_top:
x = x.flatten(start_dim=1)
x = self._dropout(x)
x = self._fc(x)
return x

@classmethod
def from_name(cls, model_name, in_channels=3, **override_params):
"""Create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
in_channels (int): Input data's channel number.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
An efficientnet model.
"""
cls._check_model_name_is_valid(model_name)
blocks_args, global_params = get_model_params(model_name,
override_params)
model = cls(blocks_args, global_params)
model._change_in_channels(in_channels)
return model

@classmethod
def from_pretrained(cls,
model_name,
weights_path=None,
advprop=False,
in_channels=3,
num_classes=1000,
**override_params):
"""Create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
weights_path (None or str):
str: path to pretrained weights file on the local disk.
None: use pretrained weights downloaded from the Internet.
advprop (bool):
Whether to load pretrained weights
trained with advprop (valid when weights_path is None).
in_channels (int): Input data's channel number.
num_classes (int):
Number of categories for classification.
It controls the output size for final linear layer.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
A pretrained efficientnet model.
"""
model = cls.from_name(
model_name, num_classes=num_classes, **override_params)
model._change_in_channels(in_channels)
return model

@classmethod
def get_image_size(cls, model_name):
"""Get the input image size for a given efficientnet model.
Args:
model_name (str): Name for efficientnet.
Returns:
Input image size (resolution).
"""
cls._check_model_name_is_valid(model_name)
_, _, res, _ = efficientnet_params(model_name)
return res

@classmethod
def _check_model_name_is_valid(cls, model_name):
"""Validates model name.
Args:
model_name (str): Name for efficientnet.
Returns:
bool: Is a valid name or not.
"""
if model_name not in VALID_MODELS:
raise ValueError('model_name should be one of: '
+ ', '.join(VALID_MODELS))

def _change_in_channels(self, in_channels):
"""Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
Args:
in_channels (int): Input data's channel number.
"""
if in_channels != 3:
Conv2d = get_same_padding_conv2d(
image_size=self._global_params.image_size)
out_channels = round_filters(32, self._global_params)
self._conv_stem = Conv2d(
in_channels, out_channels, kernel_size=3, stride=2, bias=False)

+ 559
- 0
modelscope/models/cv/face_emotion/efficient/utils.py View File

@@ -0,0 +1,559 @@
# The implementation here is modified based on EfficientNet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

import collections
import math
import re
from functools import partial

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import model_zoo

GlobalParams = collections.namedtuple('GlobalParams', [
'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top'
])

BlockArgs = collections.namedtuple('BlockArgs', [
'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters',
'output_filters', 'se_ratio', 'id_skip'
])

GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)

if hasattr(nn, 'SiLU'):
Swish = nn.SiLU
else:

class Swish(nn.Module):

def forward(self, x):
return x * torch.sigmoid(x)


class SwishImplementation(torch.autograd.Function):

@staticmethod
def forward(ctx, i):
result = i * torch.sigmoid(i)
ctx.save_for_backward(i)
return result

@staticmethod
def backward(ctx, grad_output):
i = ctx.saved_tensors[0]
sigmoid_i = torch.sigmoid(i)
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))


class MemoryEfficientSwish(nn.Module):

def forward(self, x):
return SwishImplementation.apply(x)


def round_filters(filters, global_params):
"""Calculate and round number of filters based on width multiplier.
Use width_coefficient, depth_divisor and min_depth of global_params.
Args:
filters (int): Filters number to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new_filters: New filters number after calculating.
"""
multiplier = global_params.width_coefficient
if not multiplier:
return filters

divisor = global_params.depth_divisor
min_depth = global_params.min_depth
filters *= multiplier
min_depth = min_depth or divisor
new_filters = max(min_depth,
int(filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters:
new_filters += divisor
return int(new_filters)


def round_repeats(repeats, global_params):
"""Calculate module's repeat number of a block based on depth multiplier.
Use depth_coefficient of global_params.
Args:
repeats (int): num_repeat to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new repeat: New repeat number after calculating.
"""
multiplier = global_params.depth_coefficient
if not multiplier:
return repeats
return int(math.ceil(multiplier * repeats))


def drop_connect(inputs, p, training):
"""Drop connect.
Args:
input (tensor: BCWH): Input of this structure.
p (float: 0.0~1.0): Probability of drop connection.
training (bool): The running mode.
Returns:
output: Output after drop connection.
"""
assert 0 <= p <= 1, 'p must be in range of [0,1]'

if not training:
return inputs

batch_size = inputs.shape[0]
keep_prob = 1 - p

random_tensor = keep_prob
random_tensor += torch.rand([batch_size, 1, 1, 1],
dtype=inputs.dtype,
device=inputs.device)
binary_tensor = torch.floor(random_tensor)

output = inputs / keep_prob * binary_tensor
return output


def get_width_and_height_from_size(x):
"""Obtain height and width from x.
Args:
x (int, tuple or list): Data size.
Returns:
size: A tuple or list (H,W).
"""
if isinstance(x, int):
return x, x
if isinstance(x, list) or isinstance(x, tuple):
return x
else:
raise TypeError()


def calculate_output_image_size(input_image_size, stride):
"""Calculates the output image size when using Conv2dSamePadding with a stride.
Necessary for static padding. Thanks to mannatsingh for pointing this out.
Args:
input_image_size (int, tuple or list): Size of input image.
stride (int, tuple or list): Conv2d operation's stride.
Returns:
output_image_size: A list [H,W].
"""
if input_image_size is None:
return None
image_height, image_width = get_width_and_height_from_size(
input_image_size)
stride = stride if isinstance(stride, int) else stride[0]
image_height = int(math.ceil(image_height / stride))
image_width = int(math.ceil(image_width / stride))
return [image_height, image_width]


def get_same_padding_conv2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
"""
if image_size is None:
return Conv2dDynamicSamePadding
else:
return partial(Conv2dStaticSamePadding, image_size=image_size)


class Conv2dDynamicSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow, for a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
dilation=1,
groups=1,
bias=True):
super().__init__(in_channels, out_channels, kernel_size, stride, 0,
dilation, groups, bias)
self.stride = self.stride if len(
self.stride) == 2 else [self.stride[0]] * 2

def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
a1 = (oh - 1) * self.stride[0]
pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
a2 = (ow - 1) * self.stride[1]
pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
])
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
self.dilation, self.groups)


class Conv2dStaticSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
image_size=None,
**kwargs):
super().__init__(in_channels, out_channels, kernel_size, stride,
**kwargs)
self.stride = self.stride if len(
self.stride) == 2 else [self.stride[0]] * 2

assert image_size is not None
ih, iw = (image_size,
image_size) if isinstance(image_size, int) else image_size
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
b1 = (oh - 1) * self.stride[0]
pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
b2 = (ow - 1) * self.stride[1]
pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d(
(pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2))
else:
self.static_padding = nn.Identity()

def forward(self, x):
x = self.static_padding(x)
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
self.dilation, self.groups)
return x


def get_same_padding_maxPool2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
"""
if image_size is None:
return MaxPool2dDynamicSamePadding
else:
return partial(MaxPool2dStaticSamePadding, image_size=image_size)


class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""

def __init__(self,
kernel_size,
stride,
padding=0,
dilation=1,
return_indices=False,
ceil_mode=False):
super().__init__(kernel_size, stride, padding, dilation,
return_indices, ceil_mode)
self.stride = [self.stride] * 2 if isinstance(self.stride,
int) else self.stride
self.kernel_size = [self.kernel_size] * 2 if isinstance(
self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * 2 if isinstance(
self.dilation, int) else self.dilation

def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
c1 = (oh - 1) * self.stride[0]
pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
c2 = (ow - 1) * self.stride[1]
pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
])
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)


class MaxPool2dStaticSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""

def __init__(self, kernel_size, stride, image_size=None, **kwargs):
super().__init__(kernel_size, stride, **kwargs)
self.stride = [self.stride] * 2 if isinstance(self.stride,
int) else self.stride
self.kernel_size = [self.kernel_size] * 2 if isinstance(
self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * 2 if isinstance(
self.dilation, int) else self.dilation

assert image_size is not None
ih, iw = (image_size,
image_size) if isinstance(image_size, int) else image_size
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
d1 = (oh - 1) * self.stride[0]
pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
d2 = (ow - 1) * self.stride[1]
pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d(
(pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2))
else:
self.static_padding = nn.Identity()

def forward(self, x):
x = self.static_padding(x)
x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)
return x


class BlockDecoder(object):
"""Block Decoder for readability,
straight from the official TensorFlow repository.
"""

@staticmethod
def _decode_block_string(block_string):
"""Get a block through a string notation of arguments.
Args:
block_string (str): A string notation of arguments.
Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
Returns:
BlockArgs: The namedtuple defined at the top of this file.
"""
assert isinstance(block_string, str)

ops = block_string.split('_')
options = {}
for op in ops:
splits = re.split(r'(\d.*)', op)
if len(splits) >= 2:
key, value = splits[:2]
options[key] = value

# Check stride
assert (('s' in options and len(options['s']) == 1)
or (len(options['s']) == 2
and options['s'][0] == options['s'][1]))

return BlockArgs(
num_repeat=int(options['r']),
kernel_size=int(options['k']),
stride=[int(options['s'][0])],
expand_ratio=int(options['e']),
input_filters=int(options['i']),
output_filters=int(options['o']),
se_ratio=float(options['se']) if 'se' in options else None,
id_skip=('noskip' not in block_string))

@staticmethod
def _encode_block_string(block):
"""Encode a block to a string.
Args:
block (namedtuple): A BlockArgs type argument.
Returns:
block_string: A String form of BlockArgs.
"""
args = [
'r%d' % block.num_repeat,
'k%d' % block.kernel_size,
's%d%d' % (block.strides[0], block.strides[1]),
'e%s' % block.expand_ratio,
'i%d' % block.input_filters,
'o%d' % block.output_filters
]
if 0 < block.se_ratio <= 1:
args.append('se%s' % block.se_ratio)
if block.id_skip is False:
args.append('noskip')
return '_'.join(args)

@staticmethod
def decode(string_list):
"""Decode a list of string notations to specify blocks inside the network.
Args:
string_list (list[str]): A list of strings, each string is a notation of block.
Returns:
blocks_args: A list of BlockArgs namedtuples of block args.
"""
assert isinstance(string_list, list)
blocks_args = []
for block_string in string_list:
blocks_args.append(BlockDecoder._decode_block_string(block_string))
return blocks_args

@staticmethod
def encode(blocks_args):
"""Encode a list of BlockArgs to a list of strings.
Args:
blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
Returns:
block_strings: A list of strings, each string is a notation of block.
"""
block_strings = []
for block in blocks_args:
block_strings.append(BlockDecoder._encode_block_string(block))
return block_strings


def efficientnet_params(model_name):
"""Map EfficientNet model name to parameter coefficients.
Args:
model_name (str): Model name to be queried.
Returns:
params_dict[model_name]: A (width,depth,res,dropout) tuple.
"""
params_dict = {
'efficientnet-b0': (1.0, 1.0, 112, 0.2),
'efficientnet-b1': (1.0, 1.1, 240, 0.2),
'efficientnet-b2': (1.1, 1.2, 260, 0.3),
'efficientnet-b3': (1.2, 1.4, 300, 0.3),
'efficientnet-b4': (1.4, 1.8, 380, 0.4),
'efficientnet-b5': (1.6, 2.2, 456, 0.4),
'efficientnet-b6': (1.8, 2.6, 528, 0.5),
'efficientnet-b7': (2.0, 3.1, 600, 0.5),
'efficientnet-b8': (2.2, 3.6, 672, 0.5),
'efficientnet-l2': (4.3, 5.3, 800, 0.5),
}
return params_dict[model_name]


def efficientnet(width_coefficient=None,
depth_coefficient=None,
image_size=None,
dropout_rate=0.2,
drop_connect_rate=0.2,
num_classes=1000,
include_top=True):
"""Create BlockArgs and GlobalParams for efficientnet model.
Args:
width_coefficient (float)
depth_coefficient (float)
image_size (int)
dropout_rate (float)
drop_connect_rate (float)
num_classes (int)
Meaning as the name suggests.
Returns:
blocks_args, global_params.
"""

blocks_args = [
'r1_k3_s11_e1_i32_o16_se0.25',
'r2_k3_s22_e6_i16_o24_se0.25',
'r2_k5_s22_e6_i24_o40_se0.25',
'r3_k3_s22_e6_i40_o80_se0.25',
'r3_k5_s11_e6_i80_o112_se0.25',
'r4_k5_s22_e6_i112_o192_se0.25',
'r1_k3_s11_e6_i192_o320_se0.25',
]
blocks_args = BlockDecoder.decode(blocks_args)

global_params = GlobalParams(
width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
image_size=image_size,
dropout_rate=dropout_rate,
num_classes=num_classes,
batch_norm_momentum=0.99,
batch_norm_epsilon=1e-3,
drop_connect_rate=drop_connect_rate,
depth_divisor=8,
min_depth=None,
include_top=include_top,
)
return blocks_args, global_params


def get_model_params(model_name, override_params):
"""Get the block args and global params for a given model name.
Args:
model_name (str): Model's name.
override_params (dict): A dict to modify global_params.
Returns:
blocks_args, global_params
"""
if model_name.startswith('efficientnet'):
w, d, s, p = efficientnet_params(model_name)
blocks_args, global_params = efficientnet(
width_coefficient=w,
depth_coefficient=d,
dropout_rate=p,
image_size=s)
else:
raise NotImplementedError(
'model name is not pre-defined: {}'.format(model_name))
if override_params:
global_params = global_params._replace(**override_params)
return blocks_args, global_params


def load_pretrained_weights(model,
model_name,
weights_path=None,
load_fc=True,
advprop=False,
verbose=True):
"""Loads pretrained weights from weights path or download using url.
Args:
model (Module): The whole model of efficientnet.
model_name (str): Model name of efficientnet.
weights_path (None or str):
str: path to pretrained weights file on the local disk.
None: use pretrained weights downloaded from the Internet.
load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
advprop (bool): Whether to load pretrained weights
trained with advprop (valid when weights_path is None).
"""
if isinstance(weights_path, str):
state_dict = torch.load(weights_path)
else:
url_map_ = url_map_advprop if advprop else url_map
state_dict = model_zoo.load_url(url_map_[model_name])

if load_fc:
ret = model.load_state_dict(state_dict, strict=False)
assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
ret.missing_keys)
else:
state_dict.pop('_fc.weight')
state_dict.pop('_fc.bias')
ret = model.load_state_dict(state_dict, strict=False)
assert set(ret.missing_keys) == set([
'_fc.weight', '_fc.bias'
]), 'Missing keys when loading pretrained weights: {}'.format(
ret.missing_keys)
assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(
ret.unexpected_keys)

if verbose:
print('Loaded pretrained weights for {}'.format(model_name))

+ 67
- 0
modelscope/models/cv/face_emotion/emotion_infer.py View File

@@ -0,0 +1,67 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import torch
from PIL import Image
from torch import nn
from torchvision import transforms

from modelscope.utils.logger import get_logger
from .face_alignment.face_align import face_detection_PIL_v2

logger = get_logger()


def transform_PIL(img_pil):
val_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
return val_transforms(img_pil)


index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26]
emotion_list = [
'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise'
]


def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
image = Image.open(image_path).convert('RGB')

face, bbox = face_detection_PIL_v2(image, face_model)
if bbox is None:
logger.warn('no face detected!')
result = {'emotion_result': None, 'box': None}
return result

face = transform_PIL(face)
face = face.unsqueeze(0)
if torch.cuda.is_available():
face = face.cuda(GPU)
logits_AU, logits_emotion = model(face)
logits_AU = torch.sigmoid(logits_AU)
logits_emotion = nn.functional.softmax(logits_emotion, 1)

_, index_list = logits_emotion.max(1)
emotion_index = index_list[0].data.item()
prob = logits_emotion[0][emotion_index]
if prob > score_thre and emotion_index != 3:
cur_emotion = emotion_list[emotion_index]
else:
cur_emotion = 'Neutral'

logits_AU = logits_AU[0]
au_ouput = torch.zeros_like(logits_AU)
au_ouput[logits_AU >= score_thre] = 1
au_ouput[logits_AU < score_thre] = 0

au_ouput = au_ouput.int()

cur_au_list = []
for idx in range(au_ouput.shape[0]):
if au_ouput[idx] == 1:
au = index2AU[idx]
cur_au_list.append(au)
cur_au_list.sort()
result = (cur_emotion, bbox)
return result

+ 96
- 0
modelscope/models/cv/face_emotion/emotion_model.py View File

@@ -0,0 +1,96 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import os
import sys

import torch
import torch.nn.functional as F
from torch import nn

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.face_emotion.efficient import EfficientNet
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion)
class EfficientNetForFaceEmotion(TorchModel):

def __init__(self, model_dir, device_id=0, *args, **kwargs):

super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)
self.model = FaceEmotionModel(
name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7)

if torch.cuda.is_available():
self.device = 'cuda'
logger.info('Use GPU')
else:
self.device = 'cpu'
logger.info('Use CPU')
pretrained_params = torch.load(
'{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
map_location=self.device)

state_dict = pretrained_params['model']
new_state = {}
for k, v in state_dict.items():
if k.startswith('module.'):
k = k[7:]
new_state[k] = v

self.model.load_state_dict(new_state)
self.model.eval()
self.model.to(self.device)

def forward(self, x):
logits_au, logits_emotion = self.model(x)
return logits_au, logits_emotion


class FaceEmotionModel(nn.Module):

def __init__(self,
name='efficientnet-b0',
num_embed=512,
num_au=12,
num_emotion=7):
super(FaceEmotionModel, self).__init__()
self.backbone = EfficientNet.from_pretrained(
name, weights_path=None, advprop=True)
self.average_pool = nn.AdaptiveAvgPool2d(1)
self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1],
num_embed)
self.features = nn.BatchNorm1d(num_embed)
nn.init.constant_(self.features.weight, 1.0)
self.features.weight.requires_grad = False
self.fc_au = nn.Sequential(
nn.Dropout(0.6),
nn.Linear(num_embed, num_au),
)
self.fc_emotion = nn.Sequential(
nn.Dropout(0.6),
nn.Linear(num_embed, num_emotion),
)

def feat_single_img(self, x):
x = self.backbone.extract_features(x)
x = self.average_pool(x)
x = x.flatten(1)
x = self.embed(x)
x = self.features(x)
return x

def forward(self, x):
x = self.feat_single_img(x)
logits_au = self.fc_au(x)
att_au = torch.sigmoid(logits_au).unsqueeze(-1)
x = x.unsqueeze(1)
emotion_vec_list = torch.matmul(att_au, x)
emotion_vec = emotion_vec_list.sum(1)
logits_emotion = self.fc_emotion(emotion_vec)
return logits_au, logits_emotion

+ 0
- 0
modelscope/models/cv/face_emotion/face_alignment/__init__.py View File


+ 79
- 0
modelscope/models/cv/face_emotion/face_alignment/face.py View File

@@ -0,0 +1,79 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import os

import cv2
import numpy as np
import tensorflow as tf


def init(mod):
PATH_TO_CKPT = mod
net = tf.Graph()
with net.as_default():
od_graph_def = tf.GraphDef()
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.6
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=net, config=config)
return sess, net


def filter_bboxes_confs(shape,
imgsBboxes,
imgsConfs,
single=False,
thresh=0.5):
[w, h] = shape
if single:
bboxes, confs = [], []
for y in range(len(imgsBboxes)):
if imgsConfs[y] >= thresh:
[x1, y1, x2, y2] = list(imgsBboxes[y])
x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int(
h * y2)
bboxes.append([y1, x1, y2, x2])
confs.append(imgsConfs[y])
return bboxes, confs
else:
retImgsBboxes, retImgsConfs = [], []
for x in range(len(imgsBboxes)):
bboxes, confs = [], []
for y in range(len(imgsBboxes[x])):
if imgsConfs[x][y] >= thresh:
[x1, y1, x2, y2] = list(imgsBboxes[x][y])
x1, y1, x2, y2 = int(w * x1), int(h * y1), int(
w * x2), int(h * y2)
bboxes.append([y1, x1, y2, x2])
confs.append(imgsConfs[x][y])
retImgsBboxes.append(bboxes)
retImgsConfs.append(confs)
return retImgsBboxes, retImgsConfs


def detect(im, sess, net):
image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = net.get_tensor_by_name('image_tensor:0')
bboxes = net.get_tensor_by_name('detection_boxes:0')
dConfs = net.get_tensor_by_name('detection_scores:0')
classes = net.get_tensor_by_name('detection_classes:0')
num_detections = net.get_tensor_by_name('num_detections:0')
(bboxes, dConfs, classes,
num_detections) = sess.run([bboxes, dConfs, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
w, h, _ = im.shape
bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True)
return bboxes, confs


class FaceDetector:

def __init__(self, mod):
self.sess, self.net = init(mod)

def do_detect(self, im):
bboxes, confs = detect(im, self.sess, self.net)
return bboxes, confs

+ 59
- 0
modelscope/models/cv/face_emotion/face_alignment/face_align.py View File

@@ -0,0 +1,59 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import os
import sys

import cv2
import numpy as np
from PIL import Image, ImageFile

from .face import FaceDetector

ImageFile.LOAD_TRUNCATED_IMAGES = True


def adjust_bx_v2(box, w, h):
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
box_w = x2 - x1
box_h = y2 - y1
delta = abs(box_w - box_h)
if box_w > box_h:
if y1 >= delta:
y1 = y1 - delta
else:
delta_y1 = y1
y1 = 0
delta_y2 = delta - delta_y1
y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1
else:
if x1 >= delta / 2 and x2 <= w - delta / 2:
x1 = x1 - delta / 2
x2 = x2 + delta / 2
elif x1 < delta / 2 and x2 <= w - delta / 2:
delta_x1 = x1
x1 = 0
delta_x2 = delta - delta_x1
x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1
elif x1 >= delta / 2 and x2 > w - delta / 2:
delta_x2 = w - x2
x2 = w - 1
delta_x1 = delta - x1
x1 = x1 - delta_x1 if x1 >= delta_x1 else 0

x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
return [x1, y1, x2, y2]


def face_detection_PIL_v2(image, face_model):
crop_size = 112
face_detector = FaceDetector(face_model)
img = np.array(image)
h, w = img.shape[0:2]
bxs, conf = face_detector.do_detect(img)
bx = bxs[0]
bx = adjust_bx_v2(bx, w, h)
x1, y1, x2, y2 = bx
image = img[y1:y2, x1:x2, :]
img = Image.fromarray(image)
img = img.resize((crop_size, crop_size))
bx = tuple(bx)
return img, bx

+ 2
- 0
modelscope/models/cv/face_generation/op/conv2d_gradfix.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py
import contextlib
import warnings



+ 2
- 0
modelscope/models/cv/face_generation/op/fused_act.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
# t https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py
import os

import torch


+ 2
- 0
modelscope/models/cv/face_generation/op/upfirdn2d.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py
import os
from collections import abc



+ 2
- 0
modelscope/models/cv/face_generation/stylegan2.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from stylegan2-pytorch,
# made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
import functools
import math
import operator


+ 20
- 0
modelscope/models/cv/face_human_hand_detection/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .det_infer import NanoDetForFaceHumanHandDetection

else:
_import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 133
- 0
modelscope/models/cv/face_human_hand_detection/det_infer.py View File

@@ -0,0 +1,133 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

import cv2
import numpy as np
import torch

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .one_stage_detector import OneStageDetector

logger = get_logger()


def load_model_weight(model_dir, device):
checkpoint = torch.load(
'{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
map_location=device)
state_dict = checkpoint['state_dict'].copy()
for k in checkpoint['state_dict']:
if k.startswith('avg_model.'):
v = state_dict.pop(k)
state_dict[k[4:]] = v

return state_dict


@MODELS.register_module(
Tasks.face_human_hand_detection,
module_name=Models.face_human_hand_detection)
class NanoDetForFaceHumanHandDetection(TorchModel):

def __init__(self, model_dir, device_id=0, *args, **kwargs):

super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)

self.model = OneStageDetector()
if torch.cuda.is_available():
self.device = 'cuda'
logger.info('Use GPU ')
else:
self.device = 'cpu'
logger.info('Use CPU')

self.state_dict = load_model_weight(model_dir, self.device)
self.model.load_state_dict(self.state_dict, strict=False)
self.model.eval()
self.model.to(self.device)

def forward(self, x):
pred_result = self.model.inference(x)
return pred_result


def naive_collate(batch):
elem = batch[0]
if isinstance(elem, dict):
return {key: naive_collate([d[key] for d in batch]) for key in elem}
else:
return batch


def get_resize_matrix(raw_shape, dst_shape):

r_w, r_h = raw_shape
d_w, d_h = dst_shape
Rs = np.eye(3)

Rs[0, 0] *= d_w / r_w
Rs[1, 1] *= d_h / r_h
return Rs


def color_aug_and_norm(meta, mean, std):
img = meta['img'].astype(np.float32) / 255
mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255
std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255
img = (img - mean) / std
meta['img'] = img
return meta


def img_process(meta, mean, std):
raw_img = meta['img']
height = raw_img.shape[0]
width = raw_img.shape[1]
dst_shape = [320, 320]
M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
ResizeM = get_resize_matrix((width, height), dst_shape)
M = ResizeM @ M
img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape))
meta['img'] = img
meta['warp_matrix'] = M
meta = color_aug_and_norm(meta, mean, std)
return meta


def overlay_bbox_cv(dets, class_names, score_thresh):
all_box = []
for label in dets:
for bbox in dets[label]:
score = bbox[-1]
if score > score_thresh:
x0, y0, x1, y1 = [int(i) for i in bbox[:4]]
all_box.append([label, x0, y0, x1, y1, score])
all_box.sort(key=lambda v: v[5])
return all_box


mean = [103.53, 116.28, 123.675]
std = [57.375, 57.12, 58.395]
class_names = ['person', 'face', 'hand']


def inference(model, device, img_path):
img_info = {'id': 0}
img = cv2.imread(img_path)
height, width = img.shape[:2]
img_info['height'] = height
img_info['width'] = width
meta = dict(img_info=img_info, raw_img=img, img=img)

meta = img_process(meta, mean, std)
meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device)
meta = naive_collate([meta])
meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320)
with torch.no_grad():
res = model(meta)
result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
return result

+ 395
- 0
modelscope/models/cv/face_human_hand_detection/ghost_pan.py View File

@@ -0,0 +1,395 @@
# The implementation here is modified based on nanodet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

import math

import torch
import torch.nn as nn

from .utils import ConvModule, DepthwiseConvModule, act_layers


def _make_divisible(v, divisor, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v


def hard_sigmoid(x, inplace: bool = False):
if inplace:
return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0)
else:
return F.relu6(x + 3.0) / 6.0


class SqueezeExcite(nn.Module):

def __init__(self,
in_chs,
se_ratio=0.25,
reduced_base_chs=None,
activation='ReLU',
gate_fn=hard_sigmoid,
divisor=4,
**_):
super(SqueezeExcite, self).__init__()
self.gate_fn = gate_fn
reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio,
divisor)
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
self.act1 = act_layers(activation)
self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)

def forward(self, x):
x_se = self.avg_pool(x)
x_se = self.conv_reduce(x_se)
x_se = self.act1(x_se)
x_se = self.conv_expand(x_se)
x = x * self.gate_fn(x_se)
return x


class GhostModule(nn.Module):

def __init__(self,
inp,
oup,
kernel_size=1,
ratio=2,
dw_size=3,
stride=1,
activation='ReLU'):
super(GhostModule, self).__init__()
self.oup = oup
init_channels = math.ceil(oup / ratio)
new_channels = init_channels * (ratio - 1)

self.primary_conv = nn.Sequential(
nn.Conv2d(
inp,
init_channels,
kernel_size,
stride,
kernel_size // 2,
bias=False),
nn.BatchNorm2d(init_channels),
act_layers(activation) if activation else nn.Sequential(),
)

self.cheap_operation = nn.Sequential(
nn.Conv2d(
init_channels,
new_channels,
dw_size,
1,
dw_size // 2,
groups=init_channels,
bias=False,
),
nn.BatchNorm2d(new_channels),
act_layers(activation) if activation else nn.Sequential(),
)

def forward(self, x):
x1 = self.primary_conv(x)
x2 = self.cheap_operation(x1)
out = torch.cat([x1, x2], dim=1)
return out


class GhostBottleneck(nn.Module):
"""Ghost bottleneck w/ optional SE"""

def __init__(
self,
in_chs,
mid_chs,
out_chs,
dw_kernel_size=3,
stride=1,
activation='ReLU',
se_ratio=0.0,
):
super(GhostBottleneck, self).__init__()
has_se = se_ratio is not None and se_ratio > 0.0
self.stride = stride

# Point-wise expansion
self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation)

# Depth-wise convolution
if self.stride > 1:
self.conv_dw = nn.Conv2d(
mid_chs,
mid_chs,
dw_kernel_size,
stride=stride,
padding=(dw_kernel_size - 1) // 2,
groups=mid_chs,
bias=False,
)
self.bn_dw = nn.BatchNorm2d(mid_chs)

if has_se:
self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
else:
self.se = None

self.ghost2 = GhostModule(mid_chs, out_chs, activation=None)

if in_chs == out_chs and self.stride == 1:
self.shortcut = nn.Sequential()
else:
self.shortcut = nn.Sequential(
nn.Conv2d(
in_chs,
in_chs,
dw_kernel_size,
stride=stride,
padding=(dw_kernel_size - 1) // 2,
groups=in_chs,
bias=False,
),
nn.BatchNorm2d(in_chs),
nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(out_chs),
)

def forward(self, x):
residual = x

x = self.ghost1(x)

if self.stride > 1:
x = self.conv_dw(x)
x = self.bn_dw(x)

if self.se is not None:
x = self.se(x)

x = self.ghost2(x)

x += self.shortcut(residual)
return x


class GhostBlocks(nn.Module):
"""Stack of GhostBottleneck used in GhostPAN.

Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
expand (int): Expand ratio of GhostBottleneck. Default: 1.
kernel_size (int): Kernel size of depthwise convolution. Default: 5.
num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
use_res (bool): Whether to use residual connection. Default: False.
activation (str): Name of activation function. Default: LeakyReLU.
"""

def __init__(
self,
in_channels,
out_channels,
expand=1,
kernel_size=5,
num_blocks=1,
use_res=False,
activation='LeakyReLU',
):
super(GhostBlocks, self).__init__()
self.use_res = use_res
if use_res:
self.reduce_conv = ConvModule(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
activation=activation,
)
blocks = []
for _ in range(num_blocks):
blocks.append(
GhostBottleneck(
in_channels,
int(out_channels * expand),
out_channels,
dw_kernel_size=kernel_size,
activation=activation,
))
self.blocks = nn.Sequential(*blocks)

def forward(self, x):
out = self.blocks(x)
if self.use_res:
out = out + self.reduce_conv(x)
return out


class GhostPAN(nn.Module):
"""Path Aggregation Network with Ghost block.

Args:
in_channels (List[int]): Number of input channels per scale.
out_channels (int): Number of output channels (used at each scale)
num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
use_depthwise (bool): Whether to depthwise separable convolution in
blocks. Default: False
kernel_size (int): Kernel size of depthwise convolution. Default: 5.
expand (int): Expand ratio of GhostBottleneck. Default: 1.
num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
use_res (bool): Whether to use residual connection. Default: False.
num_extra_level (int): Number of extra conv layers for more feature levels.
Default: 0.
upsample_cfg (dict): Config dict for interpolate layer.
Default: `dict(scale_factor=2, mode='nearest')`
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN')
activation (str): Activation layer name.
Default: LeakyReLU.
"""

def __init__(
self,
in_channels,
out_channels,
use_depthwise=False,
kernel_size=5,
expand=1,
num_blocks=1,
use_res=False,
num_extra_level=0,
upsample_cfg=dict(scale_factor=2, mode='bilinear'),
norm_cfg=dict(type='BN'),
activation='LeakyReLU',
):
super(GhostPAN, self).__init__()
assert num_extra_level >= 0
assert num_blocks >= 1
self.in_channels = in_channels
self.out_channels = out_channels

conv = DepthwiseConvModule if use_depthwise else ConvModule

# build top-down blocks
self.upsample = nn.Upsample(**upsample_cfg)
self.reduce_layers = nn.ModuleList()
for idx in range(len(in_channels)):
self.reduce_layers.append(
ConvModule(
in_channels[idx],
out_channels,
1,
norm_cfg=norm_cfg,
activation=activation,
))
self.top_down_blocks = nn.ModuleList()
for idx in range(len(in_channels) - 1, 0, -1):
self.top_down_blocks.append(
GhostBlocks(
out_channels * 2,
out_channels,
expand,
kernel_size=kernel_size,
num_blocks=num_blocks,
use_res=use_res,
activation=activation,
))

# build bottom-up blocks
self.downsamples = nn.ModuleList()
self.bottom_up_blocks = nn.ModuleList()
for idx in range(len(in_channels) - 1):
self.downsamples.append(
conv(
out_channels,
out_channels,
kernel_size,
stride=2,
padding=kernel_size // 2,
norm_cfg=norm_cfg,
activation=activation,
))
self.bottom_up_blocks.append(
GhostBlocks(
out_channels * 2,
out_channels,
expand,
kernel_size=kernel_size,
num_blocks=num_blocks,
use_res=use_res,
activation=activation,
))

# extra layers
self.extra_lvl_in_conv = nn.ModuleList()
self.extra_lvl_out_conv = nn.ModuleList()
for i in range(num_extra_level):
self.extra_lvl_in_conv.append(
conv(
out_channels,
out_channels,
kernel_size,
stride=2,
padding=kernel_size // 2,
norm_cfg=norm_cfg,
activation=activation,
))
self.extra_lvl_out_conv.append(
conv(
out_channels,
out_channels,
kernel_size,
stride=2,
padding=kernel_size // 2,
norm_cfg=norm_cfg,
activation=activation,
))

def forward(self, inputs):
"""
Args:
inputs (tuple[Tensor]): input features.
Returns:
tuple[Tensor]: multi level features.
"""
assert len(inputs) == len(self.in_channels)
inputs = [
reduce(input_x)
for input_x, reduce in zip(inputs, self.reduce_layers)
]
# top-down path
inner_outs = [inputs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_heigh = inner_outs[0]
feat_low = inputs[idx - 1]

inner_outs[0] = feat_heigh

upsample_feat = self.upsample(feat_heigh)

inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
torch.cat([upsample_feat, feat_low], 1))
inner_outs.insert(0, inner_out)

# bottom-up path
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_height = inner_outs[idx + 1]
downsample_feat = self.downsamples[idx](feat_low)
out = self.bottom_up_blocks[idx](
torch.cat([downsample_feat, feat_height], 1))
outs.append(out)

# extra layers
for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv,
self.extra_lvl_out_conv):
outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1]))

return tuple(outs)

+ 427
- 0
modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py View File

@@ -0,0 +1,427 @@
# The implementation here is modified based on nanodet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

import math

import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.ops import nms

from .utils import ConvModule, DepthwiseConvModule


class Integral(nn.Module):
"""A fixed layer for calculating integral result from distribution.
This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
P(y_i) denotes the softmax vector that represents the discrete distribution
y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
Args:
reg_max (int): The maximal value of the discrete set. Default: 16. You
may want to reset it according to your new dataset or related
settings.
"""

def __init__(self, reg_max=16):
super(Integral, self).__init__()
self.reg_max = reg_max
self.register_buffer('project',
torch.linspace(0, self.reg_max, self.reg_max + 1))

def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
Args:
x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
n is self.reg_max.
Returns:
x (Tensor): Integral result of box locations, i.e., distance
offsets from the box center in four directions, shape (N, 4).
"""
shape = x.size()
x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4)
return x


def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
"""Performs non-maximum suppression in a batched fashion.
Modified from https://github.com/pytorch/vision/blob
/505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
In order to perform NMS independently per class, we add an offset to all
the boxes. The offset is dependent only on the class idx, and is large
enough so that boxes from different classes do not overlap.
Arguments:
boxes (torch.Tensor): boxes in shape (N, 4).
scores (torch.Tensor): scores in shape (N, ).
idxs (torch.Tensor): each index value correspond to a bbox cluster,
and NMS will not be applied between elements of different idxs,
shape (N, ).
nms_cfg (dict): specify nms type and other parameters like iou_thr.
Possible keys includes the following.
- iou_thr (float): IoU threshold used for NMS.
- split_thr (float): threshold number of boxes. In some cases the
number of boxes is large (e.g., 200k). To avoid OOM during
training, the users could set `split_thr` to a small value.
If the number of boxes is greater than the threshold, it will
perform NMS on each group of boxes separately and sequentially.
Defaults to 10000.
class_agnostic (bool): if true, nms is class agnostic,
i.e. IoU thresholding happens over all boxes,
regardless of the predicted class.
Returns:
tuple: kept dets and indice.
"""
nms_cfg_ = nms_cfg.copy()
class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
if class_agnostic:
boxes_for_nms = boxes
else:
max_coordinate = boxes.max()
offsets = idxs.to(boxes) * (max_coordinate + 1)
boxes_for_nms = boxes + offsets[:, None]
nms_cfg_.pop('type', 'nms')
split_thr = nms_cfg_.pop('split_thr', 10000)
if len(boxes_for_nms) < split_thr:
keep = nms(boxes_for_nms, scores, **nms_cfg_)
boxes = boxes[keep]
scores = scores[keep]
else:
total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
for id in torch.unique(idxs):
mask = (idxs == id).nonzero(as_tuple=False).view(-1)
keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_)
total_mask[mask[keep]] = True

keep = total_mask.nonzero(as_tuple=False).view(-1)
keep = keep[scores[keep].argsort(descending=True)]
boxes = boxes[keep]
scores = scores[keep]

return torch.cat([boxes, scores[:, None]], -1), keep


def multiclass_nms(multi_bboxes,
multi_scores,
score_thr,
nms_cfg,
max_num=-1,
score_factors=None):
"""NMS for multi-class bboxes.

Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class), where the last column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
score_factors (Tensor): The factors multiplied to scores before
applying NMS

Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
are 0-based.
"""
num_classes = multi_scores.size(1) - 1
if multi_bboxes.shape[1] > 4:
bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
else:
bboxes = multi_bboxes[:, None].expand(
multi_scores.size(0), num_classes, 4)
scores = multi_scores[:, :-1]

valid_mask = scores > score_thr

bboxes = torch.masked_select(
bboxes,
torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
-1)).view(-1, 4)
if score_factors is not None:
scores = scores * score_factors[:, None]
scores = torch.masked_select(scores, valid_mask)
labels = valid_mask.nonzero(as_tuple=False)[:, 1]

if bboxes.numel() == 0:
bboxes = multi_bboxes.new_zeros((0, 5))
labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)

if torch.onnx.is_in_onnx_export():
raise RuntimeError('[ONNX Error] Can not record NMS '
'as it has not been executed this time')
return bboxes, labels

dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)

if max_num > 0:
dets = dets[:max_num]
keep = keep[:max_num]

return dets, labels[keep]


def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.

Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.

Returns:
Tensor: Decoded bboxes.
"""
x1 = points[..., 0] - distance[..., 0]
y1 = points[..., 1] - distance[..., 1]
x2 = points[..., 0] + distance[..., 2]
y2 = points[..., 1] + distance[..., 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
return torch.stack([x1, y1, x2, y2], -1)


def warp_boxes(boxes, M, width, height):
n = len(boxes)
if n:
xy = np.ones((n * 4, 3))
xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)
xy = xy @ M.T
xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
xy = np.concatenate(
(x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
return xy.astype(np.float32)
else:
return boxes


class NanoDetPlusHead(nn.Module):
"""Detection head used in NanoDet-Plus.

Args:
num_classes (int): Number of categories excluding the background
category.
loss (dict): Loss config.
input_channel (int): Number of channels of the input feature.
feat_channels (int): Number of channels of the feature.
Default: 96.
stacked_convs (int): Number of conv layers in the stacked convs.
Default: 2.
kernel_size (int): Size of the convolving kernel. Default: 5.
strides (list[int]): Strides of input multi-level feature maps.
Default: [8, 16, 32].
conv_type (str): Type of the convolution.
Default: "DWConv".
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN').
reg_max (int): The maximal value of the discrete set. Default: 7.
activation (str): Type of activation function. Default: "LeakyReLU".
assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13).
"""

def __init__(self,
num_classes,
input_channel,
feat_channels=96,
stacked_convs=2,
kernel_size=5,
strides=[8, 16, 32],
conv_type='DWConv',
norm_cfg=dict(type='BN'),
reg_max=7,
activation='LeakyReLU',
assigner_cfg=dict(topk=13),
**kwargs):
super(NanoDetPlusHead, self).__init__()
self.num_classes = num_classes
self.in_channels = input_channel
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.kernel_size = kernel_size
self.strides = strides
self.reg_max = reg_max
self.activation = activation
self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule

self.norm_cfg = norm_cfg
self.distribution_project = Integral(self.reg_max)

self._init_layers()

def _init_layers(self):
self.cls_convs = nn.ModuleList()
for _ in self.strides:
cls_convs = self._buid_not_shared_head()
self.cls_convs.append(cls_convs)

self.gfl_cls = nn.ModuleList([
nn.Conv2d(
self.feat_channels,
self.num_classes + 4 * (self.reg_max + 1),
1,
padding=0,
) for _ in self.strides
])

def _buid_not_shared_head(self):
cls_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
cls_convs.append(
self.ConvModule(
chn,
self.feat_channels,
self.kernel_size,
stride=1,
padding=self.kernel_size // 2,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None,
activation=self.activation,
))
return cls_convs

def forward(self, feats):
if torch.onnx.is_in_onnx_export():
return self._forward_onnx(feats)
outputs = []
for feat, cls_convs, gfl_cls in zip(
feats,
self.cls_convs,
self.gfl_cls,
):
for conv in cls_convs:
feat = conv(feat)
output = gfl_cls(feat)
outputs.append(output.flatten(start_dim=2))
outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
return outputs

def post_process(self, preds, meta):
"""Prediction results post processing. Decode bboxes and rescale
to original image size.
Args:
preds (Tensor): Prediction output.
meta (dict): Meta info.
"""
cls_scores, bbox_preds = preds.split(
[self.num_classes, 4 * (self.reg_max + 1)], dim=-1)
result_list = self.get_bboxes(cls_scores, bbox_preds, meta)
det_results = {}
warp_matrixes = (
meta['warp_matrix']
if isinstance(meta['warp_matrix'], list) else meta['warp_matrix'])
img_heights = (
meta['img_info']['height'].cpu().numpy() if isinstance(
meta['img_info']['height'], torch.Tensor) else
meta['img_info']['height'])
img_widths = (
meta['img_info']['width'].cpu().numpy() if isinstance(
meta['img_info']['width'], torch.Tensor) else
meta['img_info']['width'])
img_ids = (
meta['img_info']['id'].cpu().numpy() if isinstance(
meta['img_info']['id'], torch.Tensor) else
meta['img_info']['id'])

for result, img_width, img_height, img_id, warp_matrix in zip(
result_list, img_widths, img_heights, img_ids, warp_matrixes):
det_result = {}
det_bboxes, det_labels = result
det_bboxes = det_bboxes.detach().cpu().numpy()
det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4],
np.linalg.inv(warp_matrix),
img_width, img_height)
classes = det_labels.detach().cpu().numpy()
for i in range(self.num_classes):
inds = classes == i
det_result[i] = np.concatenate(
[
det_bboxes[inds, :4].astype(np.float32),
det_bboxes[inds, 4:5].astype(np.float32),
],
axis=1,
).tolist()
det_results[img_id] = det_result
return det_results

def get_bboxes(self, cls_preds, reg_preds, img_metas):
"""Decode the outputs to bboxes.
Args:
cls_preds (Tensor): Shape (num_imgs, num_points, num_classes).
reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)).
img_metas (dict): Dict of image info.

Returns:
results_list (list[tuple]): List of detection bboxes and labels.
"""
device = cls_preds.device
b = cls_preds.shape[0]
input_height, input_width = img_metas['img'].shape[2:]
input_shape = (input_height, input_width)

featmap_sizes = [(math.ceil(input_height / stride),
math.ceil(input_width) / stride)
for stride in self.strides]
mlvl_center_priors = [
self.get_single_level_center_priors(
b,
featmap_sizes[i],
stride,
dtype=torch.float32,
device=device,
) for i, stride in enumerate(self.strides)
]
center_priors = torch.cat(mlvl_center_priors, dim=1)
dis_preds = self.distribution_project(reg_preds) * center_priors[...,
2,
None]
bboxes = distance2bbox(
center_priors[..., :2], dis_preds, max_shape=input_shape)
scores = cls_preds.sigmoid()
result_list = []
for i in range(b):
score, bbox = scores[i], bboxes[i]
padding = score.new_zeros(score.shape[0], 1)
score = torch.cat([score, padding], dim=1)
results = multiclass_nms(
bbox,
score,
score_thr=0.05,
nms_cfg=dict(type='nms', iou_threshold=0.6),
max_num=100,
)
result_list.append(results)
return result_list

def get_single_level_center_priors(self, batch_size, featmap_size, stride,
dtype, device):
"""Generate centers of a single stage feature map.
Args:
batch_size (int): Number of images in one batch.
featmap_size (tuple[int]): height and width of the feature map
stride (int): down sample stride of the feature map
dtype (obj:`torch.dtype`): data type of the tensors
device (obj:`torch.device`): device of the tensors
Return:
priors (Tensor): center priors of a single level feature map.
"""
h, w = featmap_size
x_range = (torch.arange(w, dtype=dtype, device=device)) * stride
y_range = (torch.arange(h, dtype=dtype, device=device)) * stride
y, x = torch.meshgrid(y_range, x_range)
y = y.flatten()
x = x.flatten()
strides = x.new_full((x.shape[0], ), stride)
proiors = torch.stack([x, y, strides, strides], dim=-1)
return proiors.unsqueeze(0).repeat(batch_size, 1, 1)

+ 64
- 0
modelscope/models/cv/face_human_hand_detection/one_stage_detector.py View File

@@ -0,0 +1,64 @@
# The implementation here is modified based on nanodet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

import torch
import torch.nn as nn

from .ghost_pan import GhostPAN
from .nanodet_plus_head import NanoDetPlusHead
from .shufflenetv2 import ShuffleNetV2


class OneStageDetector(nn.Module):

def __init__(self):
super(OneStageDetector, self).__init__()
self.backbone = ShuffleNetV2(
model_size='1.0x',
out_stages=(2, 3, 4),
with_last_conv=False,
kernal_size=3,
activation='LeakyReLU',
pretrain=False)
self.fpn = GhostPAN(
in_channels=[116, 232, 464],
out_channels=96,
use_depthwise=True,
kernel_size=5,
expand=1,
num_blocks=1,
use_res=False,
num_extra_level=1,
upsample_cfg=dict(scale_factor=2, mode='bilinear'),
norm_cfg=dict(type='BN'),
activation='LeakyReLU')
self.head = NanoDetPlusHead(
num_classes=3,
input_channel=96,
feat_channels=96,
stacked_convs=2,
kernel_size=5,
strides=[8, 16, 32, 64],
conv_type='DWConv',
norm_cfg=dict(type='BN'),
reg_max=7,
activation='LeakyReLU',
assigner_cfg=dict(topk=13))
self.epoch = 0

def forward(self, x):
x = self.backbone(x)
if hasattr(self, 'fpn'):
x = self.fpn(x)
if hasattr(self, 'head'):
x = self.head(x)
return x

def inference(self, meta):
with torch.no_grad():
torch.cuda.synchronize()
preds = self(meta['img'])
torch.cuda.synchronize()
results = self.head.post_process(preds, meta)
torch.cuda.synchronize()
return results

+ 182
- 0
modelscope/models/cv/face_human_hand_detection/shufflenetv2.py View File

@@ -0,0 +1,182 @@
# The implementation here is modified based on nanodet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

import torch
import torch.nn as nn

from .utils import act_layers


def channel_shuffle(x, groups):
batchsize, num_channels, height, width = x.data.size()
channels_per_group = num_channels // groups

x = x.view(batchsize, groups, channels_per_group, height, width)

x = torch.transpose(x, 1, 2).contiguous()

x = x.view(batchsize, -1, height, width)

return x


class ShuffleV2Block(nn.Module):

def __init__(self, inp, oup, stride, activation='ReLU'):
super(ShuffleV2Block, self).__init__()

if not (1 <= stride <= 3):
raise ValueError('illegal stride value')
self.stride = stride

branch_features = oup // 2
assert (self.stride != 1) or (inp == branch_features << 1)

if self.stride > 1:
self.branch1 = nn.Sequential(
self.depthwise_conv(
inp, inp, kernel_size=3, stride=self.stride, padding=1),
nn.BatchNorm2d(inp),
nn.Conv2d(
inp,
branch_features,
kernel_size=1,
stride=1,
padding=0,
bias=False),
nn.BatchNorm2d(branch_features),
act_layers(activation),
)
else:
self.branch1 = nn.Sequential()

self.branch2 = nn.Sequential(
nn.Conv2d(
inp if (self.stride > 1) else branch_features,
branch_features,
kernel_size=1,
stride=1,
padding=0,
bias=False,
),
nn.BatchNorm2d(branch_features),
act_layers(activation),
self.depthwise_conv(
branch_features,
branch_features,
kernel_size=3,
stride=self.stride,
padding=1,
),
nn.BatchNorm2d(branch_features),
nn.Conv2d(
branch_features,
branch_features,
kernel_size=1,
stride=1,
padding=0,
bias=False,
),
nn.BatchNorm2d(branch_features),
act_layers(activation),
)

@staticmethod
def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
return nn.Conv2d(
i, o, kernel_size, stride, padding, bias=bias, groups=i)

def forward(self, x):
if self.stride == 1:
x1, x2 = x.chunk(2, dim=1)
out = torch.cat((x1, self.branch2(x2)), dim=1)
else:
out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)

out = channel_shuffle(out, 2)

return out


class ShuffleNetV2(nn.Module):

def __init__(
self,
model_size='1.5x',
out_stages=(2, 3, 4),
with_last_conv=False,
kernal_size=3,
activation='ReLU',
pretrain=True,
):
super(ShuffleNetV2, self).__init__()
assert set(out_stages).issubset((2, 3, 4))

print('model size is ', model_size)

self.stage_repeats = [4, 8, 4]
self.model_size = model_size
self.out_stages = out_stages
self.with_last_conv = with_last_conv
self.kernal_size = kernal_size
self.activation = activation
if model_size == '0.5x':
self._stage_out_channels = [24, 48, 96, 192, 1024]
elif model_size == '1.0x':
self._stage_out_channels = [24, 116, 232, 464, 1024]
elif model_size == '1.5x':
self._stage_out_channels = [24, 176, 352, 704, 1024]
elif model_size == '2.0x':
self._stage_out_channels = [24, 244, 488, 976, 2048]
else:
raise NotImplementedError

# building first layer
input_channels = 3
output_channels = self._stage_out_channels[0]
self.conv1 = nn.Sequential(
nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
nn.BatchNorm2d(output_channels),
act_layers(activation),
)
input_channels = output_channels

self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
for name, repeats, output_channels in zip(
stage_names, self.stage_repeats, self._stage_out_channels[1:]):
seq = [
ShuffleV2Block(
input_channels, output_channels, 2, activation=activation)
]
for i in range(repeats - 1):
seq.append(
ShuffleV2Block(
output_channels,
output_channels,
1,
activation=activation))
setattr(self, name, nn.Sequential(*seq))
input_channels = output_channels
output_channels = self._stage_out_channels[-1]
if self.with_last_conv:
conv5 = nn.Sequential(
nn.Conv2d(
input_channels, output_channels, 1, 1, 0, bias=False),
nn.BatchNorm2d(output_channels),
act_layers(activation),
)
self.stage4.add_module('conv5', conv5)

def forward(self, x):
x = self.conv1(x)
x = self.maxpool(x)
output = []

for i in range(2, 5):
stage = getattr(self, 'stage{}'.format(i))
x = stage(x)
if i in self.out_stages:
output.append(x)
return tuple(output)

+ 277
- 0
modelscope/models/cv/face_human_hand_detection/utils.py View File

@@ -0,0 +1,277 @@
# The implementation here is modified based on nanodet,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

import torch
import torch.nn as nn

activations = {
'ReLU': nn.ReLU,
'LeakyReLU': nn.LeakyReLU,
'ReLU6': nn.ReLU6,
'SELU': nn.SELU,
'ELU': nn.ELU,
'GELU': nn.GELU,
'PReLU': nn.PReLU,
'SiLU': nn.SiLU,
'HardSwish': nn.Hardswish,
'Hardswish': nn.Hardswish,
None: nn.Identity,
}


def act_layers(name):
assert name in activations.keys()
if name == 'LeakyReLU':
return nn.LeakyReLU(negative_slope=0.1, inplace=True)
elif name == 'GELU':
return nn.GELU()
elif name == 'PReLU':
return nn.PReLU()
else:
return activations[name](inplace=True)


norm_cfg = {
'BN': ('bn', nn.BatchNorm2d),
'SyncBN': ('bn', nn.SyncBatchNorm),
'GN': ('gn', nn.GroupNorm),
}


def build_norm_layer(cfg, num_features, postfix=''):
"""Build normalization layer

Args:
cfg (dict): cfg should contain:
type (str): identify norm layer type.
layer args: args needed to instantiate a norm layer.
requires_grad (bool): [optional] whether stop gradient updates
num_features (int): number of channels from input.
postfix (int, str): appended into norm abbreviation to
create named layer.

Returns:
name (str): abbreviation + postfix
layer (nn.Module): created norm layer
"""
assert isinstance(cfg, dict) and 'type' in cfg
cfg_ = cfg.copy()

layer_type = cfg_.pop('type')
if layer_type not in norm_cfg:
raise KeyError('Unrecognized norm type {}'.format(layer_type))
else:
abbr, norm_layer = norm_cfg[layer_type]
if norm_layer is None:
raise NotImplementedError

assert isinstance(postfix, (int, str))
name = abbr + str(postfix)

requires_grad = cfg_.pop('requires_grad', True)
cfg_.setdefault('eps', 1e-5)
if layer_type != 'GN':
layer = norm_layer(num_features, **cfg_)
if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
layer._specify_ddp_gpu_num(1)
else:
assert 'num_groups' in cfg_
layer = norm_layer(num_channels=num_features, **cfg_)

for param in layer.parameters():
param.requires_grad = requires_grad

return name, layer


class ConvModule(nn.Module):
"""A conv block that contains conv/norm/activation layers.

Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int or tuple[int]): Same as nn.Conv2d.
padding (int or tuple[int]): Same as nn.Conv2d.
dilation (int or tuple[int]): Same as nn.Conv2d.
groups (int): Same as nn.Conv2d.
bias (bool or str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
False.
conv_cfg (dict): Config dict for convolution layer.
norm_cfg (dict): Config dict for normalization layer.
activation (str): activation layer, "ReLU" by default.
inplace (bool): Whether to use inplace mode for activation.
order (tuple[str]): The order of conv/norm/activation layers. It is a
sequence of "conv", "norm" and "act". Examples are
("conv", "norm", "act") and ("act", "conv", "norm").
"""

def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias='auto',
conv_cfg=None,
norm_cfg=None,
activation='ReLU',
inplace=True,
order=('conv', 'norm', 'act'),
):
super(ConvModule, self).__init__()
assert conv_cfg is None or isinstance(conv_cfg, dict)
assert norm_cfg is None or isinstance(norm_cfg, dict)
assert activation is None or isinstance(activation, str)
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.activation = activation
self.inplace = inplace
self.order = order
assert isinstance(self.order, tuple) and len(self.order) == 3
assert set(order) == {'conv', 'norm', 'act'}

self.with_norm = norm_cfg is not None
if bias == 'auto':
bias = False if self.with_norm else True
self.with_bias = bias

if self.with_norm and self.with_bias:
warnings.warn('ConvModule has norm and bias at the same time')

self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
)
self.in_channels = self.conv.in_channels
self.out_channels = self.conv.out_channels
self.kernel_size = self.conv.kernel_size
self.stride = self.conv.stride
self.padding = self.conv.padding
self.dilation = self.conv.dilation
self.transposed = self.conv.transposed
self.output_padding = self.conv.output_padding
self.groups = self.conv.groups

if self.with_norm:
if order.index('norm') > order.index('conv'):
norm_channels = out_channels
else:
norm_channels = in_channels
self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
self.add_module(self.norm_name, norm)
else:
self.norm_name = None

if self.activation:
self.act = act_layers(self.activation)

@property
def norm(self):
if self.norm_name:
return getattr(self, self.norm_name)
else:
return None

def forward(self, x, norm=True):
for layer in self.order:
if layer == 'conv':
x = self.conv(x)
elif layer == 'norm' and norm and self.with_norm:
x = self.norm(x)
elif layer == 'act' and self.activation:
x = self.act(x)
return x


class DepthwiseConvModule(nn.Module):

def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
bias='auto',
norm_cfg=dict(type='BN'),
activation='ReLU',
inplace=True,
order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'),
):
super(DepthwiseConvModule, self).__init__()
assert activation is None or isinstance(activation, str)
self.activation = activation
self.inplace = inplace
self.order = order
assert isinstance(self.order, tuple) and len(self.order) == 6
assert set(order) == {
'depthwise',
'dwnorm',
'act',
'pointwise',
'pwnorm',
'act',
}

self.with_norm = norm_cfg is not None
if bias == 'auto':
bias = False if self.with_norm else True
self.with_bias = bias

if self.with_norm and self.with_bias:
warnings.warn('ConvModule has norm and bias at the same time')

self.depthwise = nn.Conv2d(
in_channels,
in_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=in_channels,
bias=bias,
)
self.pointwise = nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias)

self.in_channels = self.depthwise.in_channels
self.out_channels = self.pointwise.out_channels
self.kernel_size = self.depthwise.kernel_size
self.stride = self.depthwise.stride
self.padding = self.depthwise.padding
self.dilation = self.depthwise.dilation
self.transposed = self.depthwise.transposed
self.output_padding = self.depthwise.output_padding

if self.with_norm:
_, self.dwnorm = build_norm_layer(norm_cfg, in_channels)
_, self.pwnorm = build_norm_layer(norm_cfg, out_channels)

if self.activation:
self.act = act_layers(self.activation)

def forward(self, x, norm=True):
for layer_name in self.order:
if layer_name != 'act':
layer = self.__getattr__(layer_name)
x = layer(x)
elif layer_name == 'act' and self.activation:
x = self.act(x)
return x

+ 2
- 0
modelscope/models/cv/image_colorization/unet.py View File

@@ -1,3 +1,5 @@
# The implementation here is modified based on DeOldify, originally MIT License
# and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py
import numpy as np
import torch
import torch.nn as nn


+ 2
- 0
modelscope/models/cv/image_colorization/utils.py View File

@@ -1,3 +1,5 @@
# The implementation here is modified based on DeOldify, originally MIT License and
# publicly available at https://github.com/jantic/DeOldify/blob/master/fastai/callbacks/hooks.py
import functools
from enum import Enum



+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/align_faces.py View File

@@ -1,3 +1,5 @@
# Part of the implementation is borrowed and modified from Face-Alignment,
# publicly available at https://github.com/foamliu/Face-Alignment/blob/master/align_faces.py
import cv2
import numpy as np
from skimage import transform as trans


+ 1
- 0
modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os

import cv2


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from FaceQuality, made publicly available under the MIT License
# at https://github.com/deepcam-cn/FaceQuality/blob/master/models/model_resnet.py
import torch
from torch import nn



+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/gpen.py View File

@@ -1,3 +1,5 @@
# The GPEN implementation is also open-sourced by the authors,
# and available at https://github.com/yangxy/GPEN/blob/main/face_model/gpen_model.py
import functools
import itertools
import math


+ 1
- 0
modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import math
import os.path as osp
from copy import deepcopy


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/losses/helpers.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from InsightFace_Pytorch,
# made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
from collections import namedtuple

import torch


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/losses/losses.py View File

@@ -1,3 +1,5 @@
# The GPEN implementation is also open-sourced by the authors,
# and available at https://github.com/yangxy/GPEN/tree/main/training/loss/id_loss.py
import torch
import torch.nn as nn
import torch.nn.functional as F


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from InsightFace_Pytorch,
# made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
Module, PReLU, Sequential)



+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py View File

@@ -1,3 +1,5 @@
# The GPEN implementation is also open-sourced by the authors,
# and available at https://github.com/yangxy/GPEN/blob/main/face_detect/retinaface_detection.py
import os

import cv2


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
# at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py
import time

import torch


+ 2
- 0
modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
# at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py
from collections import OrderedDict

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_generation/model.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_generation/models/autoencoder.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 2
- 0
modelscope/models/cv/image_to_image_generation/models/clip.py View File

@@ -1,3 +1,5 @@
# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_generation/ops/diffusion.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_generation/ops/losses.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/data/transforms.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math
import random



+ 1
- 0
modelscope/models/cv/image_to_image_translation/model_translation.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/models/autoencoder.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 2
- 0
modelscope/models/cv/image_to_image_translation/models/clip.py View File

@@ -1,3 +1,5 @@
# Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/apps.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
# APPs that facilitate the use of pretrained neural networks.

import os.path as osp


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/degradation.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math
import os
import random


+ 3
- 0
modelscope/models/cv/image_to_image_translation/ops/diffusion.py View File

@@ -1,3 +1,6 @@
# Part of the implementation is borrowed and modified from latent-diffusion,
# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/losses.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import math

import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/metrics.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import numpy as np
import scipy.linalg as linalg
import torch


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/random_color.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import colorsys
import random



+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/random_mask.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import cv2
import numpy as np



+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/svd.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
r"""SVD of linear degradation matrices described in the paper
``Denoising Diffusion Restoration Models.''
@article{kawar2022denoising,


+ 1
- 0
modelscope/models/cv/image_to_image_translation/ops/utils.py View File

@@ -1,3 +1,4 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
import base64
import binascii
import hashlib


+ 12
- 9
modelscope/models/cv/movie_scene_segmentation/model.py View File

@@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel):
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

self.infer_result = {'vid': [], 'sid': [], 'pred': []}
sampling_method = self.cfg.dataset.sampling_method.name
self.neighbor_size = self.cfg.dataset.sampling_method.params[
sampling_method].neighbor_size
@@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel):
shot_num = len(sids)
cnt = shot_num // bs + 1

infer_sid, infer_pred = [], []
infer_result = {}
for i in range(cnt):
start = i * bs
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
@@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel):
input_ = torch.stack(input_)
outputs = self.shared_step(input_) # shape [b,2]
prob = F.softmax(outputs, dim=1)
self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
self.infer_result['pred'] = np.stack(self.infer_result['pred'])
infer_sid.extend(sid_.cpu().detach().numpy())
infer_pred.extend(prob[:, 1].cpu().detach().numpy())
infer_result.update({'pred': np.stack(infer_pred)})
infer_result.update({'sid': infer_sid})

assert len(self.infer_result['sid']) == len(sids)
assert len(self.infer_result['pred']) == len(inputs)
return self.infer_result
assert len(infer_result['sid']) == len(sids)
assert len(infer_result['pred']) == len(inputs)
return infer_result

def shared_step(self, inputs):
with torch.no_grad():
@@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel):
thres = self.cfg.pipeline.save_threshold

anno_dict = get_pred_boundary(pred_dict, thres)
scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
self.shot2keyf, anno_dict)
if self.cfg.pipeline.save_split_scene:
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
print(f'Split scene video saved to {re_dir}')
return len(scene_list), scene_dict_lst
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst

def preprocess(self, inputs):
logger.info('Begin shot detect......')


+ 10
- 2
modelscope/models/cv/movie_scene_segmentation/utils/save_op.py View File

@@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict):
scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)

scene_dict_lst = []
shot_num = len(shot2keyf)
shot_dict_lst = []
for item in shot2keyf:
tmp = item.split(' ')
shot_dict_lst.append({
'frame': [tmp[0], tmp[1]],
'timestamps': [tmp[-2], tmp[-1]]
})
assert len(scene_list) == len(pair_list)
for scene_ind, scene_item in enumerate(scene_list):
scene_dict_lst.append({
'shot': pair_list[scene_ind],
'frame': scene_item[0],
'timestamp': scene_item[1]
'timestamps': scene_item[1]
})

return scene_dict_lst, scene_list
return scene_dict_lst, scene_list, shot_num, shot_dict_lst


def scene2video(source_movie_fn, scene_list, thres):


+ 20
- 0
modelscope/models/cv/product_segmentation/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .seg_infer import F3NetProductSegmentation

else:
_import_structure = {'seg_infer': ['F3NetProductSegmentation']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 197
- 0
modelscope/models/cv/product_segmentation/net.py View File

@@ -0,0 +1,197 @@
# The implementation here is modified based on F3Net,
# originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net

import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):

def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
dilation=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(
planes,
planes,
kernel_size=3,
stride=stride,
padding=(3 * dilation - 1) // 2,
bias=False,
dilation=dilation)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.downsample = downsample

def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)), inplace=True)
out = F.relu(self.bn2(self.conv2(out)), inplace=True)
out = self.bn3(self.conv3(out))
if self.downsample is not None:
x = self.downsample(x)
return F.relu(out + x, inplace=True)


class ResNet(nn.Module):

def __init__(self):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(
3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self.make_layer(64, 3, stride=1, dilation=1)
self.layer2 = self.make_layer(128, 4, stride=2, dilation=1)
self.layer3 = self.make_layer(256, 6, stride=2, dilation=1)
self.layer4 = self.make_layer(512, 3, stride=2, dilation=1)

def make_layer(self, planes, blocks, stride, dilation):
downsample = nn.Sequential(
nn.Conv2d(
self.inplanes,
planes * 4,
kernel_size=1,
stride=stride,
bias=False), nn.BatchNorm2d(planes * 4))
layers = [
Bottleneck(
self.inplanes, planes, stride, downsample, dilation=dilation)
]
self.inplanes = planes * 4
for _ in range(1, blocks):
layers.append(Bottleneck(self.inplanes, planes, dilation=dilation))
return nn.Sequential(*layers)

def forward(self, x):
x = x.reshape(1, 3, 448, 448)
out1 = F.relu(self.bn1(self.conv1(x)), inplace=True)
out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1)
out2 = self.layer1(out1)
out3 = self.layer2(out2)
out4 = self.layer3(out3)
out5 = self.layer4(out4)
return out2, out3, out4, out5


class CFM(nn.Module):

def __init__(self):
super(CFM, self).__init__()
self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn1h = nn.BatchNorm2d(64)
self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn2h = nn.BatchNorm2d(64)
self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn3h = nn.BatchNorm2d(64)
self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn4h = nn.BatchNorm2d(64)

self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn1v = nn.BatchNorm2d(64)
self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn2v = nn.BatchNorm2d(64)
self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn3v = nn.BatchNorm2d(64)
self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.bn4v = nn.BatchNorm2d(64)

def forward(self, left, down):
if down.size()[2:] != left.size()[2:]:
down = F.interpolate(down, size=left.size()[2:], mode='bilinear')
out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True)
out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True)
out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True)
out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True)
fuse = out2h * out2v
out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h
out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True)
out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v
out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True)
return out4h, out4v


class Decoder(nn.Module):

def __init__(self):
super(Decoder, self).__init__()
self.cfm45 = CFM()
self.cfm34 = CFM()
self.cfm23 = CFM()

def forward(self, out2h, out3h, out4h, out5v, fback=None):
if fback is not None:
refine5 = F.interpolate(
fback, size=out5v.size()[2:], mode='bilinear')
refine4 = F.interpolate(
fback, size=out4h.size()[2:], mode='bilinear')
refine3 = F.interpolate(
fback, size=out3h.size()[2:], mode='bilinear')
refine2 = F.interpolate(
fback, size=out2h.size()[2:], mode='bilinear')
out5v = out5v + refine5
out4h, out4v = self.cfm45(out4h + refine4, out5v)
out3h, out3v = self.cfm34(out3h + refine3, out4v)
out2h, pred = self.cfm23(out2h + refine2, out3v)
else:
out4h, out4v = self.cfm45(out4h, out5v)
out3h, out3v = self.cfm34(out3h, out4v)
out2h, pred = self.cfm23(out2h, out3v)
return out2h, out3h, out4h, out5v, pred


class F3Net(nn.Module):

def __init__(self):
super(F3Net, self).__init__()
self.bkbone = ResNet()
self.squeeze5 = nn.Sequential(
nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
self.squeeze4 = nn.Sequential(
nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
self.squeeze3 = nn.Sequential(
nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
self.squeeze2 = nn.Sequential(
nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))

self.decoder1 = Decoder()
self.decoder2 = Decoder()
self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)

self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)

def forward(self, x, shape=None):
x = x.reshape(1, 3, 448, 448)
out2h, out3h, out4h, out5v = self.bkbone(x)
out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3(
out3h), self.squeeze4(out4h), self.squeeze5(out5v)
out2h, out3h, out4h, out5v, pred1 = self.decoder1(
out2h, out3h, out4h, out5v)
out2h, out3h, out4h, out5v, pred2 = self.decoder2(
out2h, out3h, out4h, out5v, pred1)

shape = x.size()[2:] if shape is None else shape
pred1 = F.interpolate(
self.linearp1(pred1), size=shape, mode='bilinear')
pred2 = F.interpolate(
self.linearp2(pred2), size=shape, mode='bilinear')

out2h = F.interpolate(
self.linearr2(out2h), size=shape, mode='bilinear')
out3h = F.interpolate(
self.linearr3(out3h), size=shape, mode='bilinear')
out4h = F.interpolate(
self.linearr4(out4h), size=shape, mode='bilinear')
out5h = F.interpolate(
self.linearr5(out5v), size=shape, mode='bilinear')
return pred1, pred2, out2h, out3h, out4h, out5h

+ 77
- 0
modelscope/models/cv/product_segmentation/seg_infer.py View File

@@ -0,0 +1,77 @@
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

import cv2
import numpy as np
import torch
from PIL import Image

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .net import F3Net

logger = get_logger()


def load_state_dict(model_dir, device):
_dict = torch.load(
'{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
map_location=device)
state_dict = {}
for k, v in _dict.items():
if k.startswith('module'):
k = k[7:]
state_dict[k] = v
return state_dict


@MODELS.register_module(
Tasks.product_segmentation, module_name=Models.product_segmentation)
class F3NetForProductSegmentation(TorchModel):

def __init__(self, model_dir, device_id=0, *args, **kwargs):

super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)

self.model = F3Net()
if torch.cuda.is_available():
self.device = 'cuda'
logger.info('Use GPU')
else:
self.device = 'cpu'
logger.info('Use CPU')

self.params = load_state_dict(model_dir, self.device)
self.model.load_state_dict(self.params)
self.model.to(self.device)
self.model.eval()
self.model.to(self.device)

def forward(self, x):
pred_result = self.model(x)
return pred_result


mean, std = np.array([[[124.55, 118.90,
102.94]]]), np.array([[[56.77, 55.97, 57.50]]])


def inference(model, device, input_path):
img = Image.open(input_path)
img = np.array(img.convert('RGB')).astype(np.float32)
img = (img - mean) / std
img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
img = torch.from_numpy(img)
img = img.permute(2, 0, 1)
img = img.to(device).float()
outputs = model(img)
out = outputs[0]
pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy()
pred[pred < 20] = 0
pred = pred[:, :, np.newaxis]
pred = np.round(pred)
logger.info('Inference Done')
return pred

+ 1
- 0
modelscope/models/cv/skin_retouching/detection_model/detection_module.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn



+ 1
- 0
modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F


+ 1
- 0
modelscope/models/cv/skin_retouching/inpainting_model/gconv.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn



+ 1
- 0
modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F


+ 1
- 0
modelscope/models/cv/skin_retouching/unet_deploy.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import warnings

import torch


+ 1
- 0
modelscope/models/cv/skin_retouching/utils.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import time
from typing import Dict, List, Optional, Tuple, Union



+ 1
- 0
modelscope/models/cv/skin_retouching/weights_init.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn



+ 2
- 0
modelscope/models/cv/super_resolution/arch_util.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
# at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/arch_util.py
import collections.abc
import math
import warnings


+ 2
- 0
modelscope/models/cv/super_resolution/rrdbnet_arch.py View File

@@ -1,3 +1,5 @@
# The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
# at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/rrdbnet_arch.py
import torch
from torch import nn as nn
from torch.nn import functional as F


+ 2
- 0
modelscope/models/multi_modal/clip/__init__.py View File

@@ -1 +1,3 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from .model import CLIPForMultiModalEmbedding

+ 15
- 0
modelscope/models/multi_modal/clip/model.py View File

@@ -1,3 +1,18 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from collections import OrderedDict
from typing import Any, Dict, Iterable, List, Tuple, Union


+ 1
- 0
modelscope/models/multi_modal/gemm/gemm_base.py View File

@@ -543,6 +543,7 @@ class GEMMModel(nn.Module):
img_feature, text_feature, caption = None, None, None
if captioning and image is not None:
img_feature, caption = self.model.image_to_text(image)
img_feature = self.parse_feat(img_feature)
elif image is not None:
img_feature = self.parse_feat(self.model.encode_image(image))
if text is not None:


+ 7
- 4
modelscope/models/multi_modal/gemm/gemm_model.py View File

@@ -67,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel):
return img_tensor

def parse_text(self, text_str):
if text_str is None:
if text_str is None or len(text_str) == 0:
return None
if isinstance(text_str, str):
text_ids_tensor = self.gemm_model.tokenize(text_str)
@@ -79,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel):
return text_ids_tensor.view(1, -1)

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
image = self.parse_image(input.get('image', input.get('img', None)))
text = self.parse_text(input.get('text', input.get('txt', None)))
captioning = input.get('captioning', False) is True
image_input = input.get('image', input.get('img', None))
text_input = input.get('text', input.get('txt', None))
captioning_input = input.get('captioning', None)
image = self.parse_image(image_input)
text = self.parse_text(text_input)
captioning = captioning_input is True or text_input == ''
out = self.gemm_model(image, text, captioning)
output = {
OutputKeys.IMG_EMBEDDING: out.get('image_feature', None),


+ 1
- 1
modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py View File

@@ -1,4 +1,4 @@
# The implementation is adopated from the CLIP4Clip implementation,
# The implementation is adopted from the CLIP4Clip implementation,
# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

import random


+ 1
- 1
modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py View File

@@ -1,4 +1,4 @@
# The implementation is adopated from the CLIP4Clip implementation,
# The implementation is adopted from the CLIP4Clip implementation,
# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

import numpy as np


+ 1
- 1
modelscope/models/multi_modal/mmr/models/tokenization_clip.py View File

@@ -1,4 +1,4 @@
# The implementation is adopated from the CLIP4Clip implementation,
# The implementation is adopted from the CLIP4Clip implementation,
# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

import gzip


+ 2
- 0
modelscope/models/multi_modal/ofa/__init__.py View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel
from .tokenization_ofa import OFATokenizer, OFATokenizerZH
from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast

+ 14
- 0
modelscope/models/multi_modal/ofa/resnet.py View File

@@ -1,3 +1,17 @@
# Copyright 2022 OFA-Sys Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.nn as nn



+ 1
- 0
modelscope/models/multi_modal/ofa/utils/__init__.py View File

@@ -1 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .constant import OFA_TASK_KEY_MAPPING

+ 2
- 0
modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import os
from typing import Any, Dict



+ 56
- 14
modelscope/outputs.py View File

@@ -21,6 +21,7 @@ class OutputKeys(object):
POLYGONS = 'polygons'
OUTPUT = 'output'
OUTPUT_IMG = 'output_img'
OUTPUT_VIDEO = 'output_video'
OUTPUT_PCM = 'output_pcm'
IMG_EMBEDDING = 'img_embedding'
SPO_LIST = 'spo_list'
@@ -37,8 +38,10 @@ class OutputKeys(object):
KWS_LIST = 'kws_list'
HISTORY = 'history'
TIMESTAMPS = 'timestamps'
SPLIT_VIDEO_NUM = 'split_video_num'
SPLIT_META_LIST = 'split_meta_list'
SHOT_NUM = 'shot_num'
SCENE_NUM = 'scene_num'
SCENE_META_LIST = 'scene_meta_list'
SHOT_META_LIST = 'shot_meta_list'


TASK_OUTPUTS = {
@@ -218,13 +221,21 @@ TASK_OUTPUTS = {

# 3D human body keypoints detection result for single sample
# {
# "poses": [
# [[x, y, z]*17],
# [[x, y, z]*17],
# [[x, y, z]*17]
# ]
# "poses": [ # 3d pose coordinate in camera coordinate
# [[x, y, z]*17], # joints of per image
# [[x, y, z]*17],
# ...
# ],
# "timestamps": [ # timestamps of all frames
# "00:00:0.230",
# "00:00:0.560",
# "00:00:0.690",
# ],
# "output_video": "path_to_rendered_video" , this is optional
# and is only avaialbe when the "render" option is enabled.
# }
Tasks.body_3d_keypoints: [OutputKeys.POSES],
Tasks.body_3d_keypoints:
[OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],

# 2D hand keypoints result for single sample
# {
@@ -300,19 +311,30 @@ TASK_OUTPUTS = {
Tasks.shop_segmentation: [OutputKeys.MASKS],
# movide scene segmentation result for a single video
# {
# "split_video_num":3,
# "split_meta_list":
# "shot_num":15,
# "shot_meta_list":
# [
# {
# "frame": [start_frame, end_frame],
# "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
#
# }
# ]
# "scene_num":3,
# "scene_meta_list":
# [
# {
# "shot": [0,1,2],
# "frame": [start_frame, end_frame],
# "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
# "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
# }
# ]
#
# }
Tasks.movie_scene_segmentation:
[OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
Tasks.movie_scene_segmentation: [
OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
OutputKeys.SCENE_META_LIST
],

# ============ nlp tasks ===================

@@ -649,8 +671,28 @@ TASK_OUTPUTS = {
# 'output': ['Done' / 'Decode_Error']
# }
Tasks.video_inpainting: [OutputKeys.OUTPUT],

# {
# 'output': ['bixin']
# }
Tasks.hand_static: [OutputKeys.OUTPUT]
Tasks.hand_static: [OutputKeys.OUTPUT],

# 'output': [
# [2, 75, 287, 240, 510, 0.8335018754005432],
# [1, 127, 83, 332, 366, 0.9175254702568054],
# [0, 0, 0, 367, 639, 0.9693422317504883]]
# }
Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],

# {
# {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
# }
Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],

# {
# "masks": [
# np.array # 2D array containing only 0, 255
# ]
# }
Tasks.product_segmentation: [OutputKeys.MASKS],
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save