Browse Source

Merge remote-tracking branch 'origin/master' into ofa/finetune

master
行嗔 3 years ago
parent
commit
5cf6910bed
100 changed files with 11814 additions and 142 deletions
  1. +5
    -8
      .dev_scripts/ci_container_test.sh
  2. +0
    -19
      .dev_scripts/citest.sh
  3. +4
    -1
      .dev_scripts/dockerci.sh
  4. +1
    -1
      .readthedocs.yaml
  5. +3
    -0
      data/test/images/facial_expression_recognition.jpg
  6. +3
    -0
      data/test/images/hand_keypoints.jpg
  7. +3
    -0
      data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
  8. +3
    -0
      data/test/images/retina_face_detection.jpg
  9. +3
    -0
      data/test/images/shop_segmentation.jpg
  10. +3
    -0
      data/test/images/text_driven_segmentation.jpg
  11. +3
    -0
      data/test/videos/action_detection_test_video.mp4
  12. +2
    -2
      data/test/videos/movie_scene_segmentation_test_video.mp4
  13. +1
    -1
      docker/Dockerfile.ubuntu
  14. +17
    -0
      modelscope/metainfo.py
  15. +2
    -2
      modelscope/models/audio/ans/__init__.py
  16. +6
    -0
      modelscope/models/audio/ans/complex_nn.py
  17. +1
    -0
      modelscope/models/audio/ans/conv_stft.py
  18. +10
    -52
      modelscope/models/audio/ans/frcrn.py
  19. +1
    -0
      modelscope/models/audio/ans/se_module_complex.py
  20. +5
    -0
      modelscope/models/audio/ans/unet.py
  21. +4
    -4
      modelscope/models/cv/__init__.py
  22. +21
    -0
      modelscope/models/cv/action_detection/__init__.py
  23. +177
    -0
      modelscope/models/cv/action_detection/action_detection_onnx.py
  24. +20
    -0
      modelscope/models/cv/face_2d_keypoints/__init__.py
  25. +16
    -0
      modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
  26. +22
    -0
      modelscope/models/cv/face_detection/__init__.py
  27. +1
    -0
      modelscope/models/cv/face_detection/retinaface/__init__.py
  28. +137
    -0
      modelscope/models/cv/face_detection/retinaface/detection.py
  29. +0
    -0
      modelscope/models/cv/face_detection/retinaface/models/__init__.py
  30. +149
    -0
      modelscope/models/cv/face_detection/retinaface/models/net.py
  31. +145
    -0
      modelscope/models/cv/face_detection/retinaface/models/retinaface.py
  32. +123
    -0
      modelscope/models/cv/face_detection/retinaface/utils.py
  33. +20
    -0
      modelscope/models/cv/facial_expression_recognition/__init__.py
  34. +2
    -0
      modelscope/models/cv/facial_expression_recognition/fer/__init__.py
  35. +72
    -0
      modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
  36. +118
    -0
      modelscope/models/cv/facial_expression_recognition/fer/transforms.py
  37. +40
    -0
      modelscope/models/cv/facial_expression_recognition/fer/vgg.py
  38. +20
    -0
      modelscope/models/cv/shop_segmentation/__init__.py
  39. +59
    -0
      modelscope/models/cv/shop_segmentation/common.py
  40. +122
    -0
      modelscope/models/cv/shop_segmentation/head_fpn.py
  41. +901
    -0
      modelscope/models/cv/shop_segmentation/models.py
  42. +217
    -0
      modelscope/models/cv/shop_segmentation/neck_fpn.py
  43. +157
    -0
      modelscope/models/cv/shop_segmentation/shop_seg_base.py
  44. +115
    -0
      modelscope/models/cv/shop_segmentation/shop_seg_model.py
  45. +199
    -0
      modelscope/models/cv/shop_segmentation/utils.py
  46. +1
    -0
      modelscope/models/cv/text_driven_segmentation/__init__.py
  47. +170
    -0
      modelscope/models/cv/text_driven_segmentation/clip.py
  48. +28
    -0
      modelscope/models/cv/text_driven_segmentation/lseg_base.py
  49. +334
    -0
      modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
  50. +107
    -0
      modelscope/models/cv/text_driven_segmentation/lseg_model.py
  51. +197
    -0
      modelscope/models/cv/text_driven_segmentation/lseg_net.py
  52. +543
    -0
      modelscope/models/cv/text_driven_segmentation/lseg_vit.py
  53. +458
    -0
      modelscope/models/cv/text_driven_segmentation/model.py
  54. +156
    -0
      modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
  55. +24
    -0
      modelscope/models/cv/tinynas_detection/__init__.py
  56. +16
    -0
      modelscope/models/cv/tinynas_detection/backbone/__init__.py
  57. +126
    -0
      modelscope/models/cv/tinynas_detection/backbone/darknet.py
  58. +347
    -0
      modelscope/models/cv/tinynas_detection/backbone/tinynas.py
  59. +2
    -0
      modelscope/models/cv/tinynas_detection/core/__init__.py
  60. +474
    -0
      modelscope/models/cv/tinynas_detection/core/base_ops.py
  61. +324
    -0
      modelscope/models/cv/tinynas_detection/core/neck_ops.py
  62. +205
    -0
      modelscope/models/cv/tinynas_detection/core/repvgg_block.py
  63. +196
    -0
      modelscope/models/cv/tinynas_detection/core/utils.py
  64. +181
    -0
      modelscope/models/cv/tinynas_detection/detector.py
  65. +16
    -0
      modelscope/models/cv/tinynas_detection/head/__init__.py
  66. +361
    -0
      modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
  67. +16
    -0
      modelscope/models/cv/tinynas_detection/neck/__init__.py
  68. +235
    -0
      modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
  69. +661
    -0
      modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
  70. +203
    -0
      modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
  71. +16
    -0
      modelscope/models/cv/tinynas_detection/tinynas_detector.py
  72. +30
    -0
      modelscope/models/cv/tinynas_detection/utils.py
  73. +6
    -4
      modelscope/models/multi_modal/mplug/modeling_mplug.py
  74. +10
    -6
      modelscope/models/nlp/__init__.py
  75. +73
    -0
      modelscope/models/nlp/deberta_v2/__init__.py
  76. +130
    -0
      modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
  77. +1789
    -0
      modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
  78. +546
    -0
      modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
  79. +241
    -0
      modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
  80. +4
    -0
      modelscope/models/nlp/gpt3/modeling_gpt3.py
  81. +39
    -0
      modelscope/models/nlp/masked_language.py
  82. +8
    -8
      modelscope/models/nlp/palm_v2/modeling_palm.py
  83. +20
    -0
      modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
  84. +13
    -0
      modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
  85. +3
    -9
      modelscope/msdatasets/ms_dataset.py
  86. +18
    -6
      modelscope/msdatasets/utils/oss_utils.py
  87. +10
    -12
      modelscope/msdatasets/utils/upload_utils.py
  88. +61
    -1
      modelscope/outputs.py
  89. +0
    -1
      modelscope/pipelines/base.py
  90. +16
    -1
      modelscope/pipelines/builder.py
  91. +19
    -3
      modelscope/pipelines/cv/__init__.py
  92. +63
    -0
      modelscope/pipelines/cv/action_detection_pipeline.py
  93. +3
    -1
      modelscope/pipelines/cv/easycv_pipelines/__init__.py
  94. +41
    -0
      modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
  95. +128
    -0
      modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
  96. +51
    -0
      modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
  97. +2
    -0
      modelscope/pipelines/cv/ocr_detection_pipeline.py
  98. +58
    -0
      modelscope/pipelines/cv/retina_face_detection_pipeline.py
  99. +51
    -0
      modelscope/pipelines/cv/shop_segmentation_pipleline.py
  100. +51
    -0
      modelscope/pipelines/cv/text_driven_segmentation_pipleline.py

+ 5
- 8
.dev_scripts/ci_container_test.sh View File

@@ -1,11 +1,9 @@
pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/tests.txt
# install numpy<=1.18 for tensorflow==1.15.x
pip install "numpy<=1.18"

git config --global --add safe.directory /Maas-lib

@@ -26,4 +24,3 @@ else
fi
echo "Running case with command: $ci_command"
$ci_command
#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py

+ 0
- 19
.dev_scripts/citest.sh View File

@@ -1,19 +0,0 @@
pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html

pip install -r requirements/tests.txt
# install numpy<=1.18 for tensorflow==1.15.x
pip install "numpy<=1.18"

# linter test
# use internal project for pre-commit due to the network problem
pre-commit run --all-files
if [ $? -ne 0 ]; then
echo "linter test failed, please run 'pre-commit run --all-files' to check"
exit -1
fi

PYTHONPATH=. python tests/run.py

+ 4
- 1
.dev_scripts/dockerci.sh View File

@@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
cpu_sets_arr=($cpu_sets)
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
echo "ci command: $CI_COMMAND"
for gpu in $gpus
do
@@ -16,6 +17,7 @@ do
echo "get gpu lock $gpu"
CONTAINER_NAME="modelscope-ci-$gpu"
let is_get_file_lock=true

# pull image if there are update
docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
@@ -38,6 +40,7 @@ do
--net host \
${IMAGE_NAME}:${IMAGE_VERSION} \
$CI_COMMAND

if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1


+ 1
- 1
.readthedocs.yaml View File

@@ -25,4 +25,4 @@ python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/readthedocs.txt
- requirements: requirements/runtime.txt
- requirements: requirements/framework.txt

+ 3
- 0
data/test/images/facial_expression_recognition.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633
size 161535

+ 3
- 0
data/test/images/hand_keypoints.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
size 7750

+ 3
- 0
data/test/images/keypoints_detect/test_img_face_2d_keypoints.png View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
size 54390

+ 3
- 0
data/test/images/retina_face_detection.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
size 87228

+ 3
- 0
data/test/images/shop_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0
size 238607

+ 3
- 0
data/test/images/text_driven_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4
size 67861

+ 3
- 0
data/test/videos/action_detection_test_video.mp4 View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92
size 3191059

+ 2
- 2
data/test/videos/movie_scene_segmentation_test_video.mp4 View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
size 126815483
oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443
size 12722029

+ 1
- 1
docker/Dockerfile.ubuntu View File

@@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
# install modelscope
COPY requirements /var/modelscope
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \


+ 17
- 0
modelscope/metainfo.py View File

@@ -9,6 +9,8 @@ class Models(object):

Model name should only contain model info but not task info.
"""
tinynas_detection = 'tinynas-detection'

# vision models
detection = 'detection'
realtime_object_detection = 'realtime-object-detection'
@@ -22,12 +24,17 @@ class Models(object):
body_2d_keypoints = 'body-2d-keypoints'
body_3d_keypoints = 'body-3d-keypoints'
crowd_counting = 'HRNetCrowdCounting'
face_2d_keypoints = 'face-2d-keypoints'
panoptic_segmentation = 'swinL-panoptic-segmentation'
image_reid_person = 'passvitb'
video_summarization = 'pgl-video-summarization'
swinL_semantic_segmentation = 'swinL-semantic-segmentation'
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
text_driven_segmentation = 'text-driven-segmentation'
resnet50_bert = 'resnet50-bert'
fer = 'fer'
retinaface = 'retinaface'
shop_segmentation = 'shop-segmentation'

# EasyCV models
yolox = 'YOLOX'
@@ -37,6 +44,7 @@ class Models(object):
bert = 'bert'
palm = 'palm-v2'
structbert = 'structbert'
deberta_v2 = 'deberta_v2'
veco = 'veco'
translation = 'csanmt-translation'
space_dst = 'space-dst'
@@ -104,13 +112,17 @@ class Pipelines(object):
hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
body_3d_keypoints = 'canonical_body-3d-keypoints_video'
hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
human_detection = 'resnet18-human-detection'
object_detection = 'vit-object-detection'
easycv_detection = 'easycv-detection'
easycv_segmentation = 'easycv-segmentation'
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
salient_detection = 'u2net-salient-detection'
image_classification = 'image-classification'
face_detection = 'resnet-face-detection-scrfd10gkps'
facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
retina_face_detection = 'resnet50-face-detection-retinaface'
live_category = 'live-category'
general_image_classification = 'vit-base_image-classification_ImageNet-labels'
daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -132,13 +144,17 @@ class Pipelines(object):
image_to_image_generation = 'image-to-image-generation'
skin_retouching = 'unet-skin-retouching'
tinynas_classification = 'tinynas-classification'
tinynas_detection = 'tinynas-detection'
crowd_counting = 'hrnet-crowd-counting'
action_detection = 'ResNetC3D-action-detection'
video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
image_panoptic_segmentation = 'image-panoptic-segmentation'
video_summarization = 'googlenet_pgl_video_summarization'
image_semantic_segmentation = 'image-semantic-segmentation'
image_reid_person = 'passvitb-image-reid-person'
text_driven_segmentation = 'text-driven-segmentation'
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
shop_segmentation = 'shop-segmentation'

# nlp tasks
sentence_similarity = 'sentence-similarity'
@@ -347,6 +363,7 @@ class Datasets(object):
""" Names for different datasets.
"""
ClsDataset = 'ClsDataset'
Face2dKeypointsDataset = 'Face2dKeypointsDataset'
SegDataset = 'SegDataset'
DetDataset = 'DetDataset'
DetImagesMixDataset = 'DetImagesMixDataset'

+ 2
- 2
modelscope/models/audio/ans/__init__.py View File

@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .frcrn import FRCRNModel
from .frcrn import FRCRNDecorator

else:
_import_structure = {
'frcrn': ['FRCRNModel'],
'frcrn': ['FRCRNDecorator'],
}

import sys


+ 6
- 0
modelscope/models/audio/ans/complex_nn.py View File

@@ -1,3 +1,9 @@
"""
The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch

"""
import torch
import torch.nn as nn
import torch.nn.functional as F


+ 1
- 0
modelscope/models/audio/ans/conv_stft.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import numpy as np
import torch
import torch.nn as nn


+ 10
- 52
modelscope/models/audio/ans/frcrn.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Dict

@@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT
from .unet import UNet


class FTB(nn.Module):

def __init__(self, input_dim=257, in_channel=9, r_channel=5):

super(FTB, self).__init__()
self.in_channel = in_channel
self.conv1 = nn.Sequential(
nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
nn.BatchNorm2d(r_channel), nn.ReLU())

self.conv1d = nn.Sequential(
nn.Conv1d(
r_channel * input_dim, in_channel, kernel_size=9, padding=4),
nn.BatchNorm1d(in_channel), nn.ReLU())
self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)

self.conv2 = nn.Sequential(
nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
nn.BatchNorm2d(in_channel), nn.ReLU())

def forward(self, inputs):
'''
inputs should be [Batch, Ca, Dim, Time]
'''
# T-F attention
conv1_out = self.conv1(inputs)
B, C, D, T = conv1_out.size()
reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
conv1d_out = self.conv1d(reshape1_out)
conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])

# now is also [B,C,D,T]
att_out = conv1d_out * inputs

# tranpose to [B,C,T,D]
att_out = torch.transpose(att_out, 2, 3)
freqfc_out = self.freq_fc(att_out)
att_out = torch.transpose(freqfc_out, 2, 3)

cat_out = torch.cat([att_out, inputs], 1)
outputs = self.conv2(cat_out)
return outputs


@MODELS.register_module(
Tasks.acoustic_noise_suppression,
module_name=Models.speech_frcrn_ans_cirm_16k)
class FRCRNModel(TorchModel):
class FRCRNDecorator(TorchModel):
r""" A decorator of FRCRN for integrating into modelscope framework """

def __init__(self, model_dir: str, *args, **kwargs):
@@ -78,13 +35,14 @@ class FRCRNModel(TorchModel):
checkpoint = torch.load(
model_bin_file, map_location=torch.device('cpu'))
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
self.model.load_state_dict(
checkpoint['state_dict'], strict=False)
# the new trained model by user is based on FRCRNDecorator
self.load_state_dict(checkpoint['state_dict'])
else:
# The released model on Modelscope is based on FRCRN
self.model.load_state_dict(checkpoint, strict=False)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
result_list = self.model.forward(input['noisy'])
def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
result_list = self.model.forward(inputs['noisy'])
output = {
'spec_l1': result_list[0],
'wav_l1': result_list[1],
@@ -93,12 +51,12 @@ class FRCRNModel(TorchModel):
'wav_l2': result_list[4],
'mask_l2': result_list[5]
}
if 'clean' in input:
if 'clean' in inputs:
mix_result = self.model.loss(
input['noisy'], input['clean'], result_list, mode='Mix')
inputs['noisy'], inputs['clean'], result_list, mode='Mix')
output.update(mix_result)
sisnr_result = self.model.loss(
input['noisy'], input['clean'], result_list, mode='SiSNR')
inputs['noisy'], inputs['clean'], result_list, mode='SiSNR')
output.update(sisnr_result)
# logger hooker will use items under 'log_vars'
output['log_vars'] = {k: mix_result[k].item() for k in mix_result}


+ 1
- 0
modelscope/models/audio/ans/se_module_complex.py View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
from torch import nn



+ 5
- 0
modelscope/models/audio/ans/unet.py View File

@@ -1,3 +1,8 @@
"""
The implementation here is modified based on
Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
"""
import torch
import torch.nn as nn



+ 4
- 4
modelscope/models/cv/__init__.py View File

@@ -3,15 +3,15 @@
# yapf: disable
from . import (action_recognition, animal_recognition, body_2d_keypoints,
body_3d_keypoints, cartoon, cmdssl_video_embedding,
crowd_counting, face_detection, face_generation,
image_classification, image_color_enhance, image_colorization,
image_denoise, image_instance_segmentation,
crowd_counting, face_2d_keypoints, face_detection,
face_generation, image_classification, image_color_enhance,
image_colorization, image_denoise, image_instance_segmentation,
image_panoptic_segmentation, image_portrait_enhancement,
image_reid_person, image_semantic_segmentation,
image_to_image_generation, image_to_image_translation,
movie_scene_segmentation, object_detection,
product_retrieval_embedding, realtime_object_detection,
salient_detection, super_resolution,
salient_detection, shop_segmentation, super_resolution,
video_single_object_tracking, video_summarization, virual_tryon)

# yapf: enable

+ 21
- 0
modelscope/models/cv/action_detection/__init__.py View File

@@ -0,0 +1,21 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:

from .action_detection_onnx import ActionDetONNX

else:
_import_structure = {'action_detection_onnx': ['ActionDetONNX']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 177
- 0
modelscope/models/cv/action_detection/action_detection_onnx.py View File

@@ -0,0 +1,177 @@
import os
import os.path as osp
import shutil
import subprocess

import cv2
import numpy as np
import onnxruntime as rt

from modelscope.models import Model
from modelscope.utils.constant import Devices
from modelscope.utils.device import verify_device


class ActionDetONNX(Model):

def __init__(self, model_dir, config, *args, **kwargs):
super().__init__(self, model_dir, *args, **kwargs)
model_file = osp.join(config['model_file'])
device_type, device_id = verify_device(self._device_name)
options = rt.SessionOptions()
options.intra_op_num_threads = 1
options.inter_op_num_threads = 1
if device_type == Devices.gpu:
sess = rt.InferenceSession(
model_file,
providers=['CUDAExecutionProvider'],
sess_options=options,
provider_options=[{
'device_id': device_id
}])
else:
sess = rt.InferenceSession(
model_file,
providers=['CPUExecutionProvider'],
sess_options=options)
self.input_name = sess.get_inputs()[0].name
self.sess = sess
self.num_stride = len(config['fpn_strides'])
self.score_thresh = np.asarray(
config['pre_nms_thresh'], dtype='float32').reshape((1, -1))
self.size_divisibility = config['size_divisibility']
self.nms_threshold = config['nms_thresh']
self.tmp_dir = config['tmp_dir']
self.temporal_stride = config['step']
self.input_data_type = config['input_type']
self.action_names = config['action_names']
self.video_length_limit = config['video_length_limit']

def resize_box(self, det, height, width, scale_h, scale_w):
bboxs = det[0]
bboxs[:, [0, 2]] *= scale_w
bboxs[:, [1, 3]] *= scale_h
bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1)
bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1)
result = {
'boxes': bboxs.round().astype('int32').tolist(),
'scores': det[1].tolist(),
'labels': [self.action_names[i] for i in det[2].tolist()]
}
return result

def parse_frames(self, frame_names):
imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names]
imgs = np.stack(imgs).astype(self.input_data_type).transpose(
(3, 0, 1, 2)) # c,t,h,w
imgs = imgs[None]
return imgs

def forward_img(self, imgs, h, w):
pred = self.sess.run(None, {
self.input_name: imgs,
'height': np.asarray(h),
'width': np.asarray(w)
})
dets = self.post_nms(
pred,
score_threshold=self.score_thresh,
nms_threshold=self.nms_threshold)
return dets

def forward_video(self, video_name, scale):
min_size, max_size = self._get_sizes(scale)

tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
if osp.exists(tmp_dir):
shutil.rmtree(tmp_dir)
os.makedirs(tmp_dir)
frame_rate = 2
cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'

cmd = cmd.split(' ')
subprocess.call(cmd)

frame_names = [
osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
if name.endswith('.jpg')
]
frame_names = [
frame_names[i:i + frame_rate * 2]
for i in range(0,
len(frame_names) - frame_rate * 2 + 1, frame_rate
* self.temporal_stride)
]
timestamp = list(
range(1,
len(frame_names) * self.temporal_stride,
self.temporal_stride))
batch_imgs = [self.parse_frames(names) for names in frame_names]

N, _, T, H, W = batch_imgs[0].shape
scale_min = min_size / min(H, W)
h, w = min(int(scale_min * H),
max_size), min(int(scale_min * W), max_size)
h = round(h / self.size_divisibility) * self.size_divisibility
w = round(w / self.size_divisibility) * self.size_divisibility
scale_h, scale_w = H / h, W / w

results = []
for imgs in batch_imgs:
det = self.forward_img(imgs, h, w)
det = self.resize_box(det[0], H, W, scale_h, scale_w)
results.append(det)
results = [{
'timestamp': t,
'actions': res
} for t, res in zip(timestamp, results)]
shutil.rmtree(tmp_dir)
return results

def forward(self, video_name):
return self.forward_video(video_name, scale=1)

def post_nms(self, pred, score_threshold, nms_threshold=0.3):
pred_bboxes, pred_scores = pred
N = len(pred_bboxes)
dets = []
for i in range(N):
bboxes, scores = pred_bboxes[i], pred_scores[i]
candidate_inds = scores > score_threshold
scores = scores[candidate_inds]
candidate_nonzeros = candidate_inds.nonzero()
bboxes = bboxes[candidate_nonzeros[0]]
labels = candidate_nonzeros[1]
keep = self._nms(bboxes, scores, labels, nms_threshold)
bbox = bboxes[keep]
score = scores[keep]
label = labels[keep]
dets.append((bbox, score, label))
return dets

def _nms(self, boxes, scores, idxs, nms_threshold):
if len(boxes) == 0:
return []
max_coordinate = boxes.max()
offsets = idxs * (max_coordinate + 1)
boxes_for_nms = boxes + offsets[:, None].astype('float32')
boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0]
boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1]
keep = cv2.dnn.NMSBoxes(
boxes_for_nms.tolist(),
scores.tolist(),
score_threshold=0,
nms_threshold=nms_threshold)
if len(keep.shape) == 2:
keep = np.squeeze(keep, 1)
return keep

def _get_sizes(self, scale):
if scale == 1:
min_size, max_size = 512, 896
elif scale == 2:
min_size, max_size = 768, 1280
else:
min_size, max_size = 1024, 1792
return min_size, max_size

+ 20
- 0
modelscope/models/cv/face_2d_keypoints/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .face_2d_keypoints_align import Face2DKeypoints

else:
_import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 16
- 0
modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.face.face_keypoint import FaceKeypoint

from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks


@MODELS.register_module(
group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):

def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
FaceKeypoint.__init__(self, *args, **kwargs)

+ 22
- 0
modelscope/models/cv/face_detection/__init__.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .retinaface import RetinaFaceDetection

else:
_import_structure = {
'retinaface': ['RetinaFaceDetection'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 1
- 0
modelscope/models/cv/face_detection/retinaface/__init__.py View File

@@ -0,0 +1 @@
from .detection import RetinaFaceDetection

+ 137
- 0
modelscope/models/cv/face_detection/retinaface/detection.py View File

@@ -0,0 +1,137 @@
# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
import cv2
import numpy as np
import torch
import torch.backends.cudnn as cudnn

from modelscope.metainfo import Models
from modelscope.models.base import Tensor, TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .models.retinaface import RetinaFace
from .utils import PriorBox, decode, decode_landm, py_cpu_nms


@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface)
class RetinaFaceDetection(TorchModel):

def __init__(self, model_path, device='cuda'):
super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True
self.model_path = model_path
self.cfg = Config.from_file(
model_path.replace(ModelFile.TORCH_MODEL_FILE,
ModelFile.CONFIGURATION))['models']
self.net = RetinaFace(cfg=self.cfg)
self.load_model()
self.device = device
self.net = self.net.to(self.device)

self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device)

def check_keys(self, pretrained_state_dict):
ckpt_keys = set(pretrained_state_dict.keys())
model_keys = set(self.net.state_dict().keys())
used_pretrained_keys = model_keys & ckpt_keys
assert len(
used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
return True

def remove_prefix(self, state_dict, prefix):
new_state_dict = dict()
for k, v in state_dict.items():
if k.startswith(prefix):
new_state_dict[k[len(prefix):]] = v
else:
new_state_dict[k] = v
return new_state_dict

def load_model(self, load_to_cpu=False):
pretrained_dict = torch.load(
self.model_path, map_location=torch.device('cpu'))
if 'state_dict' in pretrained_dict.keys():
pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'],
'module.')
else:
pretrained_dict = self.remove_prefix(pretrained_dict, 'module.')
self.check_keys(pretrained_dict)
self.net.load_state_dict(pretrained_dict, strict=False)
self.net.eval()

def forward(self, input):
img_raw = input['img'].cpu().numpy()
img = np.float32(img_raw)

im_height, im_width = img.shape[:2]
ss = 1.0
# tricky
if max(im_height, im_width) > 1500:
ss = 1000.0 / max(im_height, im_width)
img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
im_height, im_width = img.shape[:2]

scale = torch.Tensor(
[img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
img -= (104, 117, 123)
img = img.transpose(2, 0, 1)
img = torch.from_numpy(img).unsqueeze(0)
img = img.to(self.device)
scale = scale.to(self.device)

loc, conf, landms = self.net(img) # forward pass
del img

confidence_threshold = 0.9
nms_threshold = 0.4
top_k = 5000
keep_top_k = 750

priorbox = PriorBox(self.cfg, image_size=(im_height, im_width))
priors = priorbox.forward()
priors = priors.to(self.device)
prior_data = priors.data
boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance'])
boxes = boxes * scale
boxes = boxes.cpu().numpy()
scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
landms = decode_landm(
landms.data.squeeze(0), prior_data, self.cfg['variance'])
scale1 = torch.Tensor([
im_width, im_height, im_width, im_height, im_width, im_height,
im_width, im_height, im_width, im_height
])
scale1 = scale1.to(self.device)
landms = landms * scale1
landms = landms.cpu().numpy()

# ignore low scores
inds = np.where(scores > confidence_threshold)[0]
boxes = boxes[inds]
landms = landms[inds]
scores = scores[inds]

# keep top-K before NMS
order = scores.argsort()[::-1][:top_k]
boxes = boxes[order]
landms = landms[order]
scores = scores[order]

# do NMS
dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
np.float32, copy=False)
keep = py_cpu_nms(dets, nms_threshold)
dets = dets[keep, :]
landms = landms[keep]

# keep top-K faster NMS
dets = dets[:keep_top_k, :]
landms = landms[:keep_top_k, :]

landms = landms.reshape((-1, 5, 2))
landms = landms.reshape(
-1,
10,
)
return dets / ss, landms / ss

+ 0
- 0
modelscope/models/cv/face_detection/retinaface/models/__init__.py View File


+ 149
- 0
modelscope/models/cv/face_detection/retinaface/models/net.py View File

@@ -0,0 +1,149 @@
# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.models._utils as _utils
from torch.autograd import Variable


def conv_bn(inp, oup, stride=1, leaky=0):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup),
nn.LeakyReLU(negative_slope=leaky, inplace=True))


def conv_bn_no_relu(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
nn.BatchNorm2d(oup),
)


def conv_bn1X1(inp, oup, stride, leaky=0):
return nn.Sequential(
nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True))


def conv_dw(inp, oup, stride, leaky=0.1):
return nn.Sequential(
nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
nn.BatchNorm2d(inp),
nn.LeakyReLU(negative_slope=leaky, inplace=True),
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
nn.LeakyReLU(negative_slope=leaky, inplace=True),
)


class SSH(nn.Module):

def __init__(self, in_channel, out_channel):
super(SSH, self).__init__()
assert out_channel % 4 == 0
leaky = 0
if (out_channel <= 64):
leaky = 0.1
self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1)

self.conv5X5_1 = conv_bn(
in_channel, out_channel // 4, stride=1, leaky=leaky)
self.conv5X5_2 = conv_bn_no_relu(
out_channel // 4, out_channel // 4, stride=1)

self.conv7X7_2 = conv_bn(
out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
self.conv7x7_3 = conv_bn_no_relu(
out_channel // 4, out_channel // 4, stride=1)

def forward(self, input):
conv3X3 = self.conv3X3(input)

conv5X5_1 = self.conv5X5_1(input)
conv5X5 = self.conv5X5_2(conv5X5_1)

conv7X7_2 = self.conv7X7_2(conv5X5_1)
conv7X7 = self.conv7x7_3(conv7X7_2)

out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
out = F.relu(out)
return out


class FPN(nn.Module):

def __init__(self, in_channels_list, out_channels):
super(FPN, self).__init__()
leaky = 0
if (out_channels <= 64):
leaky = 0.1
self.output1 = conv_bn1X1(
in_channels_list[0], out_channels, stride=1, leaky=leaky)
self.output2 = conv_bn1X1(
in_channels_list[1], out_channels, stride=1, leaky=leaky)
self.output3 = conv_bn1X1(
in_channels_list[2], out_channels, stride=1, leaky=leaky)

self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)

def forward(self, input):
# names = list(input.keys())
input = list(input.values())

output1 = self.output1(input[0])
output2 = self.output2(input[1])
output3 = self.output3(input[2])

up3 = F.interpolate(
output3, size=[output2.size(2), output2.size(3)], mode='nearest')
output2 = output2 + up3
output2 = self.merge2(output2)

up2 = F.interpolate(
output2, size=[output1.size(2), output1.size(3)], mode='nearest')
output1 = output1 + up2
output1 = self.merge1(output1)

out = [output1, output2, output3]
return out


class MobileNetV1(nn.Module):

def __init__(self):
super(MobileNetV1, self).__init__()
self.stage1 = nn.Sequential(
conv_bn(3, 8, 2, leaky=0.1), # 3
conv_dw(8, 16, 1), # 7
conv_dw(16, 32, 2), # 11
conv_dw(32, 32, 1), # 19
conv_dw(32, 64, 2), # 27
conv_dw(64, 64, 1), # 43
)
self.stage2 = nn.Sequential(
conv_dw(64, 128, 2), # 43 + 16 = 59
conv_dw(128, 128, 1), # 59 + 32 = 91
conv_dw(128, 128, 1), # 91 + 32 = 123
conv_dw(128, 128, 1), # 123 + 32 = 155
conv_dw(128, 128, 1), # 155 + 32 = 187
conv_dw(128, 128, 1), # 187 + 32 = 219
)
self.stage3 = nn.Sequential(
conv_dw(128, 256, 2), # 219 +3 2 = 241
conv_dw(256, 256, 1), # 241 + 64 = 301
)
self.avg = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(256, 1000)

def forward(self, x):
x = self.stage1(x)
x = self.stage2(x)
x = self.stage3(x)
x = self.avg(x)
x = x.view(-1, 256)
x = self.fc(x)
return x

+ 145
- 0
modelscope/models/cv/face_detection/retinaface/models/retinaface.py View File

@@ -0,0 +1,145 @@
# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.models._utils as _utils
import torchvision.models.detection.backbone_utils as backbone_utils

from .net import FPN, SSH, MobileNetV1


class ClassHead(nn.Module):

def __init__(self, inchannels=512, num_anchors=3):
super(ClassHead, self).__init__()
self.num_anchors = num_anchors
self.conv1x1 = nn.Conv2d(
inchannels,
self.num_anchors * 2,
kernel_size=(1, 1),
stride=1,
padding=0)

def forward(self, x):
out = self.conv1x1(x)
out = out.permute(0, 2, 3, 1).contiguous()

return out.view(out.shape[0], -1, 2)


class BboxHead(nn.Module):

def __init__(self, inchannels=512, num_anchors=3):
super(BboxHead, self).__init__()
self.conv1x1 = nn.Conv2d(
inchannels,
num_anchors * 4,
kernel_size=(1, 1),
stride=1,
padding=0)

def forward(self, x):
out = self.conv1x1(x)
out = out.permute(0, 2, 3, 1).contiguous()

return out.view(out.shape[0], -1, 4)


class LandmarkHead(nn.Module):

def __init__(self, inchannels=512, num_anchors=3):
super(LandmarkHead, self).__init__()
self.conv1x1 = nn.Conv2d(
inchannels,
num_anchors * 10,
kernel_size=(1, 1),
stride=1,
padding=0)

def forward(self, x):
out = self.conv1x1(x)
out = out.permute(0, 2, 3, 1).contiguous()

return out.view(out.shape[0], -1, 10)


class RetinaFace(nn.Module):

def __init__(self, cfg=None):
"""
:param cfg: Network related settings.
"""
super(RetinaFace, self).__init__()
backbone = None
if cfg['name'] == 'Resnet50':
backbone = models.resnet50(pretrained=cfg['pretrain'])
else:
raise Exception('Invalid name')

self.body = _utils.IntermediateLayerGetter(backbone,
cfg['return_layers'])
in_channels_stage2 = cfg['in_channel']
in_channels_list = [
in_channels_stage2 * 2,
in_channels_stage2 * 4,
in_channels_stage2 * 8,
]
out_channels = cfg['out_channel']
self.fpn = FPN(in_channels_list, out_channels)
self.ssh1 = SSH(out_channels, out_channels)
self.ssh2 = SSH(out_channels, out_channels)
self.ssh3 = SSH(out_channels, out_channels)

self.ClassHead = self._make_class_head(
fpn_num=3, inchannels=cfg['out_channel'])
self.BboxHead = self._make_bbox_head(
fpn_num=3, inchannels=cfg['out_channel'])
self.LandmarkHead = self._make_landmark_head(
fpn_num=3, inchannels=cfg['out_channel'])

def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
classhead = nn.ModuleList()
for i in range(fpn_num):
classhead.append(ClassHead(inchannels, anchor_num))
return classhead

def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
bboxhead = nn.ModuleList()
for i in range(fpn_num):
bboxhead.append(BboxHead(inchannels, anchor_num))
return bboxhead

def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
landmarkhead = nn.ModuleList()
for i in range(fpn_num):
landmarkhead.append(LandmarkHead(inchannels, anchor_num))
return landmarkhead

def forward(self, inputs):
out = self.body(inputs)

# FPN
fpn = self.fpn(out)

# SSH
feature1 = self.ssh1(fpn[0])
feature2 = self.ssh2(fpn[1])
feature3 = self.ssh3(fpn[2])
features = [feature1, feature2, feature3]

bbox_regressions = torch.cat(
[self.BboxHead[i](feature) for i, feature in enumerate(features)],
dim=1)
classifications = torch.cat(
[self.ClassHead[i](feature) for i, feature in enumerate(features)],
dim=1)
ldm_regressions = torch.cat(
[self.LandmarkHead[i](feat) for i, feat in enumerate(features)],
dim=1)

output = (bbox_regressions, F.softmax(classifications,
dim=-1), ldm_regressions)
return output

+ 123
- 0
modelscope/models/cv/face_detection/retinaface/utils.py View File

@@ -0,0 +1,123 @@
# --------------------------------------------------------
# Modified from https://github.com/biubug6/Pytorch_Retinaface
# --------------------------------------------------------

from itertools import product as product
from math import ceil

import numpy as np
import torch


class PriorBox(object):

def __init__(self, cfg, image_size=None, phase='train'):
super(PriorBox, self).__init__()
self.min_sizes = cfg['min_sizes']
self.steps = cfg['steps']
self.clip = cfg['clip']
self.image_size = image_size
self.feature_maps = [[
ceil(self.image_size[0] / step),
ceil(self.image_size[1] / step)
] for step in self.steps]
self.name = 's'

def forward(self):
anchors = []
for k, f in enumerate(self.feature_maps):
min_sizes = self.min_sizes[k]
for i, j in product(range(f[0]), range(f[1])):
for min_size in min_sizes:
s_kx = min_size / self.image_size[1]
s_ky = min_size / self.image_size[0]
dense_cx = [
x * self.steps[k] / self.image_size[1]
for x in [j + 0.5]
]
dense_cy = [
y * self.steps[k] / self.image_size[0]
for y in [i + 0.5]
]
for cy, cx in product(dense_cy, dense_cx):
anchors += [cx, cy, s_kx, s_ky]

# back to torch land
output = torch.Tensor(anchors).view(-1, 4)
if self.clip:
output.clamp_(max=1, min=0)
return output


def py_cpu_nms(dets, thresh):
"""Pure Python NMS baseline."""
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]

areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]

keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])

w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)

inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]

return keep


# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
"""Decode locations from predictions using priors to undo
the encoding we did for offset regression at train time.
Args:
loc (tensor): location predictions for loc layers,
Shape: [num_priors,4]
priors (tensor): Prior boxes in center-offset form.
Shape: [num_priors,4].
variances: (list[float]) Variances of priorboxes
Return:
decoded bounding box predictions
"""

boxes = torch.cat(
(priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
boxes[:, :2] -= boxes[:, 2:] / 2
boxes[:, 2:] += boxes[:, :2]
return boxes


def decode_landm(pre, priors, variances):
"""Decode landm from predictions using priors to undo
the encoding we did for offset regression at train time.
Args:
pre (tensor): landm predictions for loc layers,
Shape: [num_priors,10]
priors (tensor): Prior boxes in center-offset form.
Shape: [num_priors,4].
variances: (list[float]) Variances of priorboxes
Return:
decoded landm predictions
"""
a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
landms = torch.cat((a, b, c, d, e), dim=1)
return landms

+ 20
- 0
modelscope/models/cv/facial_expression_recognition/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .fer import FacialExpressionRecognition

else:
_import_structure = {'fer': ['FacialExpressionRecognition']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 2
- 0
modelscope/models/cv/facial_expression_recognition/fer/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .facial_expression_recognition import FacialExpressionRecognition

+ 72
- 0
modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py View File

@@ -0,0 +1,72 @@
# The implementation is based on Facial-Expression-Recognition, available at
# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
import os

import cv2
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from PIL import Image
from torch.autograd import Variable

from modelscope.metainfo import Models
from modelscope.models.base import Tensor, TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from . import transforms
from .vgg import VGG


@MODELS.register_module(
Tasks.facial_expression_recognition, module_name=Models.fer)
class FacialExpressionRecognition(TorchModel):

def __init__(self, model_path, device='cuda'):
super().__init__(model_path)
torch.set_grad_enabled(False)
cudnn.benchmark = True
self.model_path = model_path
self.device = device
self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
ModelFile.CONFIGURATION)
self.net = VGG('VGG19', cfg_path=self.cfg_path)
self.load_model()
self.net = self.net.to(device)
self.transform_test = transforms.Compose([
transforms.TenCrop(44),
transforms.Lambda(lambda crops: torch.stack(
[transforms.ToTensor()(crop) for crop in crops])),
])

self.mean = np.array([[104, 117, 123]])

def load_model(self, load_to_cpu=False):
pretrained_dict = torch.load(
self.model_path, map_location=torch.device('cpu'))
self.net.load_state_dict(pretrained_dict['net'], strict=True)
self.net.eval()

def forward(self, input):
img = input['img']
img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY)
img = cv2.resize(img, (48, 48))
img = img[:, :, np.newaxis]
img = np.concatenate((img, img, img), axis=2)

img = Image.fromarray(np.uint8(img))
inputs = self.transform_test(img)

ncrops, c, h, w = inputs.shape

inputs = inputs.view(-1, c, h, w)
inputs = inputs.to(self.device)
inputs = Variable(inputs, volatile=True)
outputs = self.net(inputs)

outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops

score = F.softmax(outputs_avg)
_, predicted = torch.max(outputs_avg.data, 0)

return score, predicted

+ 118
- 0
modelscope/models/cv/facial_expression_recognition/fer/transforms.py View File

@@ -0,0 +1,118 @@
# The implementation is based on Facial-Expression-Recognition, available at
# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
import numbers
import types

import numpy as np
import torch
from PIL import Image


def to_tensor(pic):

# handle PIL Image
if pic.mode == 'I':
img = torch.from_numpy(np.array(pic, np.int32, copy=False))
elif pic.mode == 'I;16':
img = torch.from_numpy(np.array(pic, np.int16, copy=False))
else:
img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
# PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
if pic.mode == 'YCbCr':
nchannel = 3
elif pic.mode == 'I;16':
nchannel = 1
else:
nchannel = len(pic.mode)
img = img.view(pic.size[1], pic.size[0], nchannel)
# put it from HWC to CHW format
# yikes, this transpose takes 80% of the loading time/CPU
img = img.transpose(0, 1).transpose(0, 2).contiguous()
if isinstance(img, torch.ByteTensor):
return img.float().div(255)
else:
return img


def center_crop(img, output_size):
if isinstance(output_size, numbers.Number):
output_size = (int(output_size), int(output_size))
w, h = img.size
th, tw = output_size
i = int(round((h - th) / 2.))
j = int(round((w - tw) / 2.))
return img.crop((j, i, j + tw, i + th))


def five_crop(img, size):
if isinstance(size, numbers.Number):
size = (int(size), int(size))
else:
assert len(
size) == 2, 'Please provide only two dimensions (h, w) for size.'

w, h = img.size
crop_h, crop_w = size
if crop_w > w or crop_h > h:
raise ValueError(
'Requested crop size {} is bigger than input size {}'.format(
size, (h, w)))
tl = img.crop((0, 0, crop_w, crop_h))
tr = img.crop((w - crop_w, 0, w, crop_h))
bl = img.crop((0, h - crop_h, crop_w, h))
br = img.crop((w - crop_w, h - crop_h, w, h))
center = center_crop(img, (crop_h, crop_w))
return (tl, tr, bl, br, center)


class TenCrop(object):

def __init__(self, size, vertical_flip=False):
self.size = size
if isinstance(size, numbers.Number):
self.size = (int(size), int(size))
else:
assert len(
size
) == 2, 'Please provide only two dimensions (h, w) for size.'
self.size = size
self.vertical_flip = vertical_flip

def __call__(self, img):
first_five = five_crop(img, self.size)

if self.vertical_flip:
img = img.transpose(Image.FLIP_TOP_BOTTOM)
else:
img = img.transpose(Image.FLIP_LEFT_RIGHT)

second_five = five_crop(img, self.size)

return first_five + second_five


class Compose(object):

def __init__(self, transforms):
self.transforms = transforms

def __call__(self, img):
for t in self.transforms:
img = t(img)
return img


class ToTensor(object):

def __call__(self, pic):
return to_tensor(pic)


class Lambda(object):

def __init__(self, lambd):
assert isinstance(lambd, types.LambdaType)
self.lambd = lambd

def __call__(self, img):
return self.lambd(img)

+ 40
- 0
modelscope/models/cv/facial_expression_recognition/fer/vgg.py View File

@@ -0,0 +1,40 @@
# The implementation is based on Facial-Expression-Recognition, available at
# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from modelscope.utils.config import Config


class VGG(nn.Module):

def __init__(self, vgg_name, cfg_path):
super(VGG, self).__init__()
model_cfg = Config.from_file(cfg_path)['models']
self.features = self._make_layers(model_cfg[vgg_name])
self.classifier = nn.Linear(512, 7)

def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = F.dropout(out, p=0.5, training=self.training)
out = self.classifier(out)
return out

def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
layers += [
nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)
]
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)

+ 20
- 0
modelscope/models/cv/shop_segmentation/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .shop_seg_base import SHOPSEG

else:
_import_structure = {'shop_seg_base': ['SHOPSEG']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 59
- 0
modelscope/models/cv/shop_segmentation/common.py View File

@@ -0,0 +1,59 @@
"""
Base modules are adapted from https://github.com/open-mmlab/mmcv/,
originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
https://github.com/open-mmlab/mmsegmentation/,
originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
and adapted from https://github.com/raoyongming/DenseCLIP/,
originally MIT License, Copyright (c) 2022 Rao, Yongming.
"""

import warnings

import torch.nn as nn
import torch.nn.functional as F


def resize(input,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None,
warning=True):
if warning:
if size is not None and align_corners:
input_h, input_w = tuple(int(x) for x in input.shape[2:])
output_h, output_w = tuple(int(x) for x in size)
if output_h > input_h or output_w > input_w:
if ((output_h > 1 and output_w > 1 and input_h > 1
and input_w > 1) and (output_h - 1) % (input_h - 1)
and (output_w - 1) % (input_w - 1)):
warnings.warn(
f'When align_corners={align_corners}, '
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
return F.interpolate(input, size, scale_factor, mode, align_corners)


class Upsample(nn.Module):

def __init__(self,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None):
super(Upsample, self).__init__()
self.size = size
if isinstance(scale_factor, tuple):
self.scale_factor = tuple(float(factor) for factor in scale_factor)
else:
self.scale_factor = float(scale_factor) if scale_factor else None
self.mode = mode
self.align_corners = align_corners

def forward(self, x):
if not self.size:
size = [int(t * self.scale_factor) for t in x.shape[-2:]]
else:
size = self.size
return resize(x, size, None, self.mode, self.align_corners)

+ 122
- 0
modelscope/models/cv/shop_segmentation/head_fpn.py View File

@@ -0,0 +1,122 @@
""" FPNHead
Base modules are adapted from https://github.com/open-mmlab/mmcv/,
originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
https://github.com/open-mmlab/mmsegmentation/,
originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
and adapted from https://github.com/raoyongming/DenseCLIP/,
originally MIT License, Copyright (c) 2022 Rao, Yongming.
"""

import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule
from timm.models.layers import drop, drop_path, trunc_normal_

from .common import Upsample, resize


class FPNHead(nn.Module):
"""Panoptic Feature Pyramid Networks.
This head is the implementation of `Semantic FPN
<https://arxiv.org/abs/1901.02446>`_.
Args:
feature_strides (tuple[int]): The strides for input feature maps.
stack_lateral. All strides suppose to be power of 2. The first
one is of largest resolution.
"""

def __init__(self,
channels,
num_classes,
dropout_ratio=0.1,
feature_strides=[4, 8, 16, 32],
align_corners=False,
**kwargs):
super(FPNHead, self).__init__()
self.act_cfg = dict(type='ReLU')
self.channels = channels
self.conv_cfg = None
self.norm_cfg = None
self.norm_cfg = dict(type='BN2d', requires_grad=True)
self.align_corners = align_corners
self.dropout_ratio = dropout_ratio
self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
if dropout_ratio > 0:
self.dropout = nn.Dropout2d(dropout_ratio)
else:
self.dropout = None
self.in_index = [0, 1, 2, 3]
assert min(feature_strides) == feature_strides[0]
self.feature_strides = feature_strides
self.scale_heads = nn.ModuleList()
for i in range(len(feature_strides)):
head_length = max(
1,
int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
scale_head = []
for k in range(head_length):
scale_head.append(
ConvModule(
self.channels,
self.channels,
3,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
if feature_strides[i] != feature_strides[0]:
scale_head.append(
Upsample(
scale_factor=2,
mode='bilinear',
align_corners=self.align_corners))
self.scale_heads.append(nn.Sequential(*scale_head))

self.apply(self._init_weights)

def _transform_inputs(self, inputs):
"""Transform inputs for decoder.

Args:
inputs (list[Tensor]): List of multi-level img features.

Returns:
Tensor: The transformed inputs
"""
inputs = [inputs[i] for i in self.in_index]
return inputs

def cls_seg(self, feat):
"""Classify each pixel."""
if self.dropout is not None:
feat = self.dropout(feat)
output = self.conv_seg(feat)
return output

def forward(self, inputs):
x = self._transform_inputs(inputs)
output = self.scale_heads[0](x[0])
for i in range(1, len(self.feature_strides)):
# non inplace
output = output + resize(
self.scale_heads[i](x[i]),
size=output.shape[2:],
mode='bilinear',
align_corners=self.align_corners)

output = self.cls_seg(output)
return output

def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias.data, 0)

+ 901
- 0
modelscope/models/cv/shop_segmentation/models.py View File

@@ -0,0 +1,901 @@
"""
Base modules are adapted from https://github.com/open-mmlab/mmcv/,
originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
https://github.com/open-mmlab/mmsegmentation/,
originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
and adapted from https://github.com/raoyongming/DenseCLIP/,
originally MIT License, Copyright (c) 2022 Rao, Yongming.
"""

import math
from collections import OrderedDict

import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import drop, drop_path, trunc_normal_
from torch import nn


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)

self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)

self.relu = nn.ReLU(inplace=True)
self.downsample = None
self.stride = stride

if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict([('-1', nn.AvgPool2d(stride)),
('0',
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False)),
('1', nn.BatchNorm2d(planes * self.expansion))]))

def forward(self, x: torch.Tensor):
identity = x

out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)
return out


class AttentionPool2d(nn.Module):

def __init__(self,
spacial_dim: int,
embed_dim: int,
num_heads: int,
output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(
torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads
self.embed_dim = embed_dim
self.spacial_dim = spacial_dim

def forward(self, x):
B, C, H, W = x.shape
x = x.reshape(x.shape[0], x.shape[1],
x.shape[2] * x.shape[3]).permute(2, 0,
1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC

cls_pos = self.positional_embedding[0:1, :]
spatial_pos = F.interpolate(
self.positional_embedding[1:, ].reshape(1, self.spacial_dim,
self.spacial_dim,
self.embed_dim).permute(
0, 3, 1, 2),
size=(H, W),
mode='bilinear')
spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0)
positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)

x = x + positional_embedding[:, None, :]
x, _ = F.multi_head_attention_forward(
query=x,
key=x,
value=x,
embed_dim_to_check=x.shape[-1],
num_heads=self.num_heads,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
in_proj_weight=None,
in_proj_bias=torch.cat(
[self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=0,
out_proj_weight=self.c_proj.weight,
out_proj_bias=self.c_proj.bias,
use_separate_proj_weight=True,
training=self.training,
need_weights=False)

x = x.permute(1, 2, 0)
global_feat = x[:, :, 0]
feature_map = x[:, :, 1:].reshape(B, -1, H, W)
return global_feat, feature_map


class CLIPResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim=512,
input_resolution=224,
width=64,
pretrained=None,
**kwargs):
super().__init__()
self.pretrained = pretrained
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(2)
self.relu = nn.ReLU(inplace=True)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

def init_weights(self, pretrained=None):
pretrained = pretrained or self.pretrained
if isinstance(pretrained, str):
checkpoint = torch.jit.load(
pretrained, map_location='cpu').float().state_dict()

state_dict = {}

for k in checkpoint.keys():
if k.startswith('visual.'):
new_k = k.replace('visual.', '')
state_dict[new_k] = checkpoint[k]

u, w = self.load_state_dict(state_dict, False)
print(u, w, 'are misaligned params in CLIPResNet')

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
(self.conv3, self.bn3)]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)

outs = []
x = self.layer1(x)
outs.append(x)
x = self.layer2(x)
outs.append(x)
x = self.layer3(x)
outs.append(x)
x = self.layer4(x)
outs.append(x)

return tuple(outs)


class CLIPResNetWithAttention(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim=1024,
input_resolution=224,
width=64,
pretrained=None,
**kwargs):
super().__init__()
self.pretrained = pretrained
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(2)
self.relu = nn.ReLU(inplace=True)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32,
output_dim)

def init_weights(self, pretrained=None):
pretrained = pretrained or self.pretrained
if isinstance(pretrained, str):
checkpoint = torch.jit.load(
pretrained, map_location='cpu').float().state_dict()

state_dict = {}

for k in checkpoint.keys():
if k.startswith('visual.'):
new_k = k.replace('visual.', '')
state_dict[new_k] = checkpoint[k]

if 'positional_embedding' in new_k:
if self.attnpool.positional_embedding.shape != state_dict[
new_k].shape:
print(
f'Resize the pos_embed shape from {state_dict[new_k].shape}'
f' to {self.attnpool.positional_embedding.shape}'
)
cls_pos = state_dict[new_k][0:1, :]
H = W = self.input_resolution // 32
old_h = int(
math.sqrt(state_dict[new_k][1:, ].shape[0]))
spatial_pos = F.interpolate(
state_dict[new_k][1:, ].reshape(
1, old_h, old_h,
cls_pos.shape[1]).permute(0, 3, 1, 2),
size=(H, W),
mode='bilinear')
spatial_pos = spatial_pos.reshape(
cls_pos.shape[1], H * W).permute(1, 0)
positional_embedding = torch.cat(
[cls_pos, spatial_pos], dim=0)
state_dict[new_k] = positional_embedding
assert self.attnpool.positional_embedding.shape == state_dict[
new_k].shape

u, w = self.load_state_dict(state_dict, False)
print(u, w, 'are misaligned params in CLIPResNet')

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
(self.conv3, self.bn3)]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)

outs = []
x = self.layer1(x)
outs.append(x)
x = self.layer2(x)
outs.append(x)
x = self.layer3(x)
outs.append(x)
x = self.layer4(x)
outs.append(x)

x_global, x_local = self.attnpool(x)
outs.append([x_global, x_local])

return tuple(outs)


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""

def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob

def forward(self, x):
return drop_path(x, self.drop_prob, self.training)

def extra_repr(self) -> str:
return 'p={}'.format(self.drop_prob)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None,
drop_path=0.):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.drop_path(self.attention(self.ln_1(x)))
x = x + self.drop_path(self.mlp(self.ln_2(x)))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
drop_path_rate=0.):
super().__init__()
self.width = width
self.layers = layers
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)
] # stochastic depth decay rule
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
for i in range(layers)
])

def forward(self, x: torch.Tensor):
return self.resblocks(x)


class Attention(nn.Module):

def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim**-0.5

self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)

self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)

def forward(self, q, k, v):
B, N, C = q.shape
assert k.shape == v.shape
B, M, C = k.shape
q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)

attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale

attn = attn.softmax(dim=-1)

x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)

x = self.proj(x)
x = self.proj_drop(x)
return x


class TransformerDecoderLayer(nn.Module):

def __init__(
self,
d_model,
nhead,
dropout=0.1,
):
super().__init__()
self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)

self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)

self.mlp = nn.Sequential(
nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout),
nn.Linear(d_model * 4, d_model))

def forward(self, x, mem):
q = k = v = self.norm1(x)
x = x + self.self_attn(q, k, v)
q = self.norm2(x)
x = x + self.cross_attn(q, mem, mem)
x = x + self.dropout(self.mlp(self.norm3(x)))
return x


class CLIPVisionTransformer(nn.Module):

def __init__(self,
input_resolution=224,
patch_size=32,
width=768,
layers=12,
heads=12,
output_dim=512,
drop_path_rate=0.0,
out_indices=[3, 5, 7, 11],
pretrained=None,
get_embeddings=False,
**kwargs):
super().__init__()
self.pretrained = pretrained
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.spatial_size = input_resolution // patch_size
self.ln_pre = LayerNorm(width)
self.get_embeddings = get_embeddings

self.transformer = Transformer(
width, layers, heads, drop_path_rate=drop_path_rate)

self.out_indices = out_indices

if get_embeddings:
self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

embed_dim = width

if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.ConvTranspose2d(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.SyncBatchNorm(embed_dim),
nn.GELU(),
nn.ConvTranspose2d(
embed_dim, embed_dim, kernel_size=2, stride=2),
)

self.fpn2 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.ConvTranspose2d(
embed_dim, embed_dim, kernel_size=2, stride=2),
)

self.fpn3 = nn.GroupNorm(1, embed_dim)

self.fpn4 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.MaxPool2d(kernel_size=2, stride=2))

elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.ConvTranspose2d(
embed_dim, embed_dim, kernel_size=2, stride=2),
)

self.fpn2 = nn.GroupNorm(1, embed_dim)

self.fpn3 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.MaxPool2d(kernel_size=2, stride=2),
)

self.fpn4 = nn.Sequential(
nn.GroupNorm(1, embed_dim),
nn.MaxPool2d(kernel_size=4, stride=4),
)

def init_weights(self, pretrained=None):
pretrained = pretrained or self.pretrained
if isinstance(pretrained, str):
checkpoint = torch.jit.load(
pretrained, map_location='cpu').float().state_dict()

state_dict = {}

for k in checkpoint.keys():
if k.startswith('visual.'):
new_k = k.replace('visual.', '')
state_dict[new_k] = checkpoint[k]

if 'positional_embedding' in state_dict.keys():
if self.positional_embedding.shape != state_dict[
'positional_embedding'].shape:
print(
f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to'
f' {self.positional_embedding.shape}')
cls_pos = state_dict['positional_embedding'][0:1, :]
spatial_pos = F.interpolate(
state_dict['positional_embedding'][1:, ].reshape(
1, 14, 14, 768).permute(0, 3, 1, 2),
size=(self.spatial_size, self.spatial_size),
mode='bilinear')
spatial_pos = spatial_pos.reshape(
768,
self.spatial_size * self.spatial_size).permute(1, 0)
positional_embedding = torch.cat([cls_pos, spatial_pos],
dim=0)
state_dict['positional_embedding'] = positional_embedding
assert self.positional_embedding.shape == state_dict[
'positional_embedding'].shape

u, w = self.load_state_dict(state_dict, False)
print(u, w, 'are misaligned params in vision transformer')

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
B, C, H, W = x.shape
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x1 = self.class_embedding.to(x.dtype)
x2 = torch.zeros(
x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
x = torch.cat([x1 + x2, x], dim=1)
pos = self.positional_embedding.to(x.dtype)
cls_pos = pos[0, :] + self.class_embedding.to(x.dtype)
spatial_pos = F.interpolate(
pos[1:, ].reshape(1, self.spatial_size, self.spatial_size,
C).permute(0, 3, 1, 2),
size=(H, W),
mode='bilinear')
spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1)
pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
x = x + pos
x = self.ln_pre(x)
x = x.permute(1, 0, 2) # NLD -> LND

gradientcheckpoint = False

features = []
for i, blk in enumerate(self.transformer.resblocks):
if gradientcheckpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)

if i in self.out_indices:
xp = x.permute(1, 0, 2)[:,
1:, :].permute(0, 2,
1).reshape(B, -1, H, W)
features.append(xp.contiguous())

ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
for i in range(len(features)):
features[i] = ops[i](features[i])

if self.get_embeddings:
x = x.permute(1, 0, 2)
x = self.ln_post(x)
x = x @ self.proj

global_embedding = x[:, 0]
visual_embedding = x[:, 1:].reshape(B, H, W,
-1).permute(0, 3, 1,
2) # B C H W

features.append([global_embedding, visual_embedding])

return tuple(features)


class CLIPTextEncoder(nn.Module):

def __init__(self,
context_length=77,
vocab_size=49408,
transformer_width=512,
transformer_heads=8,
transformer_layers=12,
embed_dim=1024,
out_dim=256,
pretrained=None,
**kwargs):
super().__init__()

self.pretrained = pretrained

self.context_length = context_length

self.transformer = Transformer(
width=transformer_width,
layers=transformer_layers,
heads=transformer_heads,
attn_mask=self.build_attention_mask())

self.vocab_size = vocab_size
self.token_embedding = nn.Embedding(vocab_size, transformer_width)
self.positional_embedding = nn.Parameter(
torch.empty(self.context_length, transformer_width))
self.ln_final = LayerNorm(transformer_width)
self.text_projection = nn.Parameter(
torch.empty(transformer_width, embed_dim))

def init_weights(self, pretrained=None):
pretrained = pretrained or self.pretrained
if isinstance(pretrained, str):
checkpoint = torch.jit.load(
pretrained, map_location='cpu').float().state_dict()

state_dict = {}

for k in checkpoint.keys():
if k.startswith('transformer.'):
state_dict[k] = checkpoint[k]

if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
'token_embedding') or k.startswith('ln_final'):
if k == 'positional_embedding' and checkpoint[k].size(
0) > self.context_length:
checkpoint[k] = checkpoint[k][:self.context_length]
print('positional_embedding is tuncated from 77 to',
self.context_length)
state_dict[k] = checkpoint[k]

u, w = self.load_state_dict(state_dict, False)
print(u, w, 'are misaligned params in text encoder')

def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.context_length, self.context_length)
mask.fill_(float('-inf'))
mask.triu_(1) # zero out the lower diagonal
return mask

def forward(self, text):
x = self.token_embedding(text)
x = x + self.positional_embedding
x = x.permute(1, 0, 2)
x = self.transformer(x)
x = x.permute(1, 0, 2)
x = self.ln_final(x)
x = x[torch.arange(x.shape[0]),
text.argmax(dim=-1), ...] @ self.text_projection
return x


class CLIPTextContextEncoder(nn.Module):

def __init__(self,
context_length=22,
vocab_size=49408,
transformer_width=512,
transformer_heads=8,
transformer_layers=12,
embed_dim=1024,
out_dim=256,
pretrained=None,
**kwargs):
super().__init__()

self.pretrained = pretrained

self.context_length = context_length

self.transformer = Transformer(
width=transformer_width,
layers=transformer_layers,
heads=transformer_heads,
attn_mask=self.build_attention_mask())

self.embed_dim = embed_dim

self.vocab_size = vocab_size
self.token_embedding = nn.Embedding(vocab_size, transformer_width)
self.positional_embedding = nn.Parameter(
torch.empty(self.context_length, transformer_width))
self.ln_final = LayerNorm(transformer_width)
self.text_projection = nn.Parameter(
torch.empty(transformer_width, embed_dim))

def init_weights(self, pretrained=None):
pretrained = pretrained or self.pretrained
if isinstance(pretrained, str):
checkpoint = torch.jit.load(
pretrained, map_location='cpu').float().state_dict()

state_dict = {}

for k in checkpoint.keys():
if k.startswith('transformer.'):
state_dict[k] = checkpoint[k]

if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
'token_embedding') or k.startswith('ln_final'):
if k == 'positional_embedding' and checkpoint[k].size(
0) > self.context_length:
checkpoint[k] = checkpoint[k][:self.context_length]
print('positional_embedding is tuncated from 77 to',
self.context_length)
state_dict[k] = checkpoint[k]

u, w = self.load_state_dict(state_dict, False)
print(u, w, 'are misaligned params in text encoder')

def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.context_length, self.context_length)
mask.fill_(float('-inf'))
mask.triu_(1) # zero out the lower diagonal
return mask

def forward(self, text, context=None):
x_text = self.token_embedding(text) # n_clas, n_text, C
K, N1, C = x_text.shape # 150类 * 5??? * 512
B, N2, C = context.shape # 1 * 8 * 512

eos_indx = text.argmax(dim=-1) + N2
eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1)

x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C)
context = context.reshape(B, 1, N2, C).expand(B, K, N2, C)

x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]],
dim=2).reshape(B * K, N1 + N2, C)
x = x + self.positional_embedding
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x)
x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection
x = x.reshape(B, K, self.embed_dim)
return x


class ContextDecoder(nn.Module):

def __init__(self,
transformer_width=256,
transformer_heads=4,
transformer_layers=6,
visual_dim=1024,
dropout=0.1,
**kwargs):
super().__init__()

self.memory_proj = nn.Sequential(
nn.LayerNorm(visual_dim),
nn.Linear(visual_dim, transformer_width),
nn.LayerNorm(transformer_width),
)

self.text_proj = nn.Sequential(
nn.LayerNorm(visual_dim),
nn.Linear(visual_dim, transformer_width),
)

self.decoder = nn.ModuleList([
TransformerDecoderLayer(transformer_width, transformer_heads,
dropout) for _ in range(transformer_layers)
])

self.out_proj = nn.Sequential(
nn.LayerNorm(transformer_width),
nn.Linear(transformer_width, visual_dim))

self.apply(self._init_weights)

def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)

def forward(self, text, visual):
B, N, C = visual.shape
visual = self.memory_proj(visual)
x = self.text_proj(text)

for layer in self.decoder:
x = layer(x, visual)

return self.out_proj(x)

+ 217
- 0
modelscope/models/cv/shop_segmentation/neck_fpn.py View File

@@ -0,0 +1,217 @@
""" FPNneck
Base modules are adapted from https://github.com/open-mmlab/mmcv/,
originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
https://github.com/open-mmlab/mmsegmentation/,
originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
and adapted from https://github.com/raoyongming/DenseCLIP/,
originally MIT License, Copyright (c) 2022 Rao, Yongming.
"""

import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from timm.models.layers import drop, drop_path, trunc_normal_

from .common import resize


class FPN(nn.Module):
"""Feature Pyramid Network.

This neck is the implementation of `Feature Pyramid Networks for Object
Detection <https://arxiv.org/abs/1612.03144>`_.

Args:
in_channels (list[int]): Number of input channels per scale.
out_channels (int): Number of output channels (used at each scale).
num_outs (int): Number of output scales.
start_level (int): Index of the start input backbone level used to
build the feature pyramid. Default: 0.
end_level (int): Index of the end input backbone level (exclusive) to
build the feature pyramid. Default: -1, which means the last level.
add_extra_convs (bool | str): If bool, it decides whether to add conv
layers on top of the original feature maps. Default to False.
If True, its actual mode is specified by `extra_convs_on_inputs`.
If str, it specifies the source feature map of the extra convs.
Only the following options are allowed

- 'on_input': Last feat map of neck inputs (i.e. backbone feature).
- 'on_lateral': Last feature map after lateral convs.
- 'on_output': The last output feature map after fpn convs.
extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
on the original feature from the backbone. If True,
it is equivalent to `add_extra_convs='on_input'`. If False, it is
equivalent to set `add_extra_convs='on_output'`. Default to True.
relu_before_extra_convs (bool): Whether to apply relu before the extra
conv. Default: False.
no_norm_on_lateral (bool): Whether to apply norm on lateral.
Default: False.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (dict): Config dict for activation layer in ConvModule.
Default: None.
upsample_cfg (dict): Config dict for interpolate layer.
Default: dict(mode='nearest').
init_cfg (dict or list[dict], optional): Initialization config dict.

"""

def __init__(self,
in_channels,
out_channels,
num_outs,
start_level=0,
end_level=-1,
add_extra_convs=False,
extra_convs_on_inputs=False,
relu_before_extra_convs=False,
no_norm_on_lateral=False,
conv_cfg=None,
norm_cfg=None,
act_cfg=None,
upsample_cfg=dict(mode='nearest')):
super(FPN, self).__init__()
assert isinstance(in_channels, list)
self.in_channels = in_channels
self.out_channels = out_channels
self.num_ins = len(in_channels)
self.num_outs = num_outs
self.relu_before_extra_convs = relu_before_extra_convs
self.no_norm_on_lateral = no_norm_on_lateral
self.fp16_enabled = False
self.upsample_cfg = upsample_cfg.copy()

if end_level == -1:
self.backbone_end_level = self.num_ins
assert num_outs >= self.num_ins - start_level
else:
# if end_level < inputs, no extra level is allowed
self.backbone_end_level = end_level
assert end_level <= len(in_channels)
assert num_outs == end_level - start_level
self.start_level = start_level
self.end_level = end_level
self.add_extra_convs = add_extra_convs
assert isinstance(add_extra_convs, (str, bool))
if isinstance(add_extra_convs, str):
# Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
elif add_extra_convs: # True
if extra_convs_on_inputs:
# For compatibility with previous release
# TODO: deprecate `extra_convs_on_inputs`
self.add_extra_convs = 'on_input'
else:
self.add_extra_convs = 'on_output'

self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()

for i in range(self.start_level, self.backbone_end_level):
l_conv = ConvModule(
in_channels[i],
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
act_cfg=act_cfg,
inplace=False)
fpn_conv = ConvModule(
out_channels,
out_channels,
3,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
inplace=False)

self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)

# add extra conv layers (e.g., RetinaNet)
extra_levels = num_outs - self.backbone_end_level + self.start_level
if self.add_extra_convs and extra_levels >= 1:
for i in range(extra_levels):
if i == 0 and self.add_extra_convs == 'on_input':
in_channels = self.in_channels[self.backbone_end_level - 1]
else:
in_channels = out_channels
extra_fpn_conv = ConvModule(
in_channels,
out_channels,
3,
stride=2,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
inplace=False)
self.fpn_convs.append(extra_fpn_conv)

self.apply(self._init_weights)

def forward(self, inputs):
assert len(inputs) == len(self.in_channels)

# build laterals
laterals = [
lateral_conv(inputs[i + self.start_level])
for i, lateral_conv in enumerate(self.lateral_convs)
]

# build top-down path
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
# In some cases, fixing `scale factor` (e.g. 2) is preferred, but
# it cannot co-exist with `size` in `F.interpolate`.
if 'scale_factor' in self.upsample_cfg:
laterals[i - 1] = laterals[i - 1] + resize(
laterals[i], **self.upsample_cfg)
else:
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] = laterals[i - 1] + resize(
laterals[i], size=prev_shape, **self.upsample_cfg)

# build outputs
# part 1: from original levels
outs = [
self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
]
# part 2: add extra levels
if self.num_outs > len(outs):
# use max pool to get more levels on top of outputs
# (e.g., Faster R-CNN, Mask R-CNN)
if not self.add_extra_convs:
for i in range(self.num_outs - used_backbone_levels):
outs.append(F.max_pool2d(outs[-1], 1, stride=2))
# add conv layers on top of original feature maps (RetinaNet)
else:
if self.add_extra_convs == 'on_input':
extra_source = inputs[self.backbone_end_level - 1]
elif self.add_extra_convs == 'on_lateral':
extra_source = laterals[-1]
elif self.add_extra_convs == 'on_output':
extra_source = outs[-1]
else:
raise NotImplementedError
outs.append(self.fpn_convs[used_backbone_levels](extra_source))
for i in range(used_backbone_levels + 1, self.num_outs):
if self.relu_before_extra_convs:
outs.append(self.fpn_convs[i](F.relu(outs[-1])))
else:
outs.append(self.fpn_convs[i](outs[-1]))
return tuple(outs)

def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias.data, 0)

+ 157
- 0
modelscope/models/cv/shop_segmentation/shop_seg_base.py View File

@@ -0,0 +1,157 @@
"""
Base modules are adapted from https://github.com/open-mmlab/mmcv/,
originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
https://github.com/open-mmlab/mmsegmentation/,
originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
and adapted from https://github.com/raoyongming/DenseCLIP/,
originally MIT License, Copyright (c) 2022 Rao, Yongming.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F

from .head_fpn import FPNHead
from .models import (CLIPTextContextEncoder, CLIPVisionTransformer,
ContextDecoder)
from .neck_fpn import FPN
from .utils import SimpleTokenizer, tokenize


class SHOPSEG(nn.Module):
"""Encoder Decoder segmentors.

EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
Note that auxiliary_head is only used for deep supervision during training,
which could be dumped during inference.
"""

def __init__(self,
model_dir,
context_length=22,
context_feature='attention',
score_concat_index=2,
tau=0.07,
token_embed_dim=512,
text_dim=512,
**args):
super(SHOPSEG, self).__init__()

self.model_dir = model_dir
self.tokenizer = SimpleTokenizer(model_dir
+ '/bpe_simple_vocab_16e6.txt.gz')

backbone = CLIPVisionTransformer(
input_resolution=1024,
patch_size=16,
width=768,
layers=12,
output_dim=512,
drop_path_rate=0.1,
pretrained=False,
get_embeddings=True)

text_encoder = CLIPTextContextEncoder(
context_length=30,
vocab_size=49408,
transformer_width=512,
transformer_heads=8,
transformer_layers=12,
embed_dim=512,
pretrained=False)

context_decoder = ContextDecoder(
transformer_width=256,
transformer_heads=4,
transformer_layers=3,
visual_dim=512,
dropout=0.1)
neck = FPN(
in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4)
head_fpd = FPNHead(channels=256, num_classes=2)

self.backbone = backbone
self.text_encoder = text_encoder
self.context_decoder = context_decoder
self.context_length = context_length
self.score_concat_index = score_concat_index

self.context_feature = context_feature
self.tau = tau
context_length = self.text_encoder.context_length - self.context_length
self.contexts = nn.Parameter(
torch.randn(1, context_length, token_embed_dim))
nn.init.trunc_normal_(self.contexts)
self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)

self.neck = neck
self.head_fpn = head_fpd

self.tau = 0.07

def encode_text(self, text, context_length):
output = tokenize(self.tokenizer, text, context_length, True)
return output

def extract_feat(self, img):
"""Extract features from images."""
x = self.backbone(img)
return x

def after_extract_feat(self, x, name_list):
x_orig = list(x[0:4])
global_feat, visual_embeddings = x[4]
B, C, H, W = visual_embeddings.shape
if self.context_feature == 'attention':
x1 = global_feat.reshape(B, C, 1)
x2 = visual_embeddings.reshape(B, C, H * W)
visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1)
texts = torch.cat([
self.encode_text(c, context_length=self.context_length)
for c in name_list
])
x1 = texts.to(global_feat.device)
x1 = self.text_encoder(x1, self.contexts)
text_embeddings = x1.expand(B, -1, -1)
# update text_embeddings by visual_context!
# (B, 1, C)
text_diff = self.context_decoder(text_embeddings, visual_context)
# (B, K, C)
text_embeddings = text_embeddings + self.gamma * text_diff

# compute score map and concat
B, K, C = text_embeddings.shape
visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2)
text = F.normalize(text_embeddings, dim=2, p=2)
score_map_list = []
bsz = B
for i in range(bsz):
ind = 2 * i
sub_text = torch.cat(
[text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]],
dim=1) # 1 * 2 * h * w

sub_score_map = torch.einsum('bchw,bkc->bkhw',
visual_embeddings[i:i + 1],
sub_text) # 1 * 2 * h * w
score_map_list.append(sub_score_map)
score_map = torch.cat(score_map_list, dim=0) # b * 2 * h * w
x_orig[self.score_concat_index] = torch.cat(
[x_orig[self.score_concat_index], score_map], dim=1)
return x_orig, score_map

def forward(self, img, text_list=None):
if text_list is None:
bsz = img.size()[0]
text_list = ['foregeound'] * bsz
x = self.extract_feat(img)
_x_orig = [x[i] for i in range(4)]
name_list = []
for name in text_list:
name_list.append('others')
name_list.append(name[0:20])
x_orig, score_map = self.after_extract_feat(x, name_list)
x_orig = list(self.neck(x_orig))
_x_orig = x_orig
pred = self.head_fpn(_x_orig)
return pred

+ 115
- 0
modelscope/models/cv/shop_segmentation/shop_seg_model.py View File

@@ -0,0 +1,115 @@
import os.path as osp
from typing import Any, Dict

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.shop_segmentation import SHOPSEG
from modelscope.outputs import OutputKeys
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['ShopSegmentation']


@MODELS.register_module(
Tasks.shop_segmentation, module_name=Models.shop_segmentation)
class ShopSegmentation(TorchModel):
""" shop segmentation model.
"""

def __init__(self, model_dir, device_id=0, *args, **kwargs):
super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)

self.model = SHOPSEG(model_dir=model_dir)
pretrained_params = torch.load('{}/{}'.format(
model_dir, ModelFile.TORCH_MODEL_BIN_FILE))

self.model.load_state_dict(pretrained_params)
self.model.eval()
self.device_id = device_id
if self.device_id >= 0 and torch.cuda.is_available():
self.model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id))
else:
self.device_id = -1
logger.info('Use CPU for inference')

def preprocess(self, img, size=1024):
mean = [0.48145466, 0.4578275, 0.40821073]
std = [0.26862954, 0.26130258, 0.27577711]
h, w, c = img.shape
max_hw = max(h, w)
ratio = 1.0 * size / max_hw
crop_h, crop_w = int(ratio * h), int(ratio * w)
pil_img = Image.fromarray(img)
pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
np_img = np.array(pil_img, dtype=np.float32) / 255.

for j in range(3):
np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]

img_pad = np.zeros((size, size, 3), dtype=np.float32)
img_pad[:crop_h, :crop_w] = np_img

img_pad = torch.from_numpy(img_pad).permute(2, 0,
1).unsqueeze(0).float()
return img_pad, h, w, crop_h, crop_w

def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
output = np.clip(tensors * 255., a_min=0, a_max=255.)
crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)

pil_output = Image.fromarray(crop_output)
pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
np_output = np.array(pil_output, dtype=np.uint8)

np_output[np_output < 128] = 0
np_output[np_output >= 128] = 255
np_output = np.uint8(np_output)
return np_output

def forward(self, image):
"""
image should be numpy array, dtype=np.uint8, shape: height*width*3
"""
image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
image, size=1024)
pred = self.inference(image_tensor)
msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024)

outputs = {OutputKeys.MASKS: msk}
return outputs

def inference(self, image):
"""
image should be tensor, 1 * 3 * 1024 * 1024
"""
with torch.no_grad():
if self.device_id == -1:
output = self.model(image)
else:
device = torch.device('cuda', self.device_id)
output = self.model(image.to(device))
output = F.interpolate(output, size=(1024, 1024), mode='bilinear')
output = F.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
output = output[0]
if self.device_id == -1:
pred = output.data.numpy()
else:
pred = output.data.cpu().numpy()

del output
return pred

+ 199
- 0
modelscope/models/cv/shop_segmentation/utils.py View File

@@ -0,0 +1,199 @@
""" CLIP Tokenizer
Adapted from https://github.com/openai/CLIP.
Originally MIT License, Copyright (c) 2021 OpenAI.
"""

import gzip
import html
import os
from functools import lru_cache
from typing import Any, List, Union

import ftfy
import regex as re
import torch


@lru_cache()
def default_bpe():
return os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'bpe_simple_vocab_16e6.txt.gz')


@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord('!'),
ord('~') + 1)) + list(range(
ord('¡'),
ord('¬') + 1)) + list(range(ord('®'),
ord('ÿ') + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))


def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs


def basic_clean(text):
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()


def whitespace_clean(text):
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text


class SimpleTokenizer(object):

def __init__(self, bpe_path: str = default_bpe()):
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
merges = merges[1:49152 - 256 - 2 + 1]
merges = [tuple(merge.split()) for merge in merges]
vocab = list(bytes_to_unicode().values())
vocab = vocab + [v + '</w>' for v in vocab]
for merge in merges:
vocab.append(''.join(merge))
vocab.extend(['<|startoftext|>', '<|endoftext|>'])
self.encoder = dict(zip(vocab, range(len(vocab))))
self.decoder = {v: k for k, v in self.encoder.items()}
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {
'<|startoftext|>': '<|startoftext|>',
'<|endoftext|>': '<|endoftext|>'
}
self.pat = re.compile(
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE)

def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + '</w>', )
pairs = get_pairs(word)

if not pairs:
return token + '</w>'

error_list = []
while True:
bigram = min(
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except Exception as err:
error_list.append(err)
new_word.extend(word[i:])
break

if word[i] == first and i < len(word) - 1 and word[
i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word

def encode(self, text):
bpe_tokens = []
text = whitespace_clean(basic_clean(text)).lower()
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b]
for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token]
for bpe_token in self.bpe(token).split(' '))
return bpe_tokens

def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode(
'utf-8', errors='replace').replace('</w>', ' ')
return text


def tokenize(tokenizer,
texts,
context_length: int = 77,
truncate: bool = False) -> torch.LongTensor:
"""
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize
context_length : int
The context length to use; all CLIP models use 77 as the context length
truncate: bool
Whether to truncate the text in case its encoding is longer than the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
"""
if isinstance(texts, str):
texts = [texts]

sot_token = tokenizer.encoder['<|startoftext|>']
eot_token = tokenizer.encoder['<|endoftext|>']
all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
for text in texts]
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

for i, tokens in enumerate(all_tokens):
if len(tokens) > context_length:
if truncate:
tokens = tokens[:context_length]
tokens[-1] = eot_token
else:
raise RuntimeError(
f'Input {texts[i]} is too long for context length {context_length}'
)
result[i, :len(tokens)] = torch.tensor(tokens)

return result

+ 1
- 0
modelscope/models/cv/text_driven_segmentation/__init__.py View File

@@ -0,0 +1 @@
from .lseg_base import TextDrivenSegmentation

+ 170
- 0
modelscope/models/cv/text_driven_segmentation/clip.py View File

@@ -0,0 +1,170 @@
""" CLIP
Adapted from https://github.com/openai/CLIP.
Originally MIT License, Copyright (c) 2021 OpenAI.
"""

import hashlib
import os
import urllib
import warnings
from typing import Any, List, Union

import torch
from PIL import Image
from pkg_resources import packaging
from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
ToTensor)
from tqdm import tqdm

from .model import build_model
from .simple_tokenizer import SimpleTokenizer as _Tokenizer

try:
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC
except ImportError:
BICUBIC = Image.BICUBIC

if packaging.version.parse(
torch.__version__) < packaging.version.parse('1.7.1'):
warnings.warn('PyTorch version 1.7.1 or higher is recommended')
__all__ = ['load', 'tokenize']


def _convert_image_to_rgb(image):
return image.convert('RGB')


def _transform(n_px):
return Compose([
Resize(n_px, interpolation=BICUBIC),
CenterCrop(n_px),
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711)),
])


def load(name: str,
device: Union[str, torch.device] = 'cuda'
if torch.cuda.is_available() else 'cpu',
jit: bool = False,
root: str = None):

if not jit:
model = build_model().to(device)
if str(device) == 'cpu':
model.float()
return model, _transform(model.visual.input_resolution)

# patch the device names
device_holder = torch.jit.trace(
lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
device_node = [
n for n in device_holder.graph.findAllNodes('prim::Constant')
if 'Device' in repr(n)
][-1]

def patch_device(module):
try:
graphs = [module.graph] if hasattr(module, 'graph') else []
except RuntimeError:
graphs = []

if hasattr(module, 'forward1'):
graphs.append(module.forward1.graph)

for graph in graphs:
for node in graph.findAllNodes('prim::Constant'):
if 'value' in node.attributeNames() and str(
node['value']).startswith('cuda'):
node.copyAttributes(device_node)

model.apply(patch_device)
patch_device(model.encode_image)
patch_device(model.encode_text)

# patch dtype to float32 on CPU
if str(device) == 'cpu':
float_holder = torch.jit.trace(
lambda: torch.ones([]).float(), example_inputs=[])
float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
float_node = float_input.node()

def patch_float(module):
try:
graphs = [module.graph] if hasattr(module, 'graph') else []
except RuntimeError:
graphs = []

if hasattr(module, 'forward1'):
graphs.append(module.forward1.graph)

for graph in graphs:
for node in graph.findAllNodes('aten::to'):
inputs = list(node.inputs())
for i in [
1, 2
]: # dtype can be the second or third argument to aten::to()
if inputs[i].node()['value'] == 5:
inputs[i].node().copyAttributes(float_node)

model.apply(patch_float)
patch_float(model.encode_image)
patch_float(model.encode_text)

model.float()

return model, _transform(model.input_resolution.item())


def tokenize(
_tokenizer,
texts: Union[str, List[str]],
context_length: int = 77,
truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
"""
Returns the tokenized representation of given input string(s)

Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize

context_length : int
The context length to use; all CLIP models use 77 as the context length

truncate: bool
Whether to truncate the text in case its encoding is longer than the context length

Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
"""
if isinstance(texts, str):
texts = [texts]

sot_token = _tokenizer.encoder['<|startoftext|>']
eot_token = _tokenizer.encoder['<|endoftext|>']
all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
for text in texts]
if packaging.version.parse(
torch.__version__) < packaging.version.parse('1.8.0'):
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
else:
result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)

for i, tokens in enumerate(all_tokens):
if len(tokens) > context_length:
if truncate:
tokens = tokens[:context_length]
tokens[-1] = eot_token
else:
raise RuntimeError(
f'Input {texts[i]} is too long for context length {context_length}'
)
result[i, :len(tokens)] = torch.tensor(tokens)

return result

+ 28
- 0
modelscope/models/cv/text_driven_segmentation/lseg_base.py View File

@@ -0,0 +1,28 @@
"""
Adapted from https://github.com/isl-org/lang-seg.
Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
"""

import torch
import torch.nn as nn

from .lseg_net import LSeg


class TextDrivenSegmentation(nn.Module):

def __init__(self, model_dir):
super(TextDrivenSegmentation, self).__init__()
self.net = LSeg(model_dir=model_dir)
self.model_dir = model_dir

def forward(self, img, txt_list):
b = img.size()[0]
batch_name_list = txt_list
xout_list = []
for i in range(b):
labelset = ['others', batch_name_list[i]]
xout = self.net(img[i:i + 1], labelset=labelset)
xout_list.append(xout)
score_map = torch.cat(xout_list, dim=0)
return score_map

+ 334
- 0
modelscope/models/cv/text_driven_segmentation/lseg_blocks.py View File

@@ -0,0 +1,334 @@
"""
Adapted from https://github.com/isl-org/lang-seg.
Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
"""

import torch
import torch.nn as nn

from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit


def _make_encoder(
backbone,
features,
use_pretrained=True,
groups=1,
expand=False,
exportable=True,
hooks=None,
use_vit_only=False,
use_readout='ignore',
enable_attention_hooks=False,
):
if backbone == 'clip_vitl16_384':
clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
use_pretrained,
hooks=hooks,
use_readout=use_readout,
enable_attention_hooks=enable_attention_hooks,
)
scratch = _make_scratch([256, 512, 1024, 1024],
features,
groups=groups,
expand=expand)
else:
raise NotImplementedError(f"Backbone '{backbone}' not implemented")

return clip_pretrained, pretrained, scratch


def _make_scratch(in_shape, out_shape, groups=1, expand=False):
scratch = nn.Module()

out_shape1 = out_shape
out_shape2 = out_shape
out_shape3 = out_shape
out_shape4 = out_shape
if expand is True:
out_shape1 = out_shape
out_shape2 = out_shape * 2
out_shape3 = out_shape * 4
out_shape4 = out_shape * 8

scratch.layer1_rn = nn.Conv2d(
in_shape[0],
out_shape1,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups,
)
scratch.layer2_rn = nn.Conv2d(
in_shape[1],
out_shape2,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups,
)
scratch.layer3_rn = nn.Conv2d(
in_shape[2],
out_shape3,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups,
)
scratch.layer4_rn = nn.Conv2d(
in_shape[3],
out_shape4,
kernel_size=3,
stride=1,
padding=1,
bias=False,
groups=groups,
)

return scratch


class Interpolate(nn.Module):
"""Interpolation module."""

def __init__(self, scale_factor, mode, align_corners=False):
"""Init.

Args:
scale_factor (float): scaling
mode (str): interpolation mode
"""
super(Interpolate, self).__init__()

self.interp = nn.functional.interpolate
self.scale_factor = scale_factor
self.mode = mode
self.align_corners = align_corners

def forward(self, x):
"""Forward pass.

Args:
x (tensor): input

Returns:
tensor: interpolated data
"""

x = self.interp(
x,
scale_factor=self.scale_factor,
mode=self.mode,
align_corners=self.align_corners,
)

return x


class ResidualConvUnit(nn.Module):
"""Residual convolution module."""

def __init__(self, features):
"""Init.

Args:
features (int): number of features
"""
super().__init__()

self.conv1 = nn.Conv2d(
features, features, kernel_size=3, stride=1, padding=1, bias=True)

self.conv2 = nn.Conv2d(
features, features, kernel_size=3, stride=1, padding=1, bias=True)

self.relu = nn.ReLU(inplace=True)

def forward(self, x):
"""Forward pass.

Args:
x (tensor): input

Returns:
tensor: output
"""
out = self.relu(x)
out = self.conv1(out)
out = self.relu(out)
out = self.conv2(out)

return out + x


class FeatureFusionBlock(nn.Module):
"""Feature fusion block."""

def __init__(self, features):
"""Init.

Args:
features (int): number of features
"""
super(FeatureFusionBlock, self).__init__()

self.resConfUnit1 = ResidualConvUnit(features)
self.resConfUnit2 = ResidualConvUnit(features)

def forward(self, *xs):
"""Forward pass.

Returns:
tensor: output
"""
output = xs[0]

if len(xs) == 2:
output += self.resConfUnit1(xs[1])

output = self.resConfUnit2(output)

output = nn.functional.interpolate(
output, scale_factor=2, mode='bilinear', align_corners=True)

return output


class ResidualConvUnit_custom(nn.Module):
"""Residual convolution module."""

def __init__(self, features, activation, bn):
"""Init.

Args:
features (int): number of features
"""
super().__init__()

self.bn = bn

self.groups = 1

self.conv1 = nn.Conv2d(
features,
features,
kernel_size=3,
stride=1,
padding=1,
bias=not self.bn,
groups=self.groups,
)

self.conv2 = nn.Conv2d(
features,
features,
kernel_size=3,
stride=1,
padding=1,
bias=not self.bn,
groups=self.groups,
)

if self.bn is True:
self.bn1 = nn.BatchNorm2d(features)
self.bn2 = nn.BatchNorm2d(features)

self.activation = activation

self.skip_add = nn.quantized.FloatFunctional()

def forward(self, x):
"""Forward pass.

Args:
x (tensor): input

Returns:
tensor: output
"""

out = self.activation(x)
out = self.conv1(out)
if self.bn is True:
out = self.bn1(out)

out = self.activation(out)
out = self.conv2(out)
if self.bn is True:
out = self.bn2(out)

if self.groups > 1:
out = self.conv_merge(out)

return self.skip_add.add(out, x)


class FeatureFusionBlock_custom(nn.Module):
"""Feature fusion block."""

def __init__(
self,
features,
activation,
deconv=False,
bn=False,
expand=False,
align_corners=True,
):
"""Init.

Args:
features (int): number of features
"""
super(FeatureFusionBlock_custom, self).__init__()

self.deconv = deconv
self.align_corners = align_corners

self.groups = 1

self.expand = expand
out_features = features
if self.expand is True:
out_features = features // 2

self.out_conv = nn.Conv2d(
features,
out_features,
kernel_size=1,
stride=1,
padding=0,
bias=True,
groups=1,
)

self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)

self.skip_add = nn.quantized.FloatFunctional()

def forward(self, *xs):
"""Forward pass.

Returns:
tensor: output
"""
output = xs[0]

if len(xs) == 2:
res = self.resConfUnit1(xs[1])
output = self.skip_add.add(output, res)

output = self.resConfUnit2(output)

output = nn.functional.interpolate(
output,
scale_factor=2,
mode='bilinear',
align_corners=self.align_corners)

output = self.out_conv(output)
return output

+ 107
- 0
modelscope/models/cv/text_driven_segmentation/lseg_model.py View File

@@ -0,0 +1,107 @@
import os.path as osp
from typing import Any, Dict

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.text_driven_segmentation import \
TextDrivenSegmentation
from modelscope.outputs import OutputKeys
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()
__all__ = ['TextDrivenSeg']


@MODELS.register_module(
Tasks.text_driven_segmentation,
module_name=Models.text_driven_segmentation)
class TextDrivenSeg(TorchModel):
""" text driven segmentation model.
"""

def __init__(self, model_dir, device_id=0, *args, **kwargs):
super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)
self.model = TextDrivenSegmentation(model_dir=model_dir)
pretrained_params = torch.load('{}/{}'.format(
model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
self.model.load_state_dict(pretrained_params)
self.model.eval()
if device_id >= 0 and torch.cuda.is_available():
self.model.to('cuda:{}'.format(device_id))
logger.info('Use GPU: {}'.format(device_id))
else:
device_id = -1
logger.info('Use CPU for inference')
self.device_id = device_id

def preprocess(self, img, size=640):
mean = [0.48145466, 0.4578275, 0.40821073]
std = [0.26862954, 0.26130258, 0.27577711]
h, w, c = img.shape
max_hw = max(h, w)
ratio = 1.0 * size / max_hw
crop_h, crop_w = int(ratio * h), int(ratio * w)
pil_img = Image.fromarray(img)
pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
np_img = np.array(pil_img, dtype=np.float32) / 255.
for j in range(3):
np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
img_pad = np.zeros((size, size, 3), dtype=np.float32)
img_pad[:crop_h, :crop_w] = np_img
img_pad = torch.from_numpy(img_pad).permute(2, 0,
1).unsqueeze(0).float()
return img_pad, h, w, crop_h, crop_w

def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
output = np.clip(tensors * 255., a_min=0, a_max=255.)
crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
pil_output = Image.fromarray(crop_output)
pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
np_output = np.array(pil_output, dtype=np.uint8)
np_output[np_output < 128] = 0
np_output[np_output >= 128] = 255
np_output = np.uint8(np_output)
return np_output

def forward(self, image, text):
"""
image should be numpy array, dtype=np.uint8, shape: height*width*3
"""
image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
image, size=640)
pred = self.inference(image_tensor, text)
msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640)
outputs = {OutputKeys.MASKS: msk}
return outputs

def inference(self, image, text):
"""
image should be tensor, 1 * 3 * 640 * 640
"""
with torch.no_grad():
if self.device_id == -1:
output = self.model(image)
else:
device = torch.device('cuda', self.device_id)
output = self.model(image.to(device), [text])
output = F.interpolate(output, size=(640, 640), mode='bilinear')
output = F.softmax(output, dim=1)
output = torch.argmax(output, dim=1)
output = output[0]
if self.device_id == -1:
pred = output.data.numpy()
else:
pred = output.data.cpu().numpy()
del output
return pred

+ 197
- 0
modelscope/models/cv/text_driven_segmentation/lseg_net.py View File

@@ -0,0 +1,197 @@
"""
Adapted from https://github.com/isl-org/lang-seg.
Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
"""

import numpy as np
import torch
import torch.nn as nn

from . import clip
from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
Interpolate, _make_encoder, forward_vit)
from .simple_tokenizer import SimpleTokenizer


class depthwise_clipseg_conv(nn.Module):

def __init__(self):
super(depthwise_clipseg_conv, self).__init__()
self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)

def depthwise_clipseg(self, x, channels):
x = torch.cat(
[self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)],
dim=1)
return x

def forward(self, x):
channels = x.shape[1]
out = self.depthwise_clipseg(x, channels)
return out


class depthwise_conv(nn.Module):

def __init__(self, kernel_size=3, stride=1, padding=1):
super(depthwise_conv, self).__init__()
self.depthwise = nn.Conv2d(
1, 1, kernel_size=kernel_size, stride=stride, padding=padding)

def forward(self, x):
# support for 4D tensor with NCHW
C, H, W = x.shape[1:]
x = x.reshape(-1, 1, H, W)
x = self.depthwise(x)
x = x.view(-1, C, H, W)
return x


class depthwise_block(nn.Module):

def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
super(depthwise_block, self).__init__()
self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
if activation == 'relu':
self.activation = nn.ReLU()
elif activation == 'lrelu':
self.activation = nn.LeakyReLU()
elif activation == 'tanh':
self.activation = nn.Tanh()

def forward(self, x, act=True):
x = self.depthwise(x)
if act:
x = self.activation(x)
return x


class bottleneck_block(nn.Module):

def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
super(bottleneck_block, self).__init__()
self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
if activation == 'relu':
self.activation = nn.ReLU()
elif activation == 'lrelu':
self.activation = nn.LeakyReLU()
elif activation == 'tanh':
self.activation = nn.Tanh()

def forward(self, x, act=True):
sum_layer = x.max(dim=1, keepdim=True)[0]
x = self.depthwise(x)
x = x + sum_layer
if act:
x = self.activation(x)
return x


class BaseModel(torch.nn.Module):

def load(self, path):
"""Load model from file.
Args:
path (str): file path
"""
parameters = torch.load(path, map_location=torch.device('cpu'))

if 'optimizer' in parameters:
parameters = parameters['model']

self.load_state_dict(parameters)


def _make_fusion_block(features, use_bn):
return FeatureFusionBlock_custom(
features,
activation=nn.ReLU(False),
deconv=False,
bn=use_bn,
expand=False,
align_corners=True,
)


class LSeg(BaseModel):

def __init__(
self,
features=256,
backbone='clip_vitl16_384',
readout='project',
use_bn=True,
model_dir=None,
):
super(LSeg, self).__init__()
hooks = {
'clip_vitl16_384': [5, 11, 17, 23],
}

# Instantiate backbone and reassemble blocks
self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
backbone,
features,
groups=1,
expand=False,
exportable=False,
hooks=hooks[backbone],
use_readout=readout,
)

self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)

self.logit_scale = nn.Parameter(torch.ones([])
* np.log(1 / 0.07)).exp()
self.out_c = 512
self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)

self.scratch.output_conv = nn.Sequential(
Interpolate(scale_factor=2, mode='bilinear', align_corners=True), )

self.tau = 0.07
self.model_dir = model_dir
self.tokenizer = SimpleTokenizer(model_dir
+ '/bpe_simple_vocab_16e6.txt.gz')

def forward(self, x, labelset=''):
text = clip.tokenize(self.tokenizer, labelset)

layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)

layer_1_rn = self.scratch.layer1_rn(layer_1)
layer_2_rn = self.scratch.layer2_rn(layer_2)
layer_3_rn = self.scratch.layer3_rn(layer_3)
layer_4_rn = self.scratch.layer4_rn(layer_4)

path_4 = self.scratch.refinenet4(layer_4_rn)
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)

text = text.to(x.device)
text_features = self.clip_pretrained.encode_text(text)

image_features = self.scratch.head1(path_1)

imshape = image_features.shape
image_features = image_features.permute(0, 2, 3,
1).reshape(-1, self.out_c)

# normalized features
image_features = image_features / image_features.norm(
dim=-1, keepdim=True)
text_features = text_features / text_features.norm(
dim=-1, keepdim=True)

logits_per_image = image_features @ text_features.t() / self.tau

out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3],
-1).permute(0, 3, 1, 2)

out = self.scratch.output_conv(out)

return out

+ 543
- 0
modelscope/models/cv/text_driven_segmentation/lseg_vit.py View File

@@ -0,0 +1,543 @@
"""
Adapted from https://github.com/isl-org/lang-seg.
Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
"""

import math
import types

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint

from . import clip

activations = {}


def get_activation(name):

def hook(model, input, output):
activations[name] = output

return hook


attention = {}


def get_attention(name):

def hook(module, input, output):
x = input[0]
B, N, C = x.shape
qkv = (
module.qkv(x).reshape(B, N, 3, module.num_heads,
C // module.num_heads).permute(
2, 0, 3, 1, 4))
q, k, _ = (
qkv[0],
qkv[1],
qkv[2],
) # make torchscript happy (cannot use tensor as tuple)

attn = (q @ k.transpose(-2, -1)) * module.scale

attn = attn.softmax(dim=-1) # [:,:,1,1:]
attention[name] = attn

return hook


def get_mean_attention_map(attn, token, shape):
attn = attn[:, :, token, 1:]
attn = attn.unflatten(2, torch.Size([shape[2] // 16,
shape[3] // 16])).float()
attn = torch.nn.functional.interpolate(
attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0)

all_attn = torch.mean(attn, 0)

return all_attn


class Slice(nn.Module):

def __init__(self, start_index=1):
super(Slice, self).__init__()
self.start_index = start_index

def forward(self, x):
return x[:, self.start_index:]


class AddReadout(nn.Module):

def __init__(self, start_index=1):
super(AddReadout, self).__init__()
self.start_index = start_index

def forward(self, x):
if self.start_index == 2:
readout = (x[:, 0] + x[:, 1]) / 2
else:
readout = x[:, 0]
return x[:, self.start_index:] + readout.unsqueeze(1)


class ProjectReadout(nn.Module):

def __init__(self, in_features, start_index=1):
super(ProjectReadout, self).__init__()
self.start_index = start_index

self.project = nn.Sequential(
nn.Linear(2 * in_features, in_features), nn.GELU())

def forward(self, x):
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
features = torch.cat((x[:, self.start_index:], readout), -1)

return self.project(features)


class Transpose(nn.Module):

def __init__(self, dim0, dim1):
super(Transpose, self).__init__()
self.dim0 = dim0
self.dim1 = dim1

def forward(self, x):
x = x.transpose(self.dim0, self.dim1)
return x


def forward_vit(pretrained, x):
b, c, h, w = x.shape

# encoder
_ = pretrained.model.forward_flex(x)

layer_1 = pretrained.activations['1']
layer_2 = pretrained.activations['2']
layer_3 = pretrained.activations['3']
layer_4 = pretrained.activations['4']

layer_1 = pretrained.act_postprocess1[0:2](layer_1)
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
layer_4 = pretrained.act_postprocess4[0:2](layer_4)

unflatten = nn.Sequential(
nn.Unflatten(
2,
torch.Size([
h // pretrained.model.patch_size[1],
w // pretrained.model.patch_size[0],
]),
))

if layer_1.ndim == 3:
layer_1 = unflatten(layer_1)
if layer_2.ndim == 3:
layer_2 = unflatten(layer_2)
if layer_3.ndim == 3:
layer_3 = unflatten(layer_3)
if layer_4.ndim == 3:
layer_4 = unflatten(layer_4)

layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
layer_1)
layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
layer_2)
layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
layer_3)
layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
layer_4)

return layer_1, layer_2, layer_3, layer_4


def _resize_pos_embed(self, posemb, gs_h, gs_w):
posemb_tok, posemb_grid = (
posemb[:, :self.start_index],
posemb[0, self.start_index:],
)

gs_old = int(math.sqrt(len(posemb_grid)))

posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
-1).permute(0, 3, 1, 2)
posemb_grid = F.interpolate(
posemb_grid, size=(gs_h, gs_w), mode='bilinear')
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)

posemb = torch.cat([posemb_tok, posemb_grid], dim=1)

return posemb


def forward_flex(self, x):
b, c, h, w = x.shape

pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
w // self.patch_size[0])

B = x.shape[0]

if hasattr(self.patch_embed, 'backbone'):
x = self.patch_embed.backbone(x)
if isinstance(x, (list, tuple)):
x = x[
-1] # last feature if backbone outputs list/tuple of features
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)

if getattr(self, 'dist_token', None) is not None:
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
dist_token = self.dist_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, dist_token, x), dim=1)
else:
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)

x = x + pos_embed
x = self.pos_drop(x)

gradient_checkpoint = False
for blk in self.blocks:
if gradient_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)

x = self.norm(x)

return x


def get_readout_oper(vit_features, features, use_readout, start_index=1):
if use_readout == 'ignore':
readout_oper = [Slice(start_index)] * len(features)
elif use_readout == 'add':
readout_oper = [AddReadout(start_index)] * len(features)
elif use_readout == 'project':
readout_oper = [
ProjectReadout(vit_features, start_index) for out_feat in features
]
else:
assert (
False
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"

return readout_oper


def adapt_input_conv(in_chans, conv_weight):
conv_type = conv_weight.dtype
conv_weight = conv_weight.float(
) # Some weights are in torch.half, ensure it's float for sum on CPU
O, II, J, K = conv_weight.shape
if in_chans == 1:
if II > 3:
assert conv_weight.shape[1] % 3 == 0
# For models with space2depth stems
conv_weight = conv_weight.reshape(O, II // 3, 3, J, K)
conv_weight = conv_weight.sum(dim=2, keepdim=False)
else:
conv_weight = conv_weight.sum(dim=1, keepdim=True)
elif in_chans != 3:
if II != 3:
raise NotImplementedError(
'Weight format not supported by conversion.')
else:
# NOTE this strategy should be better than random init, but there could be other combinations of
# the original RGB input layer weights that'd work better for specific cases.
repeat = int(math.ceil(in_chans / 3))
conv_weight = conv_weight.repeat(1, repeat, 1,
1)[:, :in_chans, :, :]
conv_weight *= (3 / float(in_chans))
conv_weight = conv_weight.to(conv_type)
return conv_weight


@torch.no_grad()
def _load_weights(model, checkpoint_path, prefix=''):
""" Load weights from .npz checkpoints for official Google Brain Flax implementation
"""
import numpy as np

def _n2p(w, t=True):
if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
w = w.flatten()
if t:
if w.ndim == 4:
w = w.transpose([3, 2, 0, 1])
elif w.ndim == 3:
w = w.transpose([2, 0, 1])
elif w.ndim == 2:
w = w.transpose([1, 0])
return torch.from_numpy(w)

w = np.load(checkpoint_path)
if not prefix and 'opt/target/embedding/kernel' in w:
prefix = 'opt/target/'

if hasattr(model.patch_embed, 'backbone'):
# hybrid
backbone = model.patch_embed.backbone
stem_only = not hasattr(backbone, 'stem')
stem = backbone if stem_only else backbone.stem
stem.conv.weight.copy_(
adapt_input_conv(stem.conv.weight.shape[1],
_n2p(w[f'{prefix}conv_root/kernel'])))
stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
if not stem_only:
for i, stage in enumerate(backbone.stages):
for j, block in enumerate(stage.blocks):
bp = f'{prefix}block{i + 1}/unit{j + 1}/'
for r in range(3):
getattr(block, f'conv{r + 1}').weight.copy_(
_n2p(w[f'{bp}conv{r + 1}/kernel']))
getattr(block, f'norm{r + 1}').weight.copy_(
_n2p(w[f'{bp}gn{r + 1}/scale']))
getattr(block, f'norm{r + 1}').bias.copy_(
_n2p(w[f'{bp}gn{r + 1}/bias']))
if block.downsample is not None:
block.downsample.conv.weight.copy_(
_n2p(w[f'{bp}conv_proj/kernel']))
block.downsample.norm.weight.copy_(
_n2p(w[f'{bp}gn_proj/scale']))
block.downsample.norm.bias.copy_(
_n2p(w[f'{bp}gn_proj/bias']))
embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
else:
embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
_n2p(w[f'{prefix}embedding/kernel']))
model.patch_embed.proj.weight.copy_(embed_conv_w)
model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
pos_embed_w = _n2p(
w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
if pos_embed_w.shape != model.pos_embed.shape:
pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights
pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
1),
model.patch_embed.grid_size)
model.pos_embed.copy_(pos_embed_w)
model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
if isinstance(
model.head, nn.Linear
) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
# NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
# if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
# model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
# model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
for i, block in enumerate(model.blocks.children()):
block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
block.attn.qkv.weight.copy_(
torch.cat([
_n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
for n in ('query', 'key', 'value')
]))
block.attn.qkv.bias.copy_(
torch.cat([
_n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
for n in ('query', 'key', 'value')
]))
block.attn.proj.weight.copy_(
_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
for r in range(2):
getattr(block.mlp, f'fc{r + 1}').weight.copy_(
_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
getattr(block.mlp, f'fc{r + 1}').bias.copy_(
_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))


def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
# Rescale the grid of position embeddings when loading from state_dict. Adapted from
# https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
ntok_new = posemb_new.shape[1]
if num_prefix_tokens:
posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
0, num_prefix_tokens:]
ntok_new -= num_prefix_tokens
else:
posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
gs_old = int(math.sqrt(len(posemb_grid)))
if not len(gs_new): # backwards compatibility
gs_new = [int(math.sqrt(ntok_new))] * 2
assert len(gs_new) >= 2
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
-1).permute(0, 3, 1, 2)
posemb_grid = F.interpolate(
posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
posemb_grid = posemb_grid.permute(0, 2, 3,
1).reshape(1, gs_new[0] * gs_new[1], -1)
posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
return posemb


def _make_pretrained_clip_vitl16_384(pretrained,
use_readout='ignore',
hooks=None,
enable_attention_hooks=False):
clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False)

# model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
model = timm.create_model('vit_large_patch16_384', pretrained=False)
hooks = [5, 11, 17, 23] if hooks is None else hooks
pretrained = _make_vit_b16_backbone(
model,
features=[256, 512, 1024, 1024],
hooks=hooks,
vit_features=1024,
use_readout=use_readout,
enable_attention_hooks=enable_attention_hooks,
)
return clip_pretrained, pretrained


def _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
size=[384, 384],
hooks=[2, 5, 8, 11],
vit_features=768,
use_readout='ignore',
start_index=1,
enable_attention_hooks=False,
):
pretrained = nn.Module()

pretrained.model = model
pretrained.model.blocks[hooks[0]].register_forward_hook(
get_activation('1'))
pretrained.model.blocks[hooks[1]].register_forward_hook(
get_activation('2'))
pretrained.model.blocks[hooks[2]].register_forward_hook(
get_activation('3'))
pretrained.model.blocks[hooks[3]].register_forward_hook(
get_activation('4'))

pretrained.activations = activations

if enable_attention_hooks:
pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
get_attention('attn_1'))
pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
get_attention('attn_2'))
pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
get_attention('attn_3'))
pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
get_attention('attn_4'))
pretrained.attention = attention

readout_oper = get_readout_oper(vit_features, features, use_readout,
start_index)

# 32, 48, 136, 384
pretrained.act_postprocess1 = nn.Sequential(
readout_oper[0],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[0],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[0],
out_channels=features[0],
kernel_size=4,
stride=4,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)

pretrained.act_postprocess2 = nn.Sequential(
readout_oper[1],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[1],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[1],
out_channels=features[1],
kernel_size=2,
stride=2,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)

pretrained.act_postprocess3 = nn.Sequential(
readout_oper[2],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[2],
kernel_size=1,
stride=1,
padding=0,
),
)

pretrained.act_postprocess4 = nn.Sequential(
readout_oper[3],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[3],
kernel_size=1,
stride=1,
padding=0,
),
nn.Conv2d(
in_channels=features[3],
out_channels=features[3],
kernel_size=3,
stride=2,
padding=1,
),
)

pretrained.model.start_index = start_index
pretrained.model.patch_size = [16, 16]

# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model.forward_flex = types.MethodType(forward_flex,
pretrained.model)
pretrained.model._resize_pos_embed = types.MethodType(
_resize_pos_embed, pretrained.model)

return pretrained

+ 458
- 0
modelscope/models/cv/text_driven_segmentation/model.py View File

@@ -0,0 +1,458 @@
"""
Adapted from https://github.com/isl-org/lang-seg.
Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
"""

from collections import OrderedDict
from typing import Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.relu1 = nn.ReLU(inplace=True)

self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.relu2 = nn.ReLU(inplace=True)

self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu3 = nn.ReLU(inplace=True)

self.downsample = None
self.stride = stride

if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict([('-1', nn.AvgPool2d(stride)),
('0',
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False)),
('1', nn.BatchNorm2d(planes * self.expansion))]))

def forward(self, x: torch.Tensor):
identity = x

out = self.relu1(self.bn1(self.conv1(x)))
out = self.relu2(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu3(out)
return out


class AttentionPool2d(nn.Module):

def __init__(self,
spacial_dim: int,
embed_dim: int,
num_heads: int,
output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(
torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads

def forward(self, x):
x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
x, _ = F.multi_head_attention_forward(
query=x[:1],
key=x,
value=x,
embed_dim_to_check=x.shape[-1],
num_heads=self.num_heads,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
in_proj_weight=None,
in_proj_bias=torch.cat(
[self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=0,
out_proj_weight=self.c_proj.weight,
out_proj_bias=self.c_proj.bias,
use_separate_proj_weight=True,
training=self.training,
need_weights=False)
return x.squeeze(0)


class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim,
heads,
input_resolution=224,
width=64):
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.relu3 = nn.ReLU(inplace=True)
self.avgpool = nn.AvgPool2d(2)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.attnpool(x)

return x


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self, width, layers, heads, attn_mask=None):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

def forward(self, x: torch.Tensor):
return self.resblocks(x)


class VisionTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(width, layers, heads)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x1 = self.class_embedding.to(x.dtype)
x2 = torch.zeros(
x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
x = torch.cat([x1 + x2, x], dim=1) # shape = [*, grid ** 2 + 1, width]
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x


class CLIP(nn.Module):

def __init__(
self,
embed_dim: int,
# vision
image_resolution: int,
vision_layers: Union[Tuple[int, int, int, int], int],
vision_width: int,
vision_patch_size: int,
# text
context_length: int,
vocab_size: int,
transformer_width: int,
transformer_heads: int,
transformer_layers: int):
super().__init__()

self.context_length = context_length

if isinstance(vision_layers, (tuple, list)):
vision_heads = vision_width * 32 // 64
self.visual = ModifiedResNet(
layers=vision_layers,
output_dim=embed_dim,
heads=vision_heads,
input_resolution=image_resolution,
width=vision_width)
else:
vision_heads = vision_width // 64
self.visual = VisionTransformer(
input_resolution=image_resolution,
patch_size=vision_patch_size,
width=vision_width,
layers=vision_layers,
heads=vision_heads,
output_dim=embed_dim)

self.transformer = Transformer(
width=transformer_width,
layers=transformer_layers,
heads=transformer_heads,
attn_mask=self.build_attention_mask())

self.vocab_size = vocab_size
self.token_embedding = nn.Embedding(vocab_size, transformer_width)
self.positional_embedding = nn.Parameter(
torch.empty(self.context_length, transformer_width))
self.ln_final = LayerNorm(transformer_width)

self.text_projection = nn.Parameter(
torch.empty(transformer_width, embed_dim))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

self.initialize_parameters()

def initialize_parameters(self):
nn.init.normal_(self.token_embedding.weight, std=0.02)
nn.init.normal_(self.positional_embedding, std=0.01)

if isinstance(self.visual, ModifiedResNet):
if self.visual.attnpool is not None:
std = self.visual.attnpool.c_proj.in_features**-0.5
nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

for resnet_block in [
self.visual.layer1, self.visual.layer2, self.visual.layer3,
self.visual.layer4
]:
for name, param in resnet_block.named_parameters():
if name.endswith('bn3.weight'):
nn.init.zeros_(param)

proj_std = (self.transformer.width**-0.5) * (
(2 * self.transformer.layers)**-0.5)
attn_std = self.transformer.width**-0.5
fc_std = (2 * self.transformer.width)**-0.5
for block in self.transformer.resblocks:
nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)

if self.text_projection is not None:
nn.init.normal_(
self.text_projection, std=self.transformer.width**-0.5)

def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.context_length, self.context_length)
mask.fill_(float('-inf'))
mask.triu_(1) # zero out the lower diagonal
return mask

@property
def dtype(self):
return self.visual.conv1.weight.dtype

def encode_image(self, image):
return self.visual(image.type(self.dtype))

def encode_text(self, text):
x = self.token_embedding(text).type(self.dtype)
x = x + self.positional_embedding.type(self.dtype)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x).type(self.dtype)
x = x[torch.arange(x.shape[0]),
text.argmax(dim=-1)] @ self.text_projection
return x

def forward(self, image, text):
image_features = self.encode_image(image)
text_features = self.encode_text(text)

# normalized features
image_features = image_features / image_features.norm(
dim=1, keepdim=True)
text_features = text_features / text_features.norm(dim=1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()

# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text


def convert_weights(model: nn.Module):
"""Convert applicable model parameters to fp16"""

def _convert_weights_to_fp16(ll):
if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)):
ll.weight.data = ll.weight.data.half()
if ll.bias is not None:
ll.bias.data = ll.bias.data.half()

if isinstance(ll, nn.MultiheadAttention):
for attr in [
*[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
'in_proj_bias', 'bias_k', 'bias_v'
]:
tensor = getattr(ll, attr)
if tensor is not None:
tensor.data = tensor.data.half()

for name in ['text_projection', 'proj']:
if hasattr(ll, name):
attr = getattr(ll, name)
if attr is not None:
attr.data = attr.data.half()

model.apply(_convert_weights_to_fp16)


def build_model():
model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12)
convert_weights(model)
return model.eval()

+ 156
- 0
modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py View File

@@ -0,0 +1,156 @@
""" CLIP
Adapted from https://github.com/openai/CLIP.
Originally MIT License, Copyright (c) 2021 OpenAI.
"""

import gzip
import html
import os
from functools import lru_cache

import ftfy
import regex as re


@lru_cache()
def default_bpe():
return os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'bpe_simple_vocab_16e6.txt.gz')


@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord('!'),
ord('~') + 1)) + list(range(
ord('¡'),
ord('¬') + 1)) + list(range(ord('®'),
ord('ÿ') + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))


def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs


def basic_clean(text):
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()


def whitespace_clean(text):
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text


class SimpleTokenizer(object):

def __init__(self, bpe_path: str = default_bpe()):
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
merges = merges[1:49152 - 256 - 2 + 1]
merges = [tuple(merge.split()) for merge in merges]
vocab = list(bytes_to_unicode().values())
vocab = vocab + [v + '</w>' for v in vocab]
for merge in merges:
vocab.append(''.join(merge))
vocab.extend(['<|startoftext|>', '<|endoftext|>'])
self.encoder = dict(zip(vocab, range(len(vocab))))
self.decoder = {v: k for k, v in self.encoder.items()}
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {
'<|startoftext|>': '<|startoftext|>',
'<|endoftext|>': '<|endoftext|>'
}
self.pat = re.compile(
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE)

def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + '</w>', )
pairs = get_pairs(word)

if not pairs:
return token + '</w>'

while True:
bigram = min(
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
error_list = []
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except Exception as err:
new_word.extend(word[i:])
error_list.append(err)
break

if word[i] == first and i < len(word) - 1 and word[
i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word

def encode(self, text):
bpe_tokens = []
text = whitespace_clean(basic_clean(text)).lower()
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b]
for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token]
for bpe_token in self.bpe(token).split(' '))
return bpe_tokens

def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode(
'utf-8', errors='replace').replace('</w>', ' ')
return text

+ 24
- 0
modelscope/models/cv/tinynas_detection/__init__.py View File

@@ -0,0 +1,24 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .tinynas_detector import Tinynas_detector

else:
_import_structure = {
'tinynas_detector': ['TinynasDetector'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 16
- 0
modelscope/models/cv/tinynas_detection/backbone/__init__.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import copy

from .darknet import CSPDarknet
from .tinynas import load_tinynas_net


def build_backbone(cfg):
backbone_cfg = copy.deepcopy(cfg)
name = backbone_cfg.pop('name')
if name == 'CSPDarknet':
return CSPDarknet(**backbone_cfg)
elif name == 'TinyNAS':
return load_tinynas_net(backbone_cfg)

+ 126
- 0
modelscope/models/cv/tinynas_detection/backbone/darknet.py View File

@@ -0,0 +1,126 @@
# Copyright (c) Megvii Inc. All rights reserved.
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
from torch import nn

from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
SPPBottleneck)


class CSPDarknet(nn.Module):

def __init__(
self,
dep_mul,
wid_mul,
out_features=('dark3', 'dark4', 'dark5'),
depthwise=False,
act='silu',
reparam=False,
):
super(CSPDarknet, self).__init__()
assert out_features, 'please provide output features of Darknet'
self.out_features = out_features
Conv = DWConv if depthwise else BaseConv

base_channels = int(wid_mul * 64) # 64
base_depth = max(round(dep_mul * 3), 1) # 3

# stem
# self.stem = Focus(3, base_channels, ksize=3, act=act)
self.stem = Focus(3, base_channels, 3, act=act)

# dark2
self.dark2 = nn.Sequential(
Conv(base_channels, base_channels * 2, 3, 2, act=act),
CSPLayer(
base_channels * 2,
base_channels * 2,
n=base_depth,
depthwise=depthwise,
act=act,
reparam=reparam,
),
)

# dark3
self.dark3 = nn.Sequential(
Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
CSPLayer(
base_channels * 4,
base_channels * 4,
n=base_depth * 3,
depthwise=depthwise,
act=act,
reparam=reparam,
),
)

# dark4
self.dark4 = nn.Sequential(
Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
CSPLayer(
base_channels * 8,
base_channels * 8,
n=base_depth * 3,
depthwise=depthwise,
act=act,
reparam=reparam,
),
)

# dark5
self.dark5 = nn.Sequential(
Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
SPPBottleneck(
base_channels * 16, base_channels * 16, activation=act),
CSPLayer(
base_channels * 16,
base_channels * 16,
n=base_depth,
shortcut=False,
depthwise=depthwise,
act=act,
reparam=reparam,
),
)

def init_weights(self, pretrain=None):

if pretrain is None:
return
else:
pretrained_dict = torch.load(
pretrain, map_location='cpu')['state_dict']
new_params = self.state_dict().copy()
for k, v in pretrained_dict.items():
ks = k.split('.')
if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[
-1] == 'total_params':
continue
else:
new_params[k] = v

self.load_state_dict(new_params)
print(f' load pretrain backbone from {pretrain}')

def forward(self, x):
outputs = {}
x = self.stem(x)
outputs['stem'] = x
x = self.dark2(x)
outputs['dark2'] = x
x = self.dark3(x)
outputs['dark3'] = x
x = self.dark4(x)
outputs['dark4'] = x
x = self.dark5(x)
outputs['dark5'] = x
features_out = [
outputs['stem'], outputs['dark2'], outputs['dark3'],
outputs['dark4'], outputs['dark5']
]

return features_out

+ 347
- 0
modelscope/models/cv/tinynas_detection/backbone/tinynas.py View File

@@ -0,0 +1,347 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
import torch.nn as nn

from ..core.base_ops import Focus, SPPBottleneck, get_activation
from ..core.repvgg_block import RepVggBlock


class ConvKXBN(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride):
super(ConvKXBN, self).__init__()
self.conv1 = nn.Conv2d(
in_c,
out_c,
kernel_size,
stride, (kernel_size - 1) // 2,
groups=1,
bias=False)
self.bn1 = nn.BatchNorm2d(out_c)

def forward(self, x):
return self.bn1(self.conv1(x))


class ConvKXBNRELU(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
super(ConvKXBNRELU, self).__init__()
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

def forward(self, x):
output = self.conv(x)
return self.activation_function(output)


class ResConvK1KX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
force_resproj=False,
act='silu'):
super(ResConvK1KX, self).__init__()
self.stride = stride
self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
self.conv2 = RepVggBlock(
btn_c, out_c, kernel_size, stride, act='identity')

if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

if stride == 2:
self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
else:
self.residual_downsample = nn.Identity()

if in_c != out_c or force_resproj:
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
else:
self.residual_proj = nn.Identity()

def forward(self, x):
if self.stride != 2:
reslink = self.residual_downsample(x)
reslink = self.residual_proj(reslink)

output = x
output = self.conv1(output)
output = self.activation_function(output)
output = self.conv2(output)
if self.stride != 2:
output = output + reslink
output = self.activation_function(output)

return output


class SuperResConvK1KX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
num_blocks,
with_spp=False,
act='silu'):
super(SuperResConvK1KX, self).__init__()
if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(num_blocks):
if block_id == 0:
in_channels = in_c
out_channels = out_c
this_stride = stride
force_resproj = False # as a part of CSPLayer, DO NOT need this flag
this_kernel_size = kernel_size
else:
in_channels = out_c
out_channels = out_c
this_stride = 1
force_resproj = False
this_kernel_size = kernel_size
the_block = ResConvK1KX(
in_channels,
out_channels,
btn_c,
this_kernel_size,
this_stride,
force_resproj,
act=act)
self.block_list.append(the_block)
if block_id == 0 and with_spp:
self.block_list.append(
SPPBottleneck(out_channels, out_channels))

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class ResConvKXKX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
force_resproj=False,
act='silu'):
super(ResConvKXKX, self).__init__()
self.stride = stride
if self.stride == 2:
self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
else:
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
self.conv2 = RepVggBlock(
btn_c, out_c, kernel_size, stride, act='identity')

if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

if stride == 2:
self.residual_downsample = nn.AvgPool2d(
kernel_size=2, stride=2)
else:
self.residual_downsample = nn.Identity()

if in_c != out_c or force_resproj:
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
else:
self.residual_proj = nn.Identity()

def forward(self, x):
if self.stride == 2:
return self.downsampler(x)
reslink = self.residual_downsample(x)
reslink = self.residual_proj(reslink)

output = x
output = self.conv1(output)
output = self.activation_function(output)
output = self.conv2(output)

output = output + reslink
output = self.activation_function(output)

return output


class SuperResConvKXKX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
num_blocks,
with_spp=False,
act='silu'):
super(SuperResConvKXKX, self).__init__()
if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(num_blocks):
if block_id == 0:
in_channels = in_c
out_channels = out_c
this_stride = stride
force_resproj = False # as a part of CSPLayer, DO NOT need this flag
this_kernel_size = kernel_size
else:
in_channels = out_c
out_channels = out_c
this_stride = 1
force_resproj = False
this_kernel_size = kernel_size
the_block = ResConvKXKX(
in_channels,
out_channels,
btn_c,
this_kernel_size,
this_stride,
force_resproj,
act=act)
self.block_list.append(the_block)
if block_id == 0 and with_spp:
self.block_list.append(
SPPBottleneck(out_channels, out_channels))

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class TinyNAS(nn.Module):

def __init__(self,
structure_info=None,
out_indices=[0, 1, 2, 4, 5],
out_channels=[None, None, 128, 256, 512],
with_spp=False,
use_focus=False,
need_conv1=True,
act='silu'):
super(TinyNAS, self).__init__()
assert len(out_indices) == len(out_channels)
self.out_indices = out_indices
self.need_conv1 = need_conv1

self.block_list = nn.ModuleList()
if need_conv1:
self.conv1_list = nn.ModuleList()
for idx, block_info in enumerate(structure_info):
the_block_class = block_info['class']
if the_block_class == 'ConvKXBNRELU':
if use_focus:
the_block = Focus(block_info['in'], block_info['out'],
block_info['k'])
else:
the_block = ConvKXBNRELU(
block_info['in'],
block_info['out'],
block_info['k'],
block_info['s'],
act=act)
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvK1KX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResConvK1KX(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act)
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvKXKX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResConvKXKX(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act)
self.block_list.append(the_block)
if need_conv1:
if idx in self.out_indices and out_channels[
self.out_indices.index(idx)] is not None:
self.conv1_list.append(
nn.Conv2d(block_info['out'],
out_channels[self.out_indices.index(idx)],
1))
else:
self.conv1_list.append(None)

def init_weights(self, pretrain=None):
pass

def forward(self, x):
output = x
stage_feature_list = []
for idx, block in enumerate(self.block_list):
output = block(output)
if idx in self.out_indices:
if self.need_conv1 and self.conv1_list[idx] is not None:
true_out = self.conv1_list[idx](output)
stage_feature_list.append(true_out)
else:
stage_feature_list.append(output)
return stage_feature_list


def load_tinynas_net(backbone_cfg):
# load masternet model to path
import ast

struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
struct_info = ast.literal_eval(struct_str)
for layer in struct_info:
if 'nbitsA' in layer:
del layer['nbitsA']
if 'nbitsW' in layer:
del layer['nbitsW']

model = TinyNAS(
structure_info=struct_info,
out_indices=backbone_cfg.out_indices,
out_channels=backbone_cfg.out_channels,
with_spp=backbone_cfg.with_spp,
use_focus=backbone_cfg.use_focus,
act=backbone_cfg.act,
need_conv1=backbone_cfg.need_conv1,
)

return model

+ 2
- 0
modelscope/models/cv/tinynas_detection/core/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

+ 474
- 0
modelscope/models/cv/tinynas_detection/core/base_ops.py View File

@@ -0,0 +1,474 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

from .repvgg_block import RepVggBlock


class SiLU(nn.Module):
"""export-friendly version of nn.SiLU()"""

@staticmethod
def forward(x):
return x * torch.sigmoid(x)


def get_activation(name='silu', inplace=True):
if name == 'silu':
module = nn.SiLU(inplace=inplace)
elif name == 'relu':
module = nn.ReLU(inplace=inplace)
elif name == 'lrelu':
module = nn.LeakyReLU(0.1, inplace=inplace)
else:
raise AttributeError('Unsupported act type: {}'.format(name))
return module


def get_norm(name, out_channels, inplace=True):
if name == 'bn':
module = nn.BatchNorm2d(out_channels)
elif name == 'gn':
module = nn.GroupNorm(num_channels=out_channels, num_groups=32)
return module


class BaseConv(nn.Module):
"""A Conv2d -> Batchnorm -> silu/leaky relu block"""

def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
groups=1,
bias=False,
act='silu',
norm='bn'):
super().__init__()
# same padding
pad = (ksize - 1) // 2
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=pad,
groups=groups,
bias=bias,
)
if norm is not None:
self.bn = get_norm(norm, out_channels, inplace=True)
if act is not None:
self.act = get_activation(act, inplace=True)
self.with_norm = norm is not None
self.with_act = act is not None

def forward(self, x):
x = self.conv(x)
if self.with_norm:
# x = self.norm(x)
x = self.bn(x)
if self.with_act:
x = self.act(x)
return x

def fuseforward(self, x):
return self.act(self.conv(x))


class DepthWiseConv(nn.Module):

def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
groups=None,
bias=False,
act='silu',
norm='bn'):
super().__init__()
padding = (ksize - 1) // 2
self.depthwise = nn.Conv2d(
in_channels,
in_channels,
kernel_size=ksize,
stride=stride,
padding=padding,
groups=in_channels,
bias=bias,
)

self.pointwise = nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias)
if norm is not None:
self.dwnorm = get_norm(norm, in_channels, inplace=True)
self.pwnorm = get_norm(norm, out_channels, inplace=True)
if act is not None:
self.act = get_activation(act, inplace=True)

self.with_norm = norm is not None
self.with_act = act is not None
self.order = ['depthwise', 'dwnorm', 'pointwise', 'act']

def forward(self, x):

for layer_name in self.order:
layer = self.__getattr__(layer_name)
if layer is not None:
x = layer(x)
return x


class DWConv(nn.Module):
"""Depthwise Conv + Conv"""

def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
super().__init__()
self.dconv = BaseConv(
in_channels,
in_channels,
ksize=ksize,
stride=stride,
groups=in_channels,
act=act,
)
self.pconv = BaseConv(
in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)

def forward(self, x):
x = self.dconv(x)
return self.pconv(x)


class Bottleneck(nn.Module):
# Standard bottleneck
def __init__(
self,
in_channels,
out_channels,
shortcut=True,
expansion=0.5,
depthwise=False,
act='silu',
reparam=False,
):
super().__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
k_conv1 = 3 if reparam else 1
self.conv1 = BaseConv(
in_channels, hidden_channels, k_conv1, stride=1, act=act)
if reparam:
self.conv2 = RepVggBlock(
hidden_channels, out_channels, 3, stride=1, act=act)
else:
self.conv2 = Conv(
hidden_channels, out_channels, 3, stride=1, act=act)
self.use_add = shortcut and in_channels == out_channels

def forward(self, x):
y = self.conv2(self.conv1(x))
if self.use_add:
y = y + x
return y


class ResLayer(nn.Module):
'Residual layer with `in_channels` inputs.'

def __init__(self, in_channels: int):
super().__init__()
mid_channels = in_channels // 2
self.layer1 = BaseConv(
in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
self.layer2 = BaseConv(
mid_channels, in_channels, ksize=3, stride=1, act='lrelu')

def forward(self, x):
out = self.layer2(self.layer1(x))
return x + out


class SPPBottleneck(nn.Module):
"""Spatial pyramid pooling layer used in YOLOv3-SPP"""

def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
activation='silu'):
super().__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, 1, stride=1, act=activation)
self.m = nn.ModuleList([
nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(
conv2_channels, out_channels, 1, stride=1, act=activation)

def forward(self, x):
x = self.conv1(x)
x = torch.cat([x] + [m(x) for m in self.m], dim=1)
x = self.conv2(x)
return x


class CSPLayer(nn.Module):
"""C3 in yolov5, CSP Bottleneck with 3 convolutions"""

def __init__(
self,
in_channels,
out_channels,
n=1,
shortcut=True,
expansion=0.5,
depthwise=False,
act='silu',
reparam=False,
):
"""
Args:
in_channels (int): input channels.
out_channels (int): output channels.
n (int): number of Bottlenecks. Default value: 1.
"""
# ch_in, ch_out, number, shortcut, groups, expansion
super().__init__()
hidden_channels = int(out_channels * expansion) # hidden channels
self.conv1 = BaseConv(
in_channels, hidden_channels, 1, stride=1, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, 1, stride=1, act=act)
self.conv3 = BaseConv(
2 * hidden_channels, out_channels, 1, stride=1, act=act)
module_list = [
Bottleneck(
hidden_channels,
hidden_channels,
shortcut,
1.0,
depthwise,
act=act,
reparam=reparam) for _ in range(n)
]
self.m = nn.Sequential(*module_list)

def forward(self, x):
x_1 = self.conv1(x)
x_2 = self.conv2(x)
x_1 = self.m(x_1)
x = torch.cat((x_1, x_2), dim=1)
return self.conv3(x)


class Focus(nn.Module):
"""Focus width and height information into channel space."""

def __init__(self,
in_channels,
out_channels,
ksize=1,
stride=1,
act='silu'):
super().__init__()
self.conv = BaseConv(
in_channels * 4, out_channels, ksize, stride, act=act)

def forward(self, x):
# shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
patch_top_left = x[..., ::2, ::2]
patch_top_right = x[..., ::2, 1::2]
patch_bot_left = x[..., 1::2, ::2]
patch_bot_right = x[..., 1::2, 1::2]
x = torch.cat(
(
patch_top_left,
patch_bot_left,
patch_top_right,
patch_bot_right,
),
dim=1,
)
return self.conv(x)


class fast_Focus(nn.Module):

def __init__(self,
in_channels,
out_channels,
ksize=1,
stride=1,
act='silu'):
super(Focus, self).__init__()
self.conv1 = self.focus_conv(w1=1.0)
self.conv2 = self.focus_conv(w3=1.0)
self.conv3 = self.focus_conv(w2=1.0)
self.conv4 = self.focus_conv(w4=1.0)

self.conv = BaseConv(
in_channels * 4, out_channels, ksize, stride, act=act)

def forward(self, x):
return self.conv(
torch.cat(
[self.conv1(x),
self.conv2(x),
self.conv3(x),
self.conv4(x)], 1))

def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False)
conv.weight = self.init_weights_constant(w1, w2, w3, w4)
conv.weight.requires_grad = False
return conv

def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
return nn.Parameter(
torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]],
[[[w1, w2], [w3, w4]]]]))


# shufflenet block
def channel_shuffle(x, groups=2):
bat_size, channels, w, h = x.shape
group_c = channels // groups
x = x.view(bat_size, groups, group_c, w, h)
x = torch.transpose(x, 1, 2).contiguous()
x = x.view(bat_size, -1, w, h)
return x


def conv_1x1_bn(in_c, out_c, stride=1):
return nn.Sequential(
nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False),
nn.BatchNorm2d(out_c), nn.ReLU(True))


def conv_bn(in_c, out_c, stride=2):
return nn.Sequential(
nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False),
nn.BatchNorm2d(out_c), nn.ReLU(True))


class ShuffleBlock(nn.Module):

def __init__(self, in_c, out_c, downsample=False):
super(ShuffleBlock, self).__init__()
self.downsample = downsample
half_c = out_c // 2
if downsample:
self.branch1 = nn.Sequential(
# 3*3 dw conv, stride = 2
# nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False),
nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False),
nn.BatchNorm2d(in_c),
# 1*1 pw conv
nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
nn.BatchNorm2d(half_c),
nn.ReLU(True))

self.branch2 = nn.Sequential(
# 1*1 pw conv
nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
nn.BatchNorm2d(half_c),
nn.ReLU(True),
# 3*3 dw conv, stride = 2
# nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False),
nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
nn.BatchNorm2d(half_c),
# 1*1 pw conv
nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
nn.BatchNorm2d(half_c),
nn.ReLU(True))
else:
# in_c = out_c
assert in_c == out_c

self.branch2 = nn.Sequential(
# 1*1 pw conv
nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
nn.BatchNorm2d(half_c),
nn.ReLU(True),
# 3*3 dw conv, stride = 1
nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
nn.BatchNorm2d(half_c),
# 1*1 pw conv
nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
nn.BatchNorm2d(half_c),
nn.ReLU(True))

def forward(self, x):
out = None
if self.downsample:
# if it is downsampling, we don't need to do channel split
out = torch.cat((self.branch1(x), self.branch2(x)), 1)
else:
# channel split
channels = x.shape[1]
c = channels // 2
x1 = x[:, :c, :, :]
x2 = x[:, c:, :, :]
out = torch.cat((x1, self.branch2(x2)), 1)
return channel_shuffle(out, 2)


class ShuffleCSPLayer(nn.Module):
"""C3 in yolov5, CSP Bottleneck with 3 convolutions"""

def __init__(
self,
in_channels,
out_channels,
n=1,
shortcut=True,
expansion=0.5,
depthwise=False,
act='silu',
):
"""
Args:
in_channels (int): input channels.
out_channels (int): output channels.
n (int): number of Bottlenecks. Default value: 1.
"""
# ch_in, ch_out, number, shortcut, groups, expansion
super().__init__()
hidden_channels = int(out_channels * expansion) # hidden channels
self.conv1 = BaseConv(
in_channels, hidden_channels, 1, stride=1, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, 1, stride=1, act=act)
module_list = [
Bottleneck(
hidden_channels,
hidden_channels,
shortcut,
1.0,
depthwise,
act=act) for _ in range(n)
]
self.m = nn.Sequential(*module_list)

def forward(self, x):
x_1 = self.conv1(x)
x_2 = self.conv2(x)
x_1 = self.m(x_1)
x = torch.cat((x_1, x_2), dim=1)
# add channel shuffle
return channel_shuffle(x, 2)

+ 324
- 0
modelscope/models/cv/tinynas_detection/core/neck_ops.py View File

@@ -0,0 +1,324 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


class Swish(nn.Module):

def __init__(self, inplace=True):
super(Swish, self).__init__()
self.inplace = inplace

def forward(self, x):
if self.inplace:
x.mul_(F.sigmoid(x))
return x
else:
return x * F.sigmoid(x)


def get_activation(name='silu', inplace=True):
if name is None:
return nn.Identity()

if isinstance(name, str):
if name == 'silu':
module = nn.SiLU(inplace=inplace)
elif name == 'relu':
module = nn.ReLU(inplace=inplace)
elif name == 'lrelu':
module = nn.LeakyReLU(0.1, inplace=inplace)
elif name == 'swish':
module = Swish(inplace=inplace)
elif name == 'hardsigmoid':
module = nn.Hardsigmoid(inplace=inplace)
else:
raise AttributeError('Unsupported act type: {}'.format(name))
return module
elif isinstance(name, nn.Module):
return name
else:
raise AttributeError('Unsupported act type: {}'.format(name))


class ConvBNLayer(nn.Module):

def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2d(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(ch_out, )
self.act = get_activation(act, inplace=True)

def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)

return x


class RepVGGBlock(nn.Module):

def __init__(self, ch_in, ch_out, act='relu', deploy=False):
super(RepVGGBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.deploy = deploy
self.in_channels = ch_in
self.groups = 1
if self.deploy is False:
self.rbr_dense = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=None)
self.rbr_1x1 = ConvBNLayer(
ch_in, ch_out, 1, stride=1, padding=0, act=None)
# self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None
self.rbr_identity = None
else:
self.rbr_reparam = nn.Conv2d(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
self.act = get_activation(act) if act is None or isinstance(
act, (str, dict)) else act

def forward(self, x):
if self.deploy:
print('----------deploy----------')
y = self.rbr_reparam(x)
else:
if self.rbr_identity is None:
y = self.rbr_dense(x) + self.rbr_1x1(x)
else:
y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x)

y = self.act(y)
return y

def switch_to_deploy(self):
print('switch')
if not hasattr(self, 'rbr_reparam'):
# return
self.rbr_reparam = nn.Conv2d(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
print('switch')
kernel, bias = self.get_equivalent_kernel_bias()
self.rbr_reparam.weight.data = kernel
self.rbr_reparam.bias.data = bias
for para in self.parameters():
para.detach_()
# self.__delattr__(self.rbr_dense)
# self.__delattr__(self.rbr_1x1)
self.__delattr__('rbr_dense')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity'):
self.__delattr__('rbr_identity')
if hasattr(self, 'id_tensor'):
self.__delattr__('id_tensor')
self.deploy = True

def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
# if isinstance(branch, nn.Sequential):
if isinstance(branch, ConvBNLayer):
kernel = branch.conv.weight
running_mean = branch.bn.running_mean
running_var = branch.bn.running_var
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn.eps
else:
assert isinstance(branch, nn.BatchNorm2d)
if not hasattr(self, 'id_tensor'):
input_dim = self.in_channels // self.groups
kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
dtype=np.float32)
for i in range(self.in_channels):
kernel_value[i, i % input_dim, 1, 1] = 1
self.id_tensor = torch.from_numpy(kernel_value).to(
branch.weight.device)
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std


class BasicBlock(nn.Module):

def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
super(BasicBlock, self).__init__()
assert ch_in == ch_out
# self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
# self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
self.shortcut = shortcut

def forward(self, x):
# y = self.conv1(x)
y = self.conv2(x)
if self.shortcut:
return x + y
else:
return y


class BasicBlock_3x3(nn.Module):

def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
super(BasicBlock_3x3, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=act)
# self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
self.shortcut = shortcut

def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
if self.shortcut:
return x + y
else:
return y


class BasicBlock_3x3_Reverse(nn.Module):

def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
super(BasicBlock_3x3_Reverse, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=act)
# self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
self.shortcut = shortcut

def forward(self, x):
y = self.conv2(x)
y = self.conv1(y)
if self.shortcut:
return x + y
else:
return y


class SPP(nn.Module):

def __init__(
self,
ch_in,
ch_out,
k,
pool_size,
act='swish',
):
super(SPP, self).__init__()
self.pool = []
for i, size in enumerate(pool_size):
pool = nn.MaxPool2d(
kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
self.add_module('pool{}'.format(i), pool)
self.pool.append(pool)
self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)

def forward(self, x):
outs = [x]

for pool in self.pool:
outs.append(pool(x))
y = torch.cat(outs, axis=1)

y = self.conv(y)
return y


class CSPStage(nn.Module):

def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False):
super(CSPStage, self).__init__()

ch_mid = int(ch_out // 2)
self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
# self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act)
self.convs = nn.Sequential()

next_ch_in = ch_mid
for i in range(n):
if block_fn == 'BasicBlock':
self.convs.add_module(
str(i),
BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False))
elif block_fn == 'BasicBlock_3x3':
self.convs.add_module(
str(i),
BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True))
elif block_fn == 'BasicBlock_3x3_Reverse':
self.convs.add_module(
str(i),
BasicBlock_3x3_Reverse(
next_ch_in, ch_mid, act=act, shortcut=True))
else:
raise NotImplementedError
if i == (n - 1) // 2 and spp:
self.convs.add_module(
'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
next_ch_in = ch_mid
# self.convs = nn.Sequential(*convs)
self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act)

def forward(self, x):
y1 = self.conv1(x)
y2 = self.conv2(x)

mid_out = [y1]
for conv in self.convs:
y2 = conv(y2)
mid_out.append(y2)
y = torch.cat(mid_out, axis=1)
y = self.conv3(y)
return y

+ 205
- 0
modelscope/models/cv/tinynas_detection/core/repvgg_block.py View File

@@ -0,0 +1,205 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.nn.parameter import Parameter


def get_activation(name='silu', inplace=True):
if name == 'silu':
module = nn.SiLU(inplace=inplace)
elif name == 'relu':
module = nn.ReLU(inplace=inplace)
elif name == 'lrelu':
module = nn.LeakyReLU(0.1, inplace=inplace)
elif name == 'identity':
module = nn.Identity()
else:
raise AttributeError('Unsupported act type: {}'.format(name))
return module


def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
'''Basic cell for rep-style block, including conv and bn'''
result = nn.Sequential()
result.add_module(
'conv',
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False))
result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
return result


class RepVggBlock(nn.Module):
'''RepVggBlock is a basic rep-style block, including training and deploy status
This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
'''

def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
dilation=1,
groups=1,
padding_mode='zeros',
deploy=False,
use_se=False,
act='relu',
norm=None):
super(RepVggBlock, self).__init__()
""" Initialization of the class.
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 1
dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
padding_mode (string, optional): Default: 'zeros'
deploy: Whether to be deploy status or training status. Default: False
use_se: Whether to use se. Default: False
"""
self.deploy = deploy
self.groups = groups
self.in_channels = in_channels
self.out_channels = out_channels

assert kernel_size == 3
assert padding == 1

padding_11 = padding - kernel_size // 2

if isinstance(act, str):
self.nonlinearity = get_activation(act)
else:
self.nonlinearity = act

if use_se:
raise NotImplementedError('se block not supported yet')
else:
self.se = nn.Identity()

if deploy:
self.rbr_reparam = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=True,
padding_mode=padding_mode)

else:
self.rbr_identity = None
self.rbr_dense = conv_bn(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups)
self.rbr_1x1 = conv_bn(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=stride,
padding=padding_11,
groups=groups)

def forward(self, inputs):
'''Forward process'''
if hasattr(self, 'rbr_reparam'):
return self.nonlinearity(self.se(self.rbr_reparam(inputs)))

if self.rbr_identity is None:
id_out = 0
else:
id_out = self.rbr_identity(inputs)

return self.nonlinearity(
self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))

def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
if isinstance(branch, nn.Sequential):
kernel = branch.conv.weight
running_mean = branch.bn.running_mean
running_var = branch.bn.running_var
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn.eps
else:
assert isinstance(branch, nn.BatchNorm2d)
if not hasattr(self, 'id_tensor'):
input_dim = self.in_channels // self.groups
kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
dtype=np.float32)
for i in range(self.in_channels):
kernel_value[i, i % input_dim, 1, 1] = 1
self.id_tensor = torch.from_numpy(kernel_value).to(
branch.weight.device)
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std

def switch_to_deploy(self):
if hasattr(self, 'rbr_reparam'):
return
kernel, bias = self.get_equivalent_kernel_bias()
self.rbr_reparam = nn.Conv2d(
in_channels=self.rbr_dense.conv.in_channels,
out_channels=self.rbr_dense.conv.out_channels,
kernel_size=self.rbr_dense.conv.kernel_size,
stride=self.rbr_dense.conv.stride,
padding=self.rbr_dense.conv.padding,
dilation=self.rbr_dense.conv.dilation,
groups=self.rbr_dense.conv.groups,
bias=True)
self.rbr_reparam.weight.data = kernel
self.rbr_reparam.bias.data = bias
for para in self.parameters():
para.detach_()
self.__delattr__('rbr_dense')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity'):
self.__delattr__('rbr_identity')
if hasattr(self, 'id_tensor'):
self.__delattr__('id_tensor')
self.deploy = True

+ 196
- 0
modelscope/models/cv/tinynas_detection/core/utils.py View File

@@ -0,0 +1,196 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import numpy as np
import torch
import torchvision

__all__ = [
'filter_box',
'postprocess_airdet',
'bboxes_iou',
'matrix_iou',
'adjust_box_anns',
'xyxy2xywh',
'xyxy2cxcywh',
]


def multiclass_nms(multi_bboxes,
multi_scores,
score_thr,
iou_thr,
max_num=100,
score_factors=None):
"""NMS for multi-class bboxes.

Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class), where the last column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
score_factors (Tensor): The factors multiplied to scores before
applying NMS

Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
are 0-based.
"""
num_classes = multi_scores.size(1)
# exclude background category
if multi_bboxes.shape[1] > 4:
bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
else:
bboxes = multi_bboxes[:, None].expand(
multi_scores.size(0), num_classes, 4)
scores = multi_scores
# filter out boxes with low scores
valid_mask = scores > score_thr # 1000 * 80 bool

# We use masked_select for ONNX exporting purpose,
# which is equivalent to bboxes = bboxes[valid_mask]
# (TODO): as ONNX does not support repeat now,
# we have to use this ugly code
# bboxes -> 1000, 4
bboxes = torch.masked_select(
bboxes,
torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
-1)).view(-1, 4) # mask-> 1000*80*4, 80000*4
if score_factors is not None:
scores = scores * score_factors[:, None]
scores = torch.masked_select(scores, valid_mask)
labels = valid_mask.nonzero(as_tuple=False)[:, 1]

if bboxes.numel() == 0:
bboxes = multi_bboxes.new_zeros((0, 5))
labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
scores = multi_bboxes.new_zeros((0, ))

return bboxes, scores, labels

keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)

if max_num > 0:
keep = keep[:max_num]

return bboxes[keep], scores[keep], labels[keep]


def filter_box(output, scale_range):
"""
output: (N, 5+class) shape
"""
min_scale, max_scale = scale_range
w = output[:, 2] - output[:, 0]
h = output[:, 3] - output[:, 1]
keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
return output[keep]


def filter_results(boxlist, num_classes, nms_thre):
boxes = boxlist.bbox
scores = boxlist.get_field('scores')
cls = boxlist.get_field('labels')
nms_out_index = torchvision.ops.batched_nms(
boxes,
scores,
cls,
nms_thre,
)
boxlist = boxlist[nms_out_index]

return boxlist


def postprocess_airdet(prediction,
num_classes,
conf_thre=0.7,
nms_thre=0.45,
imgs=None):
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))]
for i, image_pred in enumerate(prediction):
# If none are remaining => process next image
if not image_pred.size(0):
continue
multi_bboxes = image_pred[:, :4]
multi_scores = image_pred[:, 5:]
detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
conf_thre, nms_thre, 500)
detections = torch.cat(
(detections, scores[:, None], scores[:, None], labels[:, None]),
dim=1)

if output[i] is None:
output[i] = detections
else:
output[i] = torch.cat((output[i], detections))
return output


def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
raise IndexError

if xyxy:
tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
else:
tl = torch.max(
(bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
(bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
)
br = torch.min(
(bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
(bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
)

area_a = torch.prod(bboxes_a[:, 2:], 1)
area_b = torch.prod(bboxes_b[:, 2:], 1)
en = (tl < br).type(tl.type()).prod(dim=2)
area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all())
return area_i / (area_a[:, None] + area_b - area_i)


def matrix_iou(a, b):
"""
return iou of a and b, numpy version for data augenmentation
"""
lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)


def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
return bbox


def xyxy2xywh(bboxes):
bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
return bboxes


def xyxy2cxcywh(bboxes):
bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
return bboxes

+ 181
- 0
modelscope/models/cv/tinynas_detection/detector.py View File

@@ -0,0 +1,181 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import os.path as osp
import pickle

import cv2
import torch
import torchvision

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .backbone import build_backbone
from .head import build_head
from .neck import build_neck
from .utils import parse_config


class SingleStageDetector(TorchModel):
"""
The base class of single stage detector.
"""

def __init__(self, model_dir: str, *args, **kwargs):
"""
init model by cfg
"""
super().__init__(model_dir, *args, **kwargs)

config_path = osp.join(model_dir, 'airdet_s.py')
config = parse_config(config_path)
self.cfg = config
model_path = osp.join(model_dir, config.model.name)
label_map = osp.join(model_dir, config.model.class_map)
self.label_map = pickle.load(open(label_map, 'rb'))
self.size_divisible = config.dataset.size_divisibility
self.num_classes = config.model.head.num_classes
self.conf_thre = config.model.head.nms_conf_thre
self.nms_thre = config.model.head.nms_iou_thre

self.backbone = build_backbone(self.cfg.model.backbone)
self.neck = build_neck(self.cfg.model.neck)
self.head = build_head(self.cfg.model.head)

self.load_pretrain_model(model_path)

def load_pretrain_model(self, pretrain_model):

state_dict = torch.load(pretrain_model, map_location='cpu')['model']
new_state_dict = {}
for k, v in state_dict.items():
k = k.replace('module.', '')
new_state_dict[k] = v
self.load_state_dict(new_state_dict, strict=True)

def inference(self, x):

if self.training:
return self.forward_train(x)
else:
return self.forward_eval(x)

def forward_train(self, x):

pass

def forward_eval(self, x):

x = self.backbone(x)
x = self.neck(x)
prediction = self.head(x)

return prediction

def preprocess(self, image):
image = torch.from_numpy(image).type(torch.float32)
image = image.permute(2, 0, 1)
shape = image.shape # c, h, w
if self.size_divisible > 0:
import math
stride = self.size_divisible
shape = list(shape)
shape[1] = int(math.ceil(shape[1] / stride) * stride)
shape[2] = int(math.ceil(shape[2] / stride) * stride)
shape = tuple(shape)
pad_img = image.new(*shape).zero_()
pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
pad_img = pad_img.unsqueeze(0)

return pad_img

def postprocess(self, preds):
bboxes, scores, labels_idx = postprocess_gfocal(
preds, self.num_classes, self.conf_thre, self.nms_thre)
bboxes = bboxes.cpu().numpy()
scores = scores.cpu().numpy()
labels_idx = labels_idx.cpu().numpy()
labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]

return (bboxes, scores, labels)


def multiclass_nms(multi_bboxes,
multi_scores,
score_thr,
iou_thr,
max_num=100,
score_factors=None):
"""NMS for multi-class bboxes.

Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class), where the last column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
score_factors (Tensor): The factors multiplied to scores before
applying NMS

Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
are 0-based.
"""
num_classes = multi_scores.size(1)
# exclude background category
if multi_bboxes.shape[1] > 4:
bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
else:
bboxes = multi_bboxes[:, None].expand(
multi_scores.size(0), num_classes, 4)
scores = multi_scores
# filter out boxes with low scores
valid_mask = scores > score_thr # 1000 * 80 bool

# We use masked_select for ONNX exporting purpose,
# which is equivalent to bboxes = bboxes[valid_mask]
# (TODO): as ONNX does not support repeat now,
# we have to use this ugly code
# bboxes -> 1000, 4
bboxes = torch.masked_select(
bboxes,
torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
-1)).view(-1, 4) # mask-> 1000*80*4, 80000*4
if score_factors is not None:
scores = scores * score_factors[:, None]
scores = torch.masked_select(scores, valid_mask)
labels = valid_mask.nonzero(as_tuple=False)[:, 1]

if bboxes.numel() == 0:
bboxes = multi_bboxes.new_zeros((0, 5))
labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
scores = multi_bboxes.new_zeros((0, ))

return bboxes, scores, labels

keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)

if max_num > 0:
keep = keep[:max_num]

return bboxes[keep], scores[keep], labels[keep]


def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7):
assert prediction.shape[0] == 1
for i, image_pred in enumerate(prediction):
# If none are remaining => process next image
if not image_pred.size(0):
continue
multi_bboxes = image_pred[:, :4]
multi_scores = image_pred[:, 4:]
detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
conf_thre, nms_thre, 500)

return detections, scores, labels

+ 16
- 0
modelscope/models/cv/tinynas_detection/head/__init__.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import copy

from .gfocal_v2_tiny import GFocalHead_Tiny


def build_head(cfg):

head_cfg = copy.deepcopy(cfg)
name = head_cfg.pop('name')
if name == 'GFocalV2':
return GFocalHead_Tiny(**head_cfg)
else:
raise NotImplementedError

+ 361
- 0
modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py View File

@@ -0,0 +1,361 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import functools
from functools import partial

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from ..core.base_ops import BaseConv, DWConv


class Scale(nn.Module):

def __init__(self, scale=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

def forward(self, x):
return x * self.scale


def multi_apply(func, *args, **kwargs):

pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))


def xyxy2CxCywh(xyxy, size=None):
x1 = xyxy[..., 0]
y1 = xyxy[..., 1]
x2 = xyxy[..., 2]
y2 = xyxy[..., 3]

cx = (x1 + x2) / 2
cy = (y1 + y2) / 2

w = x2 - x1
h = y2 - y1
if size is not None:
w = w.clamp(min=0, max=size[1])
h = h.clamp(min=0, max=size[0])
return torch.stack([cx, cy, w, h], axis=-1)


def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
"""
x1 = points[..., 0] - distance[..., 0]
y1 = points[..., 1] - distance[..., 1]
x2 = points[..., 0] + distance[..., 2]
y2 = points[..., 1] + distance[..., 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
return torch.stack([x1, y1, x2, y2], -1)


def bbox2distance(points, bbox, max_dis=None, eps=0.1):
"""Decode bounding box based on distances.
"""
left = points[:, 0] - bbox[:, 0]
top = points[:, 1] - bbox[:, 1]
right = bbox[:, 2] - points[:, 0]
bottom = bbox[:, 3] - points[:, 1]
if max_dis is not None:
left = left.clamp(min=0, max=max_dis - eps)
top = top.clamp(min=0, max=max_dis - eps)
right = right.clamp(min=0, max=max_dis - eps)
bottom = bottom.clamp(min=0, max=max_dis - eps)
return torch.stack([left, top, right, bottom], -1)


class Integral(nn.Module):
"""A fixed layer for calculating integral result from distribution.
"""

def __init__(self, reg_max=16):
super(Integral, self).__init__()
self.reg_max = reg_max
self.register_buffer('project',
torch.linspace(0, self.reg_max, self.reg_max + 1))

def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
"""
shape = x.size()
x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
b, nb, ne, _ = x.size()
x = x.reshape(b * nb * ne, self.reg_max + 1)
y = self.project.type_as(x).unsqueeze(1)
x = torch.matmul(x, y).reshape(b, nb, 4)
return x


class GFocalHead_Tiny(nn.Module):
"""Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
Estimation for Dense Object Detection.
"""

def __init__(
self,
num_classes,
in_channels,
stacked_convs=4, # 4
feat_channels=256,
reg_max=12,
reg_topk=4,
reg_channels=64,
strides=[8, 16, 32],
add_mean=True,
norm='gn',
act='relu',
start_kernel_size=3,
conv_groups=1,
conv_type='BaseConv',
simOTA_cls_weight=1.0,
simOTA_iou_weight=3.0,
octbase=8,
simlqe=False,
**kwargs):
self.simlqe = simlqe
self.num_classes = num_classes
self.in_channels = in_channels
self.strides = strides
self.feat_channels = feat_channels if isinstance(feat_channels, list) \
else [feat_channels] * len(self.strides)

self.cls_out_channels = num_classes + 1 # add 1 for keep consistance with former models
# and will be deprecated in future.
self.stacked_convs = stacked_convs
self.conv_groups = conv_groups
self.reg_max = reg_max
self.reg_topk = reg_topk
self.reg_channels = reg_channels
self.add_mean = add_mean
self.total_dim = reg_topk
self.start_kernel_size = start_kernel_size

self.norm = norm
self.act = act
self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv

if add_mean:
self.total_dim += 1

super(GFocalHead_Tiny, self).__init__()
self.integral = Integral(self.reg_max)

self._init_layers()

def _build_not_shared_convs(self, in_channel, feat_channels):
self.relu = nn.ReLU(inplace=True)
cls_convs = nn.ModuleList()
reg_convs = nn.ModuleList()

for i in range(self.stacked_convs):
chn = feat_channels if i > 0 else in_channel
kernel_size = 3 if i > 0 else self.start_kernel_size
cls_convs.append(
self.conv_module(
chn,
feat_channels,
kernel_size,
stride=1,
groups=self.conv_groups,
norm=self.norm,
act=self.act))
reg_convs.append(
self.conv_module(
chn,
feat_channels,
kernel_size,
stride=1,
groups=self.conv_groups,
norm=self.norm,
act=self.act))
if not self.simlqe:
conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
else:
conf_vector = [
nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
]
conf_vector += [self.relu]
conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
reg_conf = nn.Sequential(*conf_vector)

return cls_convs, reg_convs, reg_conf

def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
self.reg_confs = nn.ModuleList()

for i in range(len(self.strides)):
cls_convs, reg_convs, reg_conf = self._build_not_shared_convs(
self.in_channels[i], self.feat_channels[i])
self.cls_convs.append(cls_convs)
self.reg_convs.append(reg_convs)
self.reg_confs.append(reg_conf)

self.gfl_cls = nn.ModuleList([
nn.Conv2d(
self.feat_channels[i], self.cls_out_channels, 3, padding=1)
for i in range(len(self.strides))
])

self.gfl_reg = nn.ModuleList([
nn.Conv2d(
self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
for i in range(len(self.strides))
])

self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])

def forward(self,
xin,
labels=None,
imgs=None,
conf_thre=0.05,
nms_thre=0.7):

# prepare labels during training
b, c, h, w = xin[0].shape
if labels is not None:
gt_bbox_list = []
gt_cls_list = []
for label in labels:
gt_bbox_list.append(label.bbox)
gt_cls_list.append((label.get_field('labels')
- 1).long()) # labels starts from 1

# prepare priors for label assignment and bbox decode
mlvl_priors_list = [
self.get_single_level_center_priors(
xin[i].shape[0],
xin[i].shape[-2:],
stride,
dtype=torch.float32,
device=xin[0].device) for i, stride in enumerate(self.strides)
]
mlvl_priors = torch.cat(mlvl_priors_list, dim=1)

# forward for bboxes and classification prediction
cls_scores, bbox_preds = multi_apply(
self.forward_single,
xin,
self.cls_convs,
self.reg_convs,
self.gfl_cls,
self.gfl_reg,
self.reg_confs,
self.scales,
)
flatten_cls_scores = torch.cat(cls_scores, dim=1)
flatten_bbox_preds = torch.cat(bbox_preds, dim=1)

# calculating losses or bboxes decoded
if self.training:
loss = self.loss(flatten_cls_scores, flatten_bbox_preds,
gt_bbox_list, gt_cls_list, mlvl_priors)
return loss
else:
output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds,
mlvl_priors)
return output

def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg,
reg_conf, scale):
"""Forward feature of a single scale level.

"""
cls_feat = x
reg_feat = x

for cls_conv in cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in reg_convs:
reg_feat = reg_conv(reg_feat)

bbox_pred = scale(gfl_reg(reg_feat)).float()
N, C, H, W = bbox_pred.size()
prob = F.softmax(
bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
if not self.simlqe:
prob_topk, _ = prob.topk(self.reg_topk, dim=2)

if self.add_mean:
stat = torch.cat(
[prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
else:
stat = prob_topk

quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
else:
quality_score = reg_conf(
bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))

cls_score = gfl_cls(cls_feat).sigmoid() * quality_score

flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
return flatten_cls_score, flatten_bbox_pred

def get_single_level_center_priors(self, batch_size, featmap_size, stride,
dtype, device):

h, w = featmap_size
x_range = (torch.arange(0, int(w), dtype=dtype,
device=device)) * stride
y_range = (torch.arange(0, int(h), dtype=dtype,
device=device)) * stride

x = x_range.repeat(h, 1)
y = y_range.unsqueeze(-1).repeat(1, w)

y = y.flatten()
x = x.flatten()
strides = x.new_full((x.shape[0], ), stride)
priors = torch.stack([x, y, strides, strides], dim=-1)

return priors.unsqueeze(0).repeat(batch_size, 1, 1)

def sample(self, assign_result, gt_bboxes):
pos_inds = torch.nonzero(
assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
neg_inds = torch.nonzero(
assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

if gt_bboxes.numel() == 0:
# hack for index error case
assert pos_assigned_gt_inds.numel() == 0
pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
else:
if len(gt_bboxes.shape) < 2:
gt_bboxes = gt_bboxes.view(-1, 4)
pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]

return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

def get_bboxes(self,
cls_preds,
reg_preds,
mlvl_center_priors,
img_meta=None):

dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)

res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)

return res

+ 16
- 0
modelscope/models/cv/tinynas_detection/neck/__init__.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import copy

from .giraffe_fpn import GiraffeNeck
from .giraffe_fpn_v2 import GiraffeNeckV2


def build_neck(cfg):
neck_cfg = copy.deepcopy(cfg)
name = neck_cfg.pop('name')
if name == 'GiraffeNeck':
return GiraffeNeck(**neck_cfg)
elif name == 'GiraffeNeckV2':
return GiraffeNeckV2(**neck_cfg)

+ 235
- 0
modelscope/models/cv/tinynas_detection/neck/giraffe_config.py View File

@@ -0,0 +1,235 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import collections
import itertools
import os

import networkx as nx
from omegaconf import OmegaConf

Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])


def get_graph_info(graph):
input_nodes = []
output_nodes = []
Nodes = []
for node in range(graph.number_of_nodes()):
tmp = list(graph.neighbors(node))
tmp.sort()
type = -1
if node < tmp[0]:
input_nodes.append(node)
type = 0
if node > tmp[-1]:
output_nodes.append(node)
type = 1
Nodes.append(Node(node, [n for n in tmp if n < node], type))
return Nodes, input_nodes, output_nodes


def nodeid_trans(id, cur_level, num_levels):
if id % 2 == 1:
gap = int(((id + 1) // 2) * num_levels * 2)
else:
a = (num_levels - cur_level) * 2 - 1
b = ((id + 1) // 2) * num_levels * 2
gap = int(a + b)
return cur_level + gap


def gen_log2n_graph_file(log2n_graph_file, depth_multiplier):
f = open(log2n_graph_file, 'w')
for i in range(depth_multiplier):
for j in [1, 2, 4, 8, 16, 32]:
if i - j < 0:
break
else:
f.write('%d,%d\n' % (i - j, i))
f.close()


def get_log2n_graph(depth_multiplier):
nodes = []
connnections = []

for i in range(depth_multiplier):
nodes.append(i)
for j in [1, 2, 4, 8, 16, 32]:
if i - j < 0:
break
else:
connnections.append((i - j, i))
return nodes, connnections


def get_dense_graph(depth_multiplier):
nodes = []
connections = []

for i in range(depth_multiplier):
nodes.append(i)
for j in range(i):
connections.append((j, i))
return nodes, connections


def giraffeneck_config(min_level,
max_level,
weight_method=None,
depth_multiplier=5,
with_backslash=False,
with_slash=False,
with_skip_connect=False,
skip_connect_type='dense'):
"""Graph config with log2n merge and panet"""
if skip_connect_type == 'dense':
nodes, connections = get_dense_graph(depth_multiplier)
elif skip_connect_type == 'log2n':
nodes, connections = get_log2n_graph(depth_multiplier)
graph = nx.Graph()
graph.add_nodes_from(nodes)
graph.add_edges_from(connections)

drop_node = []
nodes, input_nodes, output_nodes = get_graph_info(graph)

weight_method = weight_method or 'fastattn'

num_levels = max_level - min_level + 1
node_ids = {min_level + i: [i] for i in range(num_levels)}
node_ids_per_layer = {}

pnodes = {}

def update_drop_node(new_id, input_offsets):
if new_id not in drop_node:
new_id = new_id
else:
while new_id in drop_node:
if new_id in pnodes:
for n in pnodes[new_id]['inputs_offsets']:
if n not in input_offsets and n not in drop_node:
input_offsets.append(n)
new_id = new_id - 1
if new_id not in input_offsets:
input_offsets.append(new_id)

# top-down layer
for i in range(max_level, min_level - 1, -1):
node_ids_per_layer[i] = []
for id, node in enumerate(nodes):
input_offsets = []
if id in input_nodes:
input_offsets.append(node_ids[i][0])
else:
if with_skip_connect:
for input_id in node.inputs:
new_id = nodeid_trans(input_id, i - min_level,
num_levels)
update_drop_node(new_id, input_offsets)

# add top2down
new_id = nodeid_trans(id, i - min_level, num_levels)

# add backslash node
def cal_backslash_node(id):
ind = id // num_levels
mod = id % num_levels
if ind % 2 == 0: # even
if mod == (num_levels - 1):
last = -1
else:
last = (ind - 1) * num_levels + (
num_levels - 1 - mod - 1)
else: # odd
if mod == 0:
last = -1
else:
last = (ind - 1) * num_levels + (
num_levels - 1 - mod + 1)

return last

# add slash node
def cal_slash_node(id):
ind = id // num_levels
mod = id % num_levels
if ind % 2 == 1: # odd
if mod == (num_levels - 1):
last = -1
else:
last = (ind - 1) * num_levels + (
num_levels - 1 - mod - 1)
else: # even
if mod == 0:
last = -1
else:
last = (ind - 1) * num_levels + (
num_levels - 1 - mod + 1)

return last

# add last node
last = new_id - 1
update_drop_node(last, input_offsets)

if with_backslash:
backslash = cal_backslash_node(new_id)
if backslash != -1 and backslash not in input_offsets:
input_offsets.append(backslash)

if with_slash:
slash = cal_slash_node(new_id)
if slash != -1 and slash not in input_offsets:
input_offsets.append(slash)

if new_id in drop_node:
input_offsets = []

pnodes[new_id] = {
'reduction': 1 << i,
'inputs_offsets': input_offsets,
'weight_method': weight_method,
'is_out': 0,
}

input_offsets = []
for out_id in output_nodes:
new_id = nodeid_trans(out_id, i - min_level, num_levels)
input_offsets.append(new_id)

pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = {
'reduction': 1 << i,
'inputs_offsets': input_offsets,
'weight_method': weight_method,
'is_out': 1,
}

pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0]))
return pnodes


def get_graph_config(fpn_name,
min_level=3,
max_level=7,
weight_method='concat',
depth_multiplier=5,
with_backslash=False,
with_slash=False,
with_skip_connect=False,
skip_connect_type='dense'):
name_to_config = {
'giraffeneck':
giraffeneck_config(
min_level=min_level,
max_level=max_level,
weight_method=weight_method,
depth_multiplier=depth_multiplier,
with_backslash=with_backslash,
with_slash=with_slash,
with_skip_connect=with_skip_connect,
skip_connect_type=skip_connect_type),
}
return name_to_config[fpn_name]

+ 661
- 0
modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py View File

@@ -0,0 +1,661 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import logging
import math
from collections import OrderedDict
from functools import partial
from typing import Callable, List, Optional, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm import create_model
from timm.models.layers import (Swish, create_conv2d, create_pool2d,
get_act_layer)

from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
from .giraffe_config import get_graph_config

_ACT_LAYER = Swish


class SequentialList(nn.Sequential):
""" This module exists to work around torchscript typing issues list -> list"""

def __init__(self, *args):
super(SequentialList, self).__init__(*args)

def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
for module in self:
x = module(x)
return x


class ConvBnAct2d(nn.Module):

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
dilation=1,
padding='',
bias=False,
norm_layer=nn.BatchNorm2d,
act_layer=_ACT_LAYER):
super(ConvBnAct2d, self).__init__()

self.conv = create_conv2d(
in_channels,
out_channels,
kernel_size,
stride=stride,
dilation=dilation,
padding=padding,
bias=bias)
self.bn = None if norm_layer is None else norm_layer(out_channels)
self.act = None if act_layer is None else act_layer(inplace=True)

def forward(self, x):
x = self.conv(x)
if self.bn is not None:
x = self.bn(x)
if self.act is not None:
x = self.act(x)
return x


class SeparableConv2d(nn.Module):
""" Separable Conv
"""

def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
dilation=1,
padding='',
bias=False,
channel_multiplier=1.0,
pw_kernel_size=1,
norm_layer=nn.BatchNorm2d,
act_layer=_ACT_LAYER):
super(SeparableConv2d, self).__init__()
self.conv_dw = create_conv2d(
in_channels,
int(in_channels * channel_multiplier),
kernel_size,
stride=stride,
dilation=dilation,
padding=padding,
depthwise=True)

self.conv_pw = create_conv2d(
int(in_channels * channel_multiplier),
out_channels,
pw_kernel_size,
padding=padding,
bias=bias)

self.bn = None if norm_layer is None else norm_layer(out_channels)
self.act = None if act_layer is None else act_layer(inplace=True)

def forward(self, x):
x = self.conv_dw(x)
x = self.conv_pw(x)
if self.bn is not None:
x = self.bn(x)
if self.act is not None:
x = self.act(x)
return x


def _init_weight(
m,
n='',
):
""" Weight initialization as per Tensorflow official implementations.
"""

def _fan_in_out(w, groups=1):
dimensions = w.dim()
if dimensions < 2:
raise ValueError(
'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions'
)
num_input_fmaps = w.size(1)
num_output_fmaps = w.size(0)
receptive_field_size = 1
if w.dim() > 2:
receptive_field_size = w[0][0].numel()
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
fan_out //= groups
return fan_in, fan_out

def _glorot_uniform(w, gain=1, groups=1):
fan_in, fan_out = _fan_in_out(w, groups)
gain /= max(1., (fan_in + fan_out) / 2.) # fan avg
limit = math.sqrt(3.0 * gain)
w.data.uniform_(-limit, limit)

def _variance_scaling(w, gain=1, groups=1):
fan_in, fan_out = _fan_in_out(w, groups)
gain /= max(1., fan_in) # fan in
std = math.sqrt(gain)
w.data.normal_(std=std)

if isinstance(m, SeparableConv2d):
if 'box_net' in n or 'class_net' in n:
_variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups)
_variance_scaling(m.conv_pw.weight)
if m.conv_pw.bias is not None:
if 'class_net.predict' in n:
m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
else:
m.conv_pw.bias.data.zero_()
else:
_glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups)
_glorot_uniform(m.conv_pw.weight)
if m.conv_pw.bias is not None:
m.conv_pw.bias.data.zero_()
elif isinstance(m, ConvBnAct2d):
if 'box_net' in n or 'class_net' in n:
m.conv.weight.data.normal_(std=.01)
if m.conv.bias is not None:
if 'class_net.predict' in n:
m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
else:
m.conv.bias.data.zero_()
else:
_glorot_uniform(m.conv.weight)
if m.conv.bias is not None:
m.conv.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1.0)
m.bias.data.zero_()


def _init_weight_alt(
m,
n='',
):
""" Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition
NOTE: this will likely be removed after some experimentation
"""
if isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
if 'class_net.predict' in n:
m.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
else:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1.0)
m.bias.data.zero_()


class Interpolate2d(nn.Module):
r"""Resamples a 2d Image

The input data is assumed to be of the form
`minibatch x channels x [optional depth] x [optional height] x width`.
Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.

The algorithms available for upsampling are nearest neighbor and linear,
bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
respectively.

One can either give a :attr:`scale_factor` or the target output :attr:`size` to
calculate the output size. (You cannot give both, as it is ambiguous)

Args:
size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
output spatial sizes
scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
multiplier for spatial size. Has to match input size if it is a tuple.
mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
Default: ``'nearest'``
align_corners (bool, optional): if ``True``, the corner pixels of the input
and output tensors are aligned, and thus preserving the values at
those pixels. This only has effect when :attr:`mode` is
``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
"""
__constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name']
name: str
size: Optional[Union[int, Tuple[int, int]]]
scale_factor: Optional[Union[float, Tuple[float, float]]]
mode: str
align_corners: Optional[bool]

def __init__(self,
size: Optional[Union[int, Tuple[int, int]]] = None,
scale_factor: Optional[Union[float, Tuple[float,
float]]] = None,
mode: str = 'nearest',
align_corners: bool = False) -> None:
super(Interpolate2d, self).__init__()
self.name = type(self).__name__
self.size = size
if isinstance(scale_factor, tuple):
self.scale_factor = tuple(float(factor) for factor in scale_factor)
else:
self.scale_factor = float(scale_factor) if scale_factor else None
self.mode = mode
self.align_corners = None if mode == 'nearest' else align_corners

def forward(self, input: torch.Tensor) -> torch.Tensor:
return F.interpolate(
input,
self.size,
self.scale_factor,
self.mode,
self.align_corners,
recompute_scale_factor=False)


class ResampleFeatureMap(nn.Sequential):

def __init__(self,
in_channels,
out_channels,
reduction_ratio=1.,
pad_type='',
downsample=None,
upsample=None,
norm_layer=nn.BatchNorm2d,
apply_bn=False,
conv_after_downsample=False,
redundant_bias=False):
super(ResampleFeatureMap, self).__init__()
downsample = downsample or 'max'
upsample = upsample or 'nearest'
self.in_channels = in_channels
self.out_channels = out_channels
self.reduction_ratio = reduction_ratio
self.conv_after_downsample = conv_after_downsample

conv = None
if in_channels != out_channels:
conv = ConvBnAct2d(
in_channels,
out_channels,
kernel_size=1,
padding=pad_type,
norm_layer=norm_layer if apply_bn else None,
bias=not apply_bn or redundant_bias,
act_layer=None)

if reduction_ratio > 1:
if conv is not None and not self.conv_after_downsample:
self.add_module('conv', conv)
if downsample in ('max', 'avg'):
stride_size = int(reduction_ratio)
downsample = create_pool2d(
downsample,
kernel_size=stride_size + 1,
stride=stride_size,
padding=pad_type)
else:
downsample = Interpolate2d(
scale_factor=1. / reduction_ratio, mode=downsample)
self.add_module('downsample', downsample)
if conv is not None and self.conv_after_downsample:
self.add_module('conv', conv)
else:
if conv is not None:
self.add_module('conv', conv)
if reduction_ratio < 1:
scale = int(1 // reduction_ratio)
self.add_module(
'upsample',
Interpolate2d(scale_factor=scale, mode=upsample))


class GiraffeCombine(nn.Module):

def __init__(self,
feature_info,
fpn_config,
fpn_channels,
inputs_offsets,
target_reduction,
pad_type='',
downsample=None,
upsample=None,
norm_layer=nn.BatchNorm2d,
apply_resample_bn=False,
conv_after_downsample=False,
redundant_bias=False,
weight_method='attn'):
super(GiraffeCombine, self).__init__()
self.inputs_offsets = inputs_offsets
self.weight_method = weight_method

self.resample = nn.ModuleDict()
reduction_base = feature_info[0]['reduction']

target_channels_idx = int(
math.log(target_reduction // reduction_base, 2))
for idx, offset in enumerate(inputs_offsets):
if offset < len(feature_info):
in_channels = feature_info[offset]['num_chs']
input_reduction = feature_info[offset]['reduction']
else:
node_idx = offset
input_reduction = fpn_config[node_idx]['reduction']
# in_channels = fpn_config[node_idx]['num_chs']
input_channels_idx = int(
math.log(input_reduction // reduction_base, 2))
in_channels = feature_info[input_channels_idx]['num_chs']

reduction_ratio = target_reduction / input_reduction
if weight_method == 'concat':
self.resample[str(offset)] = ResampleFeatureMap(
in_channels,
in_channels,
reduction_ratio=reduction_ratio,
pad_type=pad_type,
downsample=downsample,
upsample=upsample,
norm_layer=norm_layer,
apply_bn=apply_resample_bn,
conv_after_downsample=conv_after_downsample,
redundant_bias=redundant_bias)
else:
self.resample[str(offset)] = ResampleFeatureMap(
in_channels,
fpn_channels[target_channels_idx],
reduction_ratio=reduction_ratio,
pad_type=pad_type,
downsample=downsample,
upsample=upsample,
norm_layer=norm_layer,
apply_bn=apply_resample_bn,
conv_after_downsample=conv_after_downsample,
redundant_bias=redundant_bias)

if weight_method == 'attn' or weight_method == 'fastattn':
self.edge_weights = nn.Parameter(
torch.ones(len(inputs_offsets)), requires_grad=True) # WSM
else:
self.edge_weights = None

def forward(self, x: List[torch.Tensor]):
dtype = x[0].dtype
nodes = []
if len(self.inputs_offsets) == 0:
return None
for offset, resample in zip(self.inputs_offsets,
self.resample.values()):
input_node = x[offset]
input_node = resample(input_node)
nodes.append(input_node)

if self.weight_method == 'attn':
normalized_weights = torch.softmax(
self.edge_weights.to(dtype=dtype), dim=0)
out = torch.stack(nodes, dim=-1) * normalized_weights
out = torch.sum(out, dim=-1)
elif self.weight_method == 'fastattn':
edge_weights = nn.functional.relu(
self.edge_weights.to(dtype=dtype))
weights_sum = torch.sum(edge_weights)
weights_norm = weights_sum + 0.0001
out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm
for i in range(len(nodes))],
dim=-1)

out = torch.sum(out, dim=-1)
elif self.weight_method == 'sum':
out = torch.stack(nodes, dim=-1)
out = torch.sum(out, dim=-1)
elif self.weight_method == 'concat':
out = torch.cat(nodes, dim=1)
else:
raise ValueError('unknown weight_method {}'.format(
self.weight_method))
return out


class GiraffeNode(nn.Module):
""" A simple wrapper used in place of nn.Sequential for torchscript typing
Handles input type List[Tensor] -> output type Tensor
"""

def __init__(self, combine: nn.Module, after_combine: nn.Module):
super(GiraffeNode, self).__init__()
self.combine = combine
self.after_combine = after_combine

def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
combine_feat = self.combine(x)
if combine_feat is None:
return None
else:
return self.after_combine(combine_feat)


class GiraffeLayer(nn.Module):

def __init__(self,
feature_info,
fpn_config,
inner_fpn_channels,
outer_fpn_channels,
num_levels=5,
pad_type='',
downsample=None,
upsample=None,
norm_layer=nn.BatchNorm2d,
act_layer=_ACT_LAYER,
apply_resample_bn=False,
conv_after_downsample=True,
conv_bn_relu_pattern=False,
separable_conv=True,
redundant_bias=False,
merge_type='conv'):
super(GiraffeLayer, self).__init__()
self.num_levels = num_levels
self.conv_bn_relu_pattern = False

self.feature_info = {}
for idx, feat in enumerate(feature_info):
self.feature_info[idx] = feat

self.fnode = nn.ModuleList()
reduction_base = feature_info[0]['reduction']
for i, fnode_cfg in fpn_config.items():
logging.debug('fnode {} : {}'.format(i, fnode_cfg))

if fnode_cfg['is_out'] == 1:
fpn_channels = outer_fpn_channels
else:
fpn_channels = inner_fpn_channels

reduction = fnode_cfg['reduction']
fpn_channels_idx = int(math.log(reduction // reduction_base, 2))
combine = GiraffeCombine(
self.feature_info,
fpn_config,
fpn_channels,
tuple(fnode_cfg['inputs_offsets']),
target_reduction=reduction,
pad_type=pad_type,
downsample=downsample,
upsample=upsample,
norm_layer=norm_layer,
apply_resample_bn=apply_resample_bn,
conv_after_downsample=conv_after_downsample,
redundant_bias=redundant_bias,
weight_method=fnode_cfg['weight_method'])

after_combine = nn.Sequential()

in_channels = 0
out_channels = 0
for input_offset in fnode_cfg['inputs_offsets']:
in_channels += self.feature_info[input_offset]['num_chs']

out_channels = fpn_channels[fpn_channels_idx]

if merge_type == 'csp':
after_combine.add_module(
'CspLayer',
CSPLayer(
in_channels,
out_channels,
2,
shortcut=True,
depthwise=False,
act='silu'))
elif merge_type == 'shuffle':
after_combine.add_module(
'shuffleBlock', ShuffleBlock(in_channels, in_channels))
after_combine.add_module(
'conv1x1',
create_conv2d(in_channels, out_channels, kernel_size=1))
elif merge_type == 'conv':
after_combine.add_module(
'conv1x1',
create_conv2d(in_channels, out_channels, kernel_size=1))
conv_kwargs = dict(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
padding=pad_type,
bias=False,
norm_layer=norm_layer,
act_layer=act_layer)
if not conv_bn_relu_pattern:
conv_kwargs['bias'] = redundant_bias
conv_kwargs['act_layer'] = None
after_combine.add_module('act', act_layer(inplace=True))
after_combine.add_module(
'conv',
SeparableConv2d(**conv_kwargs)
if separable_conv else ConvBnAct2d(**conv_kwargs))

self.fnode.append(
GiraffeNode(combine=combine, after_combine=after_combine))
self.feature_info[i] = dict(
num_chs=fpn_channels[fpn_channels_idx], reduction=reduction)

self.out_feature_info = []
out_node = list(self.feature_info.keys())[-num_levels::]
for i in out_node:
self.out_feature_info.append(self.feature_info[i])

self.feature_info = self.out_feature_info

def forward(self, x: List[torch.Tensor]):
for fn in self.fnode:
x.append(fn(x))
return x[-self.num_levels::]


class GiraffeNeck(nn.Module):

def __init__(self, min_level, max_level, num_levels, norm_layer,
norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels,
out_fpn_channels, weight_method, depth_multiplier,
width_multiplier, with_backslash, with_slash,
with_skip_connect, skip_connect_type, separable_conv,
feature_info, merge_type, pad_type, downsample_type,
upsample_type, apply_resample_bn, conv_after_downsample,
redundant_bias, conv_bn_relu_pattern, alternate_init):
super(GiraffeNeck, self).__init__()

self.num_levels = num_levels
self.min_level = min_level
self.in_features = [0, 1, 2, 3, 4, 5,
6][self.min_level - 1:self.min_level - 1
+ num_levels]
self.alternate_init = alternate_init
norm_layer = norm_layer or nn.BatchNorm2d
if norm_kwargs:
norm_layer = partial(norm_layer, **norm_kwargs)
act_layer = get_act_layer(act_type) or _ACT_LAYER
fpn_config = fpn_config or get_graph_config(
fpn_name,
min_level=min_level,
max_level=max_level,
weight_method=weight_method,
depth_multiplier=depth_multiplier,
with_backslash=with_backslash,
with_slash=with_slash,
with_skip_connect=with_skip_connect,
skip_connect_type=skip_connect_type)

# width scale
for i in range(len(fpn_channels)):
fpn_channels[i] = int(fpn_channels[i] * width_multiplier)

self.resample = nn.ModuleDict()
for level in range(num_levels):
if level < len(feature_info):
in_chs = feature_info[level]['num_chs']
reduction = feature_info[level]['reduction']
else:
# Adds a coarser level by downsampling the last feature map
reduction_ratio = 2
self.resample[str(level)] = ResampleFeatureMap(
in_channels=in_chs,
out_channels=feature_info[level - 1]['num_chs'],
pad_type=pad_type,
downsample=downsample_type,
upsample=upsample_type,
norm_layer=norm_layer,
reduction_ratio=reduction_ratio,
apply_bn=apply_resample_bn,
conv_after_downsample=conv_after_downsample,
redundant_bias=redundant_bias,
)
in_chs = feature_info[level - 1]['num_chs']
reduction = int(reduction * reduction_ratio)
feature_info.append(dict(num_chs=in_chs, reduction=reduction))

self.cell = SequentialList()
logging.debug('building giraffeNeck')
giraffe_layer = GiraffeLayer(
feature_info=feature_info,
fpn_config=fpn_config,
inner_fpn_channels=fpn_channels,
outer_fpn_channels=out_fpn_channels,
num_levels=num_levels,
pad_type=pad_type,
downsample=downsample_type,
upsample=upsample_type,
norm_layer=norm_layer,
act_layer=act_layer,
separable_conv=separable_conv,
apply_resample_bn=apply_resample_bn,
conv_after_downsample=conv_after_downsample,
conv_bn_relu_pattern=conv_bn_relu_pattern,
redundant_bias=redundant_bias,
merge_type=merge_type)
self.cell.add_module('giraffeNeck', giraffe_layer)
feature_info = giraffe_layer.feature_info

def init_weights(self, pretrained=False):
for n, m in self.named_modules():
if 'backbone' not in n:
if self.alternate_init:
_init_weight_alt(m, n)
else:
_init_weight(m, n)

def forward(self, x: List[torch.Tensor]):
if type(x) is tuple:
x = list(x)
x = [x[f] for f in self.in_features]
for resample in self.resample.values():
x.append(resample(x[-1]))
x = self.cell(x)
return x

+ 203
- 0
modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py View File

@@ -0,0 +1,203 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
import torch.nn as nn

from ..core.base_ops import BaseConv, CSPLayer, DWConv
from ..core.neck_ops import CSPStage


class GiraffeNeckV2(nn.Module):

def __init__(
self,
depth=1.0,
width=1.0,
in_features=[2, 3, 4],
in_channels=[256, 512, 1024],
out_channels=[256, 512, 1024],
depthwise=False,
act='silu',
spp=True,
reparam_mode=True,
block_name='BasicBlock',
):
super().__init__()
self.in_features = in_features
self.in_channels = in_channels
Conv = DWConv if depthwise else BaseConv

reparam_mode = reparam_mode

self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

# node x3: input x0, x1
self.bu_conv13 = Conv(
int(in_channels[1] * width),
int(in_channels[1] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_3 = CSPStage(
block_name,
int((in_channels[1] + in_channels[2]) * width),
int(in_channels[2] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_3 = CSPLayer(
int((in_channels[1] + in_channels[2]) * width),
int(in_channels[2] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x4: input x1, x2, x3
self.bu_conv24 = Conv(
int(in_channels[0] * width),
int(in_channels[0] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_4 = CSPStage(
block_name,
int((in_channels[0] + in_channels[1] + in_channels[2])
* width),
int(in_channels[1] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_4 = CSPLayer(
int((in_channels[0] + in_channels[1] + in_channels[2])
* width),
int(in_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x5: input x2, x4
if reparam_mode:
self.merge_5 = CSPStage(
block_name,
int((in_channels[1] + in_channels[0]) * width),
int(out_channels[0] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_5 = CSPLayer(
int((in_channels[1] + in_channels[0]) * width),
int(out_channels[0] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x7: input x4, x5
self.bu_conv57 = Conv(
int(out_channels[0] * width),
int(out_channels[0] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_7 = CSPStage(
block_name,
int((out_channels[0] + in_channels[1]) * width),
int(out_channels[1] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_7 = CSPLayer(
int((out_channels[0] + in_channels[1]) * width),
int(out_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x6: input x3, x4, x7
self.bu_conv46 = Conv(
int(in_channels[1] * width),
int(in_channels[1] * width),
3,
2,
act=act)
self.bu_conv76 = Conv(
int(out_channels[1] * width),
int(out_channels[1] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_6 = CSPStage(
block_name,
int((in_channels[1] + out_channels[1] + in_channels[2])
* width),
int(out_channels[2] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_6 = CSPLayer(
int((in_channels[1] + out_channels[1] + in_channels[2])
* width),
int(out_channels[2] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

def init_weights(self):
pass

def forward(self, out_features):
"""
Args:
inputs: input images.

Returns:
Tuple[Tensor]: FPN feature.
"""

# backbone
features = [out_features[f] for f in self.in_features]
[x2, x1, x0] = features

# node x3
x13 = self.bu_conv13(x1)
x3 = torch.cat([x0, x13], 1)
x3 = self.merge_3(x3)

# node x4
x34 = self.upsample(x3)
x24 = self.bu_conv24(x2)
x4 = torch.cat([x1, x24, x34], 1)
x4 = self.merge_4(x4)

# node x5
x45 = self.upsample(x4)
x5 = torch.cat([x2, x45], 1)
x5 = self.merge_5(x5)

# node x7
x57 = self.bu_conv57(x5)
x7 = torch.cat([x4, x57], 1)
x7 = self.merge_7(x7)

# node x6
x46 = self.bu_conv46(x4)
x76 = self.bu_conv76(x7)
x6 = torch.cat([x3, x46, x76], 1)
x6 = self.merge_6(x6)

outputs = (x5, x7, x6)
return outputs

+ 16
- 0
modelscope/models/cv/tinynas_detection/tinynas_detector.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.utils.constant import Tasks
from .detector import SingleStageDetector


@MODELS.register_module(
Tasks.image_object_detection, module_name=Models.tinynas_detection)
class TinynasDetector(SingleStageDetector):

def __init__(self, model_dir, *args, **kwargs):

super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)

+ 30
- 0
modelscope/models/cv/tinynas_detection/utils.py View File

@@ -0,0 +1,30 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import importlib
import os
import sys
from os.path import dirname, join


def get_config_by_file(config_file):
try:
sys.path.append(os.path.dirname(config_file))
current_config = importlib.import_module(
os.path.basename(config_file).split('.')[0])
exp = current_config.Config()
except Exception:
raise ImportError(
"{} doesn't contains class named 'Config'".format(config_file))
return exp


def parse_config(config_file):
"""
get config object by file.
Args:
config_file (str): file path of config.
"""
assert (config_file is not None), 'plz provide config file'
if config_file is not None:
return get_config_by_file(config_file)

+ 6
- 4
modelscope/models/multi_modal/mplug/modeling_mplug.py View File

@@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel):
ModelFile.TORCH_MODEL_BIN_FILE)
checkpoint = torch.load(checkpoint_path, map_location='cpu')
if 'model' in checkpoint:
state_dict = checkpoint['model']
else:
state_dict = checkpoint['module']
checkpoint = checkpoint['model']
checkpoint = {
k.replace('model.', ''): v
for k, v in checkpoint.items()
}

msg = model.load_state_dict(state_dict, strict=False)
msg = model.load_state_dict(checkpoint, strict=False)
print('load checkpoint from %s' % checkpoint_path)
print(msg)
return model


+ 10
- 6
modelscope/models/nlp/__init__.py View File

@@ -9,12 +9,15 @@ if TYPE_CHECKING:
from .bert_for_sequence_classification import BertForSequenceClassification
from .bert_for_document_segmentation import BertForDocumentSegmentation
from .csanmt_for_translation import CsanmtForTranslation
from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
BertForMaskedLM)
from .masked_language import (
StructBertForMaskedLM,
VecoForMaskedLM,
BertForMaskedLM,
DebertaV2ForMaskedLM,
)
from .nncrf_for_named_entity_recognition import (
TransformerCRFForNamedEntityRecognition,
LSTMCRFForNamedEntityRecognition)
from .palm_v2 import PalmForTextGeneration
from .token_classification import SbertForTokenClassification
from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
from .space import SpaceForDialogIntent
@@ -22,7 +25,6 @@ if TYPE_CHECKING:
from .space import SpaceForDialogStateTracking
from .star_text_to_sql import StarForTextToSql
from .task_models import (InformationExtractionModel,
SequenceClassificationModel,
SingleBackboneTaskModelBase)
from .bart_for_text_error_correction import BartForTextErrorCorrection
from .gpt3 import GPT3ForTextGeneration
@@ -36,8 +38,10 @@ else:
'csanmt_for_translation': ['CsanmtForTranslation'],
'bert_for_sequence_classification': ['BertForSequenceClassification'],
'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
'masked_language':
['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
'masked_language': [
'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
'DebertaV2ForMaskedLM'
],
'nncrf_for_named_entity_recognition': [
'TransformerCRFForNamedEntityRecognition',
'LSTMCRFForNamedEntityRecognition'


+ 73
- 0
modelscope/models/nlp/deberta_v2/__init__.py View File

@@ -0,0 +1,73 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.

# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

_import_structure = {
'configuration_deberta_v2': [
'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
'DebertaV2OnnxConfig'
],
'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
}

if TYPE_CHECKING:
from .configuration_deberta_v2 import DebertaV2Config
from .tokenization_deberta_v2 import DebertaV2Tokenizer
from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast

from .modeling_deberta_v2 import (
DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
DebertaV2ForMaskedLM,
DebertaV2ForMultipleChoice,
DebertaV2ForQuestionAnswering,
DebertaV2ForSequenceClassification,
DebertaV2ForTokenClassification,
DebertaV2Model,
DebertaV2PreTrainedModel,
)

else:
_import_structure = {
'configuration_deberta_v2':
['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
'tokenization_deberta_v2': ['DebertaV2Tokenizer']
}
_import_structure['tokenization_deberta_v2_fast'] = [
'DebertaV2TokenizerFast'
]
_import_structure['modeling_deberta_v2'] = [
'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
'DebertaV2ForMaskedLM',
'DebertaV2ForMultipleChoice',
'DebertaV2ForQuestionAnswering',
'DebertaV2ForSequenceClassification',
'DebertaV2ForTokenClassification',
'DebertaV2Model',
'DebertaV2PreTrainedModel',
]
import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__)

+ 130
- 0
modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py View File

@@ -0,0 +1,130 @@
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
# Copyright 2020, Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

from transformers import PretrainedConfig

from modelscope.utils import logger as logging

logger = logging.get_logger(__name__)


class DebertaV2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the DeBERTa
[microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Arguments:
vocab_size (`int`, *optional*, defaults to 128100):
Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling [`DebertaV2Model`].
hidden_size (`int`, *optional*, defaults to 1536):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 24):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 6144):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 0):
The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-7):
The epsilon used by the layer normalization layers.
relative_attention (`bool`, *optional*, defaults to `True`):
Whether use relative position encoding.
max_relative_positions (`int`, *optional*, defaults to -1):
The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
as `max_position_embeddings`.
pad_token_id (`int`, *optional*, defaults to 0):
The value used to pad input_ids.
position_biased_input (`bool`, *optional*, defaults to `False`):
Whether add absolute position embedding to content embedding.
pos_att_type (`List[str]`, *optional*):
The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
`["p2c", "c2p"]`, `["p2c", "c2p"]`.
layer_norm_eps (`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
"""
model_type = 'deberta_v2'

def __init__(self,
vocab_size=128100,
hidden_size=1536,
num_hidden_layers=24,
num_attention_heads=24,
intermediate_size=6144,
hidden_act='gelu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=0,
initializer_range=0.02,
layer_norm_eps=1e-7,
relative_attention=False,
max_relative_positions=-1,
pad_token_id=0,
position_biased_input=True,
pos_att_type=None,
pooler_dropout=0,
pooler_hidden_act='gelu',
**kwargs):
super().__init__(**kwargs)

self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.relative_attention = relative_attention
self.max_relative_positions = max_relative_positions
self.pad_token_id = pad_token_id
self.position_biased_input = position_biased_input

# Backwards compatibility
if type(pos_att_type) == str:
pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')]

self.pos_att_type = pos_att_type
self.vocab_size = vocab_size
self.layer_norm_eps = layer_norm_eps

self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size)
self.pooler_dropout = pooler_dropout
self.pooler_hidden_act = pooler_hidden_act

+ 1789
- 0
modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
File diff suppressed because it is too large
View File


+ 546
- 0
modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py View File

@@ -0,0 +1,546 @@
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`"""

import os
import unicodedata
from typing import Any, Dict, List, Optional, Tuple

import sentencepiece as sp
from transformers.tokenization_utils import PreTrainedTokenizer

PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}

PRETRAINED_INIT_CONFIGURATION = {}

VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'}


class DebertaV2Tokenizer(PreTrainedTokenizer):
r"""
Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
and [jieba](https://github.com/fxsjy/jieba).

Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
bos_token (`string`, *optional*, defaults to `"[CLS]"`):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
eos_token (`string`, *optional*, defaults to `"[SEP]"`):
The end of sequence token. When building a sequence using special tokens, this is not the token that is
used for the end of sequence. The token used is the `sep_token`.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:

- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

def __init__(self,
vocab_file,
do_lower_case=False,
split_by_punct=False,
split_chinese=True,
bos_token='[CLS]',
eos_token='[SEP]',
unk_token='[UNK]',
sep_token='[SEP]',
pad_token='[PAD]',
cls_token='[CLS]',
mask_token='[MASK]',
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

super().__init__(
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
split_by_punct=split_by_punct,
split_chinese=split_chinese,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)

if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
)
self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.split_chinese = split_chinese
self.vocab_file = vocab_file
self._tokenizer = SPMTokenizer(
vocab_file,
split_by_punct=split_by_punct,
sp_model_kwargs=self.sp_model_kwargs)
self.jieba = None
if self.split_chinese:
try:
import jieba
except ImportError:
raise ImportError(
'You need to install jieba to split chinese and use DebertaV2Tokenizer. '
'See https://pypi.org/project/jieba/ for installation.')
self.jieba = jieba

@property
def vocab_size(self):
return len(self.vocab)

@property
def vocab(self):
return self._tokenizer.vocab

def get_vocab(self):
vocab = self.vocab.copy()
vocab.update(self.get_added_vocab())
return vocab

def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
if self.do_lower_case:
text = text.lower()
if self.split_chinese:
seg_list = [x for x in self.jieba.cut(text)]
text = ' '.join(seg_list)
return self._tokenizer.tokenize(text)

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self._tokenizer.spm.PieceToId(token)

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self._tokenizer.spm.IdToPiece(
index) if index < self.vocab_size else self.unk_token

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
return self._tokenizer.decode(tokens)

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A DeBERTa sequence has the following format:

- single sequence: [CLS] X [SEP]
- pair of sequences: [CLS] A [SEP] B [SEP]

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""

if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep

def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.

Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""

if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True)

if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + (
[0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]

def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+ sep) * [1]

def prepare_for_tokenization(self,
text,
is_split_into_words=False,
**kwargs):
add_prefix_space = kwargs.pop('add_prefix_space', False)
if is_split_into_words or add_prefix_space:
text = ' ' + text
return (text, kwargs)

def save_vocabulary(self,
save_directory: str,
filename_prefix: Optional[str] = None) -> Tuple[str]:
return self._tokenizer.save_pretrained(
save_directory, filename_prefix=filename_prefix)


class SPMTokenizer:
r"""
Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).

Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:

- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""

def __init__(self,
vocab_file,
split_by_punct=False,
sp_model_kwargs: Optional[Dict[str, Any]] = None):
self.split_by_punct = split_by_punct
self.vocab_file = vocab_file
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
if not os.path.exists(vocab_file):
raise FileNotFoundError(f'{vocab_file} does not exist!')
spm.load(vocab_file)
bpe_vocab_size = spm.GetPieceSize()
# Token map
# <unk> 0+1
# <s> 1+1
# </s> 2+1
self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
# self.vocab['[PAD]'] = 0
# self.vocab['[CLS]'] = 1
# self.vocab['[SEP]'] = 2
# self.vocab['[UNK]'] = 3

self.spm = spm

def __getstate__(self):
state = self.__dict__.copy()
state['spm'] = None
return state

def __setstate__(self, d):
self.__dict__ = d

# for backward compatibility
if not hasattr(self, 'sp_model_kwargs'):
self.sp_model_kwargs = {}

self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
self.spm.Load(self.vocab_file)

def tokenize(self, text):
return self._encode_as_pieces(text)

def convert_ids_to_tokens(self, ids):
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens

def decode(self, tokens, start=-1, end=-1, raw_text=None):
if raw_text is None:
return self.spm.decode_pieces([t for t in tokens])
else:
words = self.split_to_words(raw_text)
word_tokens = [self.tokenize(w) for w in words]
token2words = [0] * len(tokens)
tid = 0
for i, w in enumerate(word_tokens):
for k, t in enumerate(w):
token2words[tid] = i
tid += 1
word_start = token2words[start]
word_end = token2words[end] if end < len(tokens) else len(words)
text = ''.join(words[word_start:word_end])
return text

def add_special_token(self, token):
if token not in self.special_tokens:
self.special_tokens.append(token)
if token not in self.vocab:
self.vocab[token] = len(self.vocab) - 1
self.ids_to_tokens.append(token)
return self.id(token)

def part_of_whole_word(self, token, is_bos=False):
if is_bos:
return True
if (len(token) == 1 and (_is_whitespace(list(token)[0]))):
return False
if _is_control(list(token)[0]):
return False
if _is_punctuation(list(token)[0]):
return False
if token in self.add_special_token:
return False

word_start = b'\xe2\x96\x81'.decode('utf-8')
return not token.startswith(word_start)

def pad(self):
return '[PAD]'

def bos(self):
return '[CLS]'

def eos(self):
return '[SEP]'

def unk(self):
return '[UNK]'

def mask(self):
return '[MASK]'

def sym(self, id):
return self.ids_to_tokens[id]

def id(self, sym):
return self.vocab[sym] if sym in self.vocab else 1

def _encode_as_pieces(self, text):
text = convert_to_unicode(text)
if self.split_by_punct:
words = self._run_split_on_punc(text)
pieces = [self.spm.encode(w, out_type=str) for w in words]
return [p for w in pieces for p in w]
else:
return self.spm.encode(text, out_type=str)

def split_to_words(self, text):
pieces = self._encode_as_pieces(text)
word_start = b'\xe2\x96\x81'.decode('utf-8')
words = []
offset = 0
prev_end = 0
for i, p in enumerate(pieces):
if p.startswith(word_start):
if offset > prev_end:
words.append(text[prev_end:offset])
prev_end = offset
w = p.replace(word_start, '')
else:
w = p
try:
s = text.index(w, offset)
pn = ''
k = i + 1
while k < len(pieces):
pn = pieces[k].replace(word_start, '')
if len(pn) > 0:
break
k += 1

if len(pn) > 0 and pn in text[offset:s]:
offset = offset + 1
else:
offset = s + len(w)
except Exception:
offset = offset + 1

if prev_end < offset:
words.append(text[prev_end:offset])

return words

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize('NFD', text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == 'Mn':
continue
output.append(char)
return ''.join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return [''.join(x) for x in output]

def save_pretrained(self, path: str, filename_prefix: str = None):
filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
if filename_prefix is not None:
filename = filename_prefix + '-' + filename
full_path = os.path.join(path, filename)
with open(full_path, 'wb') as fs:
fs.write(self.spm.serialized_model_proto())
return (full_path, )


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically control characters but we treat them
# as whitespace since they are generally considered as such.
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
return True
cat = unicodedata.category(char)
if cat == 'Zs':
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == '\t' or char == '\n' or char == '\r':
return False
cat = unicodedata.category(char)
if cat.startswith('C'):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith('P'):
return True
return False


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError(f'Unsupported string type: {type(text)}')

+ 241
- 0
modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py View File

@@ -0,0 +1,241 @@
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization class for model DeBERTa."""

import os
from shutil import copyfile
from typing import Optional, Tuple

from transformers.file_utils import is_sentencepiece_available
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

from modelscope.utils import logger as logging

if is_sentencepiece_available():
from .tokenization_deberta_v2 import DebertaV2Tokenizer
else:
DebertaV2Tokenizer = None

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {
'vocab_file': 'spm.model',
'tokenizer_file': 'tokenizer.json'
}

PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}

PRETRAINED_INIT_CONFIGURATION = {}


class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
r"""
Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
and [rjieba-py](https://github.com/messense/rjieba-py).

Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
bos_token (`string`, *optional*, defaults to `"[CLS]"`):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
eos_token (`string`, *optional*, defaults to `"[SEP]"`):
The end of sequence token. When building a sequence using special tokens, this is not the token that is
used for the end of sequence. The token used is the `sep_token`.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
sp_model_kwargs (`dict`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:

- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = DebertaV2Tokenizer

def __init__(self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=False,
split_by_punct=False,
split_chinese=True,
bos_token='[CLS]',
eos_token='[SEP]',
unk_token='[UNK]',
sep_token='[SEP]',
pad_token='[PAD]',
cls_token='[CLS]',
mask_token='[MASK]',
**kwargs) -> None:
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
split_by_punct=split_by_punct,
split_chinese=split_chinese,
**kwargs,
)

self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.split_chinese = split_chinese
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A DeBERTa sequence has the following format:

- single sequence: [CLS] X [SEP]
- pair of sequences: [CLS] A [SEP] B [SEP]

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""

if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep

def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.

Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""

if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True)

if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + (
[0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]

def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+ sep) * [1]

def save_vocabulary(self,
save_directory: str,
filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
'tokenizer.')

if not os.path.isdir(save_directory):
logger.error(
f'Vocabulary path ({save_directory}) should be a directory')
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + '-' if filename_prefix else '')
+ VOCAB_FILES_NAMES['vocab_file'])

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)

return (out_vocab_file, )

+ 4
- 0
modelscope/models/nlp/gpt3/modeling_gpt3.py View File

@@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel):
state_dict_file = os.path.join(pretrained_model_name_or_path,
ModelFile.TORCH_MODEL_BIN_FILE)
state_dict = torch.load(state_dict_file)
state_dict = {
k.replace('model.language_model', 'language_model'): v
for k, v in state_dict.items()
}
model.load_state_dict(state_dict)
return model

+ 39
- 0
modelscope/models/nlp/masked_language.py View File

@@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer
from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.nlp.deberta_v2 import \
DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
from modelscope.models.nlp.structbert import SbertForMaskedLM
from modelscope.models.nlp.veco import \
VecoForMaskedLM as VecoForMaskedLMTransformer
@@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
VecoForMaskedLM).from_pretrained(
pretrained_model_name_or_path=model_dir,
model_dir=model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
"""Deberta v2 for MLM model.

Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
"""

def __init__(self, config, model_dir):
super(TorchModel, self).__init__(model_dir)
DebertaV2ForMaskedLMTransformer.__init__(self, config)

def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
labels=None):
output = DebertaV2ForMaskedLMTransformer.forward(
self,
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
labels=labels)
output[OutputKeys.INPUT_IDS] = input_ids
return output

@classmethod
def _instantiate(cls, **kwargs):
model_dir = kwargs.get('model_dir')
return super(DebertaV2ForMaskedLMTransformer,
DebertaV2ForMaskedLM).from_pretrained(
pretrained_model_name_or_path=model_dir,
model_dir=model_dir)

+ 8
- 8
modelscope/models/nlp/palm_v2/modeling_palm.py View File

@@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel): # Model
self.generator.dense.weight = self.decoder.embeddings.weight

if checkpoint is not None:
for key in list(checkpoint['model'].keys()):
checkpoint['model'][key.replace('module.',
'')] = checkpoint['model'][key]
msg = self.load_state_dict(checkpoint['model'], strict=False)
print(msg)
if 'model' in checkpoint:
checkpoint = checkpoint['model']
for key in list(checkpoint.keys()):
checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
self.load_state_dict(checkpoint, strict=False)
else:
for module in self.decoder.modules():
if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
return addict.Dict(loss=loss)


class Translator(nn.Module):
class Translator(object):
"""
Uses a model to translate a batch of sentences.
"""
@@ -1298,8 +1298,8 @@ class Translator(nn.Module):

return results

def forward(self, input_ids: torch.Tensor,
attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
def __call__(self, input_ids: torch.Tensor,
attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
batch = self.Batch(
batch_size=input_ids.size()[0],
src=input_ids,


+ 20
- 0
modelscope/msdatasets/cv/face_2d_keypoins/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .face_2d_keypoints_dataset import FaceKeypointDataset

else:
_import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 13
- 0
modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py View File

@@ -0,0 +1,13 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset

from modelscope.metainfo import Datasets
from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
from modelscope.utils.constant import Tasks


@TASK_DATASETS.register_module(
group_key=Tasks.face_2d_keypoints,
module_name=Datasets.Face2dKeypointsDataset)
class FaceKeypointDataset(_FaceKeypointDataset):
"""EasyCV dataset for face 2d keypoints."""

+ 3
- 9
modelscope/msdatasets/ms_dataset.py View File

@@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset):
for idx in range(iter_start, iter_end):
item_dict = self.dataset[idx]
res = {
k: np.array(item_dict[k])
k: torch.tensor(item_dict[k])
for k in self.columns if k in self.retained_columns
}
for preprocessor in self.preprocessor_list:
res.update({
k: np.array(v)
k: torch.tensor(v)
for k, v in preprocessor(item_dict).items()
if k in self.retained_columns
})
@@ -574,14 +574,8 @@ class MsDataset:
None

"""
from modelscope.hub.api import HubApi
_hub_api = HubApi()
cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
_upload_manager = DatasetUploadManager(
dataset_name=dataset_name,
namespace=namespace,
version=version,
cookies=cookies)
dataset_name=dataset_name, namespace=namespace, version=version)
_upload_manager.upload(object_name, local_file_path)

@staticmethod


+ 18
- 6
modelscope/msdatasets/utils/oss_utils.py View File

@@ -18,6 +18,12 @@ class OssUtilities:
self.oss_dir = oss_config['Dir']
self.oss_backup_dir = oss_config['BackupDir']

self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset'
self.upload_multipart_threshold = 50 * 1024 * 1024
self.upload_part_size = 1 * 1024 * 1024
self.upload_num_threads = 4
self.upload_max_retries = 3

@staticmethod
def _percentage(consumed_bytes, total_bytes):
if total_bytes:
@@ -42,21 +48,27 @@ class OssUtilities:
progress_callback=self._percentage)
return local_path

def upload(self, oss_file_name: str, local_file_path: str) -> str:
max_retries = 3
def upload(self, oss_object_name: str, local_file_path: str) -> str:
retry_count = 0
object_key = os.path.join(self.oss_dir, oss_file_name)
object_key = os.path.join(self.oss_dir, oss_object_name)
resumable_store = oss2.ResumableStore(
root=self.upload_resumable_tmp_store)

while True:
try:
retry_count += 1
self.bucket.put_object_from_file(
oss2.resumable_upload(
self.bucket,
object_key,
local_file_path,
progress_callback=self._percentage)
store=resumable_store,
multipart_threshold=self.upload_multipart_threshold,
part_size=self.upload_part_size,
progress_callback=self._percentage,
num_threads=self.upload_num_threads)
break
except Exception:
if retry_count >= max_retries:
if retry_count >= self.upload_max_retries:
raise

return object_key

+ 10
- 12
modelscope/msdatasets/utils/upload_utils.py View File

@@ -1,23 +1,21 @@
from http.cookiejar import CookieJar

from .oss_utils import OssUtilities


class DatasetUploadManager(object):

def __init__(self, dataset_name: str, namespace: str, version: str,
cookies: CookieJar):
def __init__(self, dataset_name: str, namespace: str, version: str):
from modelscope.hub.api import HubApi
api = HubApi()
oss_config = api.get_dataset_access_config_session(
cookies=cookies,
_hub_api = HubApi()
_cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
_oss_config = _hub_api.get_dataset_access_config_session(
cookies=_cookies,
dataset_name=dataset_name,
namespace=namespace,
revision=version)

self.oss_utilities = OssUtilities(oss_config)
self.oss_utilities = OssUtilities(_oss_config)

def upload(self, oss_file_name: str, local_file_path: str) -> str:
oss_object_key = self.oss_utilities.upload(
oss_file_name=oss_file_name, local_file_path=local_file_path)
return oss_object_key
def upload(self, object_name: str, local_file_path: str) -> str:
object_key = self.oss_utilities.upload(
oss_object_name=object_name, local_file_path=local_file_path)
return object_key

+ 61
- 1
modelscope/outputs.py View File

@@ -35,6 +35,7 @@ class OutputKeys(object):
UUID = 'uuid'
WORD = 'word'
KWS_LIST = 'kws_list'
TIMESTAMPS = 'timestamps'
SPLIT_VIDEO_NUM = 'split_video_num'
SPLIT_META_DICT = 'split_meta_dict'

@@ -56,6 +57,15 @@ TASK_OUTPUTS = {
# }
Tasks.ocr_recognition: [OutputKeys.TEXT],

# face 2d keypoint result for single sample
# {
# "keypoints": [
# [x1, y1]*106
# ],
# "poses": [pitch, roll, yaw]
# }
Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],

# face detection result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
@@ -75,6 +85,14 @@ TASK_OUTPUTS = {
Tasks.face_detection:
[OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],

# facial expression recognition result for single sample
# {
# "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
# "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
# }
Tasks.facial_expression_recognition:
[OutputKeys.SCORES, OutputKeys.LABELS],

# face recognition result for single sample
# {
# "img_embedding": np.array with shape [1, D],
@@ -201,6 +219,21 @@ TASK_OUTPUTS = {
# }
Tasks.body_3d_keypoints: [OutputKeys.POSES],

# 2D hand keypoints result for single sample
# {
# "keypoints": [
# [[x, y, score] * 21],
# [[x, y, score] * 21],
# [[x, y, score] * 21],
# ],
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ]
# }
Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],

# video single object tracking result for single video
# {
# "boxes": [
@@ -242,7 +275,20 @@ TASK_OUTPUTS = {
# "output_img": np.ndarray with shape [height, width, 3]
# }
Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],

# text driven segmentation result for single sample
# {
# "masks": [
# np.array # 2D array containing only 0, 255
# ]
# }
Tasks.text_driven_segmentation: [OutputKeys.MASKS],
# shop segmentation result for single sample
# {
# "masks": [
# np.array # 2D array containing only 0, 255
# ]
# }
Tasks.shop_segmentation: [OutputKeys.MASKS],
# movide scene segmentation result for a single video
# {
# "split_video_num":3,
@@ -541,6 +587,19 @@ TASK_OUTPUTS = {
# }
Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],

# {
# 'labels': ['吸烟', '打电话', '吸烟'],
# 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
# 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
# 'timestamps': [1, 3, 5]
# }
Tasks.action_detection: [
OutputKeys.TIMESTAMPS,
OutputKeys.LABELS,
OutputKeys.SCORES,
OutputKeys.BOXES,
],

# {
# 'output': [
# [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
@@ -551,6 +610,7 @@ TASK_OUTPUTS = {
# {'label': '13421097', 'score': 2.75914817393641e-06}]]
# }
Tasks.faq_question_answering: [OutputKeys.OUTPUT],

# image person reid result for single sample
# {
# "img_embedding": np.array with shape [1, D],


+ 0
- 1
modelscope/pipelines/base.py View File

@@ -2,7 +2,6 @@

import os.path as osp
from abc import ABC, abstractmethod
from contextlib import contextmanager
from threading import Lock
from typing import Any, Dict, Generator, List, Mapping, Union



+ 16
- 1
modelscope/pipelines/builder.py View File

@@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
Tasks.action_recognition: (Pipelines.action_recognition,
'damo/cv_TAdaConv_action-recognition'),
Tasks.action_detection: (Pipelines.action_detection,
'damo/cv_ResNetC3D_action-detection_detection2d'),
Tasks.live_category: (Pipelines.live_category,
'damo/cv_resnet50_live-category'),
Tasks.video_category: (Pipelines.video_category,
@@ -97,10 +99,18 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
'damo/cv_canonical_body-3d-keypoints_video'),
Tasks.hand_2d_keypoints:
(Pipelines.hand_2d_keypoints,
'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
Tasks.face_detection: (Pipelines.face_detection,
'damo/cv_resnet_facedetection_scrfd10gkps'),
Tasks.face_recognition: (Pipelines.face_recognition,
'damo/cv_ir101_facerecognition_cfglint'),
Tasks.facial_expression_recognition:
(Pipelines.facial_expression_recognition,
'damo/cv_vgg19_facial-expression-recognition_fer'),
Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
'damo/cv_mobilenet_face-2d-keypoints_alignment'),
Tasks.video_multi_modal_embedding:
(Pipelines.video_multi_modal_embedding,
'damo/multi_modal_clip_vtretrival_msrvtt_53'),
@@ -147,9 +157,14 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/cv_vitb_video-single-object-tracking_ostrack'),
Tasks.image_reid_person: (Pipelines.image_reid_person,
'damo/cv_passvitb_image-reid-person_market'),
Tasks.text_driven_segmentation:
(Pipelines.text_driven_segmentation,
'damo/cv_vitl16_segmentation_text-driven-seg'),
Tasks.movie_scene_segmentation:
(Pipelines.movie_scene_segmentation,
'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
Tasks.shop_segmentation: (Pipelines.shop_segmentation,
'damo/cv_vitb16_segmentation_shop-seg'),
}




+ 19
- 3
modelscope/pipelines/cv/__init__.py View File

@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .action_recognition_pipeline import ActionRecognitionPipeline
from .action_detection_pipeline import ActionDetectionPipeline
from .animal_recognition_pipeline import AnimalRecognitionPipeline
from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
from .crowd_counting_pipeline import CrowdCountingPipeline
@@ -42,15 +44,21 @@ if TYPE_CHECKING:
from .tinynas_classification_pipeline import TinynasClassificationPipeline
from .video_category_pipeline import VideoCategoryPipeline
from .virtual_try_on_pipeline import VirtualTryonPipeline
from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
from .shop_segmentation_pipleline import ShopSegmentationPipeline
from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline

else:
_import_structure = {
'action_recognition_pipeline': ['ActionRecognitionPipeline'],
'action_detection_pipeline': ['ActionDetectionPipeline'],
'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
'crowd_counting_pipeline': ['CrowdCountingPipeline'],
@@ -93,10 +101,18 @@ else:
'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
'video_category_pipeline': ['VideoCategoryPipeline'],
'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
'easycv_pipeline':
['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
'easycv_pipeline': [
'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
'Face2DKeypointsPipeline'
],
'text_driven_segmentation_pipeline':
['TextDrivenSegmentationPipeline'],
'movie_scene_segmentation_pipeline':
['MovieSceneSegmentationPipeline'],
'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
'facial_expression_recognition_pipelin':
['FacialExpressionRecognitionPipeline']
}

import sys


+ 63
- 0
modelscope/pipelines/cv/action_detection_pipeline.py View File

@@ -0,0 +1,63 @@
import math
import os.path as osp
from typing import Any, Dict

from modelscope.metainfo import Pipelines
from modelscope.models.cv.action_detection import ActionDetONNX
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.action_detection, module_name=Pipelines.action_detection)
class ActionDetectionPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a action detection pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE)
logger.info(f'loading model from {model_path}')
config_path = osp.join(self.model, ModelFile.CONFIGURATION)
logger.info(f'loading config from {config_path}')
self.cfg = Config.from_file(config_path)
self.cfg.MODEL.model_file = model_path
self.model = ActionDetONNX(self.model, self.cfg.MODEL,
self.device_name)
logger.info('load model done')

def preprocess(self, input: Input) -> Dict[str, Any]:
if isinstance(input, str):
video_name = input
else:
raise TypeError(f'input should be a str,'
f' but got {type(input)}')
result = {'video_name': video_name}
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
preds = self.model.forward(input['video_name'])
labels = sum([pred['actions']['labels'] for pred in preds], [])
scores = sum([pred['actions']['scores'] for pred in preds], [])
boxes = sum([pred['actions']['boxes'] for pred in preds], [])
timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels'])
for pred in preds], [])
out = {
OutputKeys.TIMESTAMPS: timestamps,
OutputKeys.LABELS: labels,
OutputKeys.SCORES: scores,
OutputKeys.BOXES: boxes
}
return out

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 3
- 1
modelscope/pipelines/cv/easycv_pipelines/__init__.py View File

@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .detection_pipeline import EasyCVDetectionPipeline
from .segmentation_pipeline import EasyCVSegmentationPipeline
from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
else:
_import_structure = {
'detection_pipeline': ['EasyCVDetectionPipeline'],
'segmentation_pipeline': ['EasyCVSegmentationPipeline']
'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
}

import sys


+ 41
- 0
modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py View File

@@ -0,0 +1,41 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from .base import EasyCVPipeline


@PIPELINES.register_module(
Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
class Face2DKeypointsPipeline(EasyCVPipeline):
"""Pipeline for face 2d keypoints detection."""

def __init__(self,
model: str,
model_file_pattern=ModelFile.TORCH_MODEL_FILE,
*args,
**kwargs):
"""
model (str): model id on modelscope hub or local model path.
model_file_pattern (str): model file pattern.
"""

super(Face2DKeypointsPipeline, self).__init__(
model=model,
model_file_pattern=model_file_pattern,
*args,
**kwargs)

def show_result(self, img, points, scale=2, save_path=None):
return self.predict_op.show_result(img, points, scale, save_path)

def __call__(self, inputs) -> Any:
output = self.predict_op(inputs)[0][0]
points = output['point']
poses = output['pose']

return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}

+ 128
- 0
modelscope/pipelines/cv/facial_expression_recognition_pipeline.py View File

@@ -0,0 +1,128 @@
import os.path as osp
from typing import Any, Dict

import cv2
import numpy as np
import PIL
import torch

from modelscope.metainfo import Pipelines
from modelscope.models.cv.face_recognition.align_face import align_face
from modelscope.models.cv.facial_expression_recognition import \
FacialExpressionRecognition
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.facial_expression_recognition,
module_name=Pipelines.facial_expression_recognition)
class FacialExpressionRecognitionPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a face detection pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
logger.info(f'loading model from {ckpt_path}')
device = torch.device(
f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
fer = FacialExpressionRecognition(model_path=ckpt_path, device=device)
self.fer = fer
self.device = device
logger.info('load model done')

# face detect pipeline
det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
self.face_detection = pipeline(
Tasks.face_detection, model=det_model_id)

def _choose_face(self,
det_result,
min_face=10,
top_face=1,
center_face=False):
'''
choose face with maximum area
Args:
det_result: output of face detection pipeline
min_face: minimum size of valid face w/h
top_face: take faces with top max areas
center_face: choose the most centerd face from multi faces, only valid if top_face > 1
'''
bboxes = np.array(det_result[OutputKeys.BOXES])
landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
if bboxes.shape[0] == 0:
logger.info('Warning: No face detected!')
return None
# face idx with enough size
face_idx = []
for i in range(bboxes.shape[0]):
box = bboxes[i]
if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
face_idx += [i]
if len(face_idx) == 0:
logger.info(
f'Warning: Face size not enough, less than {min_face}x{min_face}!'
)
return None
bboxes = bboxes[face_idx]
landmarks = landmarks[face_idx]
# find max faces
boxes = np.array(bboxes)
area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
sort_idx = np.argsort(area)[-top_face:]
# find center face
if top_face > 1 and center_face and bboxes.shape[0] > 1:
img_center = [img.shape[1] // 2, img.shape[0] // 2]
min_dist = float('inf')
sel_idx = -1
for _idx in sort_idx:
box = boxes[_idx]
dist = np.square(
np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
np.abs((box[1] + box[3]) / 2 - img_center[1]))
if dist < min_dist:
min_dist = dist
sel_idx = _idx
sort_idx = [sel_idx]
main_idx = sort_idx[-1]
return bboxes[main_idx], landmarks[main_idx]

def preprocess(self, input: Input) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input)
img = img[:, :, ::-1]
det_result = self.face_detection(img.copy())
rtn = self._choose_face(det_result)
face_img = None
if rtn is not None:
_, face_lmks = rtn
face_lmks = face_lmks.reshape(5, 2)
face_img, _ = align_face(img, (112, 112), face_lmks)
face_img = face_img.astype(np.float32)
result = {}
result['img'] = face_img
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
result = self.fer(input)
assert result is not None
scores = result[0].tolist()
labels = result[1].tolist()
return {
OutputKeys.SCORES: scores,
OutputKeys.LABELS: labels,
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 51
- 0
modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py View File

@@ -0,0 +1,51 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path

from modelscope.metainfo import Pipelines
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.constant import ModelFile, Tasks
from .easycv_pipelines.base import EasyCVPipeline


@PIPELINES.register_module(
Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints)
class Hand2DKeypointsPipeline(EasyCVPipeline):
"""Pipeline for hand pose keypoint task."""

def __init__(self,
model: str,
model_file_pattern=ModelFile.TORCH_MODEL_FILE,
*args,
**kwargs):
"""
model (str): model id on modelscope hub or local model path.
model_file_pattern (str): model file pattern.
"""
self.model_dir = model
super(Hand2DKeypointsPipeline, self).__init__(
model=model,
model_file_pattern=model_file_pattern,
*args,
**kwargs)

def _build_predict_op(self):
"""Build EasyCV predictor."""
from easycv.predictors.builder import build_predictor
detection_predictor_type = self.cfg['DETECTION']['type']
detection_model_path = os.path.join(
self.model_dir, self.cfg['DETECTION']['model_path'])
detection_cfg_file = os.path.join(self.model_dir,
self.cfg['DETECTION']['config_file'])
detection_score_threshold = self.cfg['DETECTION']['score_threshold']
self.cfg.pipeline.predictor_config[
'detection_predictor_config'] = dict(
type=detection_predictor_type,
model_path=detection_model_path,
config_file=detection_cfg_file,
score_threshold=detection_score_threshold)
easycv_config = self._to_easycv_config()
pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
'model_path': self.model_path,
'config_file': easycv_config
})
return pipeline_op

+ 2
- 0
modelscope/pipelines/cv/ocr_detection_pipeline.py View File

@@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline):
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
rboxes = inputs['combined_rboxes'][0]
count = inputs['combined_counts'][0]
if count == 0 or count < rboxes.shape[0]:
raise Exception('modelscope error: No text detected')
rboxes = rboxes[:count, :]

# convert rboxes to polygons and find its coordinates on the original image


+ 58
- 0
modelscope/pipelines/cv/retina_face_detection_pipeline.py View File

@@ -0,0 +1,58 @@
import os.path as osp
from typing import Any, Dict

import cv2
import numpy as np
import PIL
import torch

from modelscope.metainfo import Pipelines
from modelscope.models.cv.face_detection import RetinaFaceDetection
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


@PIPELINES.register_module(
Tasks.face_detection, module_name=Pipelines.retina_face_detection)
class RetinaFaceDetectionPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
use `model` to create a face detection pipeline for prediction
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model, **kwargs)
ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
logger.info(f'loading model from {ckpt_path}')
detector = RetinaFaceDetection(
model_path=ckpt_path, device=self.device)
self.detector = detector
logger.info('load model done')

def preprocess(self, input: Input) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input)
img = img.astype(np.float32)
result = {'img': img}
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
result = self.detector(input)
assert result is not None
bboxes = result[0][:, :4].tolist()
scores = result[0][:, 4].tolist()
lms = result[1].tolist()
return {
OutputKeys.SCORES: scores,
OutputKeys.BOXES: bboxes,
OutputKeys.KEYPOINTS: lms,
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 51
- 0
modelscope/pipelines/cv/shop_segmentation_pipleline.py View File

@@ -0,0 +1,51 @@
from typing import Any, Dict

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation)
class ShopSegmentationPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
model: model id on modelscope hub.
"""
super().__init__(model=model, auto_collate=False, **kwargs)

def preprocess(self, input: Input) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input)
img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
result = {
'img': img_tensor,
'ori_h': ori_h,
'ori_w': ori_w,
'crop_h': crop_h,
'crop_w': crop_w
}
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:

outputs = self.model.inference(input['img'])
result = {
'data': outputs,
'ori_h': input['ori_h'],
'ori_w': input['ori_w'],
'crop_h': input['crop_h'],
'crop_w': input['crop_w'],
}
return result

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

data = self.model.postprocess(inputs['data'], inputs['crop_h'],
inputs['crop_w'], inputs['ori_h'],
inputs['ori_w'])
outputs = {OutputKeys.MASKS: data}
return outputs

+ 51
- 0
modelscope/pipelines/cv/text_driven_segmentation_pipleline.py View File

@@ -0,0 +1,51 @@
from typing import Any, Dict

from modelscope.metainfo import Pipelines
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
Tasks.text_driven_segmentation,
module_name=Pipelines.text_driven_segmentation)
class TextDrivenSegmentationPipeline(Pipeline):

def __init__(self, model: str, **kwargs):
"""
model: model id on modelscope hub.
"""
super().__init__(model=model, auto_collate=False, **kwargs)

def preprocess(self, input: Dict) -> Dict[str, Any]:
img = LoadImage.convert_to_ndarray(input['image'])
img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
result = {
'img': img_tensor,
'ori_h': ori_h,
'ori_w': ori_w,
'crop_h': crop_h,
'crop_w': crop_w,
'text': input['text'],
}
return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
outputs = self.model.inference(input['img'], input['text'])
result = {
'data': outputs,
'ori_h': input['ori_h'],
'ori_w': input['ori_w'],
'crop_h': input['crop_h'],
'crop_w': input['crop_w'],
}
return result

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
data = self.model.postprocess(inputs['data'], inputs['crop_h'],
inputs['crop_w'], inputs['ori_h'],
inputs['ori_w'])
outputs = {OutputKeys.MASKS: data}
return outputs

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save