| @@ -1,11 +1,9 @@ | |||||
| pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/tests.txt | pip install -r requirements/tests.txt | ||||
| # install numpy<=1.18 for tensorflow==1.15.x | |||||
| pip install "numpy<=1.18" | |||||
| git config --global --add safe.directory /Maas-lib | git config --global --add safe.directory /Maas-lib | ||||
| @@ -26,4 +24,3 @@ else | |||||
| fi | fi | ||||
| echo "Running case with command: $ci_command" | echo "Running case with command: $ci_command" | ||||
| $ci_command | $ci_command | ||||
| #python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py | |||||
| @@ -1,19 +0,0 @@ | |||||
| pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||||
| pip install -r requirements/tests.txt | |||||
| # install numpy<=1.18 for tensorflow==1.15.x | |||||
| pip install "numpy<=1.18" | |||||
| # linter test | |||||
| # use internal project for pre-commit due to the network problem | |||||
| pre-commit run --all-files | |||||
| if [ $? -ne 0 ]; then | |||||
| echo "linter test failed, please run 'pre-commit run --all-files' to check" | |||||
| exit -1 | |||||
| fi | |||||
| PYTHONPATH=. python tests/run.py | |||||
| @@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0' | |||||
| cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' | cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' | ||||
| cpu_sets_arr=($cpu_sets) | cpu_sets_arr=($cpu_sets) | ||||
| is_get_file_lock=false | is_get_file_lock=false | ||||
| CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND} | |||||
| # export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml' | |||||
| CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND} | |||||
| echo "ci command: $CI_COMMAND" | echo "ci command: $CI_COMMAND" | ||||
| for gpu in $gpus | for gpu in $gpus | ||||
| do | do | ||||
| @@ -16,6 +17,7 @@ do | |||||
| echo "get gpu lock $gpu" | echo "get gpu lock $gpu" | ||||
| CONTAINER_NAME="modelscope-ci-$gpu" | CONTAINER_NAME="modelscope-ci-$gpu" | ||||
| let is_get_file_lock=true | let is_get_file_lock=true | ||||
| # pull image if there are update | # pull image if there are update | ||||
| docker pull ${IMAGE_NAME}:${IMAGE_VERSION} | docker pull ${IMAGE_NAME}:${IMAGE_VERSION} | ||||
| docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | ||||
| @@ -38,6 +40,7 @@ do | |||||
| --net host \ | --net host \ | ||||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | ${IMAGE_NAME}:${IMAGE_VERSION} \ | ||||
| $CI_COMMAND | $CI_COMMAND | ||||
| if [ $? -ne 0 ]; then | if [ $? -ne 0 ]; then | ||||
| echo "Running test case failed, please check the log!" | echo "Running test case failed, please check the log!" | ||||
| exit -1 | exit -1 | ||||
| @@ -25,4 +25,4 @@ python: | |||||
| install: | install: | ||||
| - requirements: requirements/docs.txt | - requirements: requirements/docs.txt | ||||
| - requirements: requirements/readthedocs.txt | - requirements: requirements/readthedocs.txt | ||||
| - requirements: requirements/runtime.txt | |||||
| - requirements: requirements/framework.txt | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633 | |||||
| size 161535 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988 | |||||
| size 7750 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766 | |||||
| size 54390 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9 | |||||
| size 87228 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0 | |||||
| size 238607 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4 | |||||
| size 67861 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92 | |||||
| size 3191059 | |||||
| @@ -1,3 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | version https://git-lfs.github.com/spec/v1 | ||||
| oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f | |||||
| size 126815483 | |||||
| oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443 | |||||
| size 12722029 | |||||
| @@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \ | |||||
| # install modelscope | # install modelscope | ||||
| COPY requirements /var/modelscope | COPY requirements /var/modelscope | ||||
| RUN pip install --no-cache-dir --upgrade pip && \ | RUN pip install --no-cache-dir --upgrade pip && \ | ||||
| pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||||
| pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||||
| pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | ||||
| pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | ||||
| pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | ||||
| @@ -9,6 +9,8 @@ class Models(object): | |||||
| Model name should only contain model info but not task info. | Model name should only contain model info but not task info. | ||||
| """ | """ | ||||
| tinynas_detection = 'tinynas-detection' | |||||
| # vision models | # vision models | ||||
| detection = 'detection' | detection = 'detection' | ||||
| realtime_object_detection = 'realtime-object-detection' | realtime_object_detection = 'realtime-object-detection' | ||||
| @@ -22,12 +24,17 @@ class Models(object): | |||||
| body_2d_keypoints = 'body-2d-keypoints' | body_2d_keypoints = 'body-2d-keypoints' | ||||
| body_3d_keypoints = 'body-3d-keypoints' | body_3d_keypoints = 'body-3d-keypoints' | ||||
| crowd_counting = 'HRNetCrowdCounting' | crowd_counting = 'HRNetCrowdCounting' | ||||
| face_2d_keypoints = 'face-2d-keypoints' | |||||
| panoptic_segmentation = 'swinL-panoptic-segmentation' | panoptic_segmentation = 'swinL-panoptic-segmentation' | ||||
| image_reid_person = 'passvitb' | image_reid_person = 'passvitb' | ||||
| video_summarization = 'pgl-video-summarization' | video_summarization = 'pgl-video-summarization' | ||||
| swinL_semantic_segmentation = 'swinL-semantic-segmentation' | swinL_semantic_segmentation = 'swinL-semantic-segmentation' | ||||
| vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | ||||
| text_driven_segmentation = 'text-driven-segmentation' | |||||
| resnet50_bert = 'resnet50-bert' | resnet50_bert = 'resnet50-bert' | ||||
| fer = 'fer' | |||||
| retinaface = 'retinaface' | |||||
| shop_segmentation = 'shop-segmentation' | |||||
| # EasyCV models | # EasyCV models | ||||
| yolox = 'YOLOX' | yolox = 'YOLOX' | ||||
| @@ -37,6 +44,7 @@ class Models(object): | |||||
| bert = 'bert' | bert = 'bert' | ||||
| palm = 'palm-v2' | palm = 'palm-v2' | ||||
| structbert = 'structbert' | structbert = 'structbert' | ||||
| deberta_v2 = 'deberta_v2' | |||||
| veco = 'veco' | veco = 'veco' | ||||
| translation = 'csanmt-translation' | translation = 'csanmt-translation' | ||||
| space_dst = 'space-dst' | space_dst = 'space-dst' | ||||
| @@ -104,13 +112,17 @@ class Pipelines(object): | |||||
| hicossl_video_embedding = 'hicossl-s3dg-video_embedding' | hicossl_video_embedding = 'hicossl-s3dg-video_embedding' | ||||
| body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' | body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' | ||||
| body_3d_keypoints = 'canonical_body-3d-keypoints_video' | body_3d_keypoints = 'canonical_body-3d-keypoints_video' | ||||
| hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image' | |||||
| human_detection = 'resnet18-human-detection' | human_detection = 'resnet18-human-detection' | ||||
| object_detection = 'vit-object-detection' | object_detection = 'vit-object-detection' | ||||
| easycv_detection = 'easycv-detection' | easycv_detection = 'easycv-detection' | ||||
| easycv_segmentation = 'easycv-segmentation' | easycv_segmentation = 'easycv-segmentation' | ||||
| face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment' | |||||
| salient_detection = 'u2net-salient-detection' | salient_detection = 'u2net-salient-detection' | ||||
| image_classification = 'image-classification' | image_classification = 'image-classification' | ||||
| face_detection = 'resnet-face-detection-scrfd10gkps' | face_detection = 'resnet-face-detection-scrfd10gkps' | ||||
| facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' | |||||
| retina_face_detection = 'resnet50-face-detection-retinaface' | |||||
| live_category = 'live-category' | live_category = 'live-category' | ||||
| general_image_classification = 'vit-base_image-classification_ImageNet-labels' | general_image_classification = 'vit-base_image-classification_ImageNet-labels' | ||||
| daily_image_classification = 'vit-base_image-classification_Dailylife-labels' | daily_image_classification = 'vit-base_image-classification_Dailylife-labels' | ||||
| @@ -132,13 +144,17 @@ class Pipelines(object): | |||||
| image_to_image_generation = 'image-to-image-generation' | image_to_image_generation = 'image-to-image-generation' | ||||
| skin_retouching = 'unet-skin-retouching' | skin_retouching = 'unet-skin-retouching' | ||||
| tinynas_classification = 'tinynas-classification' | tinynas_classification = 'tinynas-classification' | ||||
| tinynas_detection = 'tinynas-detection' | |||||
| crowd_counting = 'hrnet-crowd-counting' | crowd_counting = 'hrnet-crowd-counting' | ||||
| action_detection = 'ResNetC3D-action-detection' | |||||
| video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' | video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' | ||||
| image_panoptic_segmentation = 'image-panoptic-segmentation' | image_panoptic_segmentation = 'image-panoptic-segmentation' | ||||
| video_summarization = 'googlenet_pgl_video_summarization' | video_summarization = 'googlenet_pgl_video_summarization' | ||||
| image_semantic_segmentation = 'image-semantic-segmentation' | image_semantic_segmentation = 'image-semantic-segmentation' | ||||
| image_reid_person = 'passvitb-image-reid-person' | image_reid_person = 'passvitb-image-reid-person' | ||||
| text_driven_segmentation = 'text-driven-segmentation' | |||||
| movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | ||||
| shop_segmentation = 'shop-segmentation' | |||||
| # nlp tasks | # nlp tasks | ||||
| sentence_similarity = 'sentence-similarity' | sentence_similarity = 'sentence-similarity' | ||||
| @@ -347,6 +363,7 @@ class Datasets(object): | |||||
| """ Names for different datasets. | """ Names for different datasets. | ||||
| """ | """ | ||||
| ClsDataset = 'ClsDataset' | ClsDataset = 'ClsDataset' | ||||
| Face2dKeypointsDataset = 'Face2dKeypointsDataset' | |||||
| SegDataset = 'SegDataset' | SegDataset = 'SegDataset' | ||||
| DetDataset = 'DetDataset' | DetDataset = 'DetDataset' | ||||
| DetImagesMixDataset = 'DetImagesMixDataset' | DetImagesMixDataset = 'DetImagesMixDataset' | ||||
| @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .frcrn import FRCRNModel | |||||
| from .frcrn import FRCRNDecorator | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'frcrn': ['FRCRNModel'], | |||||
| 'frcrn': ['FRCRNDecorator'], | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -1,3 +1,9 @@ | |||||
| """ | |||||
| The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d | |||||
| here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ) | |||||
| and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch | |||||
| """ | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from typing import Dict | from typing import Dict | ||||
| @@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT | |||||
| from .unet import UNet | from .unet import UNet | ||||
| class FTB(nn.Module): | |||||
| def __init__(self, input_dim=257, in_channel=9, r_channel=5): | |||||
| super(FTB, self).__init__() | |||||
| self.in_channel = in_channel | |||||
| self.conv1 = nn.Sequential( | |||||
| nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]), | |||||
| nn.BatchNorm2d(r_channel), nn.ReLU()) | |||||
| self.conv1d = nn.Sequential( | |||||
| nn.Conv1d( | |||||
| r_channel * input_dim, in_channel, kernel_size=9, padding=4), | |||||
| nn.BatchNorm1d(in_channel), nn.ReLU()) | |||||
| self.freq_fc = nn.Linear(input_dim, input_dim, bias=False) | |||||
| self.conv2 = nn.Sequential( | |||||
| nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]), | |||||
| nn.BatchNorm2d(in_channel), nn.ReLU()) | |||||
| def forward(self, inputs): | |||||
| ''' | |||||
| inputs should be [Batch, Ca, Dim, Time] | |||||
| ''' | |||||
| # T-F attention | |||||
| conv1_out = self.conv1(inputs) | |||||
| B, C, D, T = conv1_out.size() | |||||
| reshape1_out = torch.reshape(conv1_out, [B, C * D, T]) | |||||
| conv1d_out = self.conv1d(reshape1_out) | |||||
| conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T]) | |||||
| # now is also [B,C,D,T] | |||||
| att_out = conv1d_out * inputs | |||||
| # tranpose to [B,C,T,D] | |||||
| att_out = torch.transpose(att_out, 2, 3) | |||||
| freqfc_out = self.freq_fc(att_out) | |||||
| att_out = torch.transpose(freqfc_out, 2, 3) | |||||
| cat_out = torch.cat([att_out, inputs], 1) | |||||
| outputs = self.conv2(cat_out) | |||||
| return outputs | |||||
| @MODELS.register_module( | @MODELS.register_module( | ||||
| Tasks.acoustic_noise_suppression, | Tasks.acoustic_noise_suppression, | ||||
| module_name=Models.speech_frcrn_ans_cirm_16k) | module_name=Models.speech_frcrn_ans_cirm_16k) | ||||
| class FRCRNModel(TorchModel): | |||||
| class FRCRNDecorator(TorchModel): | |||||
| r""" A decorator of FRCRN for integrating into modelscope framework """ | r""" A decorator of FRCRN for integrating into modelscope framework """ | ||||
| def __init__(self, model_dir: str, *args, **kwargs): | def __init__(self, model_dir: str, *args, **kwargs): | ||||
| @@ -78,13 +35,14 @@ class FRCRNModel(TorchModel): | |||||
| checkpoint = torch.load( | checkpoint = torch.load( | ||||
| model_bin_file, map_location=torch.device('cpu')) | model_bin_file, map_location=torch.device('cpu')) | ||||
| if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: | if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: | ||||
| self.model.load_state_dict( | |||||
| checkpoint['state_dict'], strict=False) | |||||
| # the new trained model by user is based on FRCRNDecorator | |||||
| self.load_state_dict(checkpoint['state_dict']) | |||||
| else: | else: | ||||
| # The released model on Modelscope is based on FRCRN | |||||
| self.model.load_state_dict(checkpoint, strict=False) | self.model.load_state_dict(checkpoint, strict=False) | ||||
| def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||||
| result_list = self.model.forward(input['noisy']) | |||||
| def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||||
| result_list = self.model.forward(inputs['noisy']) | |||||
| output = { | output = { | ||||
| 'spec_l1': result_list[0], | 'spec_l1': result_list[0], | ||||
| 'wav_l1': result_list[1], | 'wav_l1': result_list[1], | ||||
| @@ -93,12 +51,12 @@ class FRCRNModel(TorchModel): | |||||
| 'wav_l2': result_list[4], | 'wav_l2': result_list[4], | ||||
| 'mask_l2': result_list[5] | 'mask_l2': result_list[5] | ||||
| } | } | ||||
| if 'clean' in input: | |||||
| if 'clean' in inputs: | |||||
| mix_result = self.model.loss( | mix_result = self.model.loss( | ||||
| input['noisy'], input['clean'], result_list, mode='Mix') | |||||
| inputs['noisy'], inputs['clean'], result_list, mode='Mix') | |||||
| output.update(mix_result) | output.update(mix_result) | ||||
| sisnr_result = self.model.loss( | sisnr_result = self.model.loss( | ||||
| input['noisy'], input['clean'], result_list, mode='SiSNR') | |||||
| inputs['noisy'], inputs['clean'], result_list, mode='SiSNR') | |||||
| output.update(sisnr_result) | output.update(sisnr_result) | ||||
| # logger hooker will use items under 'log_vars' | # logger hooker will use items under 'log_vars' | ||||
| output['log_vars'] = {k: mix_result[k].item() for k in mix_result} | output['log_vars'] = {k: mix_result[k].item() for k in mix_result} | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| @@ -1,3 +1,8 @@ | |||||
| """ | |||||
| The implementation here is modified based on | |||||
| Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ) | |||||
| and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch | |||||
| """ | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -3,15 +3,15 @@ | |||||
| # yapf: disable | # yapf: disable | ||||
| from . import (action_recognition, animal_recognition, body_2d_keypoints, | from . import (action_recognition, animal_recognition, body_2d_keypoints, | ||||
| body_3d_keypoints, cartoon, cmdssl_video_embedding, | body_3d_keypoints, cartoon, cmdssl_video_embedding, | ||||
| crowd_counting, face_detection, face_generation, | |||||
| image_classification, image_color_enhance, image_colorization, | |||||
| image_denoise, image_instance_segmentation, | |||||
| crowd_counting, face_2d_keypoints, face_detection, | |||||
| face_generation, image_classification, image_color_enhance, | |||||
| image_colorization, image_denoise, image_instance_segmentation, | |||||
| image_panoptic_segmentation, image_portrait_enhancement, | image_panoptic_segmentation, image_portrait_enhancement, | ||||
| image_reid_person, image_semantic_segmentation, | image_reid_person, image_semantic_segmentation, | ||||
| image_to_image_generation, image_to_image_translation, | image_to_image_generation, image_to_image_translation, | ||||
| movie_scene_segmentation, object_detection, | movie_scene_segmentation, object_detection, | ||||
| product_retrieval_embedding, realtime_object_detection, | product_retrieval_embedding, realtime_object_detection, | ||||
| salient_detection, super_resolution, | |||||
| salient_detection, shop_segmentation, super_resolution, | |||||
| video_single_object_tracking, video_summarization, virual_tryon) | video_single_object_tracking, video_summarization, virual_tryon) | ||||
| # yapf: enable | # yapf: enable | ||||
| @@ -0,0 +1,21 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .action_detection_onnx import ActionDetONNX | |||||
| else: | |||||
| _import_structure = {'action_detection_onnx': ['ActionDetONNX']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,177 @@ | |||||
| import os | |||||
| import os.path as osp | |||||
| import shutil | |||||
| import subprocess | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import onnxruntime as rt | |||||
| from modelscope.models import Model | |||||
| from modelscope.utils.constant import Devices | |||||
| from modelscope.utils.device import verify_device | |||||
| class ActionDetONNX(Model): | |||||
| def __init__(self, model_dir, config, *args, **kwargs): | |||||
| super().__init__(self, model_dir, *args, **kwargs) | |||||
| model_file = osp.join(config['model_file']) | |||||
| device_type, device_id = verify_device(self._device_name) | |||||
| options = rt.SessionOptions() | |||||
| options.intra_op_num_threads = 1 | |||||
| options.inter_op_num_threads = 1 | |||||
| if device_type == Devices.gpu: | |||||
| sess = rt.InferenceSession( | |||||
| model_file, | |||||
| providers=['CUDAExecutionProvider'], | |||||
| sess_options=options, | |||||
| provider_options=[{ | |||||
| 'device_id': device_id | |||||
| }]) | |||||
| else: | |||||
| sess = rt.InferenceSession( | |||||
| model_file, | |||||
| providers=['CPUExecutionProvider'], | |||||
| sess_options=options) | |||||
| self.input_name = sess.get_inputs()[0].name | |||||
| self.sess = sess | |||||
| self.num_stride = len(config['fpn_strides']) | |||||
| self.score_thresh = np.asarray( | |||||
| config['pre_nms_thresh'], dtype='float32').reshape((1, -1)) | |||||
| self.size_divisibility = config['size_divisibility'] | |||||
| self.nms_threshold = config['nms_thresh'] | |||||
| self.tmp_dir = config['tmp_dir'] | |||||
| self.temporal_stride = config['step'] | |||||
| self.input_data_type = config['input_type'] | |||||
| self.action_names = config['action_names'] | |||||
| self.video_length_limit = config['video_length_limit'] | |||||
| def resize_box(self, det, height, width, scale_h, scale_w): | |||||
| bboxs = det[0] | |||||
| bboxs[:, [0, 2]] *= scale_w | |||||
| bboxs[:, [1, 3]] *= scale_h | |||||
| bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1) | |||||
| bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1) | |||||
| result = { | |||||
| 'boxes': bboxs.round().astype('int32').tolist(), | |||||
| 'scores': det[1].tolist(), | |||||
| 'labels': [self.action_names[i] for i in det[2].tolist()] | |||||
| } | |||||
| return result | |||||
| def parse_frames(self, frame_names): | |||||
| imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names] | |||||
| imgs = np.stack(imgs).astype(self.input_data_type).transpose( | |||||
| (3, 0, 1, 2)) # c,t,h,w | |||||
| imgs = imgs[None] | |||||
| return imgs | |||||
| def forward_img(self, imgs, h, w): | |||||
| pred = self.sess.run(None, { | |||||
| self.input_name: imgs, | |||||
| 'height': np.asarray(h), | |||||
| 'width': np.asarray(w) | |||||
| }) | |||||
| dets = self.post_nms( | |||||
| pred, | |||||
| score_threshold=self.score_thresh, | |||||
| nms_threshold=self.nms_threshold) | |||||
| return dets | |||||
| def forward_video(self, video_name, scale): | |||||
| min_size, max_size = self._get_sizes(scale) | |||||
| tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4]) | |||||
| if osp.exists(tmp_dir): | |||||
| shutil.rmtree(tmp_dir) | |||||
| os.makedirs(tmp_dir) | |||||
| frame_rate = 2 | |||||
| cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \ | |||||
| f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg' | |||||
| cmd = cmd.split(' ') | |||||
| subprocess.call(cmd) | |||||
| frame_names = [ | |||||
| osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir)) | |||||
| if name.endswith('.jpg') | |||||
| ] | |||||
| frame_names = [ | |||||
| frame_names[i:i + frame_rate * 2] | |||||
| for i in range(0, | |||||
| len(frame_names) - frame_rate * 2 + 1, frame_rate | |||||
| * self.temporal_stride) | |||||
| ] | |||||
| timestamp = list( | |||||
| range(1, | |||||
| len(frame_names) * self.temporal_stride, | |||||
| self.temporal_stride)) | |||||
| batch_imgs = [self.parse_frames(names) for names in frame_names] | |||||
| N, _, T, H, W = batch_imgs[0].shape | |||||
| scale_min = min_size / min(H, W) | |||||
| h, w = min(int(scale_min * H), | |||||
| max_size), min(int(scale_min * W), max_size) | |||||
| h = round(h / self.size_divisibility) * self.size_divisibility | |||||
| w = round(w / self.size_divisibility) * self.size_divisibility | |||||
| scale_h, scale_w = H / h, W / w | |||||
| results = [] | |||||
| for imgs in batch_imgs: | |||||
| det = self.forward_img(imgs, h, w) | |||||
| det = self.resize_box(det[0], H, W, scale_h, scale_w) | |||||
| results.append(det) | |||||
| results = [{ | |||||
| 'timestamp': t, | |||||
| 'actions': res | |||||
| } for t, res in zip(timestamp, results)] | |||||
| shutil.rmtree(tmp_dir) | |||||
| return results | |||||
| def forward(self, video_name): | |||||
| return self.forward_video(video_name, scale=1) | |||||
| def post_nms(self, pred, score_threshold, nms_threshold=0.3): | |||||
| pred_bboxes, pred_scores = pred | |||||
| N = len(pred_bboxes) | |||||
| dets = [] | |||||
| for i in range(N): | |||||
| bboxes, scores = pred_bboxes[i], pred_scores[i] | |||||
| candidate_inds = scores > score_threshold | |||||
| scores = scores[candidate_inds] | |||||
| candidate_nonzeros = candidate_inds.nonzero() | |||||
| bboxes = bboxes[candidate_nonzeros[0]] | |||||
| labels = candidate_nonzeros[1] | |||||
| keep = self._nms(bboxes, scores, labels, nms_threshold) | |||||
| bbox = bboxes[keep] | |||||
| score = scores[keep] | |||||
| label = labels[keep] | |||||
| dets.append((bbox, score, label)) | |||||
| return dets | |||||
| def _nms(self, boxes, scores, idxs, nms_threshold): | |||||
| if len(boxes) == 0: | |||||
| return [] | |||||
| max_coordinate = boxes.max() | |||||
| offsets = idxs * (max_coordinate + 1) | |||||
| boxes_for_nms = boxes + offsets[:, None].astype('float32') | |||||
| boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0] | |||||
| boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1] | |||||
| keep = cv2.dnn.NMSBoxes( | |||||
| boxes_for_nms.tolist(), | |||||
| scores.tolist(), | |||||
| score_threshold=0, | |||||
| nms_threshold=nms_threshold) | |||||
| if len(keep.shape) == 2: | |||||
| keep = np.squeeze(keep, 1) | |||||
| return keep | |||||
| def _get_sizes(self, scale): | |||||
| if scale == 1: | |||||
| min_size, max_size = 512, 896 | |||||
| elif scale == 2: | |||||
| min_size, max_size = 768, 1280 | |||||
| else: | |||||
| min_size, max_size = 1024, 1792 | |||||
| return min_size, max_size | |||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .face_2d_keypoints_align import Face2DKeypoints | |||||
| else: | |||||
| _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from easycv.models.face.face_keypoint import FaceKeypoint | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||||
| from modelscope.utils.constant import Tasks | |||||
| @MODELS.register_module( | |||||
| group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints) | |||||
| class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint): | |||||
| def __init__(self, model_dir=None, *args, **kwargs): | |||||
| EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||||
| FaceKeypoint.__init__(self, *args, **kwargs) | |||||
| @@ -0,0 +1,22 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .retinaface import RetinaFaceDetection | |||||
| else: | |||||
| _import_structure = { | |||||
| 'retinaface': ['RetinaFaceDetection'], | |||||
| } | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1 @@ | |||||
| from .detection import RetinaFaceDetection | |||||
| @@ -0,0 +1,137 @@ | |||||
| # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.backends.cudnn as cudnn | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import Tensor, TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.config import Config | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from .models.retinaface import RetinaFace | |||||
| from .utils import PriorBox, decode, decode_landm, py_cpu_nms | |||||
| @MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface) | |||||
| class RetinaFaceDetection(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | |||||
| super().__init__(model_path) | |||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | |||||
| self.model_path = model_path | |||||
| self.cfg = Config.from_file( | |||||
| model_path.replace(ModelFile.TORCH_MODEL_FILE, | |||||
| ModelFile.CONFIGURATION))['models'] | |||||
| self.net = RetinaFace(cfg=self.cfg) | |||||
| self.load_model() | |||||
| self.device = device | |||||
| self.net = self.net.to(self.device) | |||||
| self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device) | |||||
| def check_keys(self, pretrained_state_dict): | |||||
| ckpt_keys = set(pretrained_state_dict.keys()) | |||||
| model_keys = set(self.net.state_dict().keys()) | |||||
| used_pretrained_keys = model_keys & ckpt_keys | |||||
| assert len( | |||||
| used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' | |||||
| return True | |||||
| def remove_prefix(self, state_dict, prefix): | |||||
| new_state_dict = dict() | |||||
| for k, v in state_dict.items(): | |||||
| if k.startswith(prefix): | |||||
| new_state_dict[k[len(prefix):]] = v | |||||
| else: | |||||
| new_state_dict[k] = v | |||||
| return new_state_dict | |||||
| def load_model(self, load_to_cpu=False): | |||||
| pretrained_dict = torch.load( | |||||
| self.model_path, map_location=torch.device('cpu')) | |||||
| if 'state_dict' in pretrained_dict.keys(): | |||||
| pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'], | |||||
| 'module.') | |||||
| else: | |||||
| pretrained_dict = self.remove_prefix(pretrained_dict, 'module.') | |||||
| self.check_keys(pretrained_dict) | |||||
| self.net.load_state_dict(pretrained_dict, strict=False) | |||||
| self.net.eval() | |||||
| def forward(self, input): | |||||
| img_raw = input['img'].cpu().numpy() | |||||
| img = np.float32(img_raw) | |||||
| im_height, im_width = img.shape[:2] | |||||
| ss = 1.0 | |||||
| # tricky | |||||
| if max(im_height, im_width) > 1500: | |||||
| ss = 1000.0 / max(im_height, im_width) | |||||
| img = cv2.resize(img, (0, 0), fx=ss, fy=ss) | |||||
| im_height, im_width = img.shape[:2] | |||||
| scale = torch.Tensor( | |||||
| [img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) | |||||
| img -= (104, 117, 123) | |||||
| img = img.transpose(2, 0, 1) | |||||
| img = torch.from_numpy(img).unsqueeze(0) | |||||
| img = img.to(self.device) | |||||
| scale = scale.to(self.device) | |||||
| loc, conf, landms = self.net(img) # forward pass | |||||
| del img | |||||
| confidence_threshold = 0.9 | |||||
| nms_threshold = 0.4 | |||||
| top_k = 5000 | |||||
| keep_top_k = 750 | |||||
| priorbox = PriorBox(self.cfg, image_size=(im_height, im_width)) | |||||
| priors = priorbox.forward() | |||||
| priors = priors.to(self.device) | |||||
| prior_data = priors.data | |||||
| boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance']) | |||||
| boxes = boxes * scale | |||||
| boxes = boxes.cpu().numpy() | |||||
| scores = conf.squeeze(0).data.cpu().numpy()[:, 1] | |||||
| landms = decode_landm( | |||||
| landms.data.squeeze(0), prior_data, self.cfg['variance']) | |||||
| scale1 = torch.Tensor([ | |||||
| im_width, im_height, im_width, im_height, im_width, im_height, | |||||
| im_width, im_height, im_width, im_height | |||||
| ]) | |||||
| scale1 = scale1.to(self.device) | |||||
| landms = landms * scale1 | |||||
| landms = landms.cpu().numpy() | |||||
| # ignore low scores | |||||
| inds = np.where(scores > confidence_threshold)[0] | |||||
| boxes = boxes[inds] | |||||
| landms = landms[inds] | |||||
| scores = scores[inds] | |||||
| # keep top-K before NMS | |||||
| order = scores.argsort()[::-1][:top_k] | |||||
| boxes = boxes[order] | |||||
| landms = landms[order] | |||||
| scores = scores[order] | |||||
| # do NMS | |||||
| dets = np.hstack((boxes, scores[:, np.newaxis])).astype( | |||||
| np.float32, copy=False) | |||||
| keep = py_cpu_nms(dets, nms_threshold) | |||||
| dets = dets[keep, :] | |||||
| landms = landms[keep] | |||||
| # keep top-K faster NMS | |||||
| dets = dets[:keep_top_k, :] | |||||
| landms = landms[:keep_top_k, :] | |||||
| landms = landms.reshape((-1, 5, 2)) | |||||
| landms = landms.reshape( | |||||
| -1, | |||||
| 10, | |||||
| ) | |||||
| return dets / ss, landms / ss | |||||
| @@ -0,0 +1,149 @@ | |||||
| # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface | |||||
| import time | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torchvision.models as models | |||||
| import torchvision.models._utils as _utils | |||||
| from torch.autograd import Variable | |||||
| def conv_bn(inp, oup, stride=1, leaky=0): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup), | |||||
| nn.LeakyReLU(negative_slope=leaky, inplace=True)) | |||||
| def conv_bn_no_relu(inp, oup, stride): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(inp, oup, 3, stride, 1, bias=False), | |||||
| nn.BatchNorm2d(oup), | |||||
| ) | |||||
| def conv_bn1X1(inp, oup, stride, leaky=0): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), | |||||
| nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True)) | |||||
| def conv_dw(inp, oup, stride, leaky=0.1): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), | |||||
| nn.BatchNorm2d(inp), | |||||
| nn.LeakyReLU(negative_slope=leaky, inplace=True), | |||||
| nn.Conv2d(inp, oup, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(oup), | |||||
| nn.LeakyReLU(negative_slope=leaky, inplace=True), | |||||
| ) | |||||
| class SSH(nn.Module): | |||||
| def __init__(self, in_channel, out_channel): | |||||
| super(SSH, self).__init__() | |||||
| assert out_channel % 4 == 0 | |||||
| leaky = 0 | |||||
| if (out_channel <= 64): | |||||
| leaky = 0.1 | |||||
| self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1) | |||||
| self.conv5X5_1 = conv_bn( | |||||
| in_channel, out_channel // 4, stride=1, leaky=leaky) | |||||
| self.conv5X5_2 = conv_bn_no_relu( | |||||
| out_channel // 4, out_channel // 4, stride=1) | |||||
| self.conv7X7_2 = conv_bn( | |||||
| out_channel // 4, out_channel // 4, stride=1, leaky=leaky) | |||||
| self.conv7x7_3 = conv_bn_no_relu( | |||||
| out_channel // 4, out_channel // 4, stride=1) | |||||
| def forward(self, input): | |||||
| conv3X3 = self.conv3X3(input) | |||||
| conv5X5_1 = self.conv5X5_1(input) | |||||
| conv5X5 = self.conv5X5_2(conv5X5_1) | |||||
| conv7X7_2 = self.conv7X7_2(conv5X5_1) | |||||
| conv7X7 = self.conv7x7_3(conv7X7_2) | |||||
| out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) | |||||
| out = F.relu(out) | |||||
| return out | |||||
| class FPN(nn.Module): | |||||
| def __init__(self, in_channels_list, out_channels): | |||||
| super(FPN, self).__init__() | |||||
| leaky = 0 | |||||
| if (out_channels <= 64): | |||||
| leaky = 0.1 | |||||
| self.output1 = conv_bn1X1( | |||||
| in_channels_list[0], out_channels, stride=1, leaky=leaky) | |||||
| self.output2 = conv_bn1X1( | |||||
| in_channels_list[1], out_channels, stride=1, leaky=leaky) | |||||
| self.output3 = conv_bn1X1( | |||||
| in_channels_list[2], out_channels, stride=1, leaky=leaky) | |||||
| self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) | |||||
| self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) | |||||
| def forward(self, input): | |||||
| # names = list(input.keys()) | |||||
| input = list(input.values()) | |||||
| output1 = self.output1(input[0]) | |||||
| output2 = self.output2(input[1]) | |||||
| output3 = self.output3(input[2]) | |||||
| up3 = F.interpolate( | |||||
| output3, size=[output2.size(2), output2.size(3)], mode='nearest') | |||||
| output2 = output2 + up3 | |||||
| output2 = self.merge2(output2) | |||||
| up2 = F.interpolate( | |||||
| output2, size=[output1.size(2), output1.size(3)], mode='nearest') | |||||
| output1 = output1 + up2 | |||||
| output1 = self.merge1(output1) | |||||
| out = [output1, output2, output3] | |||||
| return out | |||||
| class MobileNetV1(nn.Module): | |||||
| def __init__(self): | |||||
| super(MobileNetV1, self).__init__() | |||||
| self.stage1 = nn.Sequential( | |||||
| conv_bn(3, 8, 2, leaky=0.1), # 3 | |||||
| conv_dw(8, 16, 1), # 7 | |||||
| conv_dw(16, 32, 2), # 11 | |||||
| conv_dw(32, 32, 1), # 19 | |||||
| conv_dw(32, 64, 2), # 27 | |||||
| conv_dw(64, 64, 1), # 43 | |||||
| ) | |||||
| self.stage2 = nn.Sequential( | |||||
| conv_dw(64, 128, 2), # 43 + 16 = 59 | |||||
| conv_dw(128, 128, 1), # 59 + 32 = 91 | |||||
| conv_dw(128, 128, 1), # 91 + 32 = 123 | |||||
| conv_dw(128, 128, 1), # 123 + 32 = 155 | |||||
| conv_dw(128, 128, 1), # 155 + 32 = 187 | |||||
| conv_dw(128, 128, 1), # 187 + 32 = 219 | |||||
| ) | |||||
| self.stage3 = nn.Sequential( | |||||
| conv_dw(128, 256, 2), # 219 +3 2 = 241 | |||||
| conv_dw(256, 256, 1), # 241 + 64 = 301 | |||||
| ) | |||||
| self.avg = nn.AdaptiveAvgPool2d((1, 1)) | |||||
| self.fc = nn.Linear(256, 1000) | |||||
| def forward(self, x): | |||||
| x = self.stage1(x) | |||||
| x = self.stage2(x) | |||||
| x = self.stage3(x) | |||||
| x = self.avg(x) | |||||
| x = x.view(-1, 256) | |||||
| x = self.fc(x) | |||||
| return x | |||||
| @@ -0,0 +1,145 @@ | |||||
| # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface | |||||
| from collections import OrderedDict | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torchvision.models as models | |||||
| import torchvision.models._utils as _utils | |||||
| import torchvision.models.detection.backbone_utils as backbone_utils | |||||
| from .net import FPN, SSH, MobileNetV1 | |||||
| class ClassHead(nn.Module): | |||||
| def __init__(self, inchannels=512, num_anchors=3): | |||||
| super(ClassHead, self).__init__() | |||||
| self.num_anchors = num_anchors | |||||
| self.conv1x1 = nn.Conv2d( | |||||
| inchannels, | |||||
| self.num_anchors * 2, | |||||
| kernel_size=(1, 1), | |||||
| stride=1, | |||||
| padding=0) | |||||
| def forward(self, x): | |||||
| out = self.conv1x1(x) | |||||
| out = out.permute(0, 2, 3, 1).contiguous() | |||||
| return out.view(out.shape[0], -1, 2) | |||||
| class BboxHead(nn.Module): | |||||
| def __init__(self, inchannels=512, num_anchors=3): | |||||
| super(BboxHead, self).__init__() | |||||
| self.conv1x1 = nn.Conv2d( | |||||
| inchannels, | |||||
| num_anchors * 4, | |||||
| kernel_size=(1, 1), | |||||
| stride=1, | |||||
| padding=0) | |||||
| def forward(self, x): | |||||
| out = self.conv1x1(x) | |||||
| out = out.permute(0, 2, 3, 1).contiguous() | |||||
| return out.view(out.shape[0], -1, 4) | |||||
| class LandmarkHead(nn.Module): | |||||
| def __init__(self, inchannels=512, num_anchors=3): | |||||
| super(LandmarkHead, self).__init__() | |||||
| self.conv1x1 = nn.Conv2d( | |||||
| inchannels, | |||||
| num_anchors * 10, | |||||
| kernel_size=(1, 1), | |||||
| stride=1, | |||||
| padding=0) | |||||
| def forward(self, x): | |||||
| out = self.conv1x1(x) | |||||
| out = out.permute(0, 2, 3, 1).contiguous() | |||||
| return out.view(out.shape[0], -1, 10) | |||||
| class RetinaFace(nn.Module): | |||||
| def __init__(self, cfg=None): | |||||
| """ | |||||
| :param cfg: Network related settings. | |||||
| """ | |||||
| super(RetinaFace, self).__init__() | |||||
| backbone = None | |||||
| if cfg['name'] == 'Resnet50': | |||||
| backbone = models.resnet50(pretrained=cfg['pretrain']) | |||||
| else: | |||||
| raise Exception('Invalid name') | |||||
| self.body = _utils.IntermediateLayerGetter(backbone, | |||||
| cfg['return_layers']) | |||||
| in_channels_stage2 = cfg['in_channel'] | |||||
| in_channels_list = [ | |||||
| in_channels_stage2 * 2, | |||||
| in_channels_stage2 * 4, | |||||
| in_channels_stage2 * 8, | |||||
| ] | |||||
| out_channels = cfg['out_channel'] | |||||
| self.fpn = FPN(in_channels_list, out_channels) | |||||
| self.ssh1 = SSH(out_channels, out_channels) | |||||
| self.ssh2 = SSH(out_channels, out_channels) | |||||
| self.ssh3 = SSH(out_channels, out_channels) | |||||
| self.ClassHead = self._make_class_head( | |||||
| fpn_num=3, inchannels=cfg['out_channel']) | |||||
| self.BboxHead = self._make_bbox_head( | |||||
| fpn_num=3, inchannels=cfg['out_channel']) | |||||
| self.LandmarkHead = self._make_landmark_head( | |||||
| fpn_num=3, inchannels=cfg['out_channel']) | |||||
| def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2): | |||||
| classhead = nn.ModuleList() | |||||
| for i in range(fpn_num): | |||||
| classhead.append(ClassHead(inchannels, anchor_num)) | |||||
| return classhead | |||||
| def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2): | |||||
| bboxhead = nn.ModuleList() | |||||
| for i in range(fpn_num): | |||||
| bboxhead.append(BboxHead(inchannels, anchor_num)) | |||||
| return bboxhead | |||||
| def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2): | |||||
| landmarkhead = nn.ModuleList() | |||||
| for i in range(fpn_num): | |||||
| landmarkhead.append(LandmarkHead(inchannels, anchor_num)) | |||||
| return landmarkhead | |||||
| def forward(self, inputs): | |||||
| out = self.body(inputs) | |||||
| # FPN | |||||
| fpn = self.fpn(out) | |||||
| # SSH | |||||
| feature1 = self.ssh1(fpn[0]) | |||||
| feature2 = self.ssh2(fpn[1]) | |||||
| feature3 = self.ssh3(fpn[2]) | |||||
| features = [feature1, feature2, feature3] | |||||
| bbox_regressions = torch.cat( | |||||
| [self.BboxHead[i](feature) for i, feature in enumerate(features)], | |||||
| dim=1) | |||||
| classifications = torch.cat( | |||||
| [self.ClassHead[i](feature) for i, feature in enumerate(features)], | |||||
| dim=1) | |||||
| ldm_regressions = torch.cat( | |||||
| [self.LandmarkHead[i](feat) for i, feat in enumerate(features)], | |||||
| dim=1) | |||||
| output = (bbox_regressions, F.softmax(classifications, | |||||
| dim=-1), ldm_regressions) | |||||
| return output | |||||
| @@ -0,0 +1,123 @@ | |||||
| # -------------------------------------------------------- | |||||
| # Modified from https://github.com/biubug6/Pytorch_Retinaface | |||||
| # -------------------------------------------------------- | |||||
| from itertools import product as product | |||||
| from math import ceil | |||||
| import numpy as np | |||||
| import torch | |||||
| class PriorBox(object): | |||||
| def __init__(self, cfg, image_size=None, phase='train'): | |||||
| super(PriorBox, self).__init__() | |||||
| self.min_sizes = cfg['min_sizes'] | |||||
| self.steps = cfg['steps'] | |||||
| self.clip = cfg['clip'] | |||||
| self.image_size = image_size | |||||
| self.feature_maps = [[ | |||||
| ceil(self.image_size[0] / step), | |||||
| ceil(self.image_size[1] / step) | |||||
| ] for step in self.steps] | |||||
| self.name = 's' | |||||
| def forward(self): | |||||
| anchors = [] | |||||
| for k, f in enumerate(self.feature_maps): | |||||
| min_sizes = self.min_sizes[k] | |||||
| for i, j in product(range(f[0]), range(f[1])): | |||||
| for min_size in min_sizes: | |||||
| s_kx = min_size / self.image_size[1] | |||||
| s_ky = min_size / self.image_size[0] | |||||
| dense_cx = [ | |||||
| x * self.steps[k] / self.image_size[1] | |||||
| for x in [j + 0.5] | |||||
| ] | |||||
| dense_cy = [ | |||||
| y * self.steps[k] / self.image_size[0] | |||||
| for y in [i + 0.5] | |||||
| ] | |||||
| for cy, cx in product(dense_cy, dense_cx): | |||||
| anchors += [cx, cy, s_kx, s_ky] | |||||
| # back to torch land | |||||
| output = torch.Tensor(anchors).view(-1, 4) | |||||
| if self.clip: | |||||
| output.clamp_(max=1, min=0) | |||||
| return output | |||||
| def py_cpu_nms(dets, thresh): | |||||
| """Pure Python NMS baseline.""" | |||||
| x1 = dets[:, 0] | |||||
| y1 = dets[:, 1] | |||||
| x2 = dets[:, 2] | |||||
| y2 = dets[:, 3] | |||||
| scores = dets[:, 4] | |||||
| areas = (x2 - x1 + 1) * (y2 - y1 + 1) | |||||
| order = scores.argsort()[::-1] | |||||
| keep = [] | |||||
| while order.size > 0: | |||||
| i = order[0] | |||||
| keep.append(i) | |||||
| xx1 = np.maximum(x1[i], x1[order[1:]]) | |||||
| yy1 = np.maximum(y1[i], y1[order[1:]]) | |||||
| xx2 = np.minimum(x2[i], x2[order[1:]]) | |||||
| yy2 = np.minimum(y2[i], y2[order[1:]]) | |||||
| w = np.maximum(0.0, xx2 - xx1 + 1) | |||||
| h = np.maximum(0.0, yy2 - yy1 + 1) | |||||
| inter = w * h | |||||
| ovr = inter / (areas[i] + areas[order[1:]] - inter) | |||||
| inds = np.where(ovr <= thresh)[0] | |||||
| order = order[inds + 1] | |||||
| return keep | |||||
| # Adapted from https://github.com/Hakuyume/chainer-ssd | |||||
| def decode(loc, priors, variances): | |||||
| """Decode locations from predictions using priors to undo | |||||
| the encoding we did for offset regression at train time. | |||||
| Args: | |||||
| loc (tensor): location predictions for loc layers, | |||||
| Shape: [num_priors,4] | |||||
| priors (tensor): Prior boxes in center-offset form. | |||||
| Shape: [num_priors,4]. | |||||
| variances: (list[float]) Variances of priorboxes | |||||
| Return: | |||||
| decoded bounding box predictions | |||||
| """ | |||||
| boxes = torch.cat( | |||||
| (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], | |||||
| priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) | |||||
| boxes[:, :2] -= boxes[:, 2:] / 2 | |||||
| boxes[:, 2:] += boxes[:, :2] | |||||
| return boxes | |||||
| def decode_landm(pre, priors, variances): | |||||
| """Decode landm from predictions using priors to undo | |||||
| the encoding we did for offset regression at train time. | |||||
| Args: | |||||
| pre (tensor): landm predictions for loc layers, | |||||
| Shape: [num_priors,10] | |||||
| priors (tensor): Prior boxes in center-offset form. | |||||
| Shape: [num_priors,4]. | |||||
| variances: (list[float]) Variances of priorboxes | |||||
| Return: | |||||
| decoded landm predictions | |||||
| """ | |||||
| a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:] | |||||
| b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:] | |||||
| c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:] | |||||
| d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:] | |||||
| e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:] | |||||
| landms = torch.cat((a, b, c, d, e), dim=1) | |||||
| return landms | |||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .fer import FacialExpressionRecognition | |||||
| else: | |||||
| _import_structure = {'fer': ['FacialExpressionRecognition']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .facial_expression_recognition import FacialExpressionRecognition | |||||
| @@ -0,0 +1,72 @@ | |||||
| # The implementation is based on Facial-Expression-Recognition, available at | |||||
| # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch | |||||
| import os | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.backends.cudnn as cudnn | |||||
| import torch.nn.functional as F | |||||
| from PIL import Image | |||||
| from torch.autograd import Variable | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import Tensor, TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from . import transforms | |||||
| from .vgg import VGG | |||||
| @MODELS.register_module( | |||||
| Tasks.facial_expression_recognition, module_name=Models.fer) | |||||
| class FacialExpressionRecognition(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | |||||
| super().__init__(model_path) | |||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | |||||
| self.model_path = model_path | |||||
| self.device = device | |||||
| self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE, | |||||
| ModelFile.CONFIGURATION) | |||||
| self.net = VGG('VGG19', cfg_path=self.cfg_path) | |||||
| self.load_model() | |||||
| self.net = self.net.to(device) | |||||
| self.transform_test = transforms.Compose([ | |||||
| transforms.TenCrop(44), | |||||
| transforms.Lambda(lambda crops: torch.stack( | |||||
| [transforms.ToTensor()(crop) for crop in crops])), | |||||
| ]) | |||||
| self.mean = np.array([[104, 117, 123]]) | |||||
| def load_model(self, load_to_cpu=False): | |||||
| pretrained_dict = torch.load( | |||||
| self.model_path, map_location=torch.device('cpu')) | |||||
| self.net.load_state_dict(pretrained_dict['net'], strict=True) | |||||
| self.net.eval() | |||||
| def forward(self, input): | |||||
| img = input['img'] | |||||
| img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY) | |||||
| img = cv2.resize(img, (48, 48)) | |||||
| img = img[:, :, np.newaxis] | |||||
| img = np.concatenate((img, img, img), axis=2) | |||||
| img = Image.fromarray(np.uint8(img)) | |||||
| inputs = self.transform_test(img) | |||||
| ncrops, c, h, w = inputs.shape | |||||
| inputs = inputs.view(-1, c, h, w) | |||||
| inputs = inputs.to(self.device) | |||||
| inputs = Variable(inputs, volatile=True) | |||||
| outputs = self.net(inputs) | |||||
| outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops | |||||
| score = F.softmax(outputs_avg) | |||||
| _, predicted = torch.max(outputs_avg.data, 0) | |||||
| return score, predicted | |||||
| @@ -0,0 +1,118 @@ | |||||
| # The implementation is based on Facial-Expression-Recognition, available at | |||||
| # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch | |||||
| import numbers | |||||
| import types | |||||
| import numpy as np | |||||
| import torch | |||||
| from PIL import Image | |||||
| def to_tensor(pic): | |||||
| # handle PIL Image | |||||
| if pic.mode == 'I': | |||||
| img = torch.from_numpy(np.array(pic, np.int32, copy=False)) | |||||
| elif pic.mode == 'I;16': | |||||
| img = torch.from_numpy(np.array(pic, np.int16, copy=False)) | |||||
| else: | |||||
| img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) | |||||
| # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK | |||||
| if pic.mode == 'YCbCr': | |||||
| nchannel = 3 | |||||
| elif pic.mode == 'I;16': | |||||
| nchannel = 1 | |||||
| else: | |||||
| nchannel = len(pic.mode) | |||||
| img = img.view(pic.size[1], pic.size[0], nchannel) | |||||
| # put it from HWC to CHW format | |||||
| # yikes, this transpose takes 80% of the loading time/CPU | |||||
| img = img.transpose(0, 1).transpose(0, 2).contiguous() | |||||
| if isinstance(img, torch.ByteTensor): | |||||
| return img.float().div(255) | |||||
| else: | |||||
| return img | |||||
| def center_crop(img, output_size): | |||||
| if isinstance(output_size, numbers.Number): | |||||
| output_size = (int(output_size), int(output_size)) | |||||
| w, h = img.size | |||||
| th, tw = output_size | |||||
| i = int(round((h - th) / 2.)) | |||||
| j = int(round((w - tw) / 2.)) | |||||
| return img.crop((j, i, j + tw, i + th)) | |||||
| def five_crop(img, size): | |||||
| if isinstance(size, numbers.Number): | |||||
| size = (int(size), int(size)) | |||||
| else: | |||||
| assert len( | |||||
| size) == 2, 'Please provide only two dimensions (h, w) for size.' | |||||
| w, h = img.size | |||||
| crop_h, crop_w = size | |||||
| if crop_w > w or crop_h > h: | |||||
| raise ValueError( | |||||
| 'Requested crop size {} is bigger than input size {}'.format( | |||||
| size, (h, w))) | |||||
| tl = img.crop((0, 0, crop_w, crop_h)) | |||||
| tr = img.crop((w - crop_w, 0, w, crop_h)) | |||||
| bl = img.crop((0, h - crop_h, crop_w, h)) | |||||
| br = img.crop((w - crop_w, h - crop_h, w, h)) | |||||
| center = center_crop(img, (crop_h, crop_w)) | |||||
| return (tl, tr, bl, br, center) | |||||
| class TenCrop(object): | |||||
| def __init__(self, size, vertical_flip=False): | |||||
| self.size = size | |||||
| if isinstance(size, numbers.Number): | |||||
| self.size = (int(size), int(size)) | |||||
| else: | |||||
| assert len( | |||||
| size | |||||
| ) == 2, 'Please provide only two dimensions (h, w) for size.' | |||||
| self.size = size | |||||
| self.vertical_flip = vertical_flip | |||||
| def __call__(self, img): | |||||
| first_five = five_crop(img, self.size) | |||||
| if self.vertical_flip: | |||||
| img = img.transpose(Image.FLIP_TOP_BOTTOM) | |||||
| else: | |||||
| img = img.transpose(Image.FLIP_LEFT_RIGHT) | |||||
| second_five = five_crop(img, self.size) | |||||
| return first_five + second_five | |||||
| class Compose(object): | |||||
| def __init__(self, transforms): | |||||
| self.transforms = transforms | |||||
| def __call__(self, img): | |||||
| for t in self.transforms: | |||||
| img = t(img) | |||||
| return img | |||||
| class ToTensor(object): | |||||
| def __call__(self, pic): | |||||
| return to_tensor(pic) | |||||
| class Lambda(object): | |||||
| def __init__(self, lambd): | |||||
| assert isinstance(lambd, types.LambdaType) | |||||
| self.lambd = lambd | |||||
| def __call__(self, img): | |||||
| return self.lambd(img) | |||||
| @@ -0,0 +1,40 @@ | |||||
| # The implementation is based on Facial-Expression-Recognition, available at | |||||
| # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torch.autograd import Variable | |||||
| from modelscope.utils.config import Config | |||||
| class VGG(nn.Module): | |||||
| def __init__(self, vgg_name, cfg_path): | |||||
| super(VGG, self).__init__() | |||||
| model_cfg = Config.from_file(cfg_path)['models'] | |||||
| self.features = self._make_layers(model_cfg[vgg_name]) | |||||
| self.classifier = nn.Linear(512, 7) | |||||
| def forward(self, x): | |||||
| out = self.features(x) | |||||
| out = out.view(out.size(0), -1) | |||||
| out = F.dropout(out, p=0.5, training=self.training) | |||||
| out = self.classifier(out) | |||||
| return out | |||||
| def _make_layers(self, cfg): | |||||
| layers = [] | |||||
| in_channels = 3 | |||||
| for x in cfg: | |||||
| if x == 'M': | |||||
| layers += [nn.MaxPool2d(kernel_size=2, stride=2)] | |||||
| else: | |||||
| layers += [ | |||||
| nn.Conv2d(in_channels, x, kernel_size=3, padding=1), | |||||
| nn.BatchNorm2d(x), | |||||
| nn.ReLU(inplace=True) | |||||
| ] | |||||
| in_channels = x | |||||
| layers += [nn.AvgPool2d(kernel_size=1, stride=1)] | |||||
| return nn.Sequential(*layers) | |||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .shop_seg_base import SHOPSEG | |||||
| else: | |||||
| _import_structure = {'shop_seg_base': ['SHOPSEG']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,59 @@ | |||||
| """ | |||||
| Base modules are adapted from https://github.com/open-mmlab/mmcv/, | |||||
| originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, | |||||
| https://github.com/open-mmlab/mmsegmentation/, | |||||
| originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, | |||||
| and adapted from https://github.com/raoyongming/DenseCLIP/, | |||||
| originally MIT License, Copyright (c) 2022 Rao, Yongming. | |||||
| """ | |||||
| import warnings | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| def resize(input, | |||||
| size=None, | |||||
| scale_factor=None, | |||||
| mode='nearest', | |||||
| align_corners=None, | |||||
| warning=True): | |||||
| if warning: | |||||
| if size is not None and align_corners: | |||||
| input_h, input_w = tuple(int(x) for x in input.shape[2:]) | |||||
| output_h, output_w = tuple(int(x) for x in size) | |||||
| if output_h > input_h or output_w > input_w: | |||||
| if ((output_h > 1 and output_w > 1 and input_h > 1 | |||||
| and input_w > 1) and (output_h - 1) % (input_h - 1) | |||||
| and (output_w - 1) % (input_w - 1)): | |||||
| warnings.warn( | |||||
| f'When align_corners={align_corners}, ' | |||||
| 'the output would more aligned if ' | |||||
| f'input size {(input_h, input_w)} is `x+1` and ' | |||||
| f'out size {(output_h, output_w)} is `nx+1`') | |||||
| return F.interpolate(input, size, scale_factor, mode, align_corners) | |||||
| class Upsample(nn.Module): | |||||
| def __init__(self, | |||||
| size=None, | |||||
| scale_factor=None, | |||||
| mode='nearest', | |||||
| align_corners=None): | |||||
| super(Upsample, self).__init__() | |||||
| self.size = size | |||||
| if isinstance(scale_factor, tuple): | |||||
| self.scale_factor = tuple(float(factor) for factor in scale_factor) | |||||
| else: | |||||
| self.scale_factor = float(scale_factor) if scale_factor else None | |||||
| self.mode = mode | |||||
| self.align_corners = align_corners | |||||
| def forward(self, x): | |||||
| if not self.size: | |||||
| size = [int(t * self.scale_factor) for t in x.shape[-2:]] | |||||
| else: | |||||
| size = self.size | |||||
| return resize(x, size, None, self.mode, self.align_corners) | |||||
| @@ -0,0 +1,122 @@ | |||||
| """ FPNHead | |||||
| Base modules are adapted from https://github.com/open-mmlab/mmcv/, | |||||
| originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, | |||||
| https://github.com/open-mmlab/mmsegmentation/, | |||||
| originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, | |||||
| and adapted from https://github.com/raoyongming/DenseCLIP/, | |||||
| originally MIT License, Copyright (c) 2022 Rao, Yongming. | |||||
| """ | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from mmcv.cnn import ConvModule | |||||
| from timm.models.layers import drop, drop_path, trunc_normal_ | |||||
| from .common import Upsample, resize | |||||
| class FPNHead(nn.Module): | |||||
| """Panoptic Feature Pyramid Networks. | |||||
| This head is the implementation of `Semantic FPN | |||||
| <https://arxiv.org/abs/1901.02446>`_. | |||||
| Args: | |||||
| feature_strides (tuple[int]): The strides for input feature maps. | |||||
| stack_lateral. All strides suppose to be power of 2. The first | |||||
| one is of largest resolution. | |||||
| """ | |||||
| def __init__(self, | |||||
| channels, | |||||
| num_classes, | |||||
| dropout_ratio=0.1, | |||||
| feature_strides=[4, 8, 16, 32], | |||||
| align_corners=False, | |||||
| **kwargs): | |||||
| super(FPNHead, self).__init__() | |||||
| self.act_cfg = dict(type='ReLU') | |||||
| self.channels = channels | |||||
| self.conv_cfg = None | |||||
| self.norm_cfg = None | |||||
| self.norm_cfg = dict(type='BN2d', requires_grad=True) | |||||
| self.align_corners = align_corners | |||||
| self.dropout_ratio = dropout_ratio | |||||
| self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) | |||||
| if dropout_ratio > 0: | |||||
| self.dropout = nn.Dropout2d(dropout_ratio) | |||||
| else: | |||||
| self.dropout = None | |||||
| self.in_index = [0, 1, 2, 3] | |||||
| assert min(feature_strides) == feature_strides[0] | |||||
| self.feature_strides = feature_strides | |||||
| self.scale_heads = nn.ModuleList() | |||||
| for i in range(len(feature_strides)): | |||||
| head_length = max( | |||||
| 1, | |||||
| int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) | |||||
| scale_head = [] | |||||
| for k in range(head_length): | |||||
| scale_head.append( | |||||
| ConvModule( | |||||
| self.channels, | |||||
| self.channels, | |||||
| 3, | |||||
| padding=1, | |||||
| conv_cfg=self.conv_cfg, | |||||
| norm_cfg=self.norm_cfg, | |||||
| act_cfg=self.act_cfg)) | |||||
| if feature_strides[i] != feature_strides[0]: | |||||
| scale_head.append( | |||||
| Upsample( | |||||
| scale_factor=2, | |||||
| mode='bilinear', | |||||
| align_corners=self.align_corners)) | |||||
| self.scale_heads.append(nn.Sequential(*scale_head)) | |||||
| self.apply(self._init_weights) | |||||
| def _transform_inputs(self, inputs): | |||||
| """Transform inputs for decoder. | |||||
| Args: | |||||
| inputs (list[Tensor]): List of multi-level img features. | |||||
| Returns: | |||||
| Tensor: The transformed inputs | |||||
| """ | |||||
| inputs = [inputs[i] for i in self.in_index] | |||||
| return inputs | |||||
| def cls_seg(self, feat): | |||||
| """Classify each pixel.""" | |||||
| if self.dropout is not None: | |||||
| feat = self.dropout(feat) | |||||
| output = self.conv_seg(feat) | |||||
| return output | |||||
| def forward(self, inputs): | |||||
| x = self._transform_inputs(inputs) | |||||
| output = self.scale_heads[0](x[0]) | |||||
| for i in range(1, len(self.feature_strides)): | |||||
| # non inplace | |||||
| output = output + resize( | |||||
| self.scale_heads[i](x[i]), | |||||
| size=output.shape[2:], | |||||
| mode='bilinear', | |||||
| align_corners=self.align_corners) | |||||
| output = self.cls_seg(output) | |||||
| return output | |||||
| def _init_weights(self, m): | |||||
| if isinstance(m, nn.Linear): | |||||
| trunc_normal_(m.weight, std=.02) | |||||
| if isinstance(m, nn.Linear) and m.bias is not None: | |||||
| nn.init.constant_(m.bias, 0) | |||||
| elif isinstance(m, nn.LayerNorm): | |||||
| nn.init.constant_(m.bias, 0) | |||||
| nn.init.constant_(m.weight, 1.0) | |||||
| elif isinstance(m, nn.Conv2d): | |||||
| nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') | |||||
| if m.bias is not None: | |||||
| nn.init.constant_(m.bias.data, 0) | |||||
| @@ -0,0 +1,901 @@ | |||||
| """ | |||||
| Base modules are adapted from https://github.com/open-mmlab/mmcv/, | |||||
| originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, | |||||
| https://github.com/open-mmlab/mmsegmentation/, | |||||
| originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, | |||||
| and adapted from https://github.com/raoyongming/DenseCLIP/, | |||||
| originally MIT License, Copyright (c) 2022 Rao, Yongming. | |||||
| """ | |||||
| import math | |||||
| from collections import OrderedDict | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| import torch.utils.checkpoint as checkpoint | |||||
| from timm.models.layers import drop, drop_path, trunc_normal_ | |||||
| from torch import nn | |||||
| class Bottleneck(nn.Module): | |||||
| expansion = 4 | |||||
| def __init__(self, inplanes, planes, stride=1): | |||||
| super().__init__() | |||||
| # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 | |||||
| self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(planes) | |||||
| self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(planes) | |||||
| self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() | |||||
| self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| self.downsample = None | |||||
| self.stride = stride | |||||
| if stride > 1 or inplanes != planes * Bottleneck.expansion: | |||||
| # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 | |||||
| self.downsample = nn.Sequential( | |||||
| OrderedDict([('-1', nn.AvgPool2d(stride)), | |||||
| ('0', | |||||
| nn.Conv2d( | |||||
| inplanes, | |||||
| planes * self.expansion, | |||||
| 1, | |||||
| stride=1, | |||||
| bias=False)), | |||||
| ('1', nn.BatchNorm2d(planes * self.expansion))])) | |||||
| def forward(self, x: torch.Tensor): | |||||
| identity = x | |||||
| out = self.relu(self.bn1(self.conv1(x))) | |||||
| out = self.relu(self.bn2(self.conv2(out))) | |||||
| out = self.avgpool(out) | |||||
| out = self.bn3(self.conv3(out)) | |||||
| if self.downsample is not None: | |||||
| identity = self.downsample(x) | |||||
| out += identity | |||||
| out = self.relu(out) | |||||
| return out | |||||
| class AttentionPool2d(nn.Module): | |||||
| def __init__(self, | |||||
| spacial_dim: int, | |||||
| embed_dim: int, | |||||
| num_heads: int, | |||||
| output_dim: int = None): | |||||
| super().__init__() | |||||
| self.positional_embedding = nn.Parameter( | |||||
| torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) | |||||
| self.k_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.q_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.v_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) | |||||
| self.num_heads = num_heads | |||||
| self.embed_dim = embed_dim | |||||
| self.spacial_dim = spacial_dim | |||||
| def forward(self, x): | |||||
| B, C, H, W = x.shape | |||||
| x = x.reshape(x.shape[0], x.shape[1], | |||||
| x.shape[2] * x.shape[3]).permute(2, 0, | |||||
| 1) # NCHW -> (HW)NC | |||||
| x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC | |||||
| cls_pos = self.positional_embedding[0:1, :] | |||||
| spatial_pos = F.interpolate( | |||||
| self.positional_embedding[1:, ].reshape(1, self.spacial_dim, | |||||
| self.spacial_dim, | |||||
| self.embed_dim).permute( | |||||
| 0, 3, 1, 2), | |||||
| size=(H, W), | |||||
| mode='bilinear') | |||||
| spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0) | |||||
| positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) | |||||
| x = x + positional_embedding[:, None, :] | |||||
| x, _ = F.multi_head_attention_forward( | |||||
| query=x, | |||||
| key=x, | |||||
| value=x, | |||||
| embed_dim_to_check=x.shape[-1], | |||||
| num_heads=self.num_heads, | |||||
| q_proj_weight=self.q_proj.weight, | |||||
| k_proj_weight=self.k_proj.weight, | |||||
| v_proj_weight=self.v_proj.weight, | |||||
| in_proj_weight=None, | |||||
| in_proj_bias=torch.cat( | |||||
| [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), | |||||
| bias_k=None, | |||||
| bias_v=None, | |||||
| add_zero_attn=False, | |||||
| dropout_p=0, | |||||
| out_proj_weight=self.c_proj.weight, | |||||
| out_proj_bias=self.c_proj.bias, | |||||
| use_separate_proj_weight=True, | |||||
| training=self.training, | |||||
| need_weights=False) | |||||
| x = x.permute(1, 2, 0) | |||||
| global_feat = x[:, :, 0] | |||||
| feature_map = x[:, :, 1:].reshape(B, -1, H, W) | |||||
| return global_feat, feature_map | |||||
| class CLIPResNet(nn.Module): | |||||
| """ | |||||
| A ResNet class that is similar to torchvision's but contains the following changes: | |||||
| - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |||||
| - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |||||
| - The final pooling layer is a QKV attention instead of an average pool | |||||
| """ | |||||
| def __init__(self, | |||||
| layers, | |||||
| output_dim=512, | |||||
| input_resolution=224, | |||||
| width=64, | |||||
| pretrained=None, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.pretrained = pretrained | |||||
| self.output_dim = output_dim | |||||
| self.input_resolution = input_resolution | |||||
| # the 3-layer stem | |||||
| self.conv1 = nn.Conv2d( | |||||
| 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(width // 2) | |||||
| self.conv2 = nn.Conv2d( | |||||
| width // 2, width // 2, kernel_size=3, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(width // 2) | |||||
| self.conv3 = nn.Conv2d( | |||||
| width // 2, width, kernel_size=3, padding=1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(width) | |||||
| self.avgpool = nn.AvgPool2d(2) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| # residual layers | |||||
| self._inplanes = width # this is a *mutable* variable used during construction | |||||
| self.layer1 = self._make_layer(width, layers[0]) | |||||
| self.layer2 = self._make_layer(width * 2, layers[1], stride=2) | |||||
| self.layer3 = self._make_layer(width * 4, layers[2], stride=2) | |||||
| self.layer4 = self._make_layer(width * 8, layers[3], stride=2) | |||||
| def init_weights(self, pretrained=None): | |||||
| pretrained = pretrained or self.pretrained | |||||
| if isinstance(pretrained, str): | |||||
| checkpoint = torch.jit.load( | |||||
| pretrained, map_location='cpu').float().state_dict() | |||||
| state_dict = {} | |||||
| for k in checkpoint.keys(): | |||||
| if k.startswith('visual.'): | |||||
| new_k = k.replace('visual.', '') | |||||
| state_dict[new_k] = checkpoint[k] | |||||
| u, w = self.load_state_dict(state_dict, False) | |||||
| print(u, w, 'are misaligned params in CLIPResNet') | |||||
| def _make_layer(self, planes, blocks, stride=1): | |||||
| layers = [Bottleneck(self._inplanes, planes, stride)] | |||||
| self._inplanes = planes * Bottleneck.expansion | |||||
| for _ in range(1, blocks): | |||||
| layers.append(Bottleneck(self._inplanes, planes)) | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| def stem(x): | |||||
| for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), | |||||
| (self.conv3, self.bn3)]: | |||||
| x = self.relu(bn(conv(x))) | |||||
| x = self.avgpool(x) | |||||
| return x | |||||
| x = x.type(self.conv1.weight.dtype) | |||||
| x = stem(x) | |||||
| outs = [] | |||||
| x = self.layer1(x) | |||||
| outs.append(x) | |||||
| x = self.layer2(x) | |||||
| outs.append(x) | |||||
| x = self.layer3(x) | |||||
| outs.append(x) | |||||
| x = self.layer4(x) | |||||
| outs.append(x) | |||||
| return tuple(outs) | |||||
| class CLIPResNetWithAttention(nn.Module): | |||||
| """ | |||||
| A ResNet class that is similar to torchvision's but contains the following changes: | |||||
| - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |||||
| - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |||||
| - The final pooling layer is a QKV attention instead of an average pool | |||||
| """ | |||||
| def __init__(self, | |||||
| layers, | |||||
| output_dim=1024, | |||||
| input_resolution=224, | |||||
| width=64, | |||||
| pretrained=None, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.pretrained = pretrained | |||||
| self.output_dim = output_dim | |||||
| self.input_resolution = input_resolution | |||||
| # the 3-layer stem | |||||
| self.conv1 = nn.Conv2d( | |||||
| 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(width // 2) | |||||
| self.conv2 = nn.Conv2d( | |||||
| width // 2, width // 2, kernel_size=3, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(width // 2) | |||||
| self.conv3 = nn.Conv2d( | |||||
| width // 2, width, kernel_size=3, padding=1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(width) | |||||
| self.avgpool = nn.AvgPool2d(2) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| # residual layers | |||||
| self._inplanes = width # this is a *mutable* variable used during construction | |||||
| self.layer1 = self._make_layer(width, layers[0]) | |||||
| self.layer2 = self._make_layer(width * 2, layers[1], stride=2) | |||||
| self.layer3 = self._make_layer(width * 4, layers[2], stride=2) | |||||
| self.layer4 = self._make_layer(width * 8, layers[3], stride=2) | |||||
| embed_dim = width * 32 # the ResNet feature dimension | |||||
| self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32, | |||||
| output_dim) | |||||
| def init_weights(self, pretrained=None): | |||||
| pretrained = pretrained or self.pretrained | |||||
| if isinstance(pretrained, str): | |||||
| checkpoint = torch.jit.load( | |||||
| pretrained, map_location='cpu').float().state_dict() | |||||
| state_dict = {} | |||||
| for k in checkpoint.keys(): | |||||
| if k.startswith('visual.'): | |||||
| new_k = k.replace('visual.', '') | |||||
| state_dict[new_k] = checkpoint[k] | |||||
| if 'positional_embedding' in new_k: | |||||
| if self.attnpool.positional_embedding.shape != state_dict[ | |||||
| new_k].shape: | |||||
| print( | |||||
| f'Resize the pos_embed shape from {state_dict[new_k].shape}' | |||||
| f' to {self.attnpool.positional_embedding.shape}' | |||||
| ) | |||||
| cls_pos = state_dict[new_k][0:1, :] | |||||
| H = W = self.input_resolution // 32 | |||||
| old_h = int( | |||||
| math.sqrt(state_dict[new_k][1:, ].shape[0])) | |||||
| spatial_pos = F.interpolate( | |||||
| state_dict[new_k][1:, ].reshape( | |||||
| 1, old_h, old_h, | |||||
| cls_pos.shape[1]).permute(0, 3, 1, 2), | |||||
| size=(H, W), | |||||
| mode='bilinear') | |||||
| spatial_pos = spatial_pos.reshape( | |||||
| cls_pos.shape[1], H * W).permute(1, 0) | |||||
| positional_embedding = torch.cat( | |||||
| [cls_pos, spatial_pos], dim=0) | |||||
| state_dict[new_k] = positional_embedding | |||||
| assert self.attnpool.positional_embedding.shape == state_dict[ | |||||
| new_k].shape | |||||
| u, w = self.load_state_dict(state_dict, False) | |||||
| print(u, w, 'are misaligned params in CLIPResNet') | |||||
| def _make_layer(self, planes, blocks, stride=1): | |||||
| layers = [Bottleneck(self._inplanes, planes, stride)] | |||||
| self._inplanes = planes * Bottleneck.expansion | |||||
| for _ in range(1, blocks): | |||||
| layers.append(Bottleneck(self._inplanes, planes)) | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| def stem(x): | |||||
| for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), | |||||
| (self.conv3, self.bn3)]: | |||||
| x = self.relu(bn(conv(x))) | |||||
| x = self.avgpool(x) | |||||
| return x | |||||
| x = x.type(self.conv1.weight.dtype) | |||||
| x = stem(x) | |||||
| outs = [] | |||||
| x = self.layer1(x) | |||||
| outs.append(x) | |||||
| x = self.layer2(x) | |||||
| outs.append(x) | |||||
| x = self.layer3(x) | |||||
| outs.append(x) | |||||
| x = self.layer4(x) | |||||
| outs.append(x) | |||||
| x_global, x_local = self.attnpool(x) | |||||
| outs.append([x_global, x_local]) | |||||
| return tuple(outs) | |||||
| class LayerNorm(nn.LayerNorm): | |||||
| """Subclass torch's LayerNorm to handle fp16.""" | |||||
| def forward(self, x: torch.Tensor): | |||||
| orig_type = x.dtype | |||||
| ret = super().forward(x.type(torch.float32)) | |||||
| return ret.type(orig_type) | |||||
| class QuickGELU(nn.Module): | |||||
| def forward(self, x: torch.Tensor): | |||||
| return x * torch.sigmoid(1.702 * x) | |||||
| class DropPath(nn.Module): | |||||
| """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). | |||||
| """ | |||||
| def __init__(self, drop_prob=None): | |||||
| super(DropPath, self).__init__() | |||||
| self.drop_prob = drop_prob | |||||
| def forward(self, x): | |||||
| return drop_path(x, self.drop_prob, self.training) | |||||
| def extra_repr(self) -> str: | |||||
| return 'p={}'.format(self.drop_prob) | |||||
| class ResidualAttentionBlock(nn.Module): | |||||
| def __init__(self, | |||||
| d_model: int, | |||||
| n_head: int, | |||||
| attn_mask: torch.Tensor = None, | |||||
| drop_path=0.): | |||||
| super().__init__() | |||||
| self.attn = nn.MultiheadAttention(d_model, n_head) | |||||
| self.ln_1 = LayerNorm(d_model) | |||||
| self.mlp = nn.Sequential( | |||||
| OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), | |||||
| ('gelu', QuickGELU()), | |||||
| ('c_proj', nn.Linear(d_model * 4, d_model))])) | |||||
| self.ln_2 = LayerNorm(d_model) | |||||
| self.attn_mask = attn_mask | |||||
| self.drop_path = DropPath( | |||||
| drop_path) if drop_path > 0. else nn.Identity() | |||||
| def attention(self, x: torch.Tensor): | |||||
| self.attn_mask = self.attn_mask.to( | |||||
| dtype=x.dtype, | |||||
| device=x.device) if self.attn_mask is not None else None | |||||
| return self.attn( | |||||
| x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] | |||||
| def forward(self, x: torch.Tensor): | |||||
| x = x + self.drop_path(self.attention(self.ln_1(x))) | |||||
| x = x + self.drop_path(self.mlp(self.ln_2(x))) | |||||
| return x | |||||
| class Transformer(nn.Module): | |||||
| def __init__(self, | |||||
| width: int, | |||||
| layers: int, | |||||
| heads: int, | |||||
| attn_mask: torch.Tensor = None, | |||||
| drop_path_rate=0.): | |||||
| super().__init__() | |||||
| self.width = width | |||||
| self.layers = layers | |||||
| dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers) | |||||
| ] # stochastic depth decay rule | |||||
| self.resblocks = nn.Sequential(*[ | |||||
| ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) | |||||
| for i in range(layers) | |||||
| ]) | |||||
| def forward(self, x: torch.Tensor): | |||||
| return self.resblocks(x) | |||||
| class Attention(nn.Module): | |||||
| def __init__(self, | |||||
| dim, | |||||
| num_heads=8, | |||||
| qkv_bias=False, | |||||
| qk_scale=None, | |||||
| attn_drop=0., | |||||
| proj_drop=0.): | |||||
| super().__init__() | |||||
| self.num_heads = num_heads | |||||
| head_dim = dim // num_heads | |||||
| # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights | |||||
| self.scale = qk_scale or head_dim**-0.5 | |||||
| self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) | |||||
| self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) | |||||
| self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) | |||||
| self.attn_drop = nn.Dropout(attn_drop) | |||||
| self.proj = nn.Linear(dim, dim) | |||||
| self.proj_drop = nn.Dropout(proj_drop) | |||||
| def forward(self, q, k, v): | |||||
| B, N, C = q.shape | |||||
| assert k.shape == v.shape | |||||
| B, M, C = k.shape | |||||
| q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads) | |||||
| k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads) | |||||
| v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads) | |||||
| attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale | |||||
| attn = attn.softmax(dim=-1) | |||||
| x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C) | |||||
| x = self.proj(x) | |||||
| x = self.proj_drop(x) | |||||
| return x | |||||
| class TransformerDecoderLayer(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| d_model, | |||||
| nhead, | |||||
| dropout=0.1, | |||||
| ): | |||||
| super().__init__() | |||||
| self.self_attn = Attention(d_model, nhead, proj_drop=dropout) | |||||
| self.cross_attn = Attention(d_model, nhead, proj_drop=dropout) | |||||
| self.norm1 = nn.LayerNorm(d_model) | |||||
| self.norm2 = nn.LayerNorm(d_model) | |||||
| self.norm3 = nn.LayerNorm(d_model) | |||||
| self.dropout = nn.Dropout(dropout) | |||||
| self.mlp = nn.Sequential( | |||||
| nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout), | |||||
| nn.Linear(d_model * 4, d_model)) | |||||
| def forward(self, x, mem): | |||||
| q = k = v = self.norm1(x) | |||||
| x = x + self.self_attn(q, k, v) | |||||
| q = self.norm2(x) | |||||
| x = x + self.cross_attn(q, mem, mem) | |||||
| x = x + self.dropout(self.mlp(self.norm3(x))) | |||||
| return x | |||||
| class CLIPVisionTransformer(nn.Module): | |||||
| def __init__(self, | |||||
| input_resolution=224, | |||||
| patch_size=32, | |||||
| width=768, | |||||
| layers=12, | |||||
| heads=12, | |||||
| output_dim=512, | |||||
| drop_path_rate=0.0, | |||||
| out_indices=[3, 5, 7, 11], | |||||
| pretrained=None, | |||||
| get_embeddings=False, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.pretrained = pretrained | |||||
| self.input_resolution = input_resolution | |||||
| self.output_dim = output_dim | |||||
| self.conv1 = nn.Conv2d( | |||||
| in_channels=3, | |||||
| out_channels=width, | |||||
| kernel_size=patch_size, | |||||
| stride=patch_size, | |||||
| bias=False) | |||||
| scale = width**-0.5 | |||||
| self.class_embedding = nn.Parameter(scale * torch.randn(width)) | |||||
| self.positional_embedding = nn.Parameter(scale * torch.randn( | |||||
| (input_resolution // patch_size)**2 + 1, width)) | |||||
| self.spatial_size = input_resolution // patch_size | |||||
| self.ln_pre = LayerNorm(width) | |||||
| self.get_embeddings = get_embeddings | |||||
| self.transformer = Transformer( | |||||
| width, layers, heads, drop_path_rate=drop_path_rate) | |||||
| self.out_indices = out_indices | |||||
| if get_embeddings: | |||||
| self.ln_post = LayerNorm(width) | |||||
| self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) | |||||
| embed_dim = width | |||||
| if patch_size == 16: | |||||
| self.fpn1 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.ConvTranspose2d( | |||||
| embed_dim, embed_dim, kernel_size=2, stride=2), | |||||
| nn.SyncBatchNorm(embed_dim), | |||||
| nn.GELU(), | |||||
| nn.ConvTranspose2d( | |||||
| embed_dim, embed_dim, kernel_size=2, stride=2), | |||||
| ) | |||||
| self.fpn2 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.ConvTranspose2d( | |||||
| embed_dim, embed_dim, kernel_size=2, stride=2), | |||||
| ) | |||||
| self.fpn3 = nn.GroupNorm(1, embed_dim) | |||||
| self.fpn4 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.MaxPool2d(kernel_size=2, stride=2)) | |||||
| elif patch_size == 8: | |||||
| self.fpn1 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.ConvTranspose2d( | |||||
| embed_dim, embed_dim, kernel_size=2, stride=2), | |||||
| ) | |||||
| self.fpn2 = nn.GroupNorm(1, embed_dim) | |||||
| self.fpn3 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.MaxPool2d(kernel_size=2, stride=2), | |||||
| ) | |||||
| self.fpn4 = nn.Sequential( | |||||
| nn.GroupNorm(1, embed_dim), | |||||
| nn.MaxPool2d(kernel_size=4, stride=4), | |||||
| ) | |||||
| def init_weights(self, pretrained=None): | |||||
| pretrained = pretrained or self.pretrained | |||||
| if isinstance(pretrained, str): | |||||
| checkpoint = torch.jit.load( | |||||
| pretrained, map_location='cpu').float().state_dict() | |||||
| state_dict = {} | |||||
| for k in checkpoint.keys(): | |||||
| if k.startswith('visual.'): | |||||
| new_k = k.replace('visual.', '') | |||||
| state_dict[new_k] = checkpoint[k] | |||||
| if 'positional_embedding' in state_dict.keys(): | |||||
| if self.positional_embedding.shape != state_dict[ | |||||
| 'positional_embedding'].shape: | |||||
| print( | |||||
| f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to' | |||||
| f' {self.positional_embedding.shape}') | |||||
| cls_pos = state_dict['positional_embedding'][0:1, :] | |||||
| spatial_pos = F.interpolate( | |||||
| state_dict['positional_embedding'][1:, ].reshape( | |||||
| 1, 14, 14, 768).permute(0, 3, 1, 2), | |||||
| size=(self.spatial_size, self.spatial_size), | |||||
| mode='bilinear') | |||||
| spatial_pos = spatial_pos.reshape( | |||||
| 768, | |||||
| self.spatial_size * self.spatial_size).permute(1, 0) | |||||
| positional_embedding = torch.cat([cls_pos, spatial_pos], | |||||
| dim=0) | |||||
| state_dict['positional_embedding'] = positional_embedding | |||||
| assert self.positional_embedding.shape == state_dict[ | |||||
| 'positional_embedding'].shape | |||||
| u, w = self.load_state_dict(state_dict, False) | |||||
| print(u, w, 'are misaligned params in vision transformer') | |||||
| def forward(self, x: torch.Tensor): | |||||
| x = self.conv1(x) # shape = [*, width, grid, grid] | |||||
| B, C, H, W = x.shape | |||||
| x = x.reshape(x.shape[0], x.shape[1], | |||||
| -1) # shape = [*, width, grid ** 2] | |||||
| x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] | |||||
| x1 = self.class_embedding.to(x.dtype) | |||||
| x2 = torch.zeros( | |||||
| x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) | |||||
| x = torch.cat([x1 + x2, x], dim=1) | |||||
| pos = self.positional_embedding.to(x.dtype) | |||||
| cls_pos = pos[0, :] + self.class_embedding.to(x.dtype) | |||||
| spatial_pos = F.interpolate( | |||||
| pos[1:, ].reshape(1, self.spatial_size, self.spatial_size, | |||||
| C).permute(0, 3, 1, 2), | |||||
| size=(H, W), | |||||
| mode='bilinear') | |||||
| spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1) | |||||
| pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) | |||||
| x = x + pos | |||||
| x = self.ln_pre(x) | |||||
| x = x.permute(1, 0, 2) # NLD -> LND | |||||
| gradientcheckpoint = False | |||||
| features = [] | |||||
| for i, blk in enumerate(self.transformer.resblocks): | |||||
| if gradientcheckpoint: | |||||
| x = checkpoint.checkpoint(blk, x) | |||||
| else: | |||||
| x = blk(x) | |||||
| if i in self.out_indices: | |||||
| xp = x.permute(1, 0, 2)[:, | |||||
| 1:, :].permute(0, 2, | |||||
| 1).reshape(B, -1, H, W) | |||||
| features.append(xp.contiguous()) | |||||
| ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] | |||||
| for i in range(len(features)): | |||||
| features[i] = ops[i](features[i]) | |||||
| if self.get_embeddings: | |||||
| x = x.permute(1, 0, 2) | |||||
| x = self.ln_post(x) | |||||
| x = x @ self.proj | |||||
| global_embedding = x[:, 0] | |||||
| visual_embedding = x[:, 1:].reshape(B, H, W, | |||||
| -1).permute(0, 3, 1, | |||||
| 2) # B C H W | |||||
| features.append([global_embedding, visual_embedding]) | |||||
| return tuple(features) | |||||
| class CLIPTextEncoder(nn.Module): | |||||
| def __init__(self, | |||||
| context_length=77, | |||||
| vocab_size=49408, | |||||
| transformer_width=512, | |||||
| transformer_heads=8, | |||||
| transformer_layers=12, | |||||
| embed_dim=1024, | |||||
| out_dim=256, | |||||
| pretrained=None, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.pretrained = pretrained | |||||
| self.context_length = context_length | |||||
| self.transformer = Transformer( | |||||
| width=transformer_width, | |||||
| layers=transformer_layers, | |||||
| heads=transformer_heads, | |||||
| attn_mask=self.build_attention_mask()) | |||||
| self.vocab_size = vocab_size | |||||
| self.token_embedding = nn.Embedding(vocab_size, transformer_width) | |||||
| self.positional_embedding = nn.Parameter( | |||||
| torch.empty(self.context_length, transformer_width)) | |||||
| self.ln_final = LayerNorm(transformer_width) | |||||
| self.text_projection = nn.Parameter( | |||||
| torch.empty(transformer_width, embed_dim)) | |||||
| def init_weights(self, pretrained=None): | |||||
| pretrained = pretrained or self.pretrained | |||||
| if isinstance(pretrained, str): | |||||
| checkpoint = torch.jit.load( | |||||
| pretrained, map_location='cpu').float().state_dict() | |||||
| state_dict = {} | |||||
| for k in checkpoint.keys(): | |||||
| if k.startswith('transformer.'): | |||||
| state_dict[k] = checkpoint[k] | |||||
| if k == 'positional_embedding' or k == 'text_projection' or k.startswith( | |||||
| 'token_embedding') or k.startswith('ln_final'): | |||||
| if k == 'positional_embedding' and checkpoint[k].size( | |||||
| 0) > self.context_length: | |||||
| checkpoint[k] = checkpoint[k][:self.context_length] | |||||
| print('positional_embedding is tuncated from 77 to', | |||||
| self.context_length) | |||||
| state_dict[k] = checkpoint[k] | |||||
| u, w = self.load_state_dict(state_dict, False) | |||||
| print(u, w, 'are misaligned params in text encoder') | |||||
| def build_attention_mask(self): | |||||
| # lazily create causal attention mask, with full attention between the vision tokens | |||||
| # pytorch uses additive attention mask; fill with -inf | |||||
| mask = torch.empty(self.context_length, self.context_length) | |||||
| mask.fill_(float('-inf')) | |||||
| mask.triu_(1) # zero out the lower diagonal | |||||
| return mask | |||||
| def forward(self, text): | |||||
| x = self.token_embedding(text) | |||||
| x = x + self.positional_embedding | |||||
| x = x.permute(1, 0, 2) | |||||
| x = self.transformer(x) | |||||
| x = x.permute(1, 0, 2) | |||||
| x = self.ln_final(x) | |||||
| x = x[torch.arange(x.shape[0]), | |||||
| text.argmax(dim=-1), ...] @ self.text_projection | |||||
| return x | |||||
| class CLIPTextContextEncoder(nn.Module): | |||||
| def __init__(self, | |||||
| context_length=22, | |||||
| vocab_size=49408, | |||||
| transformer_width=512, | |||||
| transformer_heads=8, | |||||
| transformer_layers=12, | |||||
| embed_dim=1024, | |||||
| out_dim=256, | |||||
| pretrained=None, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.pretrained = pretrained | |||||
| self.context_length = context_length | |||||
| self.transformer = Transformer( | |||||
| width=transformer_width, | |||||
| layers=transformer_layers, | |||||
| heads=transformer_heads, | |||||
| attn_mask=self.build_attention_mask()) | |||||
| self.embed_dim = embed_dim | |||||
| self.vocab_size = vocab_size | |||||
| self.token_embedding = nn.Embedding(vocab_size, transformer_width) | |||||
| self.positional_embedding = nn.Parameter( | |||||
| torch.empty(self.context_length, transformer_width)) | |||||
| self.ln_final = LayerNorm(transformer_width) | |||||
| self.text_projection = nn.Parameter( | |||||
| torch.empty(transformer_width, embed_dim)) | |||||
| def init_weights(self, pretrained=None): | |||||
| pretrained = pretrained or self.pretrained | |||||
| if isinstance(pretrained, str): | |||||
| checkpoint = torch.jit.load( | |||||
| pretrained, map_location='cpu').float().state_dict() | |||||
| state_dict = {} | |||||
| for k in checkpoint.keys(): | |||||
| if k.startswith('transformer.'): | |||||
| state_dict[k] = checkpoint[k] | |||||
| if k == 'positional_embedding' or k == 'text_projection' or k.startswith( | |||||
| 'token_embedding') or k.startswith('ln_final'): | |||||
| if k == 'positional_embedding' and checkpoint[k].size( | |||||
| 0) > self.context_length: | |||||
| checkpoint[k] = checkpoint[k][:self.context_length] | |||||
| print('positional_embedding is tuncated from 77 to', | |||||
| self.context_length) | |||||
| state_dict[k] = checkpoint[k] | |||||
| u, w = self.load_state_dict(state_dict, False) | |||||
| print(u, w, 'are misaligned params in text encoder') | |||||
| def build_attention_mask(self): | |||||
| # lazily create causal attention mask, with full attention between the vision tokens | |||||
| # pytorch uses additive attention mask; fill with -inf | |||||
| mask = torch.empty(self.context_length, self.context_length) | |||||
| mask.fill_(float('-inf')) | |||||
| mask.triu_(1) # zero out the lower diagonal | |||||
| return mask | |||||
| def forward(self, text, context=None): | |||||
| x_text = self.token_embedding(text) # n_clas, n_text, C | |||||
| K, N1, C = x_text.shape # 150类 * 5??? * 512 | |||||
| B, N2, C = context.shape # 1 * 8 * 512 | |||||
| eos_indx = text.argmax(dim=-1) + N2 | |||||
| eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1) | |||||
| x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C) | |||||
| context = context.reshape(B, 1, N2, C).expand(B, K, N2, C) | |||||
| x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]], | |||||
| dim=2).reshape(B * K, N1 + N2, C) | |||||
| x = x + self.positional_embedding | |||||
| x = x.permute(1, 0, 2) # NLD -> LND | |||||
| x = self.transformer(x) | |||||
| x = x.permute(1, 0, 2) # LND -> NLD | |||||
| x = self.ln_final(x) | |||||
| x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection | |||||
| x = x.reshape(B, K, self.embed_dim) | |||||
| return x | |||||
| class ContextDecoder(nn.Module): | |||||
| def __init__(self, | |||||
| transformer_width=256, | |||||
| transformer_heads=4, | |||||
| transformer_layers=6, | |||||
| visual_dim=1024, | |||||
| dropout=0.1, | |||||
| **kwargs): | |||||
| super().__init__() | |||||
| self.memory_proj = nn.Sequential( | |||||
| nn.LayerNorm(visual_dim), | |||||
| nn.Linear(visual_dim, transformer_width), | |||||
| nn.LayerNorm(transformer_width), | |||||
| ) | |||||
| self.text_proj = nn.Sequential( | |||||
| nn.LayerNorm(visual_dim), | |||||
| nn.Linear(visual_dim, transformer_width), | |||||
| ) | |||||
| self.decoder = nn.ModuleList([ | |||||
| TransformerDecoderLayer(transformer_width, transformer_heads, | |||||
| dropout) for _ in range(transformer_layers) | |||||
| ]) | |||||
| self.out_proj = nn.Sequential( | |||||
| nn.LayerNorm(transformer_width), | |||||
| nn.Linear(transformer_width, visual_dim)) | |||||
| self.apply(self._init_weights) | |||||
| def _init_weights(self, m): | |||||
| if isinstance(m, nn.Linear): | |||||
| trunc_normal_(m.weight, std=.02) | |||||
| if isinstance(m, nn.Linear) and m.bias is not None: | |||||
| nn.init.constant_(m.bias, 0) | |||||
| elif isinstance(m, nn.LayerNorm): | |||||
| nn.init.constant_(m.bias, 0) | |||||
| nn.init.constant_(m.weight, 1.0) | |||||
| def forward(self, text, visual): | |||||
| B, N, C = visual.shape | |||||
| visual = self.memory_proj(visual) | |||||
| x = self.text_proj(text) | |||||
| for layer in self.decoder: | |||||
| x = layer(x, visual) | |||||
| return self.out_proj(x) | |||||
| @@ -0,0 +1,217 @@ | |||||
| """ FPNneck | |||||
| Base modules are adapted from https://github.com/open-mmlab/mmcv/, | |||||
| originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, | |||||
| https://github.com/open-mmlab/mmsegmentation/, | |||||
| originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, | |||||
| and adapted from https://github.com/raoyongming/DenseCLIP/, | |||||
| originally MIT License, Copyright (c) 2022 Rao, Yongming. | |||||
| """ | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from mmcv.cnn import ConvModule | |||||
| from timm.models.layers import drop, drop_path, trunc_normal_ | |||||
| from .common import resize | |||||
| class FPN(nn.Module): | |||||
| """Feature Pyramid Network. | |||||
| This neck is the implementation of `Feature Pyramid Networks for Object | |||||
| Detection <https://arxiv.org/abs/1612.03144>`_. | |||||
| Args: | |||||
| in_channels (list[int]): Number of input channels per scale. | |||||
| out_channels (int): Number of output channels (used at each scale). | |||||
| num_outs (int): Number of output scales. | |||||
| start_level (int): Index of the start input backbone level used to | |||||
| build the feature pyramid. Default: 0. | |||||
| end_level (int): Index of the end input backbone level (exclusive) to | |||||
| build the feature pyramid. Default: -1, which means the last level. | |||||
| add_extra_convs (bool | str): If bool, it decides whether to add conv | |||||
| layers on top of the original feature maps. Default to False. | |||||
| If True, its actual mode is specified by `extra_convs_on_inputs`. | |||||
| If str, it specifies the source feature map of the extra convs. | |||||
| Only the following options are allowed | |||||
| - 'on_input': Last feat map of neck inputs (i.e. backbone feature). | |||||
| - 'on_lateral': Last feature map after lateral convs. | |||||
| - 'on_output': The last output feature map after fpn convs. | |||||
| extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs | |||||
| on the original feature from the backbone. If True, | |||||
| it is equivalent to `add_extra_convs='on_input'`. If False, it is | |||||
| equivalent to set `add_extra_convs='on_output'`. Default to True. | |||||
| relu_before_extra_convs (bool): Whether to apply relu before the extra | |||||
| conv. Default: False. | |||||
| no_norm_on_lateral (bool): Whether to apply norm on lateral. | |||||
| Default: False. | |||||
| conv_cfg (dict): Config dict for convolution layer. Default: None. | |||||
| norm_cfg (dict): Config dict for normalization layer. Default: None. | |||||
| act_cfg (dict): Config dict for activation layer in ConvModule. | |||||
| Default: None. | |||||
| upsample_cfg (dict): Config dict for interpolate layer. | |||||
| Default: dict(mode='nearest'). | |||||
| init_cfg (dict or list[dict], optional): Initialization config dict. | |||||
| """ | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| num_outs, | |||||
| start_level=0, | |||||
| end_level=-1, | |||||
| add_extra_convs=False, | |||||
| extra_convs_on_inputs=False, | |||||
| relu_before_extra_convs=False, | |||||
| no_norm_on_lateral=False, | |||||
| conv_cfg=None, | |||||
| norm_cfg=None, | |||||
| act_cfg=None, | |||||
| upsample_cfg=dict(mode='nearest')): | |||||
| super(FPN, self).__init__() | |||||
| assert isinstance(in_channels, list) | |||||
| self.in_channels = in_channels | |||||
| self.out_channels = out_channels | |||||
| self.num_ins = len(in_channels) | |||||
| self.num_outs = num_outs | |||||
| self.relu_before_extra_convs = relu_before_extra_convs | |||||
| self.no_norm_on_lateral = no_norm_on_lateral | |||||
| self.fp16_enabled = False | |||||
| self.upsample_cfg = upsample_cfg.copy() | |||||
| if end_level == -1: | |||||
| self.backbone_end_level = self.num_ins | |||||
| assert num_outs >= self.num_ins - start_level | |||||
| else: | |||||
| # if end_level < inputs, no extra level is allowed | |||||
| self.backbone_end_level = end_level | |||||
| assert end_level <= len(in_channels) | |||||
| assert num_outs == end_level - start_level | |||||
| self.start_level = start_level | |||||
| self.end_level = end_level | |||||
| self.add_extra_convs = add_extra_convs | |||||
| assert isinstance(add_extra_convs, (str, bool)) | |||||
| if isinstance(add_extra_convs, str): | |||||
| # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' | |||||
| assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') | |||||
| elif add_extra_convs: # True | |||||
| if extra_convs_on_inputs: | |||||
| # For compatibility with previous release | |||||
| # TODO: deprecate `extra_convs_on_inputs` | |||||
| self.add_extra_convs = 'on_input' | |||||
| else: | |||||
| self.add_extra_convs = 'on_output' | |||||
| self.lateral_convs = nn.ModuleList() | |||||
| self.fpn_convs = nn.ModuleList() | |||||
| for i in range(self.start_level, self.backbone_end_level): | |||||
| l_conv = ConvModule( | |||||
| in_channels[i], | |||||
| out_channels, | |||||
| 1, | |||||
| conv_cfg=conv_cfg, | |||||
| norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, | |||||
| act_cfg=act_cfg, | |||||
| inplace=False) | |||||
| fpn_conv = ConvModule( | |||||
| out_channels, | |||||
| out_channels, | |||||
| 3, | |||||
| padding=1, | |||||
| conv_cfg=conv_cfg, | |||||
| norm_cfg=norm_cfg, | |||||
| act_cfg=act_cfg, | |||||
| inplace=False) | |||||
| self.lateral_convs.append(l_conv) | |||||
| self.fpn_convs.append(fpn_conv) | |||||
| # add extra conv layers (e.g., RetinaNet) | |||||
| extra_levels = num_outs - self.backbone_end_level + self.start_level | |||||
| if self.add_extra_convs and extra_levels >= 1: | |||||
| for i in range(extra_levels): | |||||
| if i == 0 and self.add_extra_convs == 'on_input': | |||||
| in_channels = self.in_channels[self.backbone_end_level - 1] | |||||
| else: | |||||
| in_channels = out_channels | |||||
| extra_fpn_conv = ConvModule( | |||||
| in_channels, | |||||
| out_channels, | |||||
| 3, | |||||
| stride=2, | |||||
| padding=1, | |||||
| conv_cfg=conv_cfg, | |||||
| norm_cfg=norm_cfg, | |||||
| act_cfg=act_cfg, | |||||
| inplace=False) | |||||
| self.fpn_convs.append(extra_fpn_conv) | |||||
| self.apply(self._init_weights) | |||||
| def forward(self, inputs): | |||||
| assert len(inputs) == len(self.in_channels) | |||||
| # build laterals | |||||
| laterals = [ | |||||
| lateral_conv(inputs[i + self.start_level]) | |||||
| for i, lateral_conv in enumerate(self.lateral_convs) | |||||
| ] | |||||
| # build top-down path | |||||
| used_backbone_levels = len(laterals) | |||||
| for i in range(used_backbone_levels - 1, 0, -1): | |||||
| # In some cases, fixing `scale factor` (e.g. 2) is preferred, but | |||||
| # it cannot co-exist with `size` in `F.interpolate`. | |||||
| if 'scale_factor' in self.upsample_cfg: | |||||
| laterals[i - 1] = laterals[i - 1] + resize( | |||||
| laterals[i], **self.upsample_cfg) | |||||
| else: | |||||
| prev_shape = laterals[i - 1].shape[2:] | |||||
| laterals[i - 1] = laterals[i - 1] + resize( | |||||
| laterals[i], size=prev_shape, **self.upsample_cfg) | |||||
| # build outputs | |||||
| # part 1: from original levels | |||||
| outs = [ | |||||
| self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) | |||||
| ] | |||||
| # part 2: add extra levels | |||||
| if self.num_outs > len(outs): | |||||
| # use max pool to get more levels on top of outputs | |||||
| # (e.g., Faster R-CNN, Mask R-CNN) | |||||
| if not self.add_extra_convs: | |||||
| for i in range(self.num_outs - used_backbone_levels): | |||||
| outs.append(F.max_pool2d(outs[-1], 1, stride=2)) | |||||
| # add conv layers on top of original feature maps (RetinaNet) | |||||
| else: | |||||
| if self.add_extra_convs == 'on_input': | |||||
| extra_source = inputs[self.backbone_end_level - 1] | |||||
| elif self.add_extra_convs == 'on_lateral': | |||||
| extra_source = laterals[-1] | |||||
| elif self.add_extra_convs == 'on_output': | |||||
| extra_source = outs[-1] | |||||
| else: | |||||
| raise NotImplementedError | |||||
| outs.append(self.fpn_convs[used_backbone_levels](extra_source)) | |||||
| for i in range(used_backbone_levels + 1, self.num_outs): | |||||
| if self.relu_before_extra_convs: | |||||
| outs.append(self.fpn_convs[i](F.relu(outs[-1]))) | |||||
| else: | |||||
| outs.append(self.fpn_convs[i](outs[-1])) | |||||
| return tuple(outs) | |||||
| def _init_weights(self, m): | |||||
| if isinstance(m, nn.Linear): | |||||
| trunc_normal_(m.weight, std=.02) | |||||
| if isinstance(m, nn.Linear) and m.bias is not None: | |||||
| nn.init.constant_(m.bias, 0) | |||||
| elif isinstance(m, nn.LayerNorm): | |||||
| nn.init.constant_(m.bias, 0) | |||||
| nn.init.constant_(m.weight, 1.0) | |||||
| elif isinstance(m, nn.Conv2d): | |||||
| nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') | |||||
| if m.bias is not None: | |||||
| nn.init.constant_(m.bias.data, 0) | |||||
| @@ -0,0 +1,157 @@ | |||||
| """ | |||||
| Base modules are adapted from https://github.com/open-mmlab/mmcv/, | |||||
| originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab, | |||||
| https://github.com/open-mmlab/mmsegmentation/, | |||||
| originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab, | |||||
| and adapted from https://github.com/raoyongming/DenseCLIP/, | |||||
| originally MIT License, Copyright (c) 2022 Rao, Yongming. | |||||
| """ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from .head_fpn import FPNHead | |||||
| from .models import (CLIPTextContextEncoder, CLIPVisionTransformer, | |||||
| ContextDecoder) | |||||
| from .neck_fpn import FPN | |||||
| from .utils import SimpleTokenizer, tokenize | |||||
| class SHOPSEG(nn.Module): | |||||
| """Encoder Decoder segmentors. | |||||
| EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. | |||||
| Note that auxiliary_head is only used for deep supervision during training, | |||||
| which could be dumped during inference. | |||||
| """ | |||||
| def __init__(self, | |||||
| model_dir, | |||||
| context_length=22, | |||||
| context_feature='attention', | |||||
| score_concat_index=2, | |||||
| tau=0.07, | |||||
| token_embed_dim=512, | |||||
| text_dim=512, | |||||
| **args): | |||||
| super(SHOPSEG, self).__init__() | |||||
| self.model_dir = model_dir | |||||
| self.tokenizer = SimpleTokenizer(model_dir | |||||
| + '/bpe_simple_vocab_16e6.txt.gz') | |||||
| backbone = CLIPVisionTransformer( | |||||
| input_resolution=1024, | |||||
| patch_size=16, | |||||
| width=768, | |||||
| layers=12, | |||||
| output_dim=512, | |||||
| drop_path_rate=0.1, | |||||
| pretrained=False, | |||||
| get_embeddings=True) | |||||
| text_encoder = CLIPTextContextEncoder( | |||||
| context_length=30, | |||||
| vocab_size=49408, | |||||
| transformer_width=512, | |||||
| transformer_heads=8, | |||||
| transformer_layers=12, | |||||
| embed_dim=512, | |||||
| pretrained=False) | |||||
| context_decoder = ContextDecoder( | |||||
| transformer_width=256, | |||||
| transformer_heads=4, | |||||
| transformer_layers=3, | |||||
| visual_dim=512, | |||||
| dropout=0.1) | |||||
| neck = FPN( | |||||
| in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4) | |||||
| head_fpd = FPNHead(channels=256, num_classes=2) | |||||
| self.backbone = backbone | |||||
| self.text_encoder = text_encoder | |||||
| self.context_decoder = context_decoder | |||||
| self.context_length = context_length | |||||
| self.score_concat_index = score_concat_index | |||||
| self.context_feature = context_feature | |||||
| self.tau = tau | |||||
| context_length = self.text_encoder.context_length - self.context_length | |||||
| self.contexts = nn.Parameter( | |||||
| torch.randn(1, context_length, token_embed_dim)) | |||||
| nn.init.trunc_normal_(self.contexts) | |||||
| self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4) | |||||
| self.neck = neck | |||||
| self.head_fpn = head_fpd | |||||
| self.tau = 0.07 | |||||
| def encode_text(self, text, context_length): | |||||
| output = tokenize(self.tokenizer, text, context_length, True) | |||||
| return output | |||||
| def extract_feat(self, img): | |||||
| """Extract features from images.""" | |||||
| x = self.backbone(img) | |||||
| return x | |||||
| def after_extract_feat(self, x, name_list): | |||||
| x_orig = list(x[0:4]) | |||||
| global_feat, visual_embeddings = x[4] | |||||
| B, C, H, W = visual_embeddings.shape | |||||
| if self.context_feature == 'attention': | |||||
| x1 = global_feat.reshape(B, C, 1) | |||||
| x2 = visual_embeddings.reshape(B, C, H * W) | |||||
| visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1) | |||||
| texts = torch.cat([ | |||||
| self.encode_text(c, context_length=self.context_length) | |||||
| for c in name_list | |||||
| ]) | |||||
| x1 = texts.to(global_feat.device) | |||||
| x1 = self.text_encoder(x1, self.contexts) | |||||
| text_embeddings = x1.expand(B, -1, -1) | |||||
| # update text_embeddings by visual_context! | |||||
| # (B, 1, C) | |||||
| text_diff = self.context_decoder(text_embeddings, visual_context) | |||||
| # (B, K, C) | |||||
| text_embeddings = text_embeddings + self.gamma * text_diff | |||||
| # compute score map and concat | |||||
| B, K, C = text_embeddings.shape | |||||
| visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2) | |||||
| text = F.normalize(text_embeddings, dim=2, p=2) | |||||
| score_map_list = [] | |||||
| bsz = B | |||||
| for i in range(bsz): | |||||
| ind = 2 * i | |||||
| sub_text = torch.cat( | |||||
| [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]], | |||||
| dim=1) # 1 * 2 * h * w | |||||
| sub_score_map = torch.einsum('bchw,bkc->bkhw', | |||||
| visual_embeddings[i:i + 1], | |||||
| sub_text) # 1 * 2 * h * w | |||||
| score_map_list.append(sub_score_map) | |||||
| score_map = torch.cat(score_map_list, dim=0) # b * 2 * h * w | |||||
| x_orig[self.score_concat_index] = torch.cat( | |||||
| [x_orig[self.score_concat_index], score_map], dim=1) | |||||
| return x_orig, score_map | |||||
| def forward(self, img, text_list=None): | |||||
| if text_list is None: | |||||
| bsz = img.size()[0] | |||||
| text_list = ['foregeound'] * bsz | |||||
| x = self.extract_feat(img) | |||||
| _x_orig = [x[i] for i in range(4)] | |||||
| name_list = [] | |||||
| for name in text_list: | |||||
| name_list.append('others') | |||||
| name_list.append(name[0:20]) | |||||
| x_orig, score_map = self.after_extract_feat(x, name_list) | |||||
| x_orig = list(self.neck(x_orig)) | |||||
| _x_orig = x_orig | |||||
| pred = self.head_fpn(_x_orig) | |||||
| return pred | |||||
| @@ -0,0 +1,115 @@ | |||||
| import os.path as osp | |||||
| from typing import Any, Dict | |||||
| import json | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from PIL import Image | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.shop_segmentation import SHOPSEG | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| __all__ = ['ShopSegmentation'] | |||||
| @MODELS.register_module( | |||||
| Tasks.shop_segmentation, module_name=Models.shop_segmentation) | |||||
| class ShopSegmentation(TorchModel): | |||||
| """ shop segmentation model. | |||||
| """ | |||||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||||
| super().__init__( | |||||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||||
| self.model = SHOPSEG(model_dir=model_dir) | |||||
| pretrained_params = torch.load('{}/{}'.format( | |||||
| model_dir, ModelFile.TORCH_MODEL_BIN_FILE)) | |||||
| self.model.load_state_dict(pretrained_params) | |||||
| self.model.eval() | |||||
| self.device_id = device_id | |||||
| if self.device_id >= 0 and torch.cuda.is_available(): | |||||
| self.model.to('cuda:{}'.format(self.device_id)) | |||||
| logger.info('Use GPU: {}'.format(self.device_id)) | |||||
| else: | |||||
| self.device_id = -1 | |||||
| logger.info('Use CPU for inference') | |||||
| def preprocess(self, img, size=1024): | |||||
| mean = [0.48145466, 0.4578275, 0.40821073] | |||||
| std = [0.26862954, 0.26130258, 0.27577711] | |||||
| h, w, c = img.shape | |||||
| max_hw = max(h, w) | |||||
| ratio = 1.0 * size / max_hw | |||||
| crop_h, crop_w = int(ratio * h), int(ratio * w) | |||||
| pil_img = Image.fromarray(img) | |||||
| pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR) | |||||
| np_img = np.array(pil_img, dtype=np.float32) / 255. | |||||
| for j in range(3): | |||||
| np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j] | |||||
| img_pad = np.zeros((size, size, 3), dtype=np.float32) | |||||
| img_pad[:crop_h, :crop_w] = np_img | |||||
| img_pad = torch.from_numpy(img_pad).permute(2, 0, | |||||
| 1).unsqueeze(0).float() | |||||
| return img_pad, h, w, crop_h, crop_w | |||||
| def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w): | |||||
| output = np.clip(tensors * 255., a_min=0, a_max=255.) | |||||
| crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8) | |||||
| pil_output = Image.fromarray(crop_output) | |||||
| pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR) | |||||
| np_output = np.array(pil_output, dtype=np.uint8) | |||||
| np_output[np_output < 128] = 0 | |||||
| np_output[np_output >= 128] = 255 | |||||
| np_output = np.uint8(np_output) | |||||
| return np_output | |||||
| def forward(self, image): | |||||
| """ | |||||
| image should be numpy array, dtype=np.uint8, shape: height*width*3 | |||||
| """ | |||||
| image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess( | |||||
| image, size=1024) | |||||
| pred = self.inference(image_tensor) | |||||
| msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024) | |||||
| outputs = {OutputKeys.MASKS: msk} | |||||
| return outputs | |||||
| def inference(self, image): | |||||
| """ | |||||
| image should be tensor, 1 * 3 * 1024 * 1024 | |||||
| """ | |||||
| with torch.no_grad(): | |||||
| if self.device_id == -1: | |||||
| output = self.model(image) | |||||
| else: | |||||
| device = torch.device('cuda', self.device_id) | |||||
| output = self.model(image.to(device)) | |||||
| output = F.interpolate(output, size=(1024, 1024), mode='bilinear') | |||||
| output = F.softmax(output, dim=1) | |||||
| output = torch.argmax(output, dim=1) | |||||
| output = output[0] | |||||
| if self.device_id == -1: | |||||
| pred = output.data.numpy() | |||||
| else: | |||||
| pred = output.data.cpu().numpy() | |||||
| del output | |||||
| return pred | |||||
| @@ -0,0 +1,199 @@ | |||||
| """ CLIP Tokenizer | |||||
| Adapted from https://github.com/openai/CLIP. | |||||
| Originally MIT License, Copyright (c) 2021 OpenAI. | |||||
| """ | |||||
| import gzip | |||||
| import html | |||||
| import os | |||||
| from functools import lru_cache | |||||
| from typing import Any, List, Union | |||||
| import ftfy | |||||
| import regex as re | |||||
| import torch | |||||
| @lru_cache() | |||||
| def default_bpe(): | |||||
| return os.path.join( | |||||
| os.path.dirname(os.path.abspath(__file__)), | |||||
| 'bpe_simple_vocab_16e6.txt.gz') | |||||
| @lru_cache() | |||||
| def bytes_to_unicode(): | |||||
| """ | |||||
| Returns list of utf-8 byte and a corresponding list of unicode strings. | |||||
| The reversible bpe codes work on unicode strings. | |||||
| This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | |||||
| When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | |||||
| This is a signficant percentage of your normal, say, 32K bpe vocab. | |||||
| To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | |||||
| And avoids mapping to whitespace/control characters the bpe code barfs on. | |||||
| """ | |||||
| bs = list(range(ord('!'), | |||||
| ord('~') + 1)) + list(range( | |||||
| ord('¡'), | |||||
| ord('¬') + 1)) + list(range(ord('®'), | |||||
| ord('ÿ') + 1)) | |||||
| cs = bs[:] | |||||
| n = 0 | |||||
| for b in range(2**8): | |||||
| if b not in bs: | |||||
| bs.append(b) | |||||
| cs.append(2**8 + n) | |||||
| n += 1 | |||||
| cs = [chr(n) for n in cs] | |||||
| return dict(zip(bs, cs)) | |||||
| def get_pairs(word): | |||||
| """Return set of symbol pairs in a word. | |||||
| Word is represented as tuple of symbols (symbols being variable-length strings). | |||||
| """ | |||||
| pairs = set() | |||||
| prev_char = word[0] | |||||
| for char in word[1:]: | |||||
| pairs.add((prev_char, char)) | |||||
| prev_char = char | |||||
| return pairs | |||||
| def basic_clean(text): | |||||
| text = ftfy.fix_text(text) | |||||
| text = html.unescape(html.unescape(text)) | |||||
| return text.strip() | |||||
| def whitespace_clean(text): | |||||
| text = re.sub(r'\s+', ' ', text) | |||||
| text = text.strip() | |||||
| return text | |||||
| class SimpleTokenizer(object): | |||||
| def __init__(self, bpe_path: str = default_bpe()): | |||||
| self.byte_encoder = bytes_to_unicode() | |||||
| self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} | |||||
| merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') | |||||
| merges = merges[1:49152 - 256 - 2 + 1] | |||||
| merges = [tuple(merge.split()) for merge in merges] | |||||
| vocab = list(bytes_to_unicode().values()) | |||||
| vocab = vocab + [v + '</w>' for v in vocab] | |||||
| for merge in merges: | |||||
| vocab.append(''.join(merge)) | |||||
| vocab.extend(['<|startoftext|>', '<|endoftext|>']) | |||||
| self.encoder = dict(zip(vocab, range(len(vocab)))) | |||||
| self.decoder = {v: k for k, v in self.encoder.items()} | |||||
| self.bpe_ranks = dict(zip(merges, range(len(merges)))) | |||||
| self.cache = { | |||||
| '<|startoftext|>': '<|startoftext|>', | |||||
| '<|endoftext|>': '<|endoftext|>' | |||||
| } | |||||
| self.pat = re.compile( | |||||
| r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", | |||||
| re.IGNORECASE) | |||||
| def bpe(self, token): | |||||
| if token in self.cache: | |||||
| return self.cache[token] | |||||
| word = tuple(token[:-1]) + (token[-1] + '</w>', ) | |||||
| pairs = get_pairs(word) | |||||
| if not pairs: | |||||
| return token + '</w>' | |||||
| error_list = [] | |||||
| while True: | |||||
| bigram = min( | |||||
| pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) | |||||
| if bigram not in self.bpe_ranks: | |||||
| break | |||||
| first, second = bigram | |||||
| new_word = [] | |||||
| i = 0 | |||||
| while i < len(word): | |||||
| try: | |||||
| j = word.index(first, i) | |||||
| new_word.extend(word[i:j]) | |||||
| i = j | |||||
| except Exception as err: | |||||
| error_list.append(err) | |||||
| new_word.extend(word[i:]) | |||||
| break | |||||
| if word[i] == first and i < len(word) - 1 and word[ | |||||
| i + 1] == second: | |||||
| new_word.append(first + second) | |||||
| i += 2 | |||||
| else: | |||||
| new_word.append(word[i]) | |||||
| i += 1 | |||||
| new_word = tuple(new_word) | |||||
| word = new_word | |||||
| if len(word) == 1: | |||||
| break | |||||
| else: | |||||
| pairs = get_pairs(word) | |||||
| word = ' '.join(word) | |||||
| self.cache[token] = word | |||||
| return word | |||||
| def encode(self, text): | |||||
| bpe_tokens = [] | |||||
| text = whitespace_clean(basic_clean(text)).lower() | |||||
| for token in re.findall(self.pat, text): | |||||
| token = ''.join(self.byte_encoder[b] | |||||
| for b in token.encode('utf-8')) | |||||
| bpe_tokens.extend(self.encoder[bpe_token] | |||||
| for bpe_token in self.bpe(token).split(' ')) | |||||
| return bpe_tokens | |||||
| def decode(self, tokens): | |||||
| text = ''.join([self.decoder[token] for token in tokens]) | |||||
| text = bytearray([self.byte_decoder[c] for c in text]).decode( | |||||
| 'utf-8', errors='replace').replace('</w>', ' ') | |||||
| return text | |||||
| def tokenize(tokenizer, | |||||
| texts, | |||||
| context_length: int = 77, | |||||
| truncate: bool = False) -> torch.LongTensor: | |||||
| """ | |||||
| Returns the tokenized representation of given input string(s) | |||||
| Parameters | |||||
| ---------- | |||||
| texts : Union[str, List[str]] | |||||
| An input string or a list of input strings to tokenize | |||||
| context_length : int | |||||
| The context length to use; all CLIP models use 77 as the context length | |||||
| truncate: bool | |||||
| Whether to truncate the text in case its encoding is longer than the context length | |||||
| Returns | |||||
| ------- | |||||
| A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] | |||||
| """ | |||||
| if isinstance(texts, str): | |||||
| texts = [texts] | |||||
| sot_token = tokenizer.encoder['<|startoftext|>'] | |||||
| eot_token = tokenizer.encoder['<|endoftext|>'] | |||||
| all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token] | |||||
| for text in texts] | |||||
| result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) | |||||
| for i, tokens in enumerate(all_tokens): | |||||
| if len(tokens) > context_length: | |||||
| if truncate: | |||||
| tokens = tokens[:context_length] | |||||
| tokens[-1] = eot_token | |||||
| else: | |||||
| raise RuntimeError( | |||||
| f'Input {texts[i]} is too long for context length {context_length}' | |||||
| ) | |||||
| result[i, :len(tokens)] = torch.tensor(tokens) | |||||
| return result | |||||
| @@ -0,0 +1 @@ | |||||
| from .lseg_base import TextDrivenSegmentation | |||||
| @@ -0,0 +1,170 @@ | |||||
| """ CLIP | |||||
| Adapted from https://github.com/openai/CLIP. | |||||
| Originally MIT License, Copyright (c) 2021 OpenAI. | |||||
| """ | |||||
| import hashlib | |||||
| import os | |||||
| import urllib | |||||
| import warnings | |||||
| from typing import Any, List, Union | |||||
| import torch | |||||
| from PIL import Image | |||||
| from pkg_resources import packaging | |||||
| from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, | |||||
| ToTensor) | |||||
| from tqdm import tqdm | |||||
| from .model import build_model | |||||
| from .simple_tokenizer import SimpleTokenizer as _Tokenizer | |||||
| try: | |||||
| from torchvision.transforms import InterpolationMode | |||||
| BICUBIC = InterpolationMode.BICUBIC | |||||
| except ImportError: | |||||
| BICUBIC = Image.BICUBIC | |||||
| if packaging.version.parse( | |||||
| torch.__version__) < packaging.version.parse('1.7.1'): | |||||
| warnings.warn('PyTorch version 1.7.1 or higher is recommended') | |||||
| __all__ = ['load', 'tokenize'] | |||||
| def _convert_image_to_rgb(image): | |||||
| return image.convert('RGB') | |||||
| def _transform(n_px): | |||||
| return Compose([ | |||||
| Resize(n_px, interpolation=BICUBIC), | |||||
| CenterCrop(n_px), | |||||
| _convert_image_to_rgb, | |||||
| ToTensor(), | |||||
| Normalize((0.48145466, 0.4578275, 0.40821073), | |||||
| (0.26862954, 0.26130258, 0.27577711)), | |||||
| ]) | |||||
| def load(name: str, | |||||
| device: Union[str, torch.device] = 'cuda' | |||||
| if torch.cuda.is_available() else 'cpu', | |||||
| jit: bool = False, | |||||
| root: str = None): | |||||
| if not jit: | |||||
| model = build_model().to(device) | |||||
| if str(device) == 'cpu': | |||||
| model.float() | |||||
| return model, _transform(model.visual.input_resolution) | |||||
| # patch the device names | |||||
| device_holder = torch.jit.trace( | |||||
| lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) | |||||
| device_node = [ | |||||
| n for n in device_holder.graph.findAllNodes('prim::Constant') | |||||
| if 'Device' in repr(n) | |||||
| ][-1] | |||||
| def patch_device(module): | |||||
| try: | |||||
| graphs = [module.graph] if hasattr(module, 'graph') else [] | |||||
| except RuntimeError: | |||||
| graphs = [] | |||||
| if hasattr(module, 'forward1'): | |||||
| graphs.append(module.forward1.graph) | |||||
| for graph in graphs: | |||||
| for node in graph.findAllNodes('prim::Constant'): | |||||
| if 'value' in node.attributeNames() and str( | |||||
| node['value']).startswith('cuda'): | |||||
| node.copyAttributes(device_node) | |||||
| model.apply(patch_device) | |||||
| patch_device(model.encode_image) | |||||
| patch_device(model.encode_text) | |||||
| # patch dtype to float32 on CPU | |||||
| if str(device) == 'cpu': | |||||
| float_holder = torch.jit.trace( | |||||
| lambda: torch.ones([]).float(), example_inputs=[]) | |||||
| float_input = list(float_holder.graph.findNode('aten::to').inputs())[1] | |||||
| float_node = float_input.node() | |||||
| def patch_float(module): | |||||
| try: | |||||
| graphs = [module.graph] if hasattr(module, 'graph') else [] | |||||
| except RuntimeError: | |||||
| graphs = [] | |||||
| if hasattr(module, 'forward1'): | |||||
| graphs.append(module.forward1.graph) | |||||
| for graph in graphs: | |||||
| for node in graph.findAllNodes('aten::to'): | |||||
| inputs = list(node.inputs()) | |||||
| for i in [ | |||||
| 1, 2 | |||||
| ]: # dtype can be the second or third argument to aten::to() | |||||
| if inputs[i].node()['value'] == 5: | |||||
| inputs[i].node().copyAttributes(float_node) | |||||
| model.apply(patch_float) | |||||
| patch_float(model.encode_image) | |||||
| patch_float(model.encode_text) | |||||
| model.float() | |||||
| return model, _transform(model.input_resolution.item()) | |||||
| def tokenize( | |||||
| _tokenizer, | |||||
| texts: Union[str, List[str]], | |||||
| context_length: int = 77, | |||||
| truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]: | |||||
| """ | |||||
| Returns the tokenized representation of given input string(s) | |||||
| Parameters | |||||
| ---------- | |||||
| texts : Union[str, List[str]] | |||||
| An input string or a list of input strings to tokenize | |||||
| context_length : int | |||||
| The context length to use; all CLIP models use 77 as the context length | |||||
| truncate: bool | |||||
| Whether to truncate the text in case its encoding is longer than the context length | |||||
| Returns | |||||
| ------- | |||||
| A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]. | |||||
| We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long. | |||||
| """ | |||||
| if isinstance(texts, str): | |||||
| texts = [texts] | |||||
| sot_token = _tokenizer.encoder['<|startoftext|>'] | |||||
| eot_token = _tokenizer.encoder['<|endoftext|>'] | |||||
| all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] | |||||
| for text in texts] | |||||
| if packaging.version.parse( | |||||
| torch.__version__) < packaging.version.parse('1.8.0'): | |||||
| result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) | |||||
| else: | |||||
| result = torch.zeros(len(all_tokens), context_length, dtype=torch.int) | |||||
| for i, tokens in enumerate(all_tokens): | |||||
| if len(tokens) > context_length: | |||||
| if truncate: | |||||
| tokens = tokens[:context_length] | |||||
| tokens[-1] = eot_token | |||||
| else: | |||||
| raise RuntimeError( | |||||
| f'Input {texts[i]} is too long for context length {context_length}' | |||||
| ) | |||||
| result[i, :len(tokens)] = torch.tensor(tokens) | |||||
| return result | |||||
| @@ -0,0 +1,28 @@ | |||||
| """ | |||||
| Adapted from https://github.com/isl-org/lang-seg. | |||||
| Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. | |||||
| """ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from .lseg_net import LSeg | |||||
| class TextDrivenSegmentation(nn.Module): | |||||
| def __init__(self, model_dir): | |||||
| super(TextDrivenSegmentation, self).__init__() | |||||
| self.net = LSeg(model_dir=model_dir) | |||||
| self.model_dir = model_dir | |||||
| def forward(self, img, txt_list): | |||||
| b = img.size()[0] | |||||
| batch_name_list = txt_list | |||||
| xout_list = [] | |||||
| for i in range(b): | |||||
| labelset = ['others', batch_name_list[i]] | |||||
| xout = self.net(img[i:i + 1], labelset=labelset) | |||||
| xout_list.append(xout) | |||||
| score_map = torch.cat(xout_list, dim=0) | |||||
| return score_map | |||||
| @@ -0,0 +1,334 @@ | |||||
| """ | |||||
| Adapted from https://github.com/isl-org/lang-seg. | |||||
| Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. | |||||
| """ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit | |||||
| def _make_encoder( | |||||
| backbone, | |||||
| features, | |||||
| use_pretrained=True, | |||||
| groups=1, | |||||
| expand=False, | |||||
| exportable=True, | |||||
| hooks=None, | |||||
| use_vit_only=False, | |||||
| use_readout='ignore', | |||||
| enable_attention_hooks=False, | |||||
| ): | |||||
| if backbone == 'clip_vitl16_384': | |||||
| clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384( | |||||
| use_pretrained, | |||||
| hooks=hooks, | |||||
| use_readout=use_readout, | |||||
| enable_attention_hooks=enable_attention_hooks, | |||||
| ) | |||||
| scratch = _make_scratch([256, 512, 1024, 1024], | |||||
| features, | |||||
| groups=groups, | |||||
| expand=expand) | |||||
| else: | |||||
| raise NotImplementedError(f"Backbone '{backbone}' not implemented") | |||||
| return clip_pretrained, pretrained, scratch | |||||
| def _make_scratch(in_shape, out_shape, groups=1, expand=False): | |||||
| scratch = nn.Module() | |||||
| out_shape1 = out_shape | |||||
| out_shape2 = out_shape | |||||
| out_shape3 = out_shape | |||||
| out_shape4 = out_shape | |||||
| if expand is True: | |||||
| out_shape1 = out_shape | |||||
| out_shape2 = out_shape * 2 | |||||
| out_shape3 = out_shape * 4 | |||||
| out_shape4 = out_shape * 8 | |||||
| scratch.layer1_rn = nn.Conv2d( | |||||
| in_shape[0], | |||||
| out_shape1, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=False, | |||||
| groups=groups, | |||||
| ) | |||||
| scratch.layer2_rn = nn.Conv2d( | |||||
| in_shape[1], | |||||
| out_shape2, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=False, | |||||
| groups=groups, | |||||
| ) | |||||
| scratch.layer3_rn = nn.Conv2d( | |||||
| in_shape[2], | |||||
| out_shape3, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=False, | |||||
| groups=groups, | |||||
| ) | |||||
| scratch.layer4_rn = nn.Conv2d( | |||||
| in_shape[3], | |||||
| out_shape4, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=False, | |||||
| groups=groups, | |||||
| ) | |||||
| return scratch | |||||
| class Interpolate(nn.Module): | |||||
| """Interpolation module.""" | |||||
| def __init__(self, scale_factor, mode, align_corners=False): | |||||
| """Init. | |||||
| Args: | |||||
| scale_factor (float): scaling | |||||
| mode (str): interpolation mode | |||||
| """ | |||||
| super(Interpolate, self).__init__() | |||||
| self.interp = nn.functional.interpolate | |||||
| self.scale_factor = scale_factor | |||||
| self.mode = mode | |||||
| self.align_corners = align_corners | |||||
| def forward(self, x): | |||||
| """Forward pass. | |||||
| Args: | |||||
| x (tensor): input | |||||
| Returns: | |||||
| tensor: interpolated data | |||||
| """ | |||||
| x = self.interp( | |||||
| x, | |||||
| scale_factor=self.scale_factor, | |||||
| mode=self.mode, | |||||
| align_corners=self.align_corners, | |||||
| ) | |||||
| return x | |||||
| class ResidualConvUnit(nn.Module): | |||||
| """Residual convolution module.""" | |||||
| def __init__(self, features): | |||||
| """Init. | |||||
| Args: | |||||
| features (int): number of features | |||||
| """ | |||||
| super().__init__() | |||||
| self.conv1 = nn.Conv2d( | |||||
| features, features, kernel_size=3, stride=1, padding=1, bias=True) | |||||
| self.conv2 = nn.Conv2d( | |||||
| features, features, kernel_size=3, stride=1, padding=1, bias=True) | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| def forward(self, x): | |||||
| """Forward pass. | |||||
| Args: | |||||
| x (tensor): input | |||||
| Returns: | |||||
| tensor: output | |||||
| """ | |||||
| out = self.relu(x) | |||||
| out = self.conv1(out) | |||||
| out = self.relu(out) | |||||
| out = self.conv2(out) | |||||
| return out + x | |||||
| class FeatureFusionBlock(nn.Module): | |||||
| """Feature fusion block.""" | |||||
| def __init__(self, features): | |||||
| """Init. | |||||
| Args: | |||||
| features (int): number of features | |||||
| """ | |||||
| super(FeatureFusionBlock, self).__init__() | |||||
| self.resConfUnit1 = ResidualConvUnit(features) | |||||
| self.resConfUnit2 = ResidualConvUnit(features) | |||||
| def forward(self, *xs): | |||||
| """Forward pass. | |||||
| Returns: | |||||
| tensor: output | |||||
| """ | |||||
| output = xs[0] | |||||
| if len(xs) == 2: | |||||
| output += self.resConfUnit1(xs[1]) | |||||
| output = self.resConfUnit2(output) | |||||
| output = nn.functional.interpolate( | |||||
| output, scale_factor=2, mode='bilinear', align_corners=True) | |||||
| return output | |||||
| class ResidualConvUnit_custom(nn.Module): | |||||
| """Residual convolution module.""" | |||||
| def __init__(self, features, activation, bn): | |||||
| """Init. | |||||
| Args: | |||||
| features (int): number of features | |||||
| """ | |||||
| super().__init__() | |||||
| self.bn = bn | |||||
| self.groups = 1 | |||||
| self.conv1 = nn.Conv2d( | |||||
| features, | |||||
| features, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=not self.bn, | |||||
| groups=self.groups, | |||||
| ) | |||||
| self.conv2 = nn.Conv2d( | |||||
| features, | |||||
| features, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| bias=not self.bn, | |||||
| groups=self.groups, | |||||
| ) | |||||
| if self.bn is True: | |||||
| self.bn1 = nn.BatchNorm2d(features) | |||||
| self.bn2 = nn.BatchNorm2d(features) | |||||
| self.activation = activation | |||||
| self.skip_add = nn.quantized.FloatFunctional() | |||||
| def forward(self, x): | |||||
| """Forward pass. | |||||
| Args: | |||||
| x (tensor): input | |||||
| Returns: | |||||
| tensor: output | |||||
| """ | |||||
| out = self.activation(x) | |||||
| out = self.conv1(out) | |||||
| if self.bn is True: | |||||
| out = self.bn1(out) | |||||
| out = self.activation(out) | |||||
| out = self.conv2(out) | |||||
| if self.bn is True: | |||||
| out = self.bn2(out) | |||||
| if self.groups > 1: | |||||
| out = self.conv_merge(out) | |||||
| return self.skip_add.add(out, x) | |||||
| class FeatureFusionBlock_custom(nn.Module): | |||||
| """Feature fusion block.""" | |||||
| def __init__( | |||||
| self, | |||||
| features, | |||||
| activation, | |||||
| deconv=False, | |||||
| bn=False, | |||||
| expand=False, | |||||
| align_corners=True, | |||||
| ): | |||||
| """Init. | |||||
| Args: | |||||
| features (int): number of features | |||||
| """ | |||||
| super(FeatureFusionBlock_custom, self).__init__() | |||||
| self.deconv = deconv | |||||
| self.align_corners = align_corners | |||||
| self.groups = 1 | |||||
| self.expand = expand | |||||
| out_features = features | |||||
| if self.expand is True: | |||||
| out_features = features // 2 | |||||
| self.out_conv = nn.Conv2d( | |||||
| features, | |||||
| out_features, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=True, | |||||
| groups=1, | |||||
| ) | |||||
| self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) | |||||
| self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) | |||||
| self.skip_add = nn.quantized.FloatFunctional() | |||||
| def forward(self, *xs): | |||||
| """Forward pass. | |||||
| Returns: | |||||
| tensor: output | |||||
| """ | |||||
| output = xs[0] | |||||
| if len(xs) == 2: | |||||
| res = self.resConfUnit1(xs[1]) | |||||
| output = self.skip_add.add(output, res) | |||||
| output = self.resConfUnit2(output) | |||||
| output = nn.functional.interpolate( | |||||
| output, | |||||
| scale_factor=2, | |||||
| mode='bilinear', | |||||
| align_corners=self.align_corners) | |||||
| output = self.out_conv(output) | |||||
| return output | |||||
| @@ -0,0 +1,107 @@ | |||||
| import os.path as osp | |||||
| from typing import Any, Dict | |||||
| import json | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from PIL import Image | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.text_driven_segmentation import \ | |||||
| TextDrivenSegmentation | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| __all__ = ['TextDrivenSeg'] | |||||
| @MODELS.register_module( | |||||
| Tasks.text_driven_segmentation, | |||||
| module_name=Models.text_driven_segmentation) | |||||
| class TextDrivenSeg(TorchModel): | |||||
| """ text driven segmentation model. | |||||
| """ | |||||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||||
| super().__init__( | |||||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||||
| self.model = TextDrivenSegmentation(model_dir=model_dir) | |||||
| pretrained_params = torch.load('{}/{}'.format( | |||||
| model_dir, ModelFile.TORCH_MODEL_BIN_FILE)) | |||||
| self.model.load_state_dict(pretrained_params) | |||||
| self.model.eval() | |||||
| if device_id >= 0 and torch.cuda.is_available(): | |||||
| self.model.to('cuda:{}'.format(device_id)) | |||||
| logger.info('Use GPU: {}'.format(device_id)) | |||||
| else: | |||||
| device_id = -1 | |||||
| logger.info('Use CPU for inference') | |||||
| self.device_id = device_id | |||||
| def preprocess(self, img, size=640): | |||||
| mean = [0.48145466, 0.4578275, 0.40821073] | |||||
| std = [0.26862954, 0.26130258, 0.27577711] | |||||
| h, w, c = img.shape | |||||
| max_hw = max(h, w) | |||||
| ratio = 1.0 * size / max_hw | |||||
| crop_h, crop_w = int(ratio * h), int(ratio * w) | |||||
| pil_img = Image.fromarray(img) | |||||
| pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR) | |||||
| np_img = np.array(pil_img, dtype=np.float32) / 255. | |||||
| for j in range(3): | |||||
| np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j] | |||||
| img_pad = np.zeros((size, size, 3), dtype=np.float32) | |||||
| img_pad[:crop_h, :crop_w] = np_img | |||||
| img_pad = torch.from_numpy(img_pad).permute(2, 0, | |||||
| 1).unsqueeze(0).float() | |||||
| return img_pad, h, w, crop_h, crop_w | |||||
| def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w): | |||||
| output = np.clip(tensors * 255., a_min=0, a_max=255.) | |||||
| crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8) | |||||
| pil_output = Image.fromarray(crop_output) | |||||
| pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR) | |||||
| np_output = np.array(pil_output, dtype=np.uint8) | |||||
| np_output[np_output < 128] = 0 | |||||
| np_output[np_output >= 128] = 255 | |||||
| np_output = np.uint8(np_output) | |||||
| return np_output | |||||
| def forward(self, image, text): | |||||
| """ | |||||
| image should be numpy array, dtype=np.uint8, shape: height*width*3 | |||||
| """ | |||||
| image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess( | |||||
| image, size=640) | |||||
| pred = self.inference(image_tensor, text) | |||||
| msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640) | |||||
| outputs = {OutputKeys.MASKS: msk} | |||||
| return outputs | |||||
| def inference(self, image, text): | |||||
| """ | |||||
| image should be tensor, 1 * 3 * 640 * 640 | |||||
| """ | |||||
| with torch.no_grad(): | |||||
| if self.device_id == -1: | |||||
| output = self.model(image) | |||||
| else: | |||||
| device = torch.device('cuda', self.device_id) | |||||
| output = self.model(image.to(device), [text]) | |||||
| output = F.interpolate(output, size=(640, 640), mode='bilinear') | |||||
| output = F.softmax(output, dim=1) | |||||
| output = torch.argmax(output, dim=1) | |||||
| output = output[0] | |||||
| if self.device_id == -1: | |||||
| pred = output.data.numpy() | |||||
| else: | |||||
| pred = output.data.cpu().numpy() | |||||
| del output | |||||
| return pred | |||||
| @@ -0,0 +1,197 @@ | |||||
| """ | |||||
| Adapted from https://github.com/isl-org/lang-seg. | |||||
| Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. | |||||
| """ | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from . import clip | |||||
| from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom, | |||||
| Interpolate, _make_encoder, forward_vit) | |||||
| from .simple_tokenizer import SimpleTokenizer | |||||
| class depthwise_clipseg_conv(nn.Module): | |||||
| def __init__(self): | |||||
| super(depthwise_clipseg_conv, self).__init__() | |||||
| self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1) | |||||
| def depthwise_clipseg(self, x, channels): | |||||
| x = torch.cat( | |||||
| [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)], | |||||
| dim=1) | |||||
| return x | |||||
| def forward(self, x): | |||||
| channels = x.shape[1] | |||||
| out = self.depthwise_clipseg(x, channels) | |||||
| return out | |||||
| class depthwise_conv(nn.Module): | |||||
| def __init__(self, kernel_size=3, stride=1, padding=1): | |||||
| super(depthwise_conv, self).__init__() | |||||
| self.depthwise = nn.Conv2d( | |||||
| 1, 1, kernel_size=kernel_size, stride=stride, padding=padding) | |||||
| def forward(self, x): | |||||
| # support for 4D tensor with NCHW | |||||
| C, H, W = x.shape[1:] | |||||
| x = x.reshape(-1, 1, H, W) | |||||
| x = self.depthwise(x) | |||||
| x = x.view(-1, C, H, W) | |||||
| return x | |||||
| class depthwise_block(nn.Module): | |||||
| def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): | |||||
| super(depthwise_block, self).__init__() | |||||
| self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) | |||||
| if activation == 'relu': | |||||
| self.activation = nn.ReLU() | |||||
| elif activation == 'lrelu': | |||||
| self.activation = nn.LeakyReLU() | |||||
| elif activation == 'tanh': | |||||
| self.activation = nn.Tanh() | |||||
| def forward(self, x, act=True): | |||||
| x = self.depthwise(x) | |||||
| if act: | |||||
| x = self.activation(x) | |||||
| return x | |||||
| class bottleneck_block(nn.Module): | |||||
| def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'): | |||||
| super(bottleneck_block, self).__init__() | |||||
| self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1) | |||||
| if activation == 'relu': | |||||
| self.activation = nn.ReLU() | |||||
| elif activation == 'lrelu': | |||||
| self.activation = nn.LeakyReLU() | |||||
| elif activation == 'tanh': | |||||
| self.activation = nn.Tanh() | |||||
| def forward(self, x, act=True): | |||||
| sum_layer = x.max(dim=1, keepdim=True)[0] | |||||
| x = self.depthwise(x) | |||||
| x = x + sum_layer | |||||
| if act: | |||||
| x = self.activation(x) | |||||
| return x | |||||
| class BaseModel(torch.nn.Module): | |||||
| def load(self, path): | |||||
| """Load model from file. | |||||
| Args: | |||||
| path (str): file path | |||||
| """ | |||||
| parameters = torch.load(path, map_location=torch.device('cpu')) | |||||
| if 'optimizer' in parameters: | |||||
| parameters = parameters['model'] | |||||
| self.load_state_dict(parameters) | |||||
| def _make_fusion_block(features, use_bn): | |||||
| return FeatureFusionBlock_custom( | |||||
| features, | |||||
| activation=nn.ReLU(False), | |||||
| deconv=False, | |||||
| bn=use_bn, | |||||
| expand=False, | |||||
| align_corners=True, | |||||
| ) | |||||
| class LSeg(BaseModel): | |||||
| def __init__( | |||||
| self, | |||||
| features=256, | |||||
| backbone='clip_vitl16_384', | |||||
| readout='project', | |||||
| use_bn=True, | |||||
| model_dir=None, | |||||
| ): | |||||
| super(LSeg, self).__init__() | |||||
| hooks = { | |||||
| 'clip_vitl16_384': [5, 11, 17, 23], | |||||
| } | |||||
| # Instantiate backbone and reassemble blocks | |||||
| self.clip_pretrained, self.pretrained, self.scratch = _make_encoder( | |||||
| backbone, | |||||
| features, | |||||
| groups=1, | |||||
| expand=False, | |||||
| exportable=False, | |||||
| hooks=hooks[backbone], | |||||
| use_readout=readout, | |||||
| ) | |||||
| self.scratch.refinenet1 = _make_fusion_block(features, use_bn) | |||||
| self.scratch.refinenet2 = _make_fusion_block(features, use_bn) | |||||
| self.scratch.refinenet3 = _make_fusion_block(features, use_bn) | |||||
| self.scratch.refinenet4 = _make_fusion_block(features, use_bn) | |||||
| self.logit_scale = nn.Parameter(torch.ones([]) | |||||
| * np.log(1 / 0.07)).exp() | |||||
| self.out_c = 512 | |||||
| self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1) | |||||
| self.scratch.output_conv = nn.Sequential( | |||||
| Interpolate(scale_factor=2, mode='bilinear', align_corners=True), ) | |||||
| self.tau = 0.07 | |||||
| self.model_dir = model_dir | |||||
| self.tokenizer = SimpleTokenizer(model_dir | |||||
| + '/bpe_simple_vocab_16e6.txt.gz') | |||||
| def forward(self, x, labelset=''): | |||||
| text = clip.tokenize(self.tokenizer, labelset) | |||||
| layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) | |||||
| layer_1_rn = self.scratch.layer1_rn(layer_1) | |||||
| layer_2_rn = self.scratch.layer2_rn(layer_2) | |||||
| layer_3_rn = self.scratch.layer3_rn(layer_3) | |||||
| layer_4_rn = self.scratch.layer4_rn(layer_4) | |||||
| path_4 = self.scratch.refinenet4(layer_4_rn) | |||||
| path_3 = self.scratch.refinenet3(path_4, layer_3_rn) | |||||
| path_2 = self.scratch.refinenet2(path_3, layer_2_rn) | |||||
| path_1 = self.scratch.refinenet1(path_2, layer_1_rn) | |||||
| text = text.to(x.device) | |||||
| text_features = self.clip_pretrained.encode_text(text) | |||||
| image_features = self.scratch.head1(path_1) | |||||
| imshape = image_features.shape | |||||
| image_features = image_features.permute(0, 2, 3, | |||||
| 1).reshape(-1, self.out_c) | |||||
| # normalized features | |||||
| image_features = image_features / image_features.norm( | |||||
| dim=-1, keepdim=True) | |||||
| text_features = text_features / text_features.norm( | |||||
| dim=-1, keepdim=True) | |||||
| logits_per_image = image_features @ text_features.t() / self.tau | |||||
| out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], | |||||
| -1).permute(0, 3, 1, 2) | |||||
| out = self.scratch.output_conv(out) | |||||
| return out | |||||
| @@ -0,0 +1,543 @@ | |||||
| """ | |||||
| Adapted from https://github.com/isl-org/lang-seg. | |||||
| Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. | |||||
| """ | |||||
| import math | |||||
| import types | |||||
| import timm | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torch.utils.checkpoint as checkpoint | |||||
| from . import clip | |||||
| activations = {} | |||||
| def get_activation(name): | |||||
| def hook(model, input, output): | |||||
| activations[name] = output | |||||
| return hook | |||||
| attention = {} | |||||
| def get_attention(name): | |||||
| def hook(module, input, output): | |||||
| x = input[0] | |||||
| B, N, C = x.shape | |||||
| qkv = ( | |||||
| module.qkv(x).reshape(B, N, 3, module.num_heads, | |||||
| C // module.num_heads).permute( | |||||
| 2, 0, 3, 1, 4)) | |||||
| q, k, _ = ( | |||||
| qkv[0], | |||||
| qkv[1], | |||||
| qkv[2], | |||||
| ) # make torchscript happy (cannot use tensor as tuple) | |||||
| attn = (q @ k.transpose(-2, -1)) * module.scale | |||||
| attn = attn.softmax(dim=-1) # [:,:,1,1:] | |||||
| attention[name] = attn | |||||
| return hook | |||||
| def get_mean_attention_map(attn, token, shape): | |||||
| attn = attn[:, :, token, 1:] | |||||
| attn = attn.unflatten(2, torch.Size([shape[2] // 16, | |||||
| shape[3] // 16])).float() | |||||
| attn = torch.nn.functional.interpolate( | |||||
| attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0) | |||||
| all_attn = torch.mean(attn, 0) | |||||
| return all_attn | |||||
| class Slice(nn.Module): | |||||
| def __init__(self, start_index=1): | |||||
| super(Slice, self).__init__() | |||||
| self.start_index = start_index | |||||
| def forward(self, x): | |||||
| return x[:, self.start_index:] | |||||
| class AddReadout(nn.Module): | |||||
| def __init__(self, start_index=1): | |||||
| super(AddReadout, self).__init__() | |||||
| self.start_index = start_index | |||||
| def forward(self, x): | |||||
| if self.start_index == 2: | |||||
| readout = (x[:, 0] + x[:, 1]) / 2 | |||||
| else: | |||||
| readout = x[:, 0] | |||||
| return x[:, self.start_index:] + readout.unsqueeze(1) | |||||
| class ProjectReadout(nn.Module): | |||||
| def __init__(self, in_features, start_index=1): | |||||
| super(ProjectReadout, self).__init__() | |||||
| self.start_index = start_index | |||||
| self.project = nn.Sequential( | |||||
| nn.Linear(2 * in_features, in_features), nn.GELU()) | |||||
| def forward(self, x): | |||||
| readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) | |||||
| features = torch.cat((x[:, self.start_index:], readout), -1) | |||||
| return self.project(features) | |||||
| class Transpose(nn.Module): | |||||
| def __init__(self, dim0, dim1): | |||||
| super(Transpose, self).__init__() | |||||
| self.dim0 = dim0 | |||||
| self.dim1 = dim1 | |||||
| def forward(self, x): | |||||
| x = x.transpose(self.dim0, self.dim1) | |||||
| return x | |||||
| def forward_vit(pretrained, x): | |||||
| b, c, h, w = x.shape | |||||
| # encoder | |||||
| _ = pretrained.model.forward_flex(x) | |||||
| layer_1 = pretrained.activations['1'] | |||||
| layer_2 = pretrained.activations['2'] | |||||
| layer_3 = pretrained.activations['3'] | |||||
| layer_4 = pretrained.activations['4'] | |||||
| layer_1 = pretrained.act_postprocess1[0:2](layer_1) | |||||
| layer_2 = pretrained.act_postprocess2[0:2](layer_2) | |||||
| layer_3 = pretrained.act_postprocess3[0:2](layer_3) | |||||
| layer_4 = pretrained.act_postprocess4[0:2](layer_4) | |||||
| unflatten = nn.Sequential( | |||||
| nn.Unflatten( | |||||
| 2, | |||||
| torch.Size([ | |||||
| h // pretrained.model.patch_size[1], | |||||
| w // pretrained.model.patch_size[0], | |||||
| ]), | |||||
| )) | |||||
| if layer_1.ndim == 3: | |||||
| layer_1 = unflatten(layer_1) | |||||
| if layer_2.ndim == 3: | |||||
| layer_2 = unflatten(layer_2) | |||||
| if layer_3.ndim == 3: | |||||
| layer_3 = unflatten(layer_3) | |||||
| if layer_4.ndim == 3: | |||||
| layer_4 = unflatten(layer_4) | |||||
| layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)]( | |||||
| layer_1) | |||||
| layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)]( | |||||
| layer_2) | |||||
| layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)]( | |||||
| layer_3) | |||||
| layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)]( | |||||
| layer_4) | |||||
| return layer_1, layer_2, layer_3, layer_4 | |||||
| def _resize_pos_embed(self, posemb, gs_h, gs_w): | |||||
| posemb_tok, posemb_grid = ( | |||||
| posemb[:, :self.start_index], | |||||
| posemb[0, self.start_index:], | |||||
| ) | |||||
| gs_old = int(math.sqrt(len(posemb_grid))) | |||||
| posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, | |||||
| -1).permute(0, 3, 1, 2) | |||||
| posemb_grid = F.interpolate( | |||||
| posemb_grid, size=(gs_h, gs_w), mode='bilinear') | |||||
| posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) | |||||
| posemb = torch.cat([posemb_tok, posemb_grid], dim=1) | |||||
| return posemb | |||||
| def forward_flex(self, x): | |||||
| b, c, h, w = x.shape | |||||
| pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1], | |||||
| w // self.patch_size[0]) | |||||
| B = x.shape[0] | |||||
| if hasattr(self.patch_embed, 'backbone'): | |||||
| x = self.patch_embed.backbone(x) | |||||
| if isinstance(x, (list, tuple)): | |||||
| x = x[ | |||||
| -1] # last feature if backbone outputs list/tuple of features | |||||
| x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) | |||||
| if getattr(self, 'dist_token', None) is not None: | |||||
| cls_tokens = self.cls_token.expand( | |||||
| B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks | |||||
| dist_token = self.dist_token.expand(B, -1, -1) | |||||
| x = torch.cat((cls_tokens, dist_token, x), dim=1) | |||||
| else: | |||||
| cls_tokens = self.cls_token.expand( | |||||
| B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks | |||||
| x = torch.cat((cls_tokens, x), dim=1) | |||||
| x = x + pos_embed | |||||
| x = self.pos_drop(x) | |||||
| gradient_checkpoint = False | |||||
| for blk in self.blocks: | |||||
| if gradient_checkpoint: | |||||
| x = checkpoint.checkpoint(blk, x) | |||||
| else: | |||||
| x = blk(x) | |||||
| x = self.norm(x) | |||||
| return x | |||||
| def get_readout_oper(vit_features, features, use_readout, start_index=1): | |||||
| if use_readout == 'ignore': | |||||
| readout_oper = [Slice(start_index)] * len(features) | |||||
| elif use_readout == 'add': | |||||
| readout_oper = [AddReadout(start_index)] * len(features) | |||||
| elif use_readout == 'project': | |||||
| readout_oper = [ | |||||
| ProjectReadout(vit_features, start_index) for out_feat in features | |||||
| ] | |||||
| else: | |||||
| assert ( | |||||
| False | |||||
| ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" | |||||
| return readout_oper | |||||
| def adapt_input_conv(in_chans, conv_weight): | |||||
| conv_type = conv_weight.dtype | |||||
| conv_weight = conv_weight.float( | |||||
| ) # Some weights are in torch.half, ensure it's float for sum on CPU | |||||
| O, II, J, K = conv_weight.shape | |||||
| if in_chans == 1: | |||||
| if II > 3: | |||||
| assert conv_weight.shape[1] % 3 == 0 | |||||
| # For models with space2depth stems | |||||
| conv_weight = conv_weight.reshape(O, II // 3, 3, J, K) | |||||
| conv_weight = conv_weight.sum(dim=2, keepdim=False) | |||||
| else: | |||||
| conv_weight = conv_weight.sum(dim=1, keepdim=True) | |||||
| elif in_chans != 3: | |||||
| if II != 3: | |||||
| raise NotImplementedError( | |||||
| 'Weight format not supported by conversion.') | |||||
| else: | |||||
| # NOTE this strategy should be better than random init, but there could be other combinations of | |||||
| # the original RGB input layer weights that'd work better for specific cases. | |||||
| repeat = int(math.ceil(in_chans / 3)) | |||||
| conv_weight = conv_weight.repeat(1, repeat, 1, | |||||
| 1)[:, :in_chans, :, :] | |||||
| conv_weight *= (3 / float(in_chans)) | |||||
| conv_weight = conv_weight.to(conv_type) | |||||
| return conv_weight | |||||
| @torch.no_grad() | |||||
| def _load_weights(model, checkpoint_path, prefix=''): | |||||
| """ Load weights from .npz checkpoints for official Google Brain Flax implementation | |||||
| """ | |||||
| import numpy as np | |||||
| def _n2p(w, t=True): | |||||
| if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: | |||||
| w = w.flatten() | |||||
| if t: | |||||
| if w.ndim == 4: | |||||
| w = w.transpose([3, 2, 0, 1]) | |||||
| elif w.ndim == 3: | |||||
| w = w.transpose([2, 0, 1]) | |||||
| elif w.ndim == 2: | |||||
| w = w.transpose([1, 0]) | |||||
| return torch.from_numpy(w) | |||||
| w = np.load(checkpoint_path) | |||||
| if not prefix and 'opt/target/embedding/kernel' in w: | |||||
| prefix = 'opt/target/' | |||||
| if hasattr(model.patch_embed, 'backbone'): | |||||
| # hybrid | |||||
| backbone = model.patch_embed.backbone | |||||
| stem_only = not hasattr(backbone, 'stem') | |||||
| stem = backbone if stem_only else backbone.stem | |||||
| stem.conv.weight.copy_( | |||||
| adapt_input_conv(stem.conv.weight.shape[1], | |||||
| _n2p(w[f'{prefix}conv_root/kernel']))) | |||||
| stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) | |||||
| stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) | |||||
| if not stem_only: | |||||
| for i, stage in enumerate(backbone.stages): | |||||
| for j, block in enumerate(stage.blocks): | |||||
| bp = f'{prefix}block{i + 1}/unit{j + 1}/' | |||||
| for r in range(3): | |||||
| getattr(block, f'conv{r + 1}').weight.copy_( | |||||
| _n2p(w[f'{bp}conv{r + 1}/kernel'])) | |||||
| getattr(block, f'norm{r + 1}').weight.copy_( | |||||
| _n2p(w[f'{bp}gn{r + 1}/scale'])) | |||||
| getattr(block, f'norm{r + 1}').bias.copy_( | |||||
| _n2p(w[f'{bp}gn{r + 1}/bias'])) | |||||
| if block.downsample is not None: | |||||
| block.downsample.conv.weight.copy_( | |||||
| _n2p(w[f'{bp}conv_proj/kernel'])) | |||||
| block.downsample.norm.weight.copy_( | |||||
| _n2p(w[f'{bp}gn_proj/scale'])) | |||||
| block.downsample.norm.bias.copy_( | |||||
| _n2p(w[f'{bp}gn_proj/bias'])) | |||||
| embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) | |||||
| else: | |||||
| embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1], | |||||
| _n2p(w[f'{prefix}embedding/kernel'])) | |||||
| model.patch_embed.proj.weight.copy_(embed_conv_w) | |||||
| model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) | |||||
| model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) | |||||
| pos_embed_w = _n2p( | |||||
| w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) | |||||
| if pos_embed_w.shape != model.pos_embed.shape: | |||||
| pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights | |||||
| pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens', | |||||
| 1), | |||||
| model.patch_embed.grid_size) | |||||
| model.pos_embed.copy_(pos_embed_w) | |||||
| model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) | |||||
| model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) | |||||
| if isinstance( | |||||
| model.head, nn.Linear | |||||
| ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: | |||||
| model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) | |||||
| model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) | |||||
| # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights | |||||
| # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: | |||||
| # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) | |||||
| # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) | |||||
| for i, block in enumerate(model.blocks.children()): | |||||
| block_prefix = f'{prefix}Transformer/encoderblock_{i}/' | |||||
| mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' | |||||
| block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) | |||||
| block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) | |||||
| block.attn.qkv.weight.copy_( | |||||
| torch.cat([ | |||||
| _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T | |||||
| for n in ('query', 'key', 'value') | |||||
| ])) | |||||
| block.attn.qkv.bias.copy_( | |||||
| torch.cat([ | |||||
| _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) | |||||
| for n in ('query', 'key', 'value') | |||||
| ])) | |||||
| block.attn.proj.weight.copy_( | |||||
| _n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) | |||||
| block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) | |||||
| for r in range(2): | |||||
| getattr(block.mlp, f'fc{r + 1}').weight.copy_( | |||||
| _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) | |||||
| getattr(block.mlp, f'fc{r + 1}').bias.copy_( | |||||
| _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) | |||||
| block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) | |||||
| block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) | |||||
| def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()): | |||||
| # Rescale the grid of position embeddings when loading from state_dict. Adapted from | |||||
| # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 | |||||
| ntok_new = posemb_new.shape[1] | |||||
| if num_prefix_tokens: | |||||
| posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[ | |||||
| 0, num_prefix_tokens:] | |||||
| ntok_new -= num_prefix_tokens | |||||
| else: | |||||
| posemb_prefix, posemb_grid = posemb[:, :0], posemb[0] | |||||
| gs_old = int(math.sqrt(len(posemb_grid))) | |||||
| if not len(gs_new): # backwards compatibility | |||||
| gs_new = [int(math.sqrt(ntok_new))] * 2 | |||||
| assert len(gs_new) >= 2 | |||||
| posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, | |||||
| -1).permute(0, 3, 1, 2) | |||||
| posemb_grid = F.interpolate( | |||||
| posemb_grid, size=gs_new, mode='bicubic', align_corners=False) | |||||
| posemb_grid = posemb_grid.permute(0, 2, 3, | |||||
| 1).reshape(1, gs_new[0] * gs_new[1], -1) | |||||
| posemb = torch.cat([posemb_prefix, posemb_grid], dim=1) | |||||
| return posemb | |||||
| def _make_pretrained_clip_vitl16_384(pretrained, | |||||
| use_readout='ignore', | |||||
| hooks=None, | |||||
| enable_attention_hooks=False): | |||||
| clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False) | |||||
| # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) | |||||
| model = timm.create_model('vit_large_patch16_384', pretrained=False) | |||||
| hooks = [5, 11, 17, 23] if hooks is None else hooks | |||||
| pretrained = _make_vit_b16_backbone( | |||||
| model, | |||||
| features=[256, 512, 1024, 1024], | |||||
| hooks=hooks, | |||||
| vit_features=1024, | |||||
| use_readout=use_readout, | |||||
| enable_attention_hooks=enable_attention_hooks, | |||||
| ) | |||||
| return clip_pretrained, pretrained | |||||
| def _make_vit_b16_backbone( | |||||
| model, | |||||
| features=[96, 192, 384, 768], | |||||
| size=[384, 384], | |||||
| hooks=[2, 5, 8, 11], | |||||
| vit_features=768, | |||||
| use_readout='ignore', | |||||
| start_index=1, | |||||
| enable_attention_hooks=False, | |||||
| ): | |||||
| pretrained = nn.Module() | |||||
| pretrained.model = model | |||||
| pretrained.model.blocks[hooks[0]].register_forward_hook( | |||||
| get_activation('1')) | |||||
| pretrained.model.blocks[hooks[1]].register_forward_hook( | |||||
| get_activation('2')) | |||||
| pretrained.model.blocks[hooks[2]].register_forward_hook( | |||||
| get_activation('3')) | |||||
| pretrained.model.blocks[hooks[3]].register_forward_hook( | |||||
| get_activation('4')) | |||||
| pretrained.activations = activations | |||||
| if enable_attention_hooks: | |||||
| pretrained.model.blocks[hooks[0]].attn.register_forward_hook( | |||||
| get_attention('attn_1')) | |||||
| pretrained.model.blocks[hooks[1]].attn.register_forward_hook( | |||||
| get_attention('attn_2')) | |||||
| pretrained.model.blocks[hooks[2]].attn.register_forward_hook( | |||||
| get_attention('attn_3')) | |||||
| pretrained.model.blocks[hooks[3]].attn.register_forward_hook( | |||||
| get_attention('attn_4')) | |||||
| pretrained.attention = attention | |||||
| readout_oper = get_readout_oper(vit_features, features, use_readout, | |||||
| start_index) | |||||
| # 32, 48, 136, 384 | |||||
| pretrained.act_postprocess1 = nn.Sequential( | |||||
| readout_oper[0], | |||||
| Transpose(1, 2), | |||||
| nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), | |||||
| nn.Conv2d( | |||||
| in_channels=vit_features, | |||||
| out_channels=features[0], | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| ), | |||||
| nn.ConvTranspose2d( | |||||
| in_channels=features[0], | |||||
| out_channels=features[0], | |||||
| kernel_size=4, | |||||
| stride=4, | |||||
| padding=0, | |||||
| bias=True, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| ), | |||||
| ) | |||||
| pretrained.act_postprocess2 = nn.Sequential( | |||||
| readout_oper[1], | |||||
| Transpose(1, 2), | |||||
| nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), | |||||
| nn.Conv2d( | |||||
| in_channels=vit_features, | |||||
| out_channels=features[1], | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| ), | |||||
| nn.ConvTranspose2d( | |||||
| in_channels=features[1], | |||||
| out_channels=features[1], | |||||
| kernel_size=2, | |||||
| stride=2, | |||||
| padding=0, | |||||
| bias=True, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| ), | |||||
| ) | |||||
| pretrained.act_postprocess3 = nn.Sequential( | |||||
| readout_oper[2], | |||||
| Transpose(1, 2), | |||||
| nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), | |||||
| nn.Conv2d( | |||||
| in_channels=vit_features, | |||||
| out_channels=features[2], | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| ), | |||||
| ) | |||||
| pretrained.act_postprocess4 = nn.Sequential( | |||||
| readout_oper[3], | |||||
| Transpose(1, 2), | |||||
| nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), | |||||
| nn.Conv2d( | |||||
| in_channels=vit_features, | |||||
| out_channels=features[3], | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| ), | |||||
| nn.Conv2d( | |||||
| in_channels=features[3], | |||||
| out_channels=features[3], | |||||
| kernel_size=3, | |||||
| stride=2, | |||||
| padding=1, | |||||
| ), | |||||
| ) | |||||
| pretrained.model.start_index = start_index | |||||
| pretrained.model.patch_size = [16, 16] | |||||
| # We inject this function into the VisionTransformer instances so that | |||||
| # we can use it with interpolated position embeddings without modifying the library source. | |||||
| pretrained.model.forward_flex = types.MethodType(forward_flex, | |||||
| pretrained.model) | |||||
| pretrained.model._resize_pos_embed = types.MethodType( | |||||
| _resize_pos_embed, pretrained.model) | |||||
| return pretrained | |||||
| @@ -0,0 +1,458 @@ | |||||
| """ | |||||
| Adapted from https://github.com/isl-org/lang-seg. | |||||
| Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org. | |||||
| """ | |||||
| from collections import OrderedDict | |||||
| from typing import Tuple, Union | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| from torch import nn | |||||
| class Bottleneck(nn.Module): | |||||
| expansion = 4 | |||||
| def __init__(self, inplanes, planes, stride=1): | |||||
| super().__init__() | |||||
| # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 | |||||
| self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(planes) | |||||
| self.relu1 = nn.ReLU(inplace=True) | |||||
| self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(planes) | |||||
| self.relu2 = nn.ReLU(inplace=True) | |||||
| self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() | |||||
| self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |||||
| self.relu3 = nn.ReLU(inplace=True) | |||||
| self.downsample = None | |||||
| self.stride = stride | |||||
| if stride > 1 or inplanes != planes * Bottleneck.expansion: | |||||
| # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 | |||||
| self.downsample = nn.Sequential( | |||||
| OrderedDict([('-1', nn.AvgPool2d(stride)), | |||||
| ('0', | |||||
| nn.Conv2d( | |||||
| inplanes, | |||||
| planes * self.expansion, | |||||
| 1, | |||||
| stride=1, | |||||
| bias=False)), | |||||
| ('1', nn.BatchNorm2d(planes * self.expansion))])) | |||||
| def forward(self, x: torch.Tensor): | |||||
| identity = x | |||||
| out = self.relu1(self.bn1(self.conv1(x))) | |||||
| out = self.relu2(self.bn2(self.conv2(out))) | |||||
| out = self.avgpool(out) | |||||
| out = self.bn3(self.conv3(out)) | |||||
| if self.downsample is not None: | |||||
| identity = self.downsample(x) | |||||
| out += identity | |||||
| out = self.relu3(out) | |||||
| return out | |||||
| class AttentionPool2d(nn.Module): | |||||
| def __init__(self, | |||||
| spacial_dim: int, | |||||
| embed_dim: int, | |||||
| num_heads: int, | |||||
| output_dim: int = None): | |||||
| super().__init__() | |||||
| self.positional_embedding = nn.Parameter( | |||||
| torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) | |||||
| self.k_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.q_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.v_proj = nn.Linear(embed_dim, embed_dim) | |||||
| self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) | |||||
| self.num_heads = num_heads | |||||
| def forward(self, x): | |||||
| x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC | |||||
| x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC | |||||
| x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC | |||||
| x, _ = F.multi_head_attention_forward( | |||||
| query=x[:1], | |||||
| key=x, | |||||
| value=x, | |||||
| embed_dim_to_check=x.shape[-1], | |||||
| num_heads=self.num_heads, | |||||
| q_proj_weight=self.q_proj.weight, | |||||
| k_proj_weight=self.k_proj.weight, | |||||
| v_proj_weight=self.v_proj.weight, | |||||
| in_proj_weight=None, | |||||
| in_proj_bias=torch.cat( | |||||
| [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), | |||||
| bias_k=None, | |||||
| bias_v=None, | |||||
| add_zero_attn=False, | |||||
| dropout_p=0, | |||||
| out_proj_weight=self.c_proj.weight, | |||||
| out_proj_bias=self.c_proj.bias, | |||||
| use_separate_proj_weight=True, | |||||
| training=self.training, | |||||
| need_weights=False) | |||||
| return x.squeeze(0) | |||||
| class ModifiedResNet(nn.Module): | |||||
| """ | |||||
| A ResNet class that is similar to torchvision's but contains the following changes: | |||||
| - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. | |||||
| - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 | |||||
| - The final pooling layer is a QKV attention instead of an average pool | |||||
| """ | |||||
| def __init__(self, | |||||
| layers, | |||||
| output_dim, | |||||
| heads, | |||||
| input_resolution=224, | |||||
| width=64): | |||||
| super().__init__() | |||||
| self.output_dim = output_dim | |||||
| self.input_resolution = input_resolution | |||||
| # the 3-layer stem | |||||
| self.conv1 = nn.Conv2d( | |||||
| 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(width // 2) | |||||
| self.relu1 = nn.ReLU(inplace=True) | |||||
| self.conv2 = nn.Conv2d( | |||||
| width // 2, width // 2, kernel_size=3, padding=1, bias=False) | |||||
| self.bn2 = nn.BatchNorm2d(width // 2) | |||||
| self.relu2 = nn.ReLU(inplace=True) | |||||
| self.conv3 = nn.Conv2d( | |||||
| width // 2, width, kernel_size=3, padding=1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(width) | |||||
| self.relu3 = nn.ReLU(inplace=True) | |||||
| self.avgpool = nn.AvgPool2d(2) | |||||
| # residual layers | |||||
| self._inplanes = width # this is a *mutable* variable used during construction | |||||
| self.layer1 = self._make_layer(width, layers[0]) | |||||
| self.layer2 = self._make_layer(width * 2, layers[1], stride=2) | |||||
| self.layer3 = self._make_layer(width * 4, layers[2], stride=2) | |||||
| self.layer4 = self._make_layer(width * 8, layers[3], stride=2) | |||||
| embed_dim = width * 32 # the ResNet feature dimension | |||||
| self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, | |||||
| heads, output_dim) | |||||
| def _make_layer(self, planes, blocks, stride=1): | |||||
| layers = [Bottleneck(self._inplanes, planes, stride)] | |||||
| self._inplanes = planes * Bottleneck.expansion | |||||
| for _ in range(1, blocks): | |||||
| layers.append(Bottleneck(self._inplanes, planes)) | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| def stem(x): | |||||
| x = self.relu1(self.bn1(self.conv1(x))) | |||||
| x = self.relu2(self.bn2(self.conv2(x))) | |||||
| x = self.relu3(self.bn3(self.conv3(x))) | |||||
| x = self.avgpool(x) | |||||
| return x | |||||
| x = x.type(self.conv1.weight.dtype) | |||||
| x = stem(x) | |||||
| x = self.layer1(x) | |||||
| x = self.layer2(x) | |||||
| x = self.layer3(x) | |||||
| x = self.layer4(x) | |||||
| x = self.attnpool(x) | |||||
| return x | |||||
| class LayerNorm(nn.LayerNorm): | |||||
| """Subclass torch's LayerNorm to handle fp16.""" | |||||
| def forward(self, x: torch.Tensor): | |||||
| orig_type = x.dtype | |||||
| ret = super().forward(x.type(torch.float32)) | |||||
| return ret.type(orig_type) | |||||
| class QuickGELU(nn.Module): | |||||
| def forward(self, x: torch.Tensor): | |||||
| return x * torch.sigmoid(1.702 * x) | |||||
| class ResidualAttentionBlock(nn.Module): | |||||
| def __init__(self, | |||||
| d_model: int, | |||||
| n_head: int, | |||||
| attn_mask: torch.Tensor = None): | |||||
| super().__init__() | |||||
| self.attn = nn.MultiheadAttention(d_model, n_head) | |||||
| self.ln_1 = LayerNorm(d_model) | |||||
| self.mlp = nn.Sequential( | |||||
| OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), | |||||
| ('gelu', QuickGELU()), | |||||
| ('c_proj', nn.Linear(d_model * 4, d_model))])) | |||||
| self.ln_2 = LayerNorm(d_model) | |||||
| self.attn_mask = attn_mask | |||||
| def attention(self, x: torch.Tensor): | |||||
| self.attn_mask = self.attn_mask.to( | |||||
| dtype=x.dtype, | |||||
| device=x.device) if self.attn_mask is not None else None | |||||
| return self.attn( | |||||
| x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] | |||||
| def forward(self, x: torch.Tensor): | |||||
| x = x + self.attention(self.ln_1(x)) | |||||
| x = x + self.mlp(self.ln_2(x)) | |||||
| return x | |||||
| class Transformer(nn.Module): | |||||
| def __init__(self, width, layers, heads, attn_mask=None): | |||||
| super().__init__() | |||||
| self.width = width | |||||
| self.layers = layers | |||||
| self.resblocks = nn.Sequential(*[ | |||||
| ResidualAttentionBlock(width, heads, attn_mask) | |||||
| for _ in range(layers) | |||||
| ]) | |||||
| def forward(self, x: torch.Tensor): | |||||
| return self.resblocks(x) | |||||
| class VisionTransformer(nn.Module): | |||||
| def __init__(self, input_resolution: int, patch_size: int, width: int, | |||||
| layers: int, heads: int, output_dim: int): | |||||
| super().__init__() | |||||
| self.input_resolution = input_resolution | |||||
| self.output_dim = output_dim | |||||
| self.conv1 = nn.Conv2d( | |||||
| in_channels=3, | |||||
| out_channels=width, | |||||
| kernel_size=patch_size, | |||||
| stride=patch_size, | |||||
| bias=False) | |||||
| scale = width**-0.5 | |||||
| self.class_embedding = nn.Parameter(scale * torch.randn(width)) | |||||
| self.positional_embedding = nn.Parameter(scale * torch.randn( | |||||
| (input_resolution // patch_size)**2 + 1, width)) | |||||
| self.ln_pre = LayerNorm(width) | |||||
| self.transformer = Transformer(width, layers, heads) | |||||
| self.ln_post = LayerNorm(width) | |||||
| self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) | |||||
| def forward(self, x: torch.Tensor): | |||||
| x = self.conv1(x) # shape = [*, width, grid, grid] | |||||
| x = x.reshape(x.shape[0], x.shape[1], | |||||
| -1) # shape = [*, width, grid ** 2] | |||||
| x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] | |||||
| x1 = self.class_embedding.to(x.dtype) | |||||
| x2 = torch.zeros( | |||||
| x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) | |||||
| x = torch.cat([x1 + x2, x], dim=1) # shape = [*, grid ** 2 + 1, width] | |||||
| x = x + self.positional_embedding.to(x.dtype) | |||||
| x = self.ln_pre(x) | |||||
| x = x.permute(1, 0, 2) # NLD -> LND | |||||
| x = self.transformer(x) | |||||
| x = x.permute(1, 0, 2) # LND -> NLD | |||||
| x = self.ln_post(x[:, 0, :]) | |||||
| if self.proj is not None: | |||||
| x = x @ self.proj | |||||
| return x | |||||
| class CLIP(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| embed_dim: int, | |||||
| # vision | |||||
| image_resolution: int, | |||||
| vision_layers: Union[Tuple[int, int, int, int], int], | |||||
| vision_width: int, | |||||
| vision_patch_size: int, | |||||
| # text | |||||
| context_length: int, | |||||
| vocab_size: int, | |||||
| transformer_width: int, | |||||
| transformer_heads: int, | |||||
| transformer_layers: int): | |||||
| super().__init__() | |||||
| self.context_length = context_length | |||||
| if isinstance(vision_layers, (tuple, list)): | |||||
| vision_heads = vision_width * 32 // 64 | |||||
| self.visual = ModifiedResNet( | |||||
| layers=vision_layers, | |||||
| output_dim=embed_dim, | |||||
| heads=vision_heads, | |||||
| input_resolution=image_resolution, | |||||
| width=vision_width) | |||||
| else: | |||||
| vision_heads = vision_width // 64 | |||||
| self.visual = VisionTransformer( | |||||
| input_resolution=image_resolution, | |||||
| patch_size=vision_patch_size, | |||||
| width=vision_width, | |||||
| layers=vision_layers, | |||||
| heads=vision_heads, | |||||
| output_dim=embed_dim) | |||||
| self.transformer = Transformer( | |||||
| width=transformer_width, | |||||
| layers=transformer_layers, | |||||
| heads=transformer_heads, | |||||
| attn_mask=self.build_attention_mask()) | |||||
| self.vocab_size = vocab_size | |||||
| self.token_embedding = nn.Embedding(vocab_size, transformer_width) | |||||
| self.positional_embedding = nn.Parameter( | |||||
| torch.empty(self.context_length, transformer_width)) | |||||
| self.ln_final = LayerNorm(transformer_width) | |||||
| self.text_projection = nn.Parameter( | |||||
| torch.empty(transformer_width, embed_dim)) | |||||
| self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) | |||||
| self.initialize_parameters() | |||||
| def initialize_parameters(self): | |||||
| nn.init.normal_(self.token_embedding.weight, std=0.02) | |||||
| nn.init.normal_(self.positional_embedding, std=0.01) | |||||
| if isinstance(self.visual, ModifiedResNet): | |||||
| if self.visual.attnpool is not None: | |||||
| std = self.visual.attnpool.c_proj.in_features**-0.5 | |||||
| nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) | |||||
| nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) | |||||
| nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) | |||||
| nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) | |||||
| for resnet_block in [ | |||||
| self.visual.layer1, self.visual.layer2, self.visual.layer3, | |||||
| self.visual.layer4 | |||||
| ]: | |||||
| for name, param in resnet_block.named_parameters(): | |||||
| if name.endswith('bn3.weight'): | |||||
| nn.init.zeros_(param) | |||||
| proj_std = (self.transformer.width**-0.5) * ( | |||||
| (2 * self.transformer.layers)**-0.5) | |||||
| attn_std = self.transformer.width**-0.5 | |||||
| fc_std = (2 * self.transformer.width)**-0.5 | |||||
| for block in self.transformer.resblocks: | |||||
| nn.init.normal_(block.attn.in_proj_weight, std=attn_std) | |||||
| nn.init.normal_(block.attn.out_proj.weight, std=proj_std) | |||||
| nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) | |||||
| nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) | |||||
| if self.text_projection is not None: | |||||
| nn.init.normal_( | |||||
| self.text_projection, std=self.transformer.width**-0.5) | |||||
| def build_attention_mask(self): | |||||
| # lazily create causal attention mask, with full attention between the vision tokens | |||||
| # pytorch uses additive attention mask; fill with -inf | |||||
| mask = torch.empty(self.context_length, self.context_length) | |||||
| mask.fill_(float('-inf')) | |||||
| mask.triu_(1) # zero out the lower diagonal | |||||
| return mask | |||||
| @property | |||||
| def dtype(self): | |||||
| return self.visual.conv1.weight.dtype | |||||
| def encode_image(self, image): | |||||
| return self.visual(image.type(self.dtype)) | |||||
| def encode_text(self, text): | |||||
| x = self.token_embedding(text).type(self.dtype) | |||||
| x = x + self.positional_embedding.type(self.dtype) | |||||
| x = x.permute(1, 0, 2) # NLD -> LND | |||||
| x = self.transformer(x) | |||||
| x = x.permute(1, 0, 2) # LND -> NLD | |||||
| x = self.ln_final(x).type(self.dtype) | |||||
| x = x[torch.arange(x.shape[0]), | |||||
| text.argmax(dim=-1)] @ self.text_projection | |||||
| return x | |||||
| def forward(self, image, text): | |||||
| image_features = self.encode_image(image) | |||||
| text_features = self.encode_text(text) | |||||
| # normalized features | |||||
| image_features = image_features / image_features.norm( | |||||
| dim=1, keepdim=True) | |||||
| text_features = text_features / text_features.norm(dim=1, keepdim=True) | |||||
| # cosine similarity as logits | |||||
| logit_scale = self.logit_scale.exp() | |||||
| logits_per_image = logit_scale * image_features @ text_features.t() | |||||
| logits_per_text = logits_per_image.t() | |||||
| # shape = [global_batch_size, global_batch_size] | |||||
| return logits_per_image, logits_per_text | |||||
| def convert_weights(model: nn.Module): | |||||
| """Convert applicable model parameters to fp16""" | |||||
| def _convert_weights_to_fp16(ll): | |||||
| if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)): | |||||
| ll.weight.data = ll.weight.data.half() | |||||
| if ll.bias is not None: | |||||
| ll.bias.data = ll.bias.data.half() | |||||
| if isinstance(ll, nn.MultiheadAttention): | |||||
| for attr in [ | |||||
| *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], | |||||
| 'in_proj_bias', 'bias_k', 'bias_v' | |||||
| ]: | |||||
| tensor = getattr(ll, attr) | |||||
| if tensor is not None: | |||||
| tensor.data = tensor.data.half() | |||||
| for name in ['text_projection', 'proj']: | |||||
| if hasattr(ll, name): | |||||
| attr = getattr(ll, name) | |||||
| if attr is not None: | |||||
| attr.data = attr.data.half() | |||||
| model.apply(_convert_weights_to_fp16) | |||||
| def build_model(): | |||||
| model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12) | |||||
| convert_weights(model) | |||||
| return model.eval() | |||||
| @@ -0,0 +1,156 @@ | |||||
| """ CLIP | |||||
| Adapted from https://github.com/openai/CLIP. | |||||
| Originally MIT License, Copyright (c) 2021 OpenAI. | |||||
| """ | |||||
| import gzip | |||||
| import html | |||||
| import os | |||||
| from functools import lru_cache | |||||
| import ftfy | |||||
| import regex as re | |||||
| @lru_cache() | |||||
| def default_bpe(): | |||||
| return os.path.join( | |||||
| os.path.dirname(os.path.abspath(__file__)), | |||||
| 'bpe_simple_vocab_16e6.txt.gz') | |||||
| @lru_cache() | |||||
| def bytes_to_unicode(): | |||||
| """ | |||||
| Returns list of utf-8 byte and a corresponding list of unicode strings. | |||||
| The reversible bpe codes work on unicode strings. | |||||
| This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | |||||
| When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | |||||
| This is a signficant percentage of your normal, say, 32K bpe vocab. | |||||
| To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | |||||
| And avoids mapping to whitespace/control characters the bpe code barfs on. | |||||
| """ | |||||
| bs = list(range(ord('!'), | |||||
| ord('~') + 1)) + list(range( | |||||
| ord('¡'), | |||||
| ord('¬') + 1)) + list(range(ord('®'), | |||||
| ord('ÿ') + 1)) | |||||
| cs = bs[:] | |||||
| n = 0 | |||||
| for b in range(2**8): | |||||
| if b not in bs: | |||||
| bs.append(b) | |||||
| cs.append(2**8 + n) | |||||
| n += 1 | |||||
| cs = [chr(n) for n in cs] | |||||
| return dict(zip(bs, cs)) | |||||
| def get_pairs(word): | |||||
| """Return set of symbol pairs in a word. | |||||
| Word is represented as tuple of symbols (symbols being variable-length strings). | |||||
| """ | |||||
| pairs = set() | |||||
| prev_char = word[0] | |||||
| for char in word[1:]: | |||||
| pairs.add((prev_char, char)) | |||||
| prev_char = char | |||||
| return pairs | |||||
| def basic_clean(text): | |||||
| text = ftfy.fix_text(text) | |||||
| text = html.unescape(html.unescape(text)) | |||||
| return text.strip() | |||||
| def whitespace_clean(text): | |||||
| text = re.sub(r'\s+', ' ', text) | |||||
| text = text.strip() | |||||
| return text | |||||
| class SimpleTokenizer(object): | |||||
| def __init__(self, bpe_path: str = default_bpe()): | |||||
| self.byte_encoder = bytes_to_unicode() | |||||
| self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} | |||||
| merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') | |||||
| merges = merges[1:49152 - 256 - 2 + 1] | |||||
| merges = [tuple(merge.split()) for merge in merges] | |||||
| vocab = list(bytes_to_unicode().values()) | |||||
| vocab = vocab + [v + '</w>' for v in vocab] | |||||
| for merge in merges: | |||||
| vocab.append(''.join(merge)) | |||||
| vocab.extend(['<|startoftext|>', '<|endoftext|>']) | |||||
| self.encoder = dict(zip(vocab, range(len(vocab)))) | |||||
| self.decoder = {v: k for k, v in self.encoder.items()} | |||||
| self.bpe_ranks = dict(zip(merges, range(len(merges)))) | |||||
| self.cache = { | |||||
| '<|startoftext|>': '<|startoftext|>', | |||||
| '<|endoftext|>': '<|endoftext|>' | |||||
| } | |||||
| self.pat = re.compile( | |||||
| r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", | |||||
| re.IGNORECASE) | |||||
| def bpe(self, token): | |||||
| if token in self.cache: | |||||
| return self.cache[token] | |||||
| word = tuple(token[:-1]) + (token[-1] + '</w>', ) | |||||
| pairs = get_pairs(word) | |||||
| if not pairs: | |||||
| return token + '</w>' | |||||
| while True: | |||||
| bigram = min( | |||||
| pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) | |||||
| if bigram not in self.bpe_ranks: | |||||
| break | |||||
| first, second = bigram | |||||
| new_word = [] | |||||
| i = 0 | |||||
| error_list = [] | |||||
| while i < len(word): | |||||
| try: | |||||
| j = word.index(first, i) | |||||
| new_word.extend(word[i:j]) | |||||
| i = j | |||||
| except Exception as err: | |||||
| new_word.extend(word[i:]) | |||||
| error_list.append(err) | |||||
| break | |||||
| if word[i] == first and i < len(word) - 1 and word[ | |||||
| i + 1] == second: | |||||
| new_word.append(first + second) | |||||
| i += 2 | |||||
| else: | |||||
| new_word.append(word[i]) | |||||
| i += 1 | |||||
| new_word = tuple(new_word) | |||||
| word = new_word | |||||
| if len(word) == 1: | |||||
| break | |||||
| else: | |||||
| pairs = get_pairs(word) | |||||
| word = ' '.join(word) | |||||
| self.cache[token] = word | |||||
| return word | |||||
| def encode(self, text): | |||||
| bpe_tokens = [] | |||||
| text = whitespace_clean(basic_clean(text)).lower() | |||||
| for token in re.findall(self.pat, text): | |||||
| token = ''.join(self.byte_encoder[b] | |||||
| for b in token.encode('utf-8')) | |||||
| bpe_tokens.extend(self.encoder[bpe_token] | |||||
| for bpe_token in self.bpe(token).split(' ')) | |||||
| return bpe_tokens | |||||
| def decode(self, tokens): | |||||
| text = ''.join([self.decoder[token] for token in tokens]) | |||||
| text = bytearray([self.byte_decoder[c] for c in text]).decode( | |||||
| 'utf-8', errors='replace').replace('</w>', ' ') | |||||
| return text | |||||
| @@ -0,0 +1,24 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .tinynas_detector import Tinynas_detector | |||||
| else: | |||||
| _import_structure = { | |||||
| 'tinynas_detector': ['TinynasDetector'], | |||||
| } | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import copy | |||||
| from .darknet import CSPDarknet | |||||
| from .tinynas import load_tinynas_net | |||||
| def build_backbone(cfg): | |||||
| backbone_cfg = copy.deepcopy(cfg) | |||||
| name = backbone_cfg.pop('name') | |||||
| if name == 'CSPDarknet': | |||||
| return CSPDarknet(**backbone_cfg) | |||||
| elif name == 'TinyNAS': | |||||
| return load_tinynas_net(backbone_cfg) | |||||
| @@ -0,0 +1,126 @@ | |||||
| # Copyright (c) Megvii Inc. All rights reserved. | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import torch | |||||
| from torch import nn | |||||
| from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer, | |||||
| SPPBottleneck) | |||||
| class CSPDarknet(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| dep_mul, | |||||
| wid_mul, | |||||
| out_features=('dark3', 'dark4', 'dark5'), | |||||
| depthwise=False, | |||||
| act='silu', | |||||
| reparam=False, | |||||
| ): | |||||
| super(CSPDarknet, self).__init__() | |||||
| assert out_features, 'please provide output features of Darknet' | |||||
| self.out_features = out_features | |||||
| Conv = DWConv if depthwise else BaseConv | |||||
| base_channels = int(wid_mul * 64) # 64 | |||||
| base_depth = max(round(dep_mul * 3), 1) # 3 | |||||
| # stem | |||||
| # self.stem = Focus(3, base_channels, ksize=3, act=act) | |||||
| self.stem = Focus(3, base_channels, 3, act=act) | |||||
| # dark2 | |||||
| self.dark2 = nn.Sequential( | |||||
| Conv(base_channels, base_channels * 2, 3, 2, act=act), | |||||
| CSPLayer( | |||||
| base_channels * 2, | |||||
| base_channels * 2, | |||||
| n=base_depth, | |||||
| depthwise=depthwise, | |||||
| act=act, | |||||
| reparam=reparam, | |||||
| ), | |||||
| ) | |||||
| # dark3 | |||||
| self.dark3 = nn.Sequential( | |||||
| Conv(base_channels * 2, base_channels * 4, 3, 2, act=act), | |||||
| CSPLayer( | |||||
| base_channels * 4, | |||||
| base_channels * 4, | |||||
| n=base_depth * 3, | |||||
| depthwise=depthwise, | |||||
| act=act, | |||||
| reparam=reparam, | |||||
| ), | |||||
| ) | |||||
| # dark4 | |||||
| self.dark4 = nn.Sequential( | |||||
| Conv(base_channels * 4, base_channels * 8, 3, 2, act=act), | |||||
| CSPLayer( | |||||
| base_channels * 8, | |||||
| base_channels * 8, | |||||
| n=base_depth * 3, | |||||
| depthwise=depthwise, | |||||
| act=act, | |||||
| reparam=reparam, | |||||
| ), | |||||
| ) | |||||
| # dark5 | |||||
| self.dark5 = nn.Sequential( | |||||
| Conv(base_channels * 8, base_channels * 16, 3, 2, act=act), | |||||
| SPPBottleneck( | |||||
| base_channels * 16, base_channels * 16, activation=act), | |||||
| CSPLayer( | |||||
| base_channels * 16, | |||||
| base_channels * 16, | |||||
| n=base_depth, | |||||
| shortcut=False, | |||||
| depthwise=depthwise, | |||||
| act=act, | |||||
| reparam=reparam, | |||||
| ), | |||||
| ) | |||||
| def init_weights(self, pretrain=None): | |||||
| if pretrain is None: | |||||
| return | |||||
| else: | |||||
| pretrained_dict = torch.load( | |||||
| pretrain, map_location='cpu')['state_dict'] | |||||
| new_params = self.state_dict().copy() | |||||
| for k, v in pretrained_dict.items(): | |||||
| ks = k.split('.') | |||||
| if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[ | |||||
| -1] == 'total_params': | |||||
| continue | |||||
| else: | |||||
| new_params[k] = v | |||||
| self.load_state_dict(new_params) | |||||
| print(f' load pretrain backbone from {pretrain}') | |||||
| def forward(self, x): | |||||
| outputs = {} | |||||
| x = self.stem(x) | |||||
| outputs['stem'] = x | |||||
| x = self.dark2(x) | |||||
| outputs['dark2'] = x | |||||
| x = self.dark3(x) | |||||
| outputs['dark3'] = x | |||||
| x = self.dark4(x) | |||||
| outputs['dark4'] = x | |||||
| x = self.dark5(x) | |||||
| outputs['dark5'] = x | |||||
| features_out = [ | |||||
| outputs['stem'], outputs['dark2'], outputs['dark3'], | |||||
| outputs['dark4'], outputs['dark5'] | |||||
| ] | |||||
| return features_out | |||||
| @@ -0,0 +1,347 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from ..core.base_ops import Focus, SPPBottleneck, get_activation | |||||
| from ..core.repvgg_block import RepVggBlock | |||||
| class ConvKXBN(nn.Module): | |||||
| def __init__(self, in_c, out_c, kernel_size, stride): | |||||
| super(ConvKXBN, self).__init__() | |||||
| self.conv1 = nn.Conv2d( | |||||
| in_c, | |||||
| out_c, | |||||
| kernel_size, | |||||
| stride, (kernel_size - 1) // 2, | |||||
| groups=1, | |||||
| bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(out_c) | |||||
| def forward(self, x): | |||||
| return self.bn1(self.conv1(x)) | |||||
| class ConvKXBNRELU(nn.Module): | |||||
| def __init__(self, in_c, out_c, kernel_size, stride, act='silu'): | |||||
| super(ConvKXBNRELU, self).__init__() | |||||
| self.conv = ConvKXBN(in_c, out_c, kernel_size, stride) | |||||
| if act is None: | |||||
| self.activation_function = torch.relu | |||||
| else: | |||||
| self.activation_function = get_activation(act) | |||||
| def forward(self, x): | |||||
| output = self.conv(x) | |||||
| return self.activation_function(output) | |||||
| class ResConvK1KX(nn.Module): | |||||
| def __init__(self, | |||||
| in_c, | |||||
| out_c, | |||||
| btn_c, | |||||
| kernel_size, | |||||
| stride, | |||||
| force_resproj=False, | |||||
| act='silu'): | |||||
| super(ResConvK1KX, self).__init__() | |||||
| self.stride = stride | |||||
| self.conv1 = ConvKXBN(in_c, btn_c, 1, 1) | |||||
| self.conv2 = RepVggBlock( | |||||
| btn_c, out_c, kernel_size, stride, act='identity') | |||||
| if act is None: | |||||
| self.activation_function = torch.relu | |||||
| else: | |||||
| self.activation_function = get_activation(act) | |||||
| if stride == 2: | |||||
| self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2) | |||||
| else: | |||||
| self.residual_downsample = nn.Identity() | |||||
| if in_c != out_c or force_resproj: | |||||
| self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) | |||||
| else: | |||||
| self.residual_proj = nn.Identity() | |||||
| def forward(self, x): | |||||
| if self.stride != 2: | |||||
| reslink = self.residual_downsample(x) | |||||
| reslink = self.residual_proj(reslink) | |||||
| output = x | |||||
| output = self.conv1(output) | |||||
| output = self.activation_function(output) | |||||
| output = self.conv2(output) | |||||
| if self.stride != 2: | |||||
| output = output + reslink | |||||
| output = self.activation_function(output) | |||||
| return output | |||||
| class SuperResConvK1KX(nn.Module): | |||||
| def __init__(self, | |||||
| in_c, | |||||
| out_c, | |||||
| btn_c, | |||||
| kernel_size, | |||||
| stride, | |||||
| num_blocks, | |||||
| with_spp=False, | |||||
| act='silu'): | |||||
| super(SuperResConvK1KX, self).__init__() | |||||
| if act is None: | |||||
| self.act = torch.relu | |||||
| else: | |||||
| self.act = get_activation(act) | |||||
| self.block_list = nn.ModuleList() | |||||
| for block_id in range(num_blocks): | |||||
| if block_id == 0: | |||||
| in_channels = in_c | |||||
| out_channels = out_c | |||||
| this_stride = stride | |||||
| force_resproj = False # as a part of CSPLayer, DO NOT need this flag | |||||
| this_kernel_size = kernel_size | |||||
| else: | |||||
| in_channels = out_c | |||||
| out_channels = out_c | |||||
| this_stride = 1 | |||||
| force_resproj = False | |||||
| this_kernel_size = kernel_size | |||||
| the_block = ResConvK1KX( | |||||
| in_channels, | |||||
| out_channels, | |||||
| btn_c, | |||||
| this_kernel_size, | |||||
| this_stride, | |||||
| force_resproj, | |||||
| act=act) | |||||
| self.block_list.append(the_block) | |||||
| if block_id == 0 and with_spp: | |||||
| self.block_list.append( | |||||
| SPPBottleneck(out_channels, out_channels)) | |||||
| def forward(self, x): | |||||
| output = x | |||||
| for block in self.block_list: | |||||
| output = block(output) | |||||
| return output | |||||
| class ResConvKXKX(nn.Module): | |||||
| def __init__(self, | |||||
| in_c, | |||||
| out_c, | |||||
| btn_c, | |||||
| kernel_size, | |||||
| stride, | |||||
| force_resproj=False, | |||||
| act='silu'): | |||||
| super(ResConvKXKX, self).__init__() | |||||
| self.stride = stride | |||||
| if self.stride == 2: | |||||
| self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act) | |||||
| else: | |||||
| self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1) | |||||
| self.conv2 = RepVggBlock( | |||||
| btn_c, out_c, kernel_size, stride, act='identity') | |||||
| if act is None: | |||||
| self.activation_function = torch.relu | |||||
| else: | |||||
| self.activation_function = get_activation(act) | |||||
| if stride == 2: | |||||
| self.residual_downsample = nn.AvgPool2d( | |||||
| kernel_size=2, stride=2) | |||||
| else: | |||||
| self.residual_downsample = nn.Identity() | |||||
| if in_c != out_c or force_resproj: | |||||
| self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) | |||||
| else: | |||||
| self.residual_proj = nn.Identity() | |||||
| def forward(self, x): | |||||
| if self.stride == 2: | |||||
| return self.downsampler(x) | |||||
| reslink = self.residual_downsample(x) | |||||
| reslink = self.residual_proj(reslink) | |||||
| output = x | |||||
| output = self.conv1(output) | |||||
| output = self.activation_function(output) | |||||
| output = self.conv2(output) | |||||
| output = output + reslink | |||||
| output = self.activation_function(output) | |||||
| return output | |||||
| class SuperResConvKXKX(nn.Module): | |||||
| def __init__(self, | |||||
| in_c, | |||||
| out_c, | |||||
| btn_c, | |||||
| kernel_size, | |||||
| stride, | |||||
| num_blocks, | |||||
| with_spp=False, | |||||
| act='silu'): | |||||
| super(SuperResConvKXKX, self).__init__() | |||||
| if act is None: | |||||
| self.act = torch.relu | |||||
| else: | |||||
| self.act = get_activation(act) | |||||
| self.block_list = nn.ModuleList() | |||||
| for block_id in range(num_blocks): | |||||
| if block_id == 0: | |||||
| in_channels = in_c | |||||
| out_channels = out_c | |||||
| this_stride = stride | |||||
| force_resproj = False # as a part of CSPLayer, DO NOT need this flag | |||||
| this_kernel_size = kernel_size | |||||
| else: | |||||
| in_channels = out_c | |||||
| out_channels = out_c | |||||
| this_stride = 1 | |||||
| force_resproj = False | |||||
| this_kernel_size = kernel_size | |||||
| the_block = ResConvKXKX( | |||||
| in_channels, | |||||
| out_channels, | |||||
| btn_c, | |||||
| this_kernel_size, | |||||
| this_stride, | |||||
| force_resproj, | |||||
| act=act) | |||||
| self.block_list.append(the_block) | |||||
| if block_id == 0 and with_spp: | |||||
| self.block_list.append( | |||||
| SPPBottleneck(out_channels, out_channels)) | |||||
| def forward(self, x): | |||||
| output = x | |||||
| for block in self.block_list: | |||||
| output = block(output) | |||||
| return output | |||||
| class TinyNAS(nn.Module): | |||||
| def __init__(self, | |||||
| structure_info=None, | |||||
| out_indices=[0, 1, 2, 4, 5], | |||||
| out_channels=[None, None, 128, 256, 512], | |||||
| with_spp=False, | |||||
| use_focus=False, | |||||
| need_conv1=True, | |||||
| act='silu'): | |||||
| super(TinyNAS, self).__init__() | |||||
| assert len(out_indices) == len(out_channels) | |||||
| self.out_indices = out_indices | |||||
| self.need_conv1 = need_conv1 | |||||
| self.block_list = nn.ModuleList() | |||||
| if need_conv1: | |||||
| self.conv1_list = nn.ModuleList() | |||||
| for idx, block_info in enumerate(structure_info): | |||||
| the_block_class = block_info['class'] | |||||
| if the_block_class == 'ConvKXBNRELU': | |||||
| if use_focus: | |||||
| the_block = Focus(block_info['in'], block_info['out'], | |||||
| block_info['k']) | |||||
| else: | |||||
| the_block = ConvKXBNRELU( | |||||
| block_info['in'], | |||||
| block_info['out'], | |||||
| block_info['k'], | |||||
| block_info['s'], | |||||
| act=act) | |||||
| self.block_list.append(the_block) | |||||
| elif the_block_class == 'SuperResConvK1KX': | |||||
| spp = with_spp if idx == len(structure_info) - 1 else False | |||||
| the_block = SuperResConvK1KX( | |||||
| block_info['in'], | |||||
| block_info['out'], | |||||
| block_info['btn'], | |||||
| block_info['k'], | |||||
| block_info['s'], | |||||
| block_info['L'], | |||||
| spp, | |||||
| act=act) | |||||
| self.block_list.append(the_block) | |||||
| elif the_block_class == 'SuperResConvKXKX': | |||||
| spp = with_spp if idx == len(structure_info) - 1 else False | |||||
| the_block = SuperResConvKXKX( | |||||
| block_info['in'], | |||||
| block_info['out'], | |||||
| block_info['btn'], | |||||
| block_info['k'], | |||||
| block_info['s'], | |||||
| block_info['L'], | |||||
| spp, | |||||
| act=act) | |||||
| self.block_list.append(the_block) | |||||
| if need_conv1: | |||||
| if idx in self.out_indices and out_channels[ | |||||
| self.out_indices.index(idx)] is not None: | |||||
| self.conv1_list.append( | |||||
| nn.Conv2d(block_info['out'], | |||||
| out_channels[self.out_indices.index(idx)], | |||||
| 1)) | |||||
| else: | |||||
| self.conv1_list.append(None) | |||||
| def init_weights(self, pretrain=None): | |||||
| pass | |||||
| def forward(self, x): | |||||
| output = x | |||||
| stage_feature_list = [] | |||||
| for idx, block in enumerate(self.block_list): | |||||
| output = block(output) | |||||
| if idx in self.out_indices: | |||||
| if self.need_conv1 and self.conv1_list[idx] is not None: | |||||
| true_out = self.conv1_list[idx](output) | |||||
| stage_feature_list.append(true_out) | |||||
| else: | |||||
| stage_feature_list.append(output) | |||||
| return stage_feature_list | |||||
| def load_tinynas_net(backbone_cfg): | |||||
| # load masternet model to path | |||||
| import ast | |||||
| struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str]) | |||||
| struct_info = ast.literal_eval(struct_str) | |||||
| for layer in struct_info: | |||||
| if 'nbitsA' in layer: | |||||
| del layer['nbitsA'] | |||||
| if 'nbitsW' in layer: | |||||
| del layer['nbitsW'] | |||||
| model = TinyNAS( | |||||
| structure_info=struct_info, | |||||
| out_indices=backbone_cfg.out_indices, | |||||
| out_channels=backbone_cfg.out_channels, | |||||
| with_spp=backbone_cfg.with_spp, | |||||
| use_focus=backbone_cfg.use_focus, | |||||
| act=backbone_cfg.act, | |||||
| need_conv1=backbone_cfg.need_conv1, | |||||
| ) | |||||
| return model | |||||
| @@ -0,0 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| @@ -0,0 +1,474 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import math | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from .repvgg_block import RepVggBlock | |||||
| class SiLU(nn.Module): | |||||
| """export-friendly version of nn.SiLU()""" | |||||
| @staticmethod | |||||
| def forward(x): | |||||
| return x * torch.sigmoid(x) | |||||
| def get_activation(name='silu', inplace=True): | |||||
| if name == 'silu': | |||||
| module = nn.SiLU(inplace=inplace) | |||||
| elif name == 'relu': | |||||
| module = nn.ReLU(inplace=inplace) | |||||
| elif name == 'lrelu': | |||||
| module = nn.LeakyReLU(0.1, inplace=inplace) | |||||
| else: | |||||
| raise AttributeError('Unsupported act type: {}'.format(name)) | |||||
| return module | |||||
| def get_norm(name, out_channels, inplace=True): | |||||
| if name == 'bn': | |||||
| module = nn.BatchNorm2d(out_channels) | |||||
| elif name == 'gn': | |||||
| module = nn.GroupNorm(num_channels=out_channels, num_groups=32) | |||||
| return module | |||||
| class BaseConv(nn.Module): | |||||
| """A Conv2d -> Batchnorm -> silu/leaky relu block""" | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| ksize, | |||||
| stride=1, | |||||
| groups=1, | |||||
| bias=False, | |||||
| act='silu', | |||||
| norm='bn'): | |||||
| super().__init__() | |||||
| # same padding | |||||
| pad = (ksize - 1) // 2 | |||||
| self.conv = nn.Conv2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=ksize, | |||||
| stride=stride, | |||||
| padding=pad, | |||||
| groups=groups, | |||||
| bias=bias, | |||||
| ) | |||||
| if norm is not None: | |||||
| self.bn = get_norm(norm, out_channels, inplace=True) | |||||
| if act is not None: | |||||
| self.act = get_activation(act, inplace=True) | |||||
| self.with_norm = norm is not None | |||||
| self.with_act = act is not None | |||||
| def forward(self, x): | |||||
| x = self.conv(x) | |||||
| if self.with_norm: | |||||
| # x = self.norm(x) | |||||
| x = self.bn(x) | |||||
| if self.with_act: | |||||
| x = self.act(x) | |||||
| return x | |||||
| def fuseforward(self, x): | |||||
| return self.act(self.conv(x)) | |||||
| class DepthWiseConv(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| ksize, | |||||
| stride=1, | |||||
| groups=None, | |||||
| bias=False, | |||||
| act='silu', | |||||
| norm='bn'): | |||||
| super().__init__() | |||||
| padding = (ksize - 1) // 2 | |||||
| self.depthwise = nn.Conv2d( | |||||
| in_channels, | |||||
| in_channels, | |||||
| kernel_size=ksize, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| groups=in_channels, | |||||
| bias=bias, | |||||
| ) | |||||
| self.pointwise = nn.Conv2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=bias) | |||||
| if norm is not None: | |||||
| self.dwnorm = get_norm(norm, in_channels, inplace=True) | |||||
| self.pwnorm = get_norm(norm, out_channels, inplace=True) | |||||
| if act is not None: | |||||
| self.act = get_activation(act, inplace=True) | |||||
| self.with_norm = norm is not None | |||||
| self.with_act = act is not None | |||||
| self.order = ['depthwise', 'dwnorm', 'pointwise', 'act'] | |||||
| def forward(self, x): | |||||
| for layer_name in self.order: | |||||
| layer = self.__getattr__(layer_name) | |||||
| if layer is not None: | |||||
| x = layer(x) | |||||
| return x | |||||
| class DWConv(nn.Module): | |||||
| """Depthwise Conv + Conv""" | |||||
| def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'): | |||||
| super().__init__() | |||||
| self.dconv = BaseConv( | |||||
| in_channels, | |||||
| in_channels, | |||||
| ksize=ksize, | |||||
| stride=stride, | |||||
| groups=in_channels, | |||||
| act=act, | |||||
| ) | |||||
| self.pconv = BaseConv( | |||||
| in_channels, out_channels, ksize=1, stride=1, groups=1, act=act) | |||||
| def forward(self, x): | |||||
| x = self.dconv(x) | |||||
| return self.pconv(x) | |||||
| class Bottleneck(nn.Module): | |||||
| # Standard bottleneck | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| shortcut=True, | |||||
| expansion=0.5, | |||||
| depthwise=False, | |||||
| act='silu', | |||||
| reparam=False, | |||||
| ): | |||||
| super().__init__() | |||||
| hidden_channels = int(out_channels * expansion) | |||||
| Conv = DWConv if depthwise else BaseConv | |||||
| k_conv1 = 3 if reparam else 1 | |||||
| self.conv1 = BaseConv( | |||||
| in_channels, hidden_channels, k_conv1, stride=1, act=act) | |||||
| if reparam: | |||||
| self.conv2 = RepVggBlock( | |||||
| hidden_channels, out_channels, 3, stride=1, act=act) | |||||
| else: | |||||
| self.conv2 = Conv( | |||||
| hidden_channels, out_channels, 3, stride=1, act=act) | |||||
| self.use_add = shortcut and in_channels == out_channels | |||||
| def forward(self, x): | |||||
| y = self.conv2(self.conv1(x)) | |||||
| if self.use_add: | |||||
| y = y + x | |||||
| return y | |||||
| class ResLayer(nn.Module): | |||||
| 'Residual layer with `in_channels` inputs.' | |||||
| def __init__(self, in_channels: int): | |||||
| super().__init__() | |||||
| mid_channels = in_channels // 2 | |||||
| self.layer1 = BaseConv( | |||||
| in_channels, mid_channels, ksize=1, stride=1, act='lrelu') | |||||
| self.layer2 = BaseConv( | |||||
| mid_channels, in_channels, ksize=3, stride=1, act='lrelu') | |||||
| def forward(self, x): | |||||
| out = self.layer2(self.layer1(x)) | |||||
| return x + out | |||||
| class SPPBottleneck(nn.Module): | |||||
| """Spatial pyramid pooling layer used in YOLOv3-SPP""" | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_sizes=(5, 9, 13), | |||||
| activation='silu'): | |||||
| super().__init__() | |||||
| hidden_channels = in_channels // 2 | |||||
| self.conv1 = BaseConv( | |||||
| in_channels, hidden_channels, 1, stride=1, act=activation) | |||||
| self.m = nn.ModuleList([ | |||||
| nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) | |||||
| for ks in kernel_sizes | |||||
| ]) | |||||
| conv2_channels = hidden_channels * (len(kernel_sizes) + 1) | |||||
| self.conv2 = BaseConv( | |||||
| conv2_channels, out_channels, 1, stride=1, act=activation) | |||||
| def forward(self, x): | |||||
| x = self.conv1(x) | |||||
| x = torch.cat([x] + [m(x) for m in self.m], dim=1) | |||||
| x = self.conv2(x) | |||||
| return x | |||||
| class CSPLayer(nn.Module): | |||||
| """C3 in yolov5, CSP Bottleneck with 3 convolutions""" | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| n=1, | |||||
| shortcut=True, | |||||
| expansion=0.5, | |||||
| depthwise=False, | |||||
| act='silu', | |||||
| reparam=False, | |||||
| ): | |||||
| """ | |||||
| Args: | |||||
| in_channels (int): input channels. | |||||
| out_channels (int): output channels. | |||||
| n (int): number of Bottlenecks. Default value: 1. | |||||
| """ | |||||
| # ch_in, ch_out, number, shortcut, groups, expansion | |||||
| super().__init__() | |||||
| hidden_channels = int(out_channels * expansion) # hidden channels | |||||
| self.conv1 = BaseConv( | |||||
| in_channels, hidden_channels, 1, stride=1, act=act) | |||||
| self.conv2 = BaseConv( | |||||
| in_channels, hidden_channels, 1, stride=1, act=act) | |||||
| self.conv3 = BaseConv( | |||||
| 2 * hidden_channels, out_channels, 1, stride=1, act=act) | |||||
| module_list = [ | |||||
| Bottleneck( | |||||
| hidden_channels, | |||||
| hidden_channels, | |||||
| shortcut, | |||||
| 1.0, | |||||
| depthwise, | |||||
| act=act, | |||||
| reparam=reparam) for _ in range(n) | |||||
| ] | |||||
| self.m = nn.Sequential(*module_list) | |||||
| def forward(self, x): | |||||
| x_1 = self.conv1(x) | |||||
| x_2 = self.conv2(x) | |||||
| x_1 = self.m(x_1) | |||||
| x = torch.cat((x_1, x_2), dim=1) | |||||
| return self.conv3(x) | |||||
| class Focus(nn.Module): | |||||
| """Focus width and height information into channel space.""" | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| ksize=1, | |||||
| stride=1, | |||||
| act='silu'): | |||||
| super().__init__() | |||||
| self.conv = BaseConv( | |||||
| in_channels * 4, out_channels, ksize, stride, act=act) | |||||
| def forward(self, x): | |||||
| # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) | |||||
| patch_top_left = x[..., ::2, ::2] | |||||
| patch_top_right = x[..., ::2, 1::2] | |||||
| patch_bot_left = x[..., 1::2, ::2] | |||||
| patch_bot_right = x[..., 1::2, 1::2] | |||||
| x = torch.cat( | |||||
| ( | |||||
| patch_top_left, | |||||
| patch_bot_left, | |||||
| patch_top_right, | |||||
| patch_bot_right, | |||||
| ), | |||||
| dim=1, | |||||
| ) | |||||
| return self.conv(x) | |||||
| class fast_Focus(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| ksize=1, | |||||
| stride=1, | |||||
| act='silu'): | |||||
| super(Focus, self).__init__() | |||||
| self.conv1 = self.focus_conv(w1=1.0) | |||||
| self.conv2 = self.focus_conv(w3=1.0) | |||||
| self.conv3 = self.focus_conv(w2=1.0) | |||||
| self.conv4 = self.focus_conv(w4=1.0) | |||||
| self.conv = BaseConv( | |||||
| in_channels * 4, out_channels, ksize, stride, act=act) | |||||
| def forward(self, x): | |||||
| return self.conv( | |||||
| torch.cat( | |||||
| [self.conv1(x), | |||||
| self.conv2(x), | |||||
| self.conv3(x), | |||||
| self.conv4(x)], 1)) | |||||
| def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0): | |||||
| conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False) | |||||
| conv.weight = self.init_weights_constant(w1, w2, w3, w4) | |||||
| conv.weight.requires_grad = False | |||||
| return conv | |||||
| def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0): | |||||
| return nn.Parameter( | |||||
| torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]], | |||||
| [[[w1, w2], [w3, w4]]]])) | |||||
| # shufflenet block | |||||
| def channel_shuffle(x, groups=2): | |||||
| bat_size, channels, w, h = x.shape | |||||
| group_c = channels // groups | |||||
| x = x.view(bat_size, groups, group_c, w, h) | |||||
| x = torch.transpose(x, 1, 2).contiguous() | |||||
| x = x.view(bat_size, -1, w, h) | |||||
| return x | |||||
| def conv_1x1_bn(in_c, out_c, stride=1): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False), | |||||
| nn.BatchNorm2d(out_c), nn.ReLU(True)) | |||||
| def conv_bn(in_c, out_c, stride=2): | |||||
| return nn.Sequential( | |||||
| nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False), | |||||
| nn.BatchNorm2d(out_c), nn.ReLU(True)) | |||||
| class ShuffleBlock(nn.Module): | |||||
| def __init__(self, in_c, out_c, downsample=False): | |||||
| super(ShuffleBlock, self).__init__() | |||||
| self.downsample = downsample | |||||
| half_c = out_c // 2 | |||||
| if downsample: | |||||
| self.branch1 = nn.Sequential( | |||||
| # 3*3 dw conv, stride = 2 | |||||
| # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False), | |||||
| nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False), | |||||
| nn.BatchNorm2d(in_c), | |||||
| # 1*1 pw conv | |||||
| nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| nn.ReLU(True)) | |||||
| self.branch2 = nn.Sequential( | |||||
| # 1*1 pw conv | |||||
| nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| nn.ReLU(True), | |||||
| # 3*3 dw conv, stride = 2 | |||||
| # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False), | |||||
| nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| # 1*1 pw conv | |||||
| nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| nn.ReLU(True)) | |||||
| else: | |||||
| # in_c = out_c | |||||
| assert in_c == out_c | |||||
| self.branch2 = nn.Sequential( | |||||
| # 1*1 pw conv | |||||
| nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| nn.ReLU(True), | |||||
| # 3*3 dw conv, stride = 1 | |||||
| nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| # 1*1 pw conv | |||||
| nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(half_c), | |||||
| nn.ReLU(True)) | |||||
| def forward(self, x): | |||||
| out = None | |||||
| if self.downsample: | |||||
| # if it is downsampling, we don't need to do channel split | |||||
| out = torch.cat((self.branch1(x), self.branch2(x)), 1) | |||||
| else: | |||||
| # channel split | |||||
| channels = x.shape[1] | |||||
| c = channels // 2 | |||||
| x1 = x[:, :c, :, :] | |||||
| x2 = x[:, c:, :, :] | |||||
| out = torch.cat((x1, self.branch2(x2)), 1) | |||||
| return channel_shuffle(out, 2) | |||||
| class ShuffleCSPLayer(nn.Module): | |||||
| """C3 in yolov5, CSP Bottleneck with 3 convolutions""" | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| n=1, | |||||
| shortcut=True, | |||||
| expansion=0.5, | |||||
| depthwise=False, | |||||
| act='silu', | |||||
| ): | |||||
| """ | |||||
| Args: | |||||
| in_channels (int): input channels. | |||||
| out_channels (int): output channels. | |||||
| n (int): number of Bottlenecks. Default value: 1. | |||||
| """ | |||||
| # ch_in, ch_out, number, shortcut, groups, expansion | |||||
| super().__init__() | |||||
| hidden_channels = int(out_channels * expansion) # hidden channels | |||||
| self.conv1 = BaseConv( | |||||
| in_channels, hidden_channels, 1, stride=1, act=act) | |||||
| self.conv2 = BaseConv( | |||||
| in_channels, hidden_channels, 1, stride=1, act=act) | |||||
| module_list = [ | |||||
| Bottleneck( | |||||
| hidden_channels, | |||||
| hidden_channels, | |||||
| shortcut, | |||||
| 1.0, | |||||
| depthwise, | |||||
| act=act) for _ in range(n) | |||||
| ] | |||||
| self.m = nn.Sequential(*module_list) | |||||
| def forward(self, x): | |||||
| x_1 = self.conv1(x) | |||||
| x_2 = self.conv2(x) | |||||
| x_1 = self.m(x_1) | |||||
| x = torch.cat((x_1, x_2), dim=1) | |||||
| # add channel shuffle | |||||
| return channel_shuffle(x, 2) | |||||
| @@ -0,0 +1,324 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class Swish(nn.Module): | |||||
| def __init__(self, inplace=True): | |||||
| super(Swish, self).__init__() | |||||
| self.inplace = inplace | |||||
| def forward(self, x): | |||||
| if self.inplace: | |||||
| x.mul_(F.sigmoid(x)) | |||||
| return x | |||||
| else: | |||||
| return x * F.sigmoid(x) | |||||
| def get_activation(name='silu', inplace=True): | |||||
| if name is None: | |||||
| return nn.Identity() | |||||
| if isinstance(name, str): | |||||
| if name == 'silu': | |||||
| module = nn.SiLU(inplace=inplace) | |||||
| elif name == 'relu': | |||||
| module = nn.ReLU(inplace=inplace) | |||||
| elif name == 'lrelu': | |||||
| module = nn.LeakyReLU(0.1, inplace=inplace) | |||||
| elif name == 'swish': | |||||
| module = Swish(inplace=inplace) | |||||
| elif name == 'hardsigmoid': | |||||
| module = nn.Hardsigmoid(inplace=inplace) | |||||
| else: | |||||
| raise AttributeError('Unsupported act type: {}'.format(name)) | |||||
| return module | |||||
| elif isinstance(name, nn.Module): | |||||
| return name | |||||
| else: | |||||
| raise AttributeError('Unsupported act type: {}'.format(name)) | |||||
| class ConvBNLayer(nn.Module): | |||||
| def __init__(self, | |||||
| ch_in, | |||||
| ch_out, | |||||
| filter_size=3, | |||||
| stride=1, | |||||
| groups=1, | |||||
| padding=0, | |||||
| act=None): | |||||
| super(ConvBNLayer, self).__init__() | |||||
| self.conv = nn.Conv2d( | |||||
| in_channels=ch_in, | |||||
| out_channels=ch_out, | |||||
| kernel_size=filter_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| groups=groups, | |||||
| bias=False) | |||||
| self.bn = nn.BatchNorm2d(ch_out, ) | |||||
| self.act = get_activation(act, inplace=True) | |||||
| def forward(self, x): | |||||
| x = self.conv(x) | |||||
| x = self.bn(x) | |||||
| x = self.act(x) | |||||
| return x | |||||
| class RepVGGBlock(nn.Module): | |||||
| def __init__(self, ch_in, ch_out, act='relu', deploy=False): | |||||
| super(RepVGGBlock, self).__init__() | |||||
| self.ch_in = ch_in | |||||
| self.ch_out = ch_out | |||||
| self.deploy = deploy | |||||
| self.in_channels = ch_in | |||||
| self.groups = 1 | |||||
| if self.deploy is False: | |||||
| self.rbr_dense = ConvBNLayer( | |||||
| ch_in, ch_out, 3, stride=1, padding=1, act=None) | |||||
| self.rbr_1x1 = ConvBNLayer( | |||||
| ch_in, ch_out, 1, stride=1, padding=0, act=None) | |||||
| # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None | |||||
| self.rbr_identity = None | |||||
| else: | |||||
| self.rbr_reparam = nn.Conv2d( | |||||
| in_channels=self.ch_in, | |||||
| out_channels=self.ch_out, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| groups=1) | |||||
| self.act = get_activation(act) if act is None or isinstance( | |||||
| act, (str, dict)) else act | |||||
| def forward(self, x): | |||||
| if self.deploy: | |||||
| print('----------deploy----------') | |||||
| y = self.rbr_reparam(x) | |||||
| else: | |||||
| if self.rbr_identity is None: | |||||
| y = self.rbr_dense(x) + self.rbr_1x1(x) | |||||
| else: | |||||
| y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x) | |||||
| y = self.act(y) | |||||
| return y | |||||
| def switch_to_deploy(self): | |||||
| print('switch') | |||||
| if not hasattr(self, 'rbr_reparam'): | |||||
| # return | |||||
| self.rbr_reparam = nn.Conv2d( | |||||
| in_channels=self.ch_in, | |||||
| out_channels=self.ch_out, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| groups=1) | |||||
| print('switch') | |||||
| kernel, bias = self.get_equivalent_kernel_bias() | |||||
| self.rbr_reparam.weight.data = kernel | |||||
| self.rbr_reparam.bias.data = bias | |||||
| for para in self.parameters(): | |||||
| para.detach_() | |||||
| # self.__delattr__(self.rbr_dense) | |||||
| # self.__delattr__(self.rbr_1x1) | |||||
| self.__delattr__('rbr_dense') | |||||
| self.__delattr__('rbr_1x1') | |||||
| if hasattr(self, 'rbr_identity'): | |||||
| self.__delattr__('rbr_identity') | |||||
| if hasattr(self, 'id_tensor'): | |||||
| self.__delattr__('id_tensor') | |||||
| self.deploy = True | |||||
| def get_equivalent_kernel_bias(self): | |||||
| kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) | |||||
| kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) | |||||
| kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) | |||||
| return kernel3x3 + self._pad_1x1_to_3x3_tensor( | |||||
| kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid | |||||
| def _pad_1x1_to_3x3_tensor(self, kernel1x1): | |||||
| if kernel1x1 is None: | |||||
| return 0 | |||||
| else: | |||||
| return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) | |||||
| def _fuse_bn_tensor(self, branch): | |||||
| if branch is None: | |||||
| return 0, 0 | |||||
| # if isinstance(branch, nn.Sequential): | |||||
| if isinstance(branch, ConvBNLayer): | |||||
| kernel = branch.conv.weight | |||||
| running_mean = branch.bn.running_mean | |||||
| running_var = branch.bn.running_var | |||||
| gamma = branch.bn.weight | |||||
| beta = branch.bn.bias | |||||
| eps = branch.bn.eps | |||||
| else: | |||||
| assert isinstance(branch, nn.BatchNorm2d) | |||||
| if not hasattr(self, 'id_tensor'): | |||||
| input_dim = self.in_channels // self.groups | |||||
| kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), | |||||
| dtype=np.float32) | |||||
| for i in range(self.in_channels): | |||||
| kernel_value[i, i % input_dim, 1, 1] = 1 | |||||
| self.id_tensor = torch.from_numpy(kernel_value).to( | |||||
| branch.weight.device) | |||||
| kernel = self.id_tensor | |||||
| running_mean = branch.running_mean | |||||
| running_var = branch.running_var | |||||
| gamma = branch.weight | |||||
| beta = branch.bias | |||||
| eps = branch.eps | |||||
| std = (running_var + eps).sqrt() | |||||
| t = (gamma / std).reshape(-1, 1, 1, 1) | |||||
| return kernel * t, beta - running_mean * gamma / std | |||||
| class BasicBlock(nn.Module): | |||||
| def __init__(self, ch_in, ch_out, act='relu', shortcut=True): | |||||
| super(BasicBlock, self).__init__() | |||||
| assert ch_in == ch_out | |||||
| # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) | |||||
| # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) | |||||
| self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) | |||||
| self.shortcut = shortcut | |||||
| def forward(self, x): | |||||
| # y = self.conv1(x) | |||||
| y = self.conv2(x) | |||||
| if self.shortcut: | |||||
| return x + y | |||||
| else: | |||||
| return y | |||||
| class BasicBlock_3x3(nn.Module): | |||||
| def __init__(self, ch_in, ch_out, act='relu', shortcut=True): | |||||
| super(BasicBlock_3x3, self).__init__() | |||||
| assert ch_in == ch_out | |||||
| self.conv1 = ConvBNLayer( | |||||
| ch_in, ch_out, 3, stride=1, padding=1, act=act) | |||||
| # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) | |||||
| self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) | |||||
| self.shortcut = shortcut | |||||
| def forward(self, x): | |||||
| y = self.conv1(x) | |||||
| y = self.conv2(y) | |||||
| if self.shortcut: | |||||
| return x + y | |||||
| else: | |||||
| return y | |||||
| class BasicBlock_3x3_Reverse(nn.Module): | |||||
| def __init__(self, ch_in, ch_out, act='relu', shortcut=True): | |||||
| super(BasicBlock_3x3_Reverse, self).__init__() | |||||
| assert ch_in == ch_out | |||||
| self.conv1 = ConvBNLayer( | |||||
| ch_in, ch_out, 3, stride=1, padding=1, act=act) | |||||
| # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act) | |||||
| self.conv2 = RepVGGBlock(ch_in, ch_out, act=act) | |||||
| self.shortcut = shortcut | |||||
| def forward(self, x): | |||||
| y = self.conv2(x) | |||||
| y = self.conv1(y) | |||||
| if self.shortcut: | |||||
| return x + y | |||||
| else: | |||||
| return y | |||||
| class SPP(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| ch_in, | |||||
| ch_out, | |||||
| k, | |||||
| pool_size, | |||||
| act='swish', | |||||
| ): | |||||
| super(SPP, self).__init__() | |||||
| self.pool = [] | |||||
| for i, size in enumerate(pool_size): | |||||
| pool = nn.MaxPool2d( | |||||
| kernel_size=size, stride=1, padding=size // 2, ceil_mode=False) | |||||
| self.add_module('pool{}'.format(i), pool) | |||||
| self.pool.append(pool) | |||||
| self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act) | |||||
| def forward(self, x): | |||||
| outs = [x] | |||||
| for pool in self.pool: | |||||
| outs.append(pool(x)) | |||||
| y = torch.cat(outs, axis=1) | |||||
| y = self.conv(y) | |||||
| return y | |||||
| class CSPStage(nn.Module): | |||||
| def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False): | |||||
| super(CSPStage, self).__init__() | |||||
| ch_mid = int(ch_out // 2) | |||||
| self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act) | |||||
| self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act) | |||||
| # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act) | |||||
| self.convs = nn.Sequential() | |||||
| next_ch_in = ch_mid | |||||
| for i in range(n): | |||||
| if block_fn == 'BasicBlock': | |||||
| self.convs.add_module( | |||||
| str(i), | |||||
| BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False)) | |||||
| elif block_fn == 'BasicBlock_3x3': | |||||
| self.convs.add_module( | |||||
| str(i), | |||||
| BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True)) | |||||
| elif block_fn == 'BasicBlock_3x3_Reverse': | |||||
| self.convs.add_module( | |||||
| str(i), | |||||
| BasicBlock_3x3_Reverse( | |||||
| next_ch_in, ch_mid, act=act, shortcut=True)) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| if i == (n - 1) // 2 and spp: | |||||
| self.convs.add_module( | |||||
| 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) | |||||
| next_ch_in = ch_mid | |||||
| # self.convs = nn.Sequential(*convs) | |||||
| self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act) | |||||
| def forward(self, x): | |||||
| y1 = self.conv1(x) | |||||
| y2 = self.conv2(x) | |||||
| mid_out = [y1] | |||||
| for conv in self.convs: | |||||
| y2 = conv(y2) | |||||
| mid_out.append(y2) | |||||
| y = torch.cat(mid_out, axis=1) | |||||
| y = self.conv3(y) | |||||
| return y | |||||
| @@ -0,0 +1,205 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torch.nn.init as init | |||||
| from torch.nn.parameter import Parameter | |||||
| def get_activation(name='silu', inplace=True): | |||||
| if name == 'silu': | |||||
| module = nn.SiLU(inplace=inplace) | |||||
| elif name == 'relu': | |||||
| module = nn.ReLU(inplace=inplace) | |||||
| elif name == 'lrelu': | |||||
| module = nn.LeakyReLU(0.1, inplace=inplace) | |||||
| elif name == 'identity': | |||||
| module = nn.Identity() | |||||
| else: | |||||
| raise AttributeError('Unsupported act type: {}'.format(name)) | |||||
| return module | |||||
| def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1): | |||||
| '''Basic cell for rep-style block, including conv and bn''' | |||||
| result = nn.Sequential() | |||||
| result.add_module( | |||||
| 'conv', | |||||
| nn.Conv2d( | |||||
| in_channels=in_channels, | |||||
| out_channels=out_channels, | |||||
| kernel_size=kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| groups=groups, | |||||
| bias=False)) | |||||
| result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) | |||||
| return result | |||||
| class RepVggBlock(nn.Module): | |||||
| '''RepVggBlock is a basic rep-style block, including training and deploy status | |||||
| This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py | |||||
| ''' | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| padding=1, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| padding_mode='zeros', | |||||
| deploy=False, | |||||
| use_se=False, | |||||
| act='relu', | |||||
| norm=None): | |||||
| super(RepVggBlock, self).__init__() | |||||
| """ Initialization of the class. | |||||
| Args: | |||||
| in_channels (int): Number of channels in the input image | |||||
| out_channels (int): Number of channels produced by the convolution | |||||
| kernel_size (int or tuple): Size of the convolving kernel | |||||
| stride (int or tuple, optional): Stride of the convolution. Default: 1 | |||||
| padding (int or tuple, optional): Zero-padding added to both sides of | |||||
| the input. Default: 1 | |||||
| dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 | |||||
| groups (int, optional): Number of blocked connections from input | |||||
| channels to output channels. Default: 1 | |||||
| padding_mode (string, optional): Default: 'zeros' | |||||
| deploy: Whether to be deploy status or training status. Default: False | |||||
| use_se: Whether to use se. Default: False | |||||
| """ | |||||
| self.deploy = deploy | |||||
| self.groups = groups | |||||
| self.in_channels = in_channels | |||||
| self.out_channels = out_channels | |||||
| assert kernel_size == 3 | |||||
| assert padding == 1 | |||||
| padding_11 = padding - kernel_size // 2 | |||||
| if isinstance(act, str): | |||||
| self.nonlinearity = get_activation(act) | |||||
| else: | |||||
| self.nonlinearity = act | |||||
| if use_se: | |||||
| raise NotImplementedError('se block not supported yet') | |||||
| else: | |||||
| self.se = nn.Identity() | |||||
| if deploy: | |||||
| self.rbr_reparam = nn.Conv2d( | |||||
| in_channels=in_channels, | |||||
| out_channels=out_channels, | |||||
| kernel_size=kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| dilation=dilation, | |||||
| groups=groups, | |||||
| bias=True, | |||||
| padding_mode=padding_mode) | |||||
| else: | |||||
| self.rbr_identity = None | |||||
| self.rbr_dense = conv_bn( | |||||
| in_channels=in_channels, | |||||
| out_channels=out_channels, | |||||
| kernel_size=kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| groups=groups) | |||||
| self.rbr_1x1 = conv_bn( | |||||
| in_channels=in_channels, | |||||
| out_channels=out_channels, | |||||
| kernel_size=1, | |||||
| stride=stride, | |||||
| padding=padding_11, | |||||
| groups=groups) | |||||
| def forward(self, inputs): | |||||
| '''Forward process''' | |||||
| if hasattr(self, 'rbr_reparam'): | |||||
| return self.nonlinearity(self.se(self.rbr_reparam(inputs))) | |||||
| if self.rbr_identity is None: | |||||
| id_out = 0 | |||||
| else: | |||||
| id_out = self.rbr_identity(inputs) | |||||
| return self.nonlinearity( | |||||
| self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) | |||||
| def get_equivalent_kernel_bias(self): | |||||
| kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) | |||||
| kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) | |||||
| kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) | |||||
| return kernel3x3 + self._pad_1x1_to_3x3_tensor( | |||||
| kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid | |||||
| def _pad_1x1_to_3x3_tensor(self, kernel1x1): | |||||
| if kernel1x1 is None: | |||||
| return 0 | |||||
| else: | |||||
| return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) | |||||
| def _fuse_bn_tensor(self, branch): | |||||
| if branch is None: | |||||
| return 0, 0 | |||||
| if isinstance(branch, nn.Sequential): | |||||
| kernel = branch.conv.weight | |||||
| running_mean = branch.bn.running_mean | |||||
| running_var = branch.bn.running_var | |||||
| gamma = branch.bn.weight | |||||
| beta = branch.bn.bias | |||||
| eps = branch.bn.eps | |||||
| else: | |||||
| assert isinstance(branch, nn.BatchNorm2d) | |||||
| if not hasattr(self, 'id_tensor'): | |||||
| input_dim = self.in_channels // self.groups | |||||
| kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), | |||||
| dtype=np.float32) | |||||
| for i in range(self.in_channels): | |||||
| kernel_value[i, i % input_dim, 1, 1] = 1 | |||||
| self.id_tensor = torch.from_numpy(kernel_value).to( | |||||
| branch.weight.device) | |||||
| kernel = self.id_tensor | |||||
| running_mean = branch.running_mean | |||||
| running_var = branch.running_var | |||||
| gamma = branch.weight | |||||
| beta = branch.bias | |||||
| eps = branch.eps | |||||
| std = (running_var + eps).sqrt() | |||||
| t = (gamma / std).reshape(-1, 1, 1, 1) | |||||
| return kernel * t, beta - running_mean * gamma / std | |||||
| def switch_to_deploy(self): | |||||
| if hasattr(self, 'rbr_reparam'): | |||||
| return | |||||
| kernel, bias = self.get_equivalent_kernel_bias() | |||||
| self.rbr_reparam = nn.Conv2d( | |||||
| in_channels=self.rbr_dense.conv.in_channels, | |||||
| out_channels=self.rbr_dense.conv.out_channels, | |||||
| kernel_size=self.rbr_dense.conv.kernel_size, | |||||
| stride=self.rbr_dense.conv.stride, | |||||
| padding=self.rbr_dense.conv.padding, | |||||
| dilation=self.rbr_dense.conv.dilation, | |||||
| groups=self.rbr_dense.conv.groups, | |||||
| bias=True) | |||||
| self.rbr_reparam.weight.data = kernel | |||||
| self.rbr_reparam.bias.data = bias | |||||
| for para in self.parameters(): | |||||
| para.detach_() | |||||
| self.__delattr__('rbr_dense') | |||||
| self.__delattr__('rbr_1x1') | |||||
| if hasattr(self, 'rbr_identity'): | |||||
| self.__delattr__('rbr_identity') | |||||
| if hasattr(self, 'id_tensor'): | |||||
| self.__delattr__('id_tensor') | |||||
| self.deploy = True | |||||
| @@ -0,0 +1,196 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import numpy as np | |||||
| import torch | |||||
| import torchvision | |||||
| __all__ = [ | |||||
| 'filter_box', | |||||
| 'postprocess_airdet', | |||||
| 'bboxes_iou', | |||||
| 'matrix_iou', | |||||
| 'adjust_box_anns', | |||||
| 'xyxy2xywh', | |||||
| 'xyxy2cxcywh', | |||||
| ] | |||||
| def multiclass_nms(multi_bboxes, | |||||
| multi_scores, | |||||
| score_thr, | |||||
| iou_thr, | |||||
| max_num=100, | |||||
| score_factors=None): | |||||
| """NMS for multi-class bboxes. | |||||
| Args: | |||||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |||||
| multi_scores (Tensor): shape (n, #class), where the last column | |||||
| contains scores of the background class, but this will be ignored. | |||||
| score_thr (float): bbox threshold, bboxes with scores lower than it | |||||
| will not be considered. | |||||
| nms_thr (float): NMS IoU threshold | |||||
| max_num (int): if there are more than max_num bboxes after NMS, | |||||
| only top max_num will be kept. | |||||
| score_factors (Tensor): The factors multiplied to scores before | |||||
| applying NMS | |||||
| Returns: | |||||
| tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ | |||||
| are 0-based. | |||||
| """ | |||||
| num_classes = multi_scores.size(1) | |||||
| # exclude background category | |||||
| if multi_bboxes.shape[1] > 4: | |||||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | |||||
| else: | |||||
| bboxes = multi_bboxes[:, None].expand( | |||||
| multi_scores.size(0), num_classes, 4) | |||||
| scores = multi_scores | |||||
| # filter out boxes with low scores | |||||
| valid_mask = scores > score_thr # 1000 * 80 bool | |||||
| # We use masked_select for ONNX exporting purpose, | |||||
| # which is equivalent to bboxes = bboxes[valid_mask] | |||||
| # (TODO): as ONNX does not support repeat now, | |||||
| # we have to use this ugly code | |||||
| # bboxes -> 1000, 4 | |||||
| bboxes = torch.masked_select( | |||||
| bboxes, | |||||
| torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), | |||||
| -1)).view(-1, 4) # mask-> 1000*80*4, 80000*4 | |||||
| if score_factors is not None: | |||||
| scores = scores * score_factors[:, None] | |||||
| scores = torch.masked_select(scores, valid_mask) | |||||
| labels = valid_mask.nonzero(as_tuple=False)[:, 1] | |||||
| if bboxes.numel() == 0: | |||||
| bboxes = multi_bboxes.new_zeros((0, 5)) | |||||
| labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) | |||||
| scores = multi_bboxes.new_zeros((0, )) | |||||
| return bboxes, scores, labels | |||||
| keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr) | |||||
| if max_num > 0: | |||||
| keep = keep[:max_num] | |||||
| return bboxes[keep], scores[keep], labels[keep] | |||||
| def filter_box(output, scale_range): | |||||
| """ | |||||
| output: (N, 5+class) shape | |||||
| """ | |||||
| min_scale, max_scale = scale_range | |||||
| w = output[:, 2] - output[:, 0] | |||||
| h = output[:, 3] - output[:, 1] | |||||
| keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) | |||||
| return output[keep] | |||||
| def filter_results(boxlist, num_classes, nms_thre): | |||||
| boxes = boxlist.bbox | |||||
| scores = boxlist.get_field('scores') | |||||
| cls = boxlist.get_field('labels') | |||||
| nms_out_index = torchvision.ops.batched_nms( | |||||
| boxes, | |||||
| scores, | |||||
| cls, | |||||
| nms_thre, | |||||
| ) | |||||
| boxlist = boxlist[nms_out_index] | |||||
| return boxlist | |||||
| def postprocess_airdet(prediction, | |||||
| num_classes, | |||||
| conf_thre=0.7, | |||||
| nms_thre=0.45, | |||||
| imgs=None): | |||||
| box_corner = prediction.new(prediction.shape) | |||||
| box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 | |||||
| box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 | |||||
| box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 | |||||
| box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 | |||||
| prediction[:, :, :4] = box_corner[:, :, :4] | |||||
| output = [None for _ in range(len(prediction))] | |||||
| for i, image_pred in enumerate(prediction): | |||||
| # If none are remaining => process next image | |||||
| if not image_pred.size(0): | |||||
| continue | |||||
| multi_bboxes = image_pred[:, :4] | |||||
| multi_scores = image_pred[:, 5:] | |||||
| detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores, | |||||
| conf_thre, nms_thre, 500) | |||||
| detections = torch.cat( | |||||
| (detections, scores[:, None], scores[:, None], labels[:, None]), | |||||
| dim=1) | |||||
| if output[i] is None: | |||||
| output[i] = detections | |||||
| else: | |||||
| output[i] = torch.cat((output[i], detections)) | |||||
| return output | |||||
| def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): | |||||
| if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: | |||||
| raise IndexError | |||||
| if xyxy: | |||||
| tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) | |||||
| br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) | |||||
| area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) | |||||
| area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) | |||||
| else: | |||||
| tl = torch.max( | |||||
| (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), | |||||
| (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), | |||||
| ) | |||||
| br = torch.min( | |||||
| (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), | |||||
| (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), | |||||
| ) | |||||
| area_a = torch.prod(bboxes_a[:, 2:], 1) | |||||
| area_b = torch.prod(bboxes_b[:, 2:], 1) | |||||
| en = (tl < br).type(tl.type()).prod(dim=2) | |||||
| area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) | |||||
| return area_i / (area_a[:, None] + area_b - area_i) | |||||
| def matrix_iou(a, b): | |||||
| """ | |||||
| return iou of a and b, numpy version for data augenmentation | |||||
| """ | |||||
| lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) | |||||
| rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) | |||||
| area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) | |||||
| area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) | |||||
| area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) | |||||
| return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) | |||||
| def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): | |||||
| bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) | |||||
| bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) | |||||
| return bbox | |||||
| def xyxy2xywh(bboxes): | |||||
| bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] | |||||
| bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] | |||||
| return bboxes | |||||
| def xyxy2cxcywh(bboxes): | |||||
| bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] | |||||
| bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] | |||||
| bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5 | |||||
| bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5 | |||||
| return bboxes | |||||
| @@ -0,0 +1,181 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import os.path as osp | |||||
| import pickle | |||||
| import cv2 | |||||
| import torch | |||||
| import torchvision | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base.base_torch_model import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.config import Config | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from .backbone import build_backbone | |||||
| from .head import build_head | |||||
| from .neck import build_neck | |||||
| from .utils import parse_config | |||||
| class SingleStageDetector(TorchModel): | |||||
| """ | |||||
| The base class of single stage detector. | |||||
| """ | |||||
| def __init__(self, model_dir: str, *args, **kwargs): | |||||
| """ | |||||
| init model by cfg | |||||
| """ | |||||
| super().__init__(model_dir, *args, **kwargs) | |||||
| config_path = osp.join(model_dir, 'airdet_s.py') | |||||
| config = parse_config(config_path) | |||||
| self.cfg = config | |||||
| model_path = osp.join(model_dir, config.model.name) | |||||
| label_map = osp.join(model_dir, config.model.class_map) | |||||
| self.label_map = pickle.load(open(label_map, 'rb')) | |||||
| self.size_divisible = config.dataset.size_divisibility | |||||
| self.num_classes = config.model.head.num_classes | |||||
| self.conf_thre = config.model.head.nms_conf_thre | |||||
| self.nms_thre = config.model.head.nms_iou_thre | |||||
| self.backbone = build_backbone(self.cfg.model.backbone) | |||||
| self.neck = build_neck(self.cfg.model.neck) | |||||
| self.head = build_head(self.cfg.model.head) | |||||
| self.load_pretrain_model(model_path) | |||||
| def load_pretrain_model(self, pretrain_model): | |||||
| state_dict = torch.load(pretrain_model, map_location='cpu')['model'] | |||||
| new_state_dict = {} | |||||
| for k, v in state_dict.items(): | |||||
| k = k.replace('module.', '') | |||||
| new_state_dict[k] = v | |||||
| self.load_state_dict(new_state_dict, strict=True) | |||||
| def inference(self, x): | |||||
| if self.training: | |||||
| return self.forward_train(x) | |||||
| else: | |||||
| return self.forward_eval(x) | |||||
| def forward_train(self, x): | |||||
| pass | |||||
| def forward_eval(self, x): | |||||
| x = self.backbone(x) | |||||
| x = self.neck(x) | |||||
| prediction = self.head(x) | |||||
| return prediction | |||||
| def preprocess(self, image): | |||||
| image = torch.from_numpy(image).type(torch.float32) | |||||
| image = image.permute(2, 0, 1) | |||||
| shape = image.shape # c, h, w | |||||
| if self.size_divisible > 0: | |||||
| import math | |||||
| stride = self.size_divisible | |||||
| shape = list(shape) | |||||
| shape[1] = int(math.ceil(shape[1] / stride) * stride) | |||||
| shape[2] = int(math.ceil(shape[2] / stride) * stride) | |||||
| shape = tuple(shape) | |||||
| pad_img = image.new(*shape).zero_() | |||||
| pad_img[:, :image.shape[1], :image.shape[2]].copy_(image) | |||||
| pad_img = pad_img.unsqueeze(0) | |||||
| return pad_img | |||||
| def postprocess(self, preds): | |||||
| bboxes, scores, labels_idx = postprocess_gfocal( | |||||
| preds, self.num_classes, self.conf_thre, self.nms_thre) | |||||
| bboxes = bboxes.cpu().numpy() | |||||
| scores = scores.cpu().numpy() | |||||
| labels_idx = labels_idx.cpu().numpy() | |||||
| labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx] | |||||
| return (bboxes, scores, labels) | |||||
| def multiclass_nms(multi_bboxes, | |||||
| multi_scores, | |||||
| score_thr, | |||||
| iou_thr, | |||||
| max_num=100, | |||||
| score_factors=None): | |||||
| """NMS for multi-class bboxes. | |||||
| Args: | |||||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |||||
| multi_scores (Tensor): shape (n, #class), where the last column | |||||
| contains scores of the background class, but this will be ignored. | |||||
| score_thr (float): bbox threshold, bboxes with scores lower than it | |||||
| will not be considered. | |||||
| nms_thr (float): NMS IoU threshold | |||||
| max_num (int): if there are more than max_num bboxes after NMS, | |||||
| only top max_num will be kept. | |||||
| score_factors (Tensor): The factors multiplied to scores before | |||||
| applying NMS | |||||
| Returns: | |||||
| tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ | |||||
| are 0-based. | |||||
| """ | |||||
| num_classes = multi_scores.size(1) | |||||
| # exclude background category | |||||
| if multi_bboxes.shape[1] > 4: | |||||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | |||||
| else: | |||||
| bboxes = multi_bboxes[:, None].expand( | |||||
| multi_scores.size(0), num_classes, 4) | |||||
| scores = multi_scores | |||||
| # filter out boxes with low scores | |||||
| valid_mask = scores > score_thr # 1000 * 80 bool | |||||
| # We use masked_select for ONNX exporting purpose, | |||||
| # which is equivalent to bboxes = bboxes[valid_mask] | |||||
| # (TODO): as ONNX does not support repeat now, | |||||
| # we have to use this ugly code | |||||
| # bboxes -> 1000, 4 | |||||
| bboxes = torch.masked_select( | |||||
| bboxes, | |||||
| torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), | |||||
| -1)).view(-1, 4) # mask-> 1000*80*4, 80000*4 | |||||
| if score_factors is not None: | |||||
| scores = scores * score_factors[:, None] | |||||
| scores = torch.masked_select(scores, valid_mask) | |||||
| labels = valid_mask.nonzero(as_tuple=False)[:, 1] | |||||
| if bboxes.numel() == 0: | |||||
| bboxes = multi_bboxes.new_zeros((0, 5)) | |||||
| labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) | |||||
| scores = multi_bboxes.new_zeros((0, )) | |||||
| return bboxes, scores, labels | |||||
| keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr) | |||||
| if max_num > 0: | |||||
| keep = keep[:max_num] | |||||
| return bboxes[keep], scores[keep], labels[keep] | |||||
| def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7): | |||||
| assert prediction.shape[0] == 1 | |||||
| for i, image_pred in enumerate(prediction): | |||||
| # If none are remaining => process next image | |||||
| if not image_pred.size(0): | |||||
| continue | |||||
| multi_bboxes = image_pred[:, :4] | |||||
| multi_scores = image_pred[:, 4:] | |||||
| detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores, | |||||
| conf_thre, nms_thre, 500) | |||||
| return detections, scores, labels | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import copy | |||||
| from .gfocal_v2_tiny import GFocalHead_Tiny | |||||
| def build_head(cfg): | |||||
| head_cfg = copy.deepcopy(cfg) | |||||
| name = head_cfg.pop('name') | |||||
| if name == 'GFocalV2': | |||||
| return GFocalHead_Tiny(**head_cfg) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,361 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import functools | |||||
| from functools import partial | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from ..core.base_ops import BaseConv, DWConv | |||||
| class Scale(nn.Module): | |||||
| def __init__(self, scale=1.0): | |||||
| super(Scale, self).__init__() | |||||
| self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) | |||||
| def forward(self, x): | |||||
| return x * self.scale | |||||
| def multi_apply(func, *args, **kwargs): | |||||
| pfunc = partial(func, **kwargs) if kwargs else func | |||||
| map_results = map(pfunc, *args) | |||||
| return tuple(map(list, zip(*map_results))) | |||||
| def xyxy2CxCywh(xyxy, size=None): | |||||
| x1 = xyxy[..., 0] | |||||
| y1 = xyxy[..., 1] | |||||
| x2 = xyxy[..., 2] | |||||
| y2 = xyxy[..., 3] | |||||
| cx = (x1 + x2) / 2 | |||||
| cy = (y1 + y2) / 2 | |||||
| w = x2 - x1 | |||||
| h = y2 - y1 | |||||
| if size is not None: | |||||
| w = w.clamp(min=0, max=size[1]) | |||||
| h = h.clamp(min=0, max=size[0]) | |||||
| return torch.stack([cx, cy, w, h], axis=-1) | |||||
| def distance2bbox(points, distance, max_shape=None): | |||||
| """Decode distance prediction to bounding box. | |||||
| """ | |||||
| x1 = points[..., 0] - distance[..., 0] | |||||
| y1 = points[..., 1] - distance[..., 1] | |||||
| x2 = points[..., 0] + distance[..., 2] | |||||
| y2 = points[..., 1] + distance[..., 3] | |||||
| if max_shape is not None: | |||||
| x1 = x1.clamp(min=0, max=max_shape[1]) | |||||
| y1 = y1.clamp(min=0, max=max_shape[0]) | |||||
| x2 = x2.clamp(min=0, max=max_shape[1]) | |||||
| y2 = y2.clamp(min=0, max=max_shape[0]) | |||||
| return torch.stack([x1, y1, x2, y2], -1) | |||||
| def bbox2distance(points, bbox, max_dis=None, eps=0.1): | |||||
| """Decode bounding box based on distances. | |||||
| """ | |||||
| left = points[:, 0] - bbox[:, 0] | |||||
| top = points[:, 1] - bbox[:, 1] | |||||
| right = bbox[:, 2] - points[:, 0] | |||||
| bottom = bbox[:, 3] - points[:, 1] | |||||
| if max_dis is not None: | |||||
| left = left.clamp(min=0, max=max_dis - eps) | |||||
| top = top.clamp(min=0, max=max_dis - eps) | |||||
| right = right.clamp(min=0, max=max_dis - eps) | |||||
| bottom = bottom.clamp(min=0, max=max_dis - eps) | |||||
| return torch.stack([left, top, right, bottom], -1) | |||||
| class Integral(nn.Module): | |||||
| """A fixed layer for calculating integral result from distribution. | |||||
| """ | |||||
| def __init__(self, reg_max=16): | |||||
| super(Integral, self).__init__() | |||||
| self.reg_max = reg_max | |||||
| self.register_buffer('project', | |||||
| torch.linspace(0, self.reg_max, self.reg_max + 1)) | |||||
| def forward(self, x): | |||||
| """Forward feature from the regression head to get integral result of | |||||
| bounding box location. | |||||
| """ | |||||
| shape = x.size() | |||||
| x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1) | |||||
| b, nb, ne, _ = x.size() | |||||
| x = x.reshape(b * nb * ne, self.reg_max + 1) | |||||
| y = self.project.type_as(x).unsqueeze(1) | |||||
| x = torch.matmul(x, y).reshape(b, nb, 4) | |||||
| return x | |||||
| class GFocalHead_Tiny(nn.Module): | |||||
| """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality | |||||
| Estimation for Dense Object Detection. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| num_classes, | |||||
| in_channels, | |||||
| stacked_convs=4, # 4 | |||||
| feat_channels=256, | |||||
| reg_max=12, | |||||
| reg_topk=4, | |||||
| reg_channels=64, | |||||
| strides=[8, 16, 32], | |||||
| add_mean=True, | |||||
| norm='gn', | |||||
| act='relu', | |||||
| start_kernel_size=3, | |||||
| conv_groups=1, | |||||
| conv_type='BaseConv', | |||||
| simOTA_cls_weight=1.0, | |||||
| simOTA_iou_weight=3.0, | |||||
| octbase=8, | |||||
| simlqe=False, | |||||
| **kwargs): | |||||
| self.simlqe = simlqe | |||||
| self.num_classes = num_classes | |||||
| self.in_channels = in_channels | |||||
| self.strides = strides | |||||
| self.feat_channels = feat_channels if isinstance(feat_channels, list) \ | |||||
| else [feat_channels] * len(self.strides) | |||||
| self.cls_out_channels = num_classes + 1 # add 1 for keep consistance with former models | |||||
| # and will be deprecated in future. | |||||
| self.stacked_convs = stacked_convs | |||||
| self.conv_groups = conv_groups | |||||
| self.reg_max = reg_max | |||||
| self.reg_topk = reg_topk | |||||
| self.reg_channels = reg_channels | |||||
| self.add_mean = add_mean | |||||
| self.total_dim = reg_topk | |||||
| self.start_kernel_size = start_kernel_size | |||||
| self.norm = norm | |||||
| self.act = act | |||||
| self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv | |||||
| if add_mean: | |||||
| self.total_dim += 1 | |||||
| super(GFocalHead_Tiny, self).__init__() | |||||
| self.integral = Integral(self.reg_max) | |||||
| self._init_layers() | |||||
| def _build_not_shared_convs(self, in_channel, feat_channels): | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| cls_convs = nn.ModuleList() | |||||
| reg_convs = nn.ModuleList() | |||||
| for i in range(self.stacked_convs): | |||||
| chn = feat_channels if i > 0 else in_channel | |||||
| kernel_size = 3 if i > 0 else self.start_kernel_size | |||||
| cls_convs.append( | |||||
| self.conv_module( | |||||
| chn, | |||||
| feat_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| groups=self.conv_groups, | |||||
| norm=self.norm, | |||||
| act=self.act)) | |||||
| reg_convs.append( | |||||
| self.conv_module( | |||||
| chn, | |||||
| feat_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| groups=self.conv_groups, | |||||
| norm=self.norm, | |||||
| act=self.act)) | |||||
| if not self.simlqe: | |||||
| conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)] | |||||
| else: | |||||
| conf_vector = [ | |||||
| nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1) | |||||
| ] | |||||
| conf_vector += [self.relu] | |||||
| conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()] | |||||
| reg_conf = nn.Sequential(*conf_vector) | |||||
| return cls_convs, reg_convs, reg_conf | |||||
| def _init_layers(self): | |||||
| """Initialize layers of the head.""" | |||||
| self.relu = nn.ReLU(inplace=True) | |||||
| self.cls_convs = nn.ModuleList() | |||||
| self.reg_convs = nn.ModuleList() | |||||
| self.reg_confs = nn.ModuleList() | |||||
| for i in range(len(self.strides)): | |||||
| cls_convs, reg_convs, reg_conf = self._build_not_shared_convs( | |||||
| self.in_channels[i], self.feat_channels[i]) | |||||
| self.cls_convs.append(cls_convs) | |||||
| self.reg_convs.append(reg_convs) | |||||
| self.reg_confs.append(reg_conf) | |||||
| self.gfl_cls = nn.ModuleList([ | |||||
| nn.Conv2d( | |||||
| self.feat_channels[i], self.cls_out_channels, 3, padding=1) | |||||
| for i in range(len(self.strides)) | |||||
| ]) | |||||
| self.gfl_reg = nn.ModuleList([ | |||||
| nn.Conv2d( | |||||
| self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1) | |||||
| for i in range(len(self.strides)) | |||||
| ]) | |||||
| self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) | |||||
| def forward(self, | |||||
| xin, | |||||
| labels=None, | |||||
| imgs=None, | |||||
| conf_thre=0.05, | |||||
| nms_thre=0.7): | |||||
| # prepare labels during training | |||||
| b, c, h, w = xin[0].shape | |||||
| if labels is not None: | |||||
| gt_bbox_list = [] | |||||
| gt_cls_list = [] | |||||
| for label in labels: | |||||
| gt_bbox_list.append(label.bbox) | |||||
| gt_cls_list.append((label.get_field('labels') | |||||
| - 1).long()) # labels starts from 1 | |||||
| # prepare priors for label assignment and bbox decode | |||||
| mlvl_priors_list = [ | |||||
| self.get_single_level_center_priors( | |||||
| xin[i].shape[0], | |||||
| xin[i].shape[-2:], | |||||
| stride, | |||||
| dtype=torch.float32, | |||||
| device=xin[0].device) for i, stride in enumerate(self.strides) | |||||
| ] | |||||
| mlvl_priors = torch.cat(mlvl_priors_list, dim=1) | |||||
| # forward for bboxes and classification prediction | |||||
| cls_scores, bbox_preds = multi_apply( | |||||
| self.forward_single, | |||||
| xin, | |||||
| self.cls_convs, | |||||
| self.reg_convs, | |||||
| self.gfl_cls, | |||||
| self.gfl_reg, | |||||
| self.reg_confs, | |||||
| self.scales, | |||||
| ) | |||||
| flatten_cls_scores = torch.cat(cls_scores, dim=1) | |||||
| flatten_bbox_preds = torch.cat(bbox_preds, dim=1) | |||||
| # calculating losses or bboxes decoded | |||||
| if self.training: | |||||
| loss = self.loss(flatten_cls_scores, flatten_bbox_preds, | |||||
| gt_bbox_list, gt_cls_list, mlvl_priors) | |||||
| return loss | |||||
| else: | |||||
| output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds, | |||||
| mlvl_priors) | |||||
| return output | |||||
| def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, | |||||
| reg_conf, scale): | |||||
| """Forward feature of a single scale level. | |||||
| """ | |||||
| cls_feat = x | |||||
| reg_feat = x | |||||
| for cls_conv in cls_convs: | |||||
| cls_feat = cls_conv(cls_feat) | |||||
| for reg_conv in reg_convs: | |||||
| reg_feat = reg_conv(reg_feat) | |||||
| bbox_pred = scale(gfl_reg(reg_feat)).float() | |||||
| N, C, H, W = bbox_pred.size() | |||||
| prob = F.softmax( | |||||
| bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2) | |||||
| if not self.simlqe: | |||||
| prob_topk, _ = prob.topk(self.reg_topk, dim=2) | |||||
| if self.add_mean: | |||||
| stat = torch.cat( | |||||
| [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2) | |||||
| else: | |||||
| stat = prob_topk | |||||
| quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W)) | |||||
| else: | |||||
| quality_score = reg_conf( | |||||
| bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W)) | |||||
| cls_score = gfl_cls(cls_feat).sigmoid() * quality_score | |||||
| flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2) | |||||
| flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2) | |||||
| return flatten_cls_score, flatten_bbox_pred | |||||
| def get_single_level_center_priors(self, batch_size, featmap_size, stride, | |||||
| dtype, device): | |||||
| h, w = featmap_size | |||||
| x_range = (torch.arange(0, int(w), dtype=dtype, | |||||
| device=device)) * stride | |||||
| y_range = (torch.arange(0, int(h), dtype=dtype, | |||||
| device=device)) * stride | |||||
| x = x_range.repeat(h, 1) | |||||
| y = y_range.unsqueeze(-1).repeat(1, w) | |||||
| y = y.flatten() | |||||
| x = x.flatten() | |||||
| strides = x.new_full((x.shape[0], ), stride) | |||||
| priors = torch.stack([x, y, strides, strides], dim=-1) | |||||
| return priors.unsqueeze(0).repeat(batch_size, 1, 1) | |||||
| def sample(self, assign_result, gt_bboxes): | |||||
| pos_inds = torch.nonzero( | |||||
| assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() | |||||
| neg_inds = torch.nonzero( | |||||
| assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() | |||||
| pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 | |||||
| if gt_bboxes.numel() == 0: | |||||
| # hack for index error case | |||||
| assert pos_assigned_gt_inds.numel() == 0 | |||||
| pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4) | |||||
| else: | |||||
| if len(gt_bboxes.shape) < 2: | |||||
| gt_bboxes = gt_bboxes.view(-1, 4) | |||||
| pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] | |||||
| return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds | |||||
| def get_bboxes(self, | |||||
| cls_preds, | |||||
| reg_preds, | |||||
| mlvl_center_priors, | |||||
| img_meta=None): | |||||
| dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None] | |||||
| bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds) | |||||
| res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1) | |||||
| return res | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import copy | |||||
| from .giraffe_fpn import GiraffeNeck | |||||
| from .giraffe_fpn_v2 import GiraffeNeckV2 | |||||
| def build_neck(cfg): | |||||
| neck_cfg = copy.deepcopy(cfg) | |||||
| name = neck_cfg.pop('name') | |||||
| if name == 'GiraffeNeck': | |||||
| return GiraffeNeck(**neck_cfg) | |||||
| elif name == 'GiraffeNeckV2': | |||||
| return GiraffeNeckV2(**neck_cfg) | |||||
| @@ -0,0 +1,235 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import collections | |||||
| import itertools | |||||
| import os | |||||
| import networkx as nx | |||||
| from omegaconf import OmegaConf | |||||
| Node = collections.namedtuple('Node', ['id', 'inputs', 'type']) | |||||
| def get_graph_info(graph): | |||||
| input_nodes = [] | |||||
| output_nodes = [] | |||||
| Nodes = [] | |||||
| for node in range(graph.number_of_nodes()): | |||||
| tmp = list(graph.neighbors(node)) | |||||
| tmp.sort() | |||||
| type = -1 | |||||
| if node < tmp[0]: | |||||
| input_nodes.append(node) | |||||
| type = 0 | |||||
| if node > tmp[-1]: | |||||
| output_nodes.append(node) | |||||
| type = 1 | |||||
| Nodes.append(Node(node, [n for n in tmp if n < node], type)) | |||||
| return Nodes, input_nodes, output_nodes | |||||
| def nodeid_trans(id, cur_level, num_levels): | |||||
| if id % 2 == 1: | |||||
| gap = int(((id + 1) // 2) * num_levels * 2) | |||||
| else: | |||||
| a = (num_levels - cur_level) * 2 - 1 | |||||
| b = ((id + 1) // 2) * num_levels * 2 | |||||
| gap = int(a + b) | |||||
| return cur_level + gap | |||||
| def gen_log2n_graph_file(log2n_graph_file, depth_multiplier): | |||||
| f = open(log2n_graph_file, 'w') | |||||
| for i in range(depth_multiplier): | |||||
| for j in [1, 2, 4, 8, 16, 32]: | |||||
| if i - j < 0: | |||||
| break | |||||
| else: | |||||
| f.write('%d,%d\n' % (i - j, i)) | |||||
| f.close() | |||||
| def get_log2n_graph(depth_multiplier): | |||||
| nodes = [] | |||||
| connnections = [] | |||||
| for i in range(depth_multiplier): | |||||
| nodes.append(i) | |||||
| for j in [1, 2, 4, 8, 16, 32]: | |||||
| if i - j < 0: | |||||
| break | |||||
| else: | |||||
| connnections.append((i - j, i)) | |||||
| return nodes, connnections | |||||
| def get_dense_graph(depth_multiplier): | |||||
| nodes = [] | |||||
| connections = [] | |||||
| for i in range(depth_multiplier): | |||||
| nodes.append(i) | |||||
| for j in range(i): | |||||
| connections.append((j, i)) | |||||
| return nodes, connections | |||||
| def giraffeneck_config(min_level, | |||||
| max_level, | |||||
| weight_method=None, | |||||
| depth_multiplier=5, | |||||
| with_backslash=False, | |||||
| with_slash=False, | |||||
| with_skip_connect=False, | |||||
| skip_connect_type='dense'): | |||||
| """Graph config with log2n merge and panet""" | |||||
| if skip_connect_type == 'dense': | |||||
| nodes, connections = get_dense_graph(depth_multiplier) | |||||
| elif skip_connect_type == 'log2n': | |||||
| nodes, connections = get_log2n_graph(depth_multiplier) | |||||
| graph = nx.Graph() | |||||
| graph.add_nodes_from(nodes) | |||||
| graph.add_edges_from(connections) | |||||
| drop_node = [] | |||||
| nodes, input_nodes, output_nodes = get_graph_info(graph) | |||||
| weight_method = weight_method or 'fastattn' | |||||
| num_levels = max_level - min_level + 1 | |||||
| node_ids = {min_level + i: [i] for i in range(num_levels)} | |||||
| node_ids_per_layer = {} | |||||
| pnodes = {} | |||||
| def update_drop_node(new_id, input_offsets): | |||||
| if new_id not in drop_node: | |||||
| new_id = new_id | |||||
| else: | |||||
| while new_id in drop_node: | |||||
| if new_id in pnodes: | |||||
| for n in pnodes[new_id]['inputs_offsets']: | |||||
| if n not in input_offsets and n not in drop_node: | |||||
| input_offsets.append(n) | |||||
| new_id = new_id - 1 | |||||
| if new_id not in input_offsets: | |||||
| input_offsets.append(new_id) | |||||
| # top-down layer | |||||
| for i in range(max_level, min_level - 1, -1): | |||||
| node_ids_per_layer[i] = [] | |||||
| for id, node in enumerate(nodes): | |||||
| input_offsets = [] | |||||
| if id in input_nodes: | |||||
| input_offsets.append(node_ids[i][0]) | |||||
| else: | |||||
| if with_skip_connect: | |||||
| for input_id in node.inputs: | |||||
| new_id = nodeid_trans(input_id, i - min_level, | |||||
| num_levels) | |||||
| update_drop_node(new_id, input_offsets) | |||||
| # add top2down | |||||
| new_id = nodeid_trans(id, i - min_level, num_levels) | |||||
| # add backslash node | |||||
| def cal_backslash_node(id): | |||||
| ind = id // num_levels | |||||
| mod = id % num_levels | |||||
| if ind % 2 == 0: # even | |||||
| if mod == (num_levels - 1): | |||||
| last = -1 | |||||
| else: | |||||
| last = (ind - 1) * num_levels + ( | |||||
| num_levels - 1 - mod - 1) | |||||
| else: # odd | |||||
| if mod == 0: | |||||
| last = -1 | |||||
| else: | |||||
| last = (ind - 1) * num_levels + ( | |||||
| num_levels - 1 - mod + 1) | |||||
| return last | |||||
| # add slash node | |||||
| def cal_slash_node(id): | |||||
| ind = id // num_levels | |||||
| mod = id % num_levels | |||||
| if ind % 2 == 1: # odd | |||||
| if mod == (num_levels - 1): | |||||
| last = -1 | |||||
| else: | |||||
| last = (ind - 1) * num_levels + ( | |||||
| num_levels - 1 - mod - 1) | |||||
| else: # even | |||||
| if mod == 0: | |||||
| last = -1 | |||||
| else: | |||||
| last = (ind - 1) * num_levels + ( | |||||
| num_levels - 1 - mod + 1) | |||||
| return last | |||||
| # add last node | |||||
| last = new_id - 1 | |||||
| update_drop_node(last, input_offsets) | |||||
| if with_backslash: | |||||
| backslash = cal_backslash_node(new_id) | |||||
| if backslash != -1 and backslash not in input_offsets: | |||||
| input_offsets.append(backslash) | |||||
| if with_slash: | |||||
| slash = cal_slash_node(new_id) | |||||
| if slash != -1 and slash not in input_offsets: | |||||
| input_offsets.append(slash) | |||||
| if new_id in drop_node: | |||||
| input_offsets = [] | |||||
| pnodes[new_id] = { | |||||
| 'reduction': 1 << i, | |||||
| 'inputs_offsets': input_offsets, | |||||
| 'weight_method': weight_method, | |||||
| 'is_out': 0, | |||||
| } | |||||
| input_offsets = [] | |||||
| for out_id in output_nodes: | |||||
| new_id = nodeid_trans(out_id, i - min_level, num_levels) | |||||
| input_offsets.append(new_id) | |||||
| pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = { | |||||
| 'reduction': 1 << i, | |||||
| 'inputs_offsets': input_offsets, | |||||
| 'weight_method': weight_method, | |||||
| 'is_out': 1, | |||||
| } | |||||
| pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0])) | |||||
| return pnodes | |||||
| def get_graph_config(fpn_name, | |||||
| min_level=3, | |||||
| max_level=7, | |||||
| weight_method='concat', | |||||
| depth_multiplier=5, | |||||
| with_backslash=False, | |||||
| with_slash=False, | |||||
| with_skip_connect=False, | |||||
| skip_connect_type='dense'): | |||||
| name_to_config = { | |||||
| 'giraffeneck': | |||||
| giraffeneck_config( | |||||
| min_level=min_level, | |||||
| max_level=max_level, | |||||
| weight_method=weight_method, | |||||
| depth_multiplier=depth_multiplier, | |||||
| with_backslash=with_backslash, | |||||
| with_slash=with_slash, | |||||
| with_skip_connect=with_skip_connect, | |||||
| skip_connect_type=skip_connect_type), | |||||
| } | |||||
| return name_to_config[fpn_name] | |||||
| @@ -0,0 +1,661 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import logging | |||||
| import math | |||||
| from collections import OrderedDict | |||||
| from functools import partial | |||||
| from typing import Callable, List, Optional, Tuple, Union | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from timm import create_model | |||||
| from timm.models.layers import (Swish, create_conv2d, create_pool2d, | |||||
| get_act_layer) | |||||
| from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer | |||||
| from .giraffe_config import get_graph_config | |||||
| _ACT_LAYER = Swish | |||||
| class SequentialList(nn.Sequential): | |||||
| """ This module exists to work around torchscript typing issues list -> list""" | |||||
| def __init__(self, *args): | |||||
| super(SequentialList, self).__init__(*args) | |||||
| def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: | |||||
| for module in self: | |||||
| x = module(x) | |||||
| return x | |||||
| class ConvBnAct2d(nn.Module): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| dilation=1, | |||||
| padding='', | |||||
| bias=False, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| act_layer=_ACT_LAYER): | |||||
| super(ConvBnAct2d, self).__init__() | |||||
| self.conv = create_conv2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=stride, | |||||
| dilation=dilation, | |||||
| padding=padding, | |||||
| bias=bias) | |||||
| self.bn = None if norm_layer is None else norm_layer(out_channels) | |||||
| self.act = None if act_layer is None else act_layer(inplace=True) | |||||
| def forward(self, x): | |||||
| x = self.conv(x) | |||||
| if self.bn is not None: | |||||
| x = self.bn(x) | |||||
| if self.act is not None: | |||||
| x = self.act(x) | |||||
| return x | |||||
| class SeparableConv2d(nn.Module): | |||||
| """ Separable Conv | |||||
| """ | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=3, | |||||
| stride=1, | |||||
| dilation=1, | |||||
| padding='', | |||||
| bias=False, | |||||
| channel_multiplier=1.0, | |||||
| pw_kernel_size=1, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| act_layer=_ACT_LAYER): | |||||
| super(SeparableConv2d, self).__init__() | |||||
| self.conv_dw = create_conv2d( | |||||
| in_channels, | |||||
| int(in_channels * channel_multiplier), | |||||
| kernel_size, | |||||
| stride=stride, | |||||
| dilation=dilation, | |||||
| padding=padding, | |||||
| depthwise=True) | |||||
| self.conv_pw = create_conv2d( | |||||
| int(in_channels * channel_multiplier), | |||||
| out_channels, | |||||
| pw_kernel_size, | |||||
| padding=padding, | |||||
| bias=bias) | |||||
| self.bn = None if norm_layer is None else norm_layer(out_channels) | |||||
| self.act = None if act_layer is None else act_layer(inplace=True) | |||||
| def forward(self, x): | |||||
| x = self.conv_dw(x) | |||||
| x = self.conv_pw(x) | |||||
| if self.bn is not None: | |||||
| x = self.bn(x) | |||||
| if self.act is not None: | |||||
| x = self.act(x) | |||||
| return x | |||||
| def _init_weight( | |||||
| m, | |||||
| n='', | |||||
| ): | |||||
| """ Weight initialization as per Tensorflow official implementations. | |||||
| """ | |||||
| def _fan_in_out(w, groups=1): | |||||
| dimensions = w.dim() | |||||
| if dimensions < 2: | |||||
| raise ValueError( | |||||
| 'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions' | |||||
| ) | |||||
| num_input_fmaps = w.size(1) | |||||
| num_output_fmaps = w.size(0) | |||||
| receptive_field_size = 1 | |||||
| if w.dim() > 2: | |||||
| receptive_field_size = w[0][0].numel() | |||||
| fan_in = num_input_fmaps * receptive_field_size | |||||
| fan_out = num_output_fmaps * receptive_field_size | |||||
| fan_out //= groups | |||||
| return fan_in, fan_out | |||||
| def _glorot_uniform(w, gain=1, groups=1): | |||||
| fan_in, fan_out = _fan_in_out(w, groups) | |||||
| gain /= max(1., (fan_in + fan_out) / 2.) # fan avg | |||||
| limit = math.sqrt(3.0 * gain) | |||||
| w.data.uniform_(-limit, limit) | |||||
| def _variance_scaling(w, gain=1, groups=1): | |||||
| fan_in, fan_out = _fan_in_out(w, groups) | |||||
| gain /= max(1., fan_in) # fan in | |||||
| std = math.sqrt(gain) | |||||
| w.data.normal_(std=std) | |||||
| if isinstance(m, SeparableConv2d): | |||||
| if 'box_net' in n or 'class_net' in n: | |||||
| _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups) | |||||
| _variance_scaling(m.conv_pw.weight) | |||||
| if m.conv_pw.bias is not None: | |||||
| if 'class_net.predict' in n: | |||||
| m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) | |||||
| else: | |||||
| m.conv_pw.bias.data.zero_() | |||||
| else: | |||||
| _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups) | |||||
| _glorot_uniform(m.conv_pw.weight) | |||||
| if m.conv_pw.bias is not None: | |||||
| m.conv_pw.bias.data.zero_() | |||||
| elif isinstance(m, ConvBnAct2d): | |||||
| if 'box_net' in n or 'class_net' in n: | |||||
| m.conv.weight.data.normal_(std=.01) | |||||
| if m.conv.bias is not None: | |||||
| if 'class_net.predict' in n: | |||||
| m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) | |||||
| else: | |||||
| m.conv.bias.data.zero_() | |||||
| else: | |||||
| _glorot_uniform(m.conv.weight) | |||||
| if m.conv.bias is not None: | |||||
| m.conv.bias.data.zero_() | |||||
| elif isinstance(m, nn.BatchNorm2d): | |||||
| m.weight.data.fill_(1.0) | |||||
| m.bias.data.zero_() | |||||
| def _init_weight_alt( | |||||
| m, | |||||
| n='', | |||||
| ): | |||||
| """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition | |||||
| NOTE: this will likely be removed after some experimentation | |||||
| """ | |||||
| if isinstance(m, nn.Conv2d): | |||||
| fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||||
| fan_out //= m.groups | |||||
| m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) | |||||
| if m.bias is not None: | |||||
| if 'class_net.predict' in n: | |||||
| m.bias.data.fill_(-math.log((1 - 0.01) / 0.01)) | |||||
| else: | |||||
| m.bias.data.zero_() | |||||
| elif isinstance(m, nn.BatchNorm2d): | |||||
| m.weight.data.fill_(1.0) | |||||
| m.bias.data.zero_() | |||||
| class Interpolate2d(nn.Module): | |||||
| r"""Resamples a 2d Image | |||||
| The input data is assumed to be of the form | |||||
| `minibatch x channels x [optional depth] x [optional height] x width`. | |||||
| Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor. | |||||
| The algorithms available for upsampling are nearest neighbor and linear, | |||||
| bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor, | |||||
| respectively. | |||||
| One can either give a :attr:`scale_factor` or the target output :attr:`size` to | |||||
| calculate the output size. (You cannot give both, as it is ambiguous) | |||||
| Args: | |||||
| size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional): | |||||
| output spatial sizes | |||||
| scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional): | |||||
| multiplier for spatial size. Has to match input size if it is a tuple. | |||||
| mode (str, optional): the upsampling algorithm: one of ``'nearest'``, | |||||
| ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``. | |||||
| Default: ``'nearest'`` | |||||
| align_corners (bool, optional): if ``True``, the corner pixels of the input | |||||
| and output tensors are aligned, and thus preserving the values at | |||||
| those pixels. This only has effect when :attr:`mode` is | |||||
| ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False`` | |||||
| """ | |||||
| __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name'] | |||||
| name: str | |||||
| size: Optional[Union[int, Tuple[int, int]]] | |||||
| scale_factor: Optional[Union[float, Tuple[float, float]]] | |||||
| mode: str | |||||
| align_corners: Optional[bool] | |||||
| def __init__(self, | |||||
| size: Optional[Union[int, Tuple[int, int]]] = None, | |||||
| scale_factor: Optional[Union[float, Tuple[float, | |||||
| float]]] = None, | |||||
| mode: str = 'nearest', | |||||
| align_corners: bool = False) -> None: | |||||
| super(Interpolate2d, self).__init__() | |||||
| self.name = type(self).__name__ | |||||
| self.size = size | |||||
| if isinstance(scale_factor, tuple): | |||||
| self.scale_factor = tuple(float(factor) for factor in scale_factor) | |||||
| else: | |||||
| self.scale_factor = float(scale_factor) if scale_factor else None | |||||
| self.mode = mode | |||||
| self.align_corners = None if mode == 'nearest' else align_corners | |||||
| def forward(self, input: torch.Tensor) -> torch.Tensor: | |||||
| return F.interpolate( | |||||
| input, | |||||
| self.size, | |||||
| self.scale_factor, | |||||
| self.mode, | |||||
| self.align_corners, | |||||
| recompute_scale_factor=False) | |||||
| class ResampleFeatureMap(nn.Sequential): | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| reduction_ratio=1., | |||||
| pad_type='', | |||||
| downsample=None, | |||||
| upsample=None, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| apply_bn=False, | |||||
| conv_after_downsample=False, | |||||
| redundant_bias=False): | |||||
| super(ResampleFeatureMap, self).__init__() | |||||
| downsample = downsample or 'max' | |||||
| upsample = upsample or 'nearest' | |||||
| self.in_channels = in_channels | |||||
| self.out_channels = out_channels | |||||
| self.reduction_ratio = reduction_ratio | |||||
| self.conv_after_downsample = conv_after_downsample | |||||
| conv = None | |||||
| if in_channels != out_channels: | |||||
| conv = ConvBnAct2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=1, | |||||
| padding=pad_type, | |||||
| norm_layer=norm_layer if apply_bn else None, | |||||
| bias=not apply_bn or redundant_bias, | |||||
| act_layer=None) | |||||
| if reduction_ratio > 1: | |||||
| if conv is not None and not self.conv_after_downsample: | |||||
| self.add_module('conv', conv) | |||||
| if downsample in ('max', 'avg'): | |||||
| stride_size = int(reduction_ratio) | |||||
| downsample = create_pool2d( | |||||
| downsample, | |||||
| kernel_size=stride_size + 1, | |||||
| stride=stride_size, | |||||
| padding=pad_type) | |||||
| else: | |||||
| downsample = Interpolate2d( | |||||
| scale_factor=1. / reduction_ratio, mode=downsample) | |||||
| self.add_module('downsample', downsample) | |||||
| if conv is not None and self.conv_after_downsample: | |||||
| self.add_module('conv', conv) | |||||
| else: | |||||
| if conv is not None: | |||||
| self.add_module('conv', conv) | |||||
| if reduction_ratio < 1: | |||||
| scale = int(1 // reduction_ratio) | |||||
| self.add_module( | |||||
| 'upsample', | |||||
| Interpolate2d(scale_factor=scale, mode=upsample)) | |||||
| class GiraffeCombine(nn.Module): | |||||
| def __init__(self, | |||||
| feature_info, | |||||
| fpn_config, | |||||
| fpn_channels, | |||||
| inputs_offsets, | |||||
| target_reduction, | |||||
| pad_type='', | |||||
| downsample=None, | |||||
| upsample=None, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| apply_resample_bn=False, | |||||
| conv_after_downsample=False, | |||||
| redundant_bias=False, | |||||
| weight_method='attn'): | |||||
| super(GiraffeCombine, self).__init__() | |||||
| self.inputs_offsets = inputs_offsets | |||||
| self.weight_method = weight_method | |||||
| self.resample = nn.ModuleDict() | |||||
| reduction_base = feature_info[0]['reduction'] | |||||
| target_channels_idx = int( | |||||
| math.log(target_reduction // reduction_base, 2)) | |||||
| for idx, offset in enumerate(inputs_offsets): | |||||
| if offset < len(feature_info): | |||||
| in_channels = feature_info[offset]['num_chs'] | |||||
| input_reduction = feature_info[offset]['reduction'] | |||||
| else: | |||||
| node_idx = offset | |||||
| input_reduction = fpn_config[node_idx]['reduction'] | |||||
| # in_channels = fpn_config[node_idx]['num_chs'] | |||||
| input_channels_idx = int( | |||||
| math.log(input_reduction // reduction_base, 2)) | |||||
| in_channels = feature_info[input_channels_idx]['num_chs'] | |||||
| reduction_ratio = target_reduction / input_reduction | |||||
| if weight_method == 'concat': | |||||
| self.resample[str(offset)] = ResampleFeatureMap( | |||||
| in_channels, | |||||
| in_channels, | |||||
| reduction_ratio=reduction_ratio, | |||||
| pad_type=pad_type, | |||||
| downsample=downsample, | |||||
| upsample=upsample, | |||||
| norm_layer=norm_layer, | |||||
| apply_bn=apply_resample_bn, | |||||
| conv_after_downsample=conv_after_downsample, | |||||
| redundant_bias=redundant_bias) | |||||
| else: | |||||
| self.resample[str(offset)] = ResampleFeatureMap( | |||||
| in_channels, | |||||
| fpn_channels[target_channels_idx], | |||||
| reduction_ratio=reduction_ratio, | |||||
| pad_type=pad_type, | |||||
| downsample=downsample, | |||||
| upsample=upsample, | |||||
| norm_layer=norm_layer, | |||||
| apply_bn=apply_resample_bn, | |||||
| conv_after_downsample=conv_after_downsample, | |||||
| redundant_bias=redundant_bias) | |||||
| if weight_method == 'attn' or weight_method == 'fastattn': | |||||
| self.edge_weights = nn.Parameter( | |||||
| torch.ones(len(inputs_offsets)), requires_grad=True) # WSM | |||||
| else: | |||||
| self.edge_weights = None | |||||
| def forward(self, x: List[torch.Tensor]): | |||||
| dtype = x[0].dtype | |||||
| nodes = [] | |||||
| if len(self.inputs_offsets) == 0: | |||||
| return None | |||||
| for offset, resample in zip(self.inputs_offsets, | |||||
| self.resample.values()): | |||||
| input_node = x[offset] | |||||
| input_node = resample(input_node) | |||||
| nodes.append(input_node) | |||||
| if self.weight_method == 'attn': | |||||
| normalized_weights = torch.softmax( | |||||
| self.edge_weights.to(dtype=dtype), dim=0) | |||||
| out = torch.stack(nodes, dim=-1) * normalized_weights | |||||
| out = torch.sum(out, dim=-1) | |||||
| elif self.weight_method == 'fastattn': | |||||
| edge_weights = nn.functional.relu( | |||||
| self.edge_weights.to(dtype=dtype)) | |||||
| weights_sum = torch.sum(edge_weights) | |||||
| weights_norm = weights_sum + 0.0001 | |||||
| out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm | |||||
| for i in range(len(nodes))], | |||||
| dim=-1) | |||||
| out = torch.sum(out, dim=-1) | |||||
| elif self.weight_method == 'sum': | |||||
| out = torch.stack(nodes, dim=-1) | |||||
| out = torch.sum(out, dim=-1) | |||||
| elif self.weight_method == 'concat': | |||||
| out = torch.cat(nodes, dim=1) | |||||
| else: | |||||
| raise ValueError('unknown weight_method {}'.format( | |||||
| self.weight_method)) | |||||
| return out | |||||
| class GiraffeNode(nn.Module): | |||||
| """ A simple wrapper used in place of nn.Sequential for torchscript typing | |||||
| Handles input type List[Tensor] -> output type Tensor | |||||
| """ | |||||
| def __init__(self, combine: nn.Module, after_combine: nn.Module): | |||||
| super(GiraffeNode, self).__init__() | |||||
| self.combine = combine | |||||
| self.after_combine = after_combine | |||||
| def forward(self, x: List[torch.Tensor]) -> torch.Tensor: | |||||
| combine_feat = self.combine(x) | |||||
| if combine_feat is None: | |||||
| return None | |||||
| else: | |||||
| return self.after_combine(combine_feat) | |||||
| class GiraffeLayer(nn.Module): | |||||
| def __init__(self, | |||||
| feature_info, | |||||
| fpn_config, | |||||
| inner_fpn_channels, | |||||
| outer_fpn_channels, | |||||
| num_levels=5, | |||||
| pad_type='', | |||||
| downsample=None, | |||||
| upsample=None, | |||||
| norm_layer=nn.BatchNorm2d, | |||||
| act_layer=_ACT_LAYER, | |||||
| apply_resample_bn=False, | |||||
| conv_after_downsample=True, | |||||
| conv_bn_relu_pattern=False, | |||||
| separable_conv=True, | |||||
| redundant_bias=False, | |||||
| merge_type='conv'): | |||||
| super(GiraffeLayer, self).__init__() | |||||
| self.num_levels = num_levels | |||||
| self.conv_bn_relu_pattern = False | |||||
| self.feature_info = {} | |||||
| for idx, feat in enumerate(feature_info): | |||||
| self.feature_info[idx] = feat | |||||
| self.fnode = nn.ModuleList() | |||||
| reduction_base = feature_info[0]['reduction'] | |||||
| for i, fnode_cfg in fpn_config.items(): | |||||
| logging.debug('fnode {} : {}'.format(i, fnode_cfg)) | |||||
| if fnode_cfg['is_out'] == 1: | |||||
| fpn_channels = outer_fpn_channels | |||||
| else: | |||||
| fpn_channels = inner_fpn_channels | |||||
| reduction = fnode_cfg['reduction'] | |||||
| fpn_channels_idx = int(math.log(reduction // reduction_base, 2)) | |||||
| combine = GiraffeCombine( | |||||
| self.feature_info, | |||||
| fpn_config, | |||||
| fpn_channels, | |||||
| tuple(fnode_cfg['inputs_offsets']), | |||||
| target_reduction=reduction, | |||||
| pad_type=pad_type, | |||||
| downsample=downsample, | |||||
| upsample=upsample, | |||||
| norm_layer=norm_layer, | |||||
| apply_resample_bn=apply_resample_bn, | |||||
| conv_after_downsample=conv_after_downsample, | |||||
| redundant_bias=redundant_bias, | |||||
| weight_method=fnode_cfg['weight_method']) | |||||
| after_combine = nn.Sequential() | |||||
| in_channels = 0 | |||||
| out_channels = 0 | |||||
| for input_offset in fnode_cfg['inputs_offsets']: | |||||
| in_channels += self.feature_info[input_offset]['num_chs'] | |||||
| out_channels = fpn_channels[fpn_channels_idx] | |||||
| if merge_type == 'csp': | |||||
| after_combine.add_module( | |||||
| 'CspLayer', | |||||
| CSPLayer( | |||||
| in_channels, | |||||
| out_channels, | |||||
| 2, | |||||
| shortcut=True, | |||||
| depthwise=False, | |||||
| act='silu')) | |||||
| elif merge_type == 'shuffle': | |||||
| after_combine.add_module( | |||||
| 'shuffleBlock', ShuffleBlock(in_channels, in_channels)) | |||||
| after_combine.add_module( | |||||
| 'conv1x1', | |||||
| create_conv2d(in_channels, out_channels, kernel_size=1)) | |||||
| elif merge_type == 'conv': | |||||
| after_combine.add_module( | |||||
| 'conv1x1', | |||||
| create_conv2d(in_channels, out_channels, kernel_size=1)) | |||||
| conv_kwargs = dict( | |||||
| in_channels=out_channels, | |||||
| out_channels=out_channels, | |||||
| kernel_size=3, | |||||
| padding=pad_type, | |||||
| bias=False, | |||||
| norm_layer=norm_layer, | |||||
| act_layer=act_layer) | |||||
| if not conv_bn_relu_pattern: | |||||
| conv_kwargs['bias'] = redundant_bias | |||||
| conv_kwargs['act_layer'] = None | |||||
| after_combine.add_module('act', act_layer(inplace=True)) | |||||
| after_combine.add_module( | |||||
| 'conv', | |||||
| SeparableConv2d(**conv_kwargs) | |||||
| if separable_conv else ConvBnAct2d(**conv_kwargs)) | |||||
| self.fnode.append( | |||||
| GiraffeNode(combine=combine, after_combine=after_combine)) | |||||
| self.feature_info[i] = dict( | |||||
| num_chs=fpn_channels[fpn_channels_idx], reduction=reduction) | |||||
| self.out_feature_info = [] | |||||
| out_node = list(self.feature_info.keys())[-num_levels::] | |||||
| for i in out_node: | |||||
| self.out_feature_info.append(self.feature_info[i]) | |||||
| self.feature_info = self.out_feature_info | |||||
| def forward(self, x: List[torch.Tensor]): | |||||
| for fn in self.fnode: | |||||
| x.append(fn(x)) | |||||
| return x[-self.num_levels::] | |||||
| class GiraffeNeck(nn.Module): | |||||
| def __init__(self, min_level, max_level, num_levels, norm_layer, | |||||
| norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels, | |||||
| out_fpn_channels, weight_method, depth_multiplier, | |||||
| width_multiplier, with_backslash, with_slash, | |||||
| with_skip_connect, skip_connect_type, separable_conv, | |||||
| feature_info, merge_type, pad_type, downsample_type, | |||||
| upsample_type, apply_resample_bn, conv_after_downsample, | |||||
| redundant_bias, conv_bn_relu_pattern, alternate_init): | |||||
| super(GiraffeNeck, self).__init__() | |||||
| self.num_levels = num_levels | |||||
| self.min_level = min_level | |||||
| self.in_features = [0, 1, 2, 3, 4, 5, | |||||
| 6][self.min_level - 1:self.min_level - 1 | |||||
| + num_levels] | |||||
| self.alternate_init = alternate_init | |||||
| norm_layer = norm_layer or nn.BatchNorm2d | |||||
| if norm_kwargs: | |||||
| norm_layer = partial(norm_layer, **norm_kwargs) | |||||
| act_layer = get_act_layer(act_type) or _ACT_LAYER | |||||
| fpn_config = fpn_config or get_graph_config( | |||||
| fpn_name, | |||||
| min_level=min_level, | |||||
| max_level=max_level, | |||||
| weight_method=weight_method, | |||||
| depth_multiplier=depth_multiplier, | |||||
| with_backslash=with_backslash, | |||||
| with_slash=with_slash, | |||||
| with_skip_connect=with_skip_connect, | |||||
| skip_connect_type=skip_connect_type) | |||||
| # width scale | |||||
| for i in range(len(fpn_channels)): | |||||
| fpn_channels[i] = int(fpn_channels[i] * width_multiplier) | |||||
| self.resample = nn.ModuleDict() | |||||
| for level in range(num_levels): | |||||
| if level < len(feature_info): | |||||
| in_chs = feature_info[level]['num_chs'] | |||||
| reduction = feature_info[level]['reduction'] | |||||
| else: | |||||
| # Adds a coarser level by downsampling the last feature map | |||||
| reduction_ratio = 2 | |||||
| self.resample[str(level)] = ResampleFeatureMap( | |||||
| in_channels=in_chs, | |||||
| out_channels=feature_info[level - 1]['num_chs'], | |||||
| pad_type=pad_type, | |||||
| downsample=downsample_type, | |||||
| upsample=upsample_type, | |||||
| norm_layer=norm_layer, | |||||
| reduction_ratio=reduction_ratio, | |||||
| apply_bn=apply_resample_bn, | |||||
| conv_after_downsample=conv_after_downsample, | |||||
| redundant_bias=redundant_bias, | |||||
| ) | |||||
| in_chs = feature_info[level - 1]['num_chs'] | |||||
| reduction = int(reduction * reduction_ratio) | |||||
| feature_info.append(dict(num_chs=in_chs, reduction=reduction)) | |||||
| self.cell = SequentialList() | |||||
| logging.debug('building giraffeNeck') | |||||
| giraffe_layer = GiraffeLayer( | |||||
| feature_info=feature_info, | |||||
| fpn_config=fpn_config, | |||||
| inner_fpn_channels=fpn_channels, | |||||
| outer_fpn_channels=out_fpn_channels, | |||||
| num_levels=num_levels, | |||||
| pad_type=pad_type, | |||||
| downsample=downsample_type, | |||||
| upsample=upsample_type, | |||||
| norm_layer=norm_layer, | |||||
| act_layer=act_layer, | |||||
| separable_conv=separable_conv, | |||||
| apply_resample_bn=apply_resample_bn, | |||||
| conv_after_downsample=conv_after_downsample, | |||||
| conv_bn_relu_pattern=conv_bn_relu_pattern, | |||||
| redundant_bias=redundant_bias, | |||||
| merge_type=merge_type) | |||||
| self.cell.add_module('giraffeNeck', giraffe_layer) | |||||
| feature_info = giraffe_layer.feature_info | |||||
| def init_weights(self, pretrained=False): | |||||
| for n, m in self.named_modules(): | |||||
| if 'backbone' not in n: | |||||
| if self.alternate_init: | |||||
| _init_weight_alt(m, n) | |||||
| else: | |||||
| _init_weight(m, n) | |||||
| def forward(self, x: List[torch.Tensor]): | |||||
| if type(x) is tuple: | |||||
| x = list(x) | |||||
| x = [x[f] for f in self.in_features] | |||||
| for resample in self.resample.values(): | |||||
| x.append(resample(x[-1])) | |||||
| x = self.cell(x) | |||||
| return x | |||||
| @@ -0,0 +1,203 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from ..core.base_ops import BaseConv, CSPLayer, DWConv | |||||
| from ..core.neck_ops import CSPStage | |||||
| class GiraffeNeckV2(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| depth=1.0, | |||||
| width=1.0, | |||||
| in_features=[2, 3, 4], | |||||
| in_channels=[256, 512, 1024], | |||||
| out_channels=[256, 512, 1024], | |||||
| depthwise=False, | |||||
| act='silu', | |||||
| spp=True, | |||||
| reparam_mode=True, | |||||
| block_name='BasicBlock', | |||||
| ): | |||||
| super().__init__() | |||||
| self.in_features = in_features | |||||
| self.in_channels = in_channels | |||||
| Conv = DWConv if depthwise else BaseConv | |||||
| reparam_mode = reparam_mode | |||||
| self.upsample = nn.Upsample(scale_factor=2, mode='nearest') | |||||
| # node x3: input x0, x1 | |||||
| self.bu_conv13 = Conv( | |||||
| int(in_channels[1] * width), | |||||
| int(in_channels[1] * width), | |||||
| 3, | |||||
| 2, | |||||
| act=act) | |||||
| if reparam_mode: | |||||
| self.merge_3 = CSPStage( | |||||
| block_name, | |||||
| int((in_channels[1] + in_channels[2]) * width), | |||||
| int(in_channels[2] * width), | |||||
| round(3 * depth), | |||||
| act=act, | |||||
| spp=spp) | |||||
| else: | |||||
| self.merge_3 = CSPLayer( | |||||
| int((in_channels[1] + in_channels[2]) * width), | |||||
| int(in_channels[2] * width), | |||||
| round(3 * depth), | |||||
| False, | |||||
| depthwise=depthwise, | |||||
| act=act) | |||||
| # node x4: input x1, x2, x3 | |||||
| self.bu_conv24 = Conv( | |||||
| int(in_channels[0] * width), | |||||
| int(in_channels[0] * width), | |||||
| 3, | |||||
| 2, | |||||
| act=act) | |||||
| if reparam_mode: | |||||
| self.merge_4 = CSPStage( | |||||
| block_name, | |||||
| int((in_channels[0] + in_channels[1] + in_channels[2]) | |||||
| * width), | |||||
| int(in_channels[1] * width), | |||||
| round(3 * depth), | |||||
| act=act, | |||||
| spp=spp) | |||||
| else: | |||||
| self.merge_4 = CSPLayer( | |||||
| int((in_channels[0] + in_channels[1] + in_channels[2]) | |||||
| * width), | |||||
| int(in_channels[1] * width), | |||||
| round(3 * depth), | |||||
| False, | |||||
| depthwise=depthwise, | |||||
| act=act) | |||||
| # node x5: input x2, x4 | |||||
| if reparam_mode: | |||||
| self.merge_5 = CSPStage( | |||||
| block_name, | |||||
| int((in_channels[1] + in_channels[0]) * width), | |||||
| int(out_channels[0] * width), | |||||
| round(3 * depth), | |||||
| act=act, | |||||
| spp=spp) | |||||
| else: | |||||
| self.merge_5 = CSPLayer( | |||||
| int((in_channels[1] + in_channels[0]) * width), | |||||
| int(out_channels[0] * width), | |||||
| round(3 * depth), | |||||
| False, | |||||
| depthwise=depthwise, | |||||
| act=act) | |||||
| # node x7: input x4, x5 | |||||
| self.bu_conv57 = Conv( | |||||
| int(out_channels[0] * width), | |||||
| int(out_channels[0] * width), | |||||
| 3, | |||||
| 2, | |||||
| act=act) | |||||
| if reparam_mode: | |||||
| self.merge_7 = CSPStage( | |||||
| block_name, | |||||
| int((out_channels[0] + in_channels[1]) * width), | |||||
| int(out_channels[1] * width), | |||||
| round(3 * depth), | |||||
| act=act, | |||||
| spp=spp) | |||||
| else: | |||||
| self.merge_7 = CSPLayer( | |||||
| int((out_channels[0] + in_channels[1]) * width), | |||||
| int(out_channels[1] * width), | |||||
| round(3 * depth), | |||||
| False, | |||||
| depthwise=depthwise, | |||||
| act=act) | |||||
| # node x6: input x3, x4, x7 | |||||
| self.bu_conv46 = Conv( | |||||
| int(in_channels[1] * width), | |||||
| int(in_channels[1] * width), | |||||
| 3, | |||||
| 2, | |||||
| act=act) | |||||
| self.bu_conv76 = Conv( | |||||
| int(out_channels[1] * width), | |||||
| int(out_channels[1] * width), | |||||
| 3, | |||||
| 2, | |||||
| act=act) | |||||
| if reparam_mode: | |||||
| self.merge_6 = CSPStage( | |||||
| block_name, | |||||
| int((in_channels[1] + out_channels[1] + in_channels[2]) | |||||
| * width), | |||||
| int(out_channels[2] * width), | |||||
| round(3 * depth), | |||||
| act=act, | |||||
| spp=spp) | |||||
| else: | |||||
| self.merge_6 = CSPLayer( | |||||
| int((in_channels[1] + out_channels[1] + in_channels[2]) | |||||
| * width), | |||||
| int(out_channels[2] * width), | |||||
| round(3 * depth), | |||||
| False, | |||||
| depthwise=depthwise, | |||||
| act=act) | |||||
| def init_weights(self): | |||||
| pass | |||||
| def forward(self, out_features): | |||||
| """ | |||||
| Args: | |||||
| inputs: input images. | |||||
| Returns: | |||||
| Tuple[Tensor]: FPN feature. | |||||
| """ | |||||
| # backbone | |||||
| features = [out_features[f] for f in self.in_features] | |||||
| [x2, x1, x0] = features | |||||
| # node x3 | |||||
| x13 = self.bu_conv13(x1) | |||||
| x3 = torch.cat([x0, x13], 1) | |||||
| x3 = self.merge_3(x3) | |||||
| # node x4 | |||||
| x34 = self.upsample(x3) | |||||
| x24 = self.bu_conv24(x2) | |||||
| x4 = torch.cat([x1, x24, x34], 1) | |||||
| x4 = self.merge_4(x4) | |||||
| # node x5 | |||||
| x45 = self.upsample(x4) | |||||
| x5 = torch.cat([x2, x45], 1) | |||||
| x5 = self.merge_5(x5) | |||||
| # node x7 | |||||
| x57 = self.bu_conv57(x5) | |||||
| x7 = torch.cat([x4, x57], 1) | |||||
| x7 = self.merge_7(x7) | |||||
| # node x6 | |||||
| x46 = self.bu_conv46(x4) | |||||
| x76 = self.bu_conv76(x7) | |||||
| x6 = torch.cat([x3, x46, x76], 1) | |||||
| x6 = self.merge_6(x6) | |||||
| outputs = (x5, x7, x6) | |||||
| return outputs | |||||
| @@ -0,0 +1,16 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import Tasks | |||||
| from .detector import SingleStageDetector | |||||
| @MODELS.register_module( | |||||
| Tasks.image_object_detection, module_name=Models.tinynas_detection) | |||||
| class TinynasDetector(SingleStageDetector): | |||||
| def __init__(self, model_dir, *args, **kwargs): | |||||
| super(TinynasDetector, self).__init__(model_dir, *args, **kwargs) | |||||
| @@ -0,0 +1,30 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||||
| import importlib | |||||
| import os | |||||
| import sys | |||||
| from os.path import dirname, join | |||||
| def get_config_by_file(config_file): | |||||
| try: | |||||
| sys.path.append(os.path.dirname(config_file)) | |||||
| current_config = importlib.import_module( | |||||
| os.path.basename(config_file).split('.')[0]) | |||||
| exp = current_config.Config() | |||||
| except Exception: | |||||
| raise ImportError( | |||||
| "{} doesn't contains class named 'Config'".format(config_file)) | |||||
| return exp | |||||
| def parse_config(config_file): | |||||
| """ | |||||
| get config object by file. | |||||
| Args: | |||||
| config_file (str): file path of config. | |||||
| """ | |||||
| assert (config_file is not None), 'plz provide config file' | |||||
| if config_file is not None: | |||||
| return get_config_by_file(config_file) | |||||
| @@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel): | |||||
| ModelFile.TORCH_MODEL_BIN_FILE) | ModelFile.TORCH_MODEL_BIN_FILE) | ||||
| checkpoint = torch.load(checkpoint_path, map_location='cpu') | checkpoint = torch.load(checkpoint_path, map_location='cpu') | ||||
| if 'model' in checkpoint: | if 'model' in checkpoint: | ||||
| state_dict = checkpoint['model'] | |||||
| else: | |||||
| state_dict = checkpoint['module'] | |||||
| checkpoint = checkpoint['model'] | |||||
| checkpoint = { | |||||
| k.replace('model.', ''): v | |||||
| for k, v in checkpoint.items() | |||||
| } | |||||
| msg = model.load_state_dict(state_dict, strict=False) | |||||
| msg = model.load_state_dict(checkpoint, strict=False) | |||||
| print('load checkpoint from %s' % checkpoint_path) | print('load checkpoint from %s' % checkpoint_path) | ||||
| print(msg) | print(msg) | ||||
| return model | return model | ||||
| @@ -9,12 +9,15 @@ if TYPE_CHECKING: | |||||
| from .bert_for_sequence_classification import BertForSequenceClassification | from .bert_for_sequence_classification import BertForSequenceClassification | ||||
| from .bert_for_document_segmentation import BertForDocumentSegmentation | from .bert_for_document_segmentation import BertForDocumentSegmentation | ||||
| from .csanmt_for_translation import CsanmtForTranslation | from .csanmt_for_translation import CsanmtForTranslation | ||||
| from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM, | |||||
| BertForMaskedLM) | |||||
| from .masked_language import ( | |||||
| StructBertForMaskedLM, | |||||
| VecoForMaskedLM, | |||||
| BertForMaskedLM, | |||||
| DebertaV2ForMaskedLM, | |||||
| ) | |||||
| from .nncrf_for_named_entity_recognition import ( | from .nncrf_for_named_entity_recognition import ( | ||||
| TransformerCRFForNamedEntityRecognition, | TransformerCRFForNamedEntityRecognition, | ||||
| LSTMCRFForNamedEntityRecognition) | LSTMCRFForNamedEntityRecognition) | ||||
| from .palm_v2 import PalmForTextGeneration | |||||
| from .token_classification import SbertForTokenClassification | from .token_classification import SbertForTokenClassification | ||||
| from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification | from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification | ||||
| from .space import SpaceForDialogIntent | from .space import SpaceForDialogIntent | ||||
| @@ -22,7 +25,6 @@ if TYPE_CHECKING: | |||||
| from .space import SpaceForDialogStateTracking | from .space import SpaceForDialogStateTracking | ||||
| from .star_text_to_sql import StarForTextToSql | from .star_text_to_sql import StarForTextToSql | ||||
| from .task_models import (InformationExtractionModel, | from .task_models import (InformationExtractionModel, | ||||
| SequenceClassificationModel, | |||||
| SingleBackboneTaskModelBase) | SingleBackboneTaskModelBase) | ||||
| from .bart_for_text_error_correction import BartForTextErrorCorrection | from .bart_for_text_error_correction import BartForTextErrorCorrection | ||||
| from .gpt3 import GPT3ForTextGeneration | from .gpt3 import GPT3ForTextGeneration | ||||
| @@ -36,8 +38,10 @@ else: | |||||
| 'csanmt_for_translation': ['CsanmtForTranslation'], | 'csanmt_for_translation': ['CsanmtForTranslation'], | ||||
| 'bert_for_sequence_classification': ['BertForSequenceClassification'], | 'bert_for_sequence_classification': ['BertForSequenceClassification'], | ||||
| 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], | 'bert_for_document_segmentation': ['BertForDocumentSegmentation'], | ||||
| 'masked_language': | |||||
| ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'], | |||||
| 'masked_language': [ | |||||
| 'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM', | |||||
| 'DebertaV2ForMaskedLM' | |||||
| ], | |||||
| 'nncrf_for_named_entity_recognition': [ | 'nncrf_for_named_entity_recognition': [ | ||||
| 'TransformerCRFForNamedEntityRecognition', | 'TransformerCRFForNamedEntityRecognition', | ||||
| 'LSTMCRFForNamedEntityRecognition' | 'LSTMCRFForNamedEntityRecognition' | ||||
| @@ -0,0 +1,73 @@ | |||||
| # flake8: noqa | |||||
| # There's no way to ignore "F401 '...' imported but unused" warnings in this | |||||
| # module, but to preserve other warnings. So, don't check this module at all. | |||||
| # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. | |||||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| _import_structure = { | |||||
| 'configuration_deberta_v2': [ | |||||
| 'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config', | |||||
| 'DebertaV2OnnxConfig' | |||||
| ], | |||||
| 'tokenization_deberta_v2': ['DebertaV2Tokenizer'], | |||||
| } | |||||
| if TYPE_CHECKING: | |||||
| from .configuration_deberta_v2 import DebertaV2Config | |||||
| from .tokenization_deberta_v2 import DebertaV2Tokenizer | |||||
| from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast | |||||
| from .modeling_deberta_v2 import ( | |||||
| DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST, | |||||
| DebertaV2ForMaskedLM, | |||||
| DebertaV2ForMultipleChoice, | |||||
| DebertaV2ForQuestionAnswering, | |||||
| DebertaV2ForSequenceClassification, | |||||
| DebertaV2ForTokenClassification, | |||||
| DebertaV2Model, | |||||
| DebertaV2PreTrainedModel, | |||||
| ) | |||||
| else: | |||||
| _import_structure = { | |||||
| 'configuration_deberta_v2': | |||||
| ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'], | |||||
| 'tokenization_deberta_v2': ['DebertaV2Tokenizer'] | |||||
| } | |||||
| _import_structure['tokenization_deberta_v2_fast'] = [ | |||||
| 'DebertaV2TokenizerFast' | |||||
| ] | |||||
| _import_structure['modeling_deberta_v2'] = [ | |||||
| 'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST', | |||||
| 'DebertaV2ForMaskedLM', | |||||
| 'DebertaV2ForMultipleChoice', | |||||
| 'DebertaV2ForQuestionAnswering', | |||||
| 'DebertaV2ForSequenceClassification', | |||||
| 'DebertaV2ForTokenClassification', | |||||
| 'DebertaV2Model', | |||||
| 'DebertaV2PreTrainedModel', | |||||
| ] | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__) | |||||
| @@ -0,0 +1,130 @@ | |||||
| # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. | |||||
| # Copyright 2020, Microsoft and the HuggingFace Inc. team. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """ DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config""" | |||||
| from collections import OrderedDict | |||||
| from typing import TYPE_CHECKING, Any, Mapping, Optional, Union | |||||
| from transformers import PretrainedConfig | |||||
| from modelscope.utils import logger as logging | |||||
| logger = logging.get_logger(__name__) | |||||
| class DebertaV2Config(PretrainedConfig): | |||||
| r""" | |||||
| This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a | |||||
| DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a | |||||
| configuration with the defaults will yield a similar configuration to that of the DeBERTa | |||||
| [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture. | |||||
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |||||
| documentation from [`PretrainedConfig`] for more information. | |||||
| Arguments: | |||||
| vocab_size (`int`, *optional*, defaults to 128100): | |||||
| Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by | |||||
| the `inputs_ids` passed when calling [`DebertaV2Model`]. | |||||
| hidden_size (`int`, *optional*, defaults to 1536): | |||||
| Dimensionality of the encoder layers and the pooler layer. | |||||
| num_hidden_layers (`int`, *optional*, defaults to 24): | |||||
| Number of hidden layers in the Transformer encoder. | |||||
| num_attention_heads (`int`, *optional*, defaults to 24): | |||||
| Number of attention heads for each attention layer in the Transformer encoder. | |||||
| intermediate_size (`int`, *optional*, defaults to 6144): | |||||
| Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. | |||||
| hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): | |||||
| The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, | |||||
| `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` | |||||
| are supported. | |||||
| hidden_dropout_prob (`float`, *optional*, defaults to 0.1): | |||||
| The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. | |||||
| attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): | |||||
| The dropout ratio for the attention probabilities. | |||||
| max_position_embeddings (`int`, *optional*, defaults to 512): | |||||
| The maximum sequence length that this model might ever be used with. Typically set this to something large | |||||
| just in case (e.g., 512 or 1024 or 2048). | |||||
| type_vocab_size (`int`, *optional*, defaults to 0): | |||||
| The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`]. | |||||
| initializer_range (`float`, *optional*, defaults to 0.02): | |||||
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |||||
| layer_norm_eps (`float`, *optional*, defaults to 1e-7): | |||||
| The epsilon used by the layer normalization layers. | |||||
| relative_attention (`bool`, *optional*, defaults to `True`): | |||||
| Whether use relative position encoding. | |||||
| max_relative_positions (`int`, *optional*, defaults to -1): | |||||
| The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value | |||||
| as `max_position_embeddings`. | |||||
| pad_token_id (`int`, *optional*, defaults to 0): | |||||
| The value used to pad input_ids. | |||||
| position_biased_input (`bool`, *optional*, defaults to `False`): | |||||
| Whether add absolute position embedding to content embedding. | |||||
| pos_att_type (`List[str]`, *optional*): | |||||
| The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`, | |||||
| `["p2c", "c2p"]`, `["p2c", "c2p"]`. | |||||
| layer_norm_eps (`float`, optional, defaults to 1e-12): | |||||
| The epsilon used by the layer normalization layers. | |||||
| """ | |||||
| model_type = 'deberta_v2' | |||||
| def __init__(self, | |||||
| vocab_size=128100, | |||||
| hidden_size=1536, | |||||
| num_hidden_layers=24, | |||||
| num_attention_heads=24, | |||||
| intermediate_size=6144, | |||||
| hidden_act='gelu', | |||||
| hidden_dropout_prob=0.1, | |||||
| attention_probs_dropout_prob=0.1, | |||||
| max_position_embeddings=512, | |||||
| type_vocab_size=0, | |||||
| initializer_range=0.02, | |||||
| layer_norm_eps=1e-7, | |||||
| relative_attention=False, | |||||
| max_relative_positions=-1, | |||||
| pad_token_id=0, | |||||
| position_biased_input=True, | |||||
| pos_att_type=None, | |||||
| pooler_dropout=0, | |||||
| pooler_hidden_act='gelu', | |||||
| **kwargs): | |||||
| super().__init__(**kwargs) | |||||
| self.hidden_size = hidden_size | |||||
| self.num_hidden_layers = num_hidden_layers | |||||
| self.num_attention_heads = num_attention_heads | |||||
| self.intermediate_size = intermediate_size | |||||
| self.hidden_act = hidden_act | |||||
| self.hidden_dropout_prob = hidden_dropout_prob | |||||
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |||||
| self.max_position_embeddings = max_position_embeddings | |||||
| self.type_vocab_size = type_vocab_size | |||||
| self.initializer_range = initializer_range | |||||
| self.relative_attention = relative_attention | |||||
| self.max_relative_positions = max_relative_positions | |||||
| self.pad_token_id = pad_token_id | |||||
| self.position_biased_input = position_biased_input | |||||
| # Backwards compatibility | |||||
| if type(pos_att_type) == str: | |||||
| pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')] | |||||
| self.pos_att_type = pos_att_type | |||||
| self.vocab_size = vocab_size | |||||
| self.layer_norm_eps = layer_norm_eps | |||||
| self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size) | |||||
| self.pooler_dropout = pooler_dropout | |||||
| self.pooler_hidden_act = pooler_hidden_act | |||||
| @@ -0,0 +1,546 @@ | |||||
| # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. | |||||
| # Copyright 2020 Microsoft and the HuggingFace Inc. team. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`""" | |||||
| import os | |||||
| import unicodedata | |||||
| from typing import Any, Dict, List, Optional, Tuple | |||||
| import sentencepiece as sp | |||||
| from transformers.tokenization_utils import PreTrainedTokenizer | |||||
| PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} | |||||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} | |||||
| PRETRAINED_INIT_CONFIGURATION = {} | |||||
| VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'} | |||||
| class DebertaV2Tokenizer(PreTrainedTokenizer): | |||||
| r""" | |||||
| Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) | |||||
| and [jieba](https://github.com/fxsjy/jieba). | |||||
| Args: | |||||
| vocab_file (`str`): | |||||
| [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that | |||||
| contains the vocabulary necessary to instantiate a tokenizer. | |||||
| do_lower_case (`bool`, *optional*, defaults to `False`): | |||||
| Whether or not to lowercase the input when tokenizing. | |||||
| bos_token (`string`, *optional*, defaults to `"[CLS]"`): | |||||
| The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. | |||||
| When building a sequence using special tokens, this is not the token that is used for the beginning of | |||||
| sequence. The token used is the `cls_token`. | |||||
| eos_token (`string`, *optional*, defaults to `"[SEP]"`): | |||||
| The end of sequence token. When building a sequence using special tokens, this is not the token that is | |||||
| used for the end of sequence. The token used is the `sep_token`. | |||||
| unk_token (`str`, *optional*, defaults to `"[UNK]"`): | |||||
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |||||
| token instead. | |||||
| sep_token (`str`, *optional*, defaults to `"[SEP]"`): | |||||
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for | |||||
| sequence classification or for a text and a question for question answering. It is also used as the last | |||||
| token of a sequence built with special tokens. | |||||
| pad_token (`str`, *optional*, defaults to `"[PAD]"`): | |||||
| The token used for padding, for example when batching sequences of different lengths. | |||||
| cls_token (`str`, *optional*, defaults to `"[CLS]"`): | |||||
| The classifier token which is used when doing sequence classification (classification of the whole sequence | |||||
| instead of per-token classification). It is the first token of the sequence when built with special tokens. | |||||
| mask_token (`str`, *optional*, defaults to `"[MASK]"`): | |||||
| The token used for masking values. This is the token used when training this model with masked language | |||||
| modeling. This is the token which the model will try to predict. | |||||
| sp_model_kwargs (`dict`, *optional*): | |||||
| Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for | |||||
| SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, | |||||
| to set: | |||||
| - `enable_sampling`: Enable subword regularization. | |||||
| - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. | |||||
| - `nbest_size = {0,1}`: No sampling is performed. | |||||
| - `nbest_size > 1`: samples from the nbest_size results. | |||||
| - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) | |||||
| using forward-filtering-and-backward-sampling algorithm. | |||||
| - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for | |||||
| BPE-dropout. | |||||
| """ | |||||
| vocab_files_names = VOCAB_FILES_NAMES | |||||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||||
| pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION | |||||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||||
| def __init__(self, | |||||
| vocab_file, | |||||
| do_lower_case=False, | |||||
| split_by_punct=False, | |||||
| split_chinese=True, | |||||
| bos_token='[CLS]', | |||||
| eos_token='[SEP]', | |||||
| unk_token='[UNK]', | |||||
| sep_token='[SEP]', | |||||
| pad_token='[PAD]', | |||||
| cls_token='[CLS]', | |||||
| mask_token='[MASK]', | |||||
| sp_model_kwargs: Optional[Dict[str, Any]] = None, | |||||
| **kwargs) -> None: | |||||
| self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs | |||||
| super().__init__( | |||||
| do_lower_case=do_lower_case, | |||||
| bos_token=bos_token, | |||||
| eos_token=eos_token, | |||||
| unk_token=unk_token, | |||||
| sep_token=sep_token, | |||||
| pad_token=pad_token, | |||||
| cls_token=cls_token, | |||||
| mask_token=mask_token, | |||||
| split_by_punct=split_by_punct, | |||||
| split_chinese=split_chinese, | |||||
| sp_model_kwargs=self.sp_model_kwargs, | |||||
| **kwargs, | |||||
| ) | |||||
| if not os.path.isfile(vocab_file): | |||||
| raise ValueError( | |||||
| f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" | |||||
| ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`' | |||||
| ) | |||||
| self.do_lower_case = do_lower_case | |||||
| self.split_by_punct = split_by_punct | |||||
| self.split_chinese = split_chinese | |||||
| self.vocab_file = vocab_file | |||||
| self._tokenizer = SPMTokenizer( | |||||
| vocab_file, | |||||
| split_by_punct=split_by_punct, | |||||
| sp_model_kwargs=self.sp_model_kwargs) | |||||
| self.jieba = None | |||||
| if self.split_chinese: | |||||
| try: | |||||
| import jieba | |||||
| except ImportError: | |||||
| raise ImportError( | |||||
| 'You need to install jieba to split chinese and use DebertaV2Tokenizer. ' | |||||
| 'See https://pypi.org/project/jieba/ for installation.') | |||||
| self.jieba = jieba | |||||
| @property | |||||
| def vocab_size(self): | |||||
| return len(self.vocab) | |||||
| @property | |||||
| def vocab(self): | |||||
| return self._tokenizer.vocab | |||||
| def get_vocab(self): | |||||
| vocab = self.vocab.copy() | |||||
| vocab.update(self.get_added_vocab()) | |||||
| return vocab | |||||
| def _tokenize(self, text: str) -> List[str]: | |||||
| """Take as input a string and return a list of strings (tokens) for words/sub-words""" | |||||
| if self.do_lower_case: | |||||
| text = text.lower() | |||||
| if self.split_chinese: | |||||
| seg_list = [x for x in self.jieba.cut(text)] | |||||
| text = ' '.join(seg_list) | |||||
| return self._tokenizer.tokenize(text) | |||||
| def _convert_token_to_id(self, token): | |||||
| """Converts a token (str) in an id using the vocab.""" | |||||
| return self._tokenizer.spm.PieceToId(token) | |||||
| def _convert_id_to_token(self, index): | |||||
| """Converts an index (integer) in a token (str) using the vocab.""" | |||||
| return self._tokenizer.spm.IdToPiece( | |||||
| index) if index < self.vocab_size else self.unk_token | |||||
| def convert_tokens_to_string(self, tokens): | |||||
| """Converts a sequence of tokens (string) in a single string.""" | |||||
| return self._tokenizer.decode(tokens) | |||||
| def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | |||||
| """ | |||||
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and | |||||
| adding special tokens. A DeBERTa sequence has the following format: | |||||
| - single sequence: [CLS] X [SEP] | |||||
| - pair of sequences: [CLS] A [SEP] B [SEP] | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs to which the special tokens will be added. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| Returns: | |||||
| `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. | |||||
| """ | |||||
| if token_ids_1 is None: | |||||
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |||||
| cls = [self.cls_token_id] | |||||
| sep = [self.sep_token_id] | |||||
| return cls + token_ids_0 + sep + token_ids_1 + sep | |||||
| def get_special_tokens_mask(self, | |||||
| token_ids_0, | |||||
| token_ids_1=None, | |||||
| already_has_special_tokens=False): | |||||
| """ | |||||
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |||||
| special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| already_has_special_tokens (`bool`, *optional*, defaults to `False`): | |||||
| Whether or not the token list is already formatted with special tokens for the model. | |||||
| Returns: | |||||
| `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |||||
| """ | |||||
| if already_has_special_tokens: | |||||
| return super().get_special_tokens_mask( | |||||
| token_ids_0=token_ids_0, | |||||
| token_ids_1=token_ids_1, | |||||
| already_has_special_tokens=True) | |||||
| if token_ids_1 is not None: | |||||
| return [1] + ([0] * len(token_ids_0)) + [1] + ( | |||||
| [0] * len(token_ids_1)) + [1] | |||||
| return [1] + ([0] * len(token_ids_0)) + [1] | |||||
| def create_token_type_ids_from_sequences(self, | |||||
| token_ids_0, | |||||
| token_ids_1=None): | |||||
| """ | |||||
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa | |||||
| sequence pair mask has the following format: | |||||
| ``` | |||||
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | |||||
| | first sequence | second sequence | | |||||
| ``` | |||||
| If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| Returns: | |||||
| `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). | |||||
| """ | |||||
| sep = [self.sep_token_id] | |||||
| cls = [self.cls_token_id] | |||||
| if token_ids_1 is None: | |||||
| return len(cls + token_ids_0 + sep) * [0] | |||||
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 | |||||
| + sep) * [1] | |||||
| def prepare_for_tokenization(self, | |||||
| text, | |||||
| is_split_into_words=False, | |||||
| **kwargs): | |||||
| add_prefix_space = kwargs.pop('add_prefix_space', False) | |||||
| if is_split_into_words or add_prefix_space: | |||||
| text = ' ' + text | |||||
| return (text, kwargs) | |||||
| def save_vocabulary(self, | |||||
| save_directory: str, | |||||
| filename_prefix: Optional[str] = None) -> Tuple[str]: | |||||
| return self._tokenizer.save_pretrained( | |||||
| save_directory, filename_prefix=filename_prefix) | |||||
| class SPMTokenizer: | |||||
| r""" | |||||
| Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece). | |||||
| Args: | |||||
| vocab_file (`str`): | |||||
| [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that | |||||
| contains the vocabulary necessary to instantiate a tokenizer. | |||||
| sp_model_kwargs (`dict`, *optional*): | |||||
| Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for | |||||
| SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, | |||||
| to set: | |||||
| - `enable_sampling`: Enable subword regularization. | |||||
| - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. | |||||
| - `nbest_size = {0,1}`: No sampling is performed. | |||||
| - `nbest_size > 1`: samples from the nbest_size results. | |||||
| - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) | |||||
| using forward-filtering-and-backward-sampling algorithm. | |||||
| - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for | |||||
| BPE-dropout. | |||||
| """ | |||||
| def __init__(self, | |||||
| vocab_file, | |||||
| split_by_punct=False, | |||||
| sp_model_kwargs: Optional[Dict[str, Any]] = None): | |||||
| self.split_by_punct = split_by_punct | |||||
| self.vocab_file = vocab_file | |||||
| self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs | |||||
| spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) | |||||
| if not os.path.exists(vocab_file): | |||||
| raise FileNotFoundError(f'{vocab_file} does not exist!') | |||||
| spm.load(vocab_file) | |||||
| bpe_vocab_size = spm.GetPieceSize() | |||||
| # Token map | |||||
| # <unk> 0+1 | |||||
| # <s> 1+1 | |||||
| # </s> 2+1 | |||||
| self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)} | |||||
| self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)] | |||||
| # self.vocab['[PAD]'] = 0 | |||||
| # self.vocab['[CLS]'] = 1 | |||||
| # self.vocab['[SEP]'] = 2 | |||||
| # self.vocab['[UNK]'] = 3 | |||||
| self.spm = spm | |||||
| def __getstate__(self): | |||||
| state = self.__dict__.copy() | |||||
| state['spm'] = None | |||||
| return state | |||||
| def __setstate__(self, d): | |||||
| self.__dict__ = d | |||||
| # for backward compatibility | |||||
| if not hasattr(self, 'sp_model_kwargs'): | |||||
| self.sp_model_kwargs = {} | |||||
| self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) | |||||
| self.spm.Load(self.vocab_file) | |||||
| def tokenize(self, text): | |||||
| return self._encode_as_pieces(text) | |||||
| def convert_ids_to_tokens(self, ids): | |||||
| tokens = [] | |||||
| for i in ids: | |||||
| tokens.append(self.ids_to_tokens[i]) | |||||
| return tokens | |||||
| def decode(self, tokens, start=-1, end=-1, raw_text=None): | |||||
| if raw_text is None: | |||||
| return self.spm.decode_pieces([t for t in tokens]) | |||||
| else: | |||||
| words = self.split_to_words(raw_text) | |||||
| word_tokens = [self.tokenize(w) for w in words] | |||||
| token2words = [0] * len(tokens) | |||||
| tid = 0 | |||||
| for i, w in enumerate(word_tokens): | |||||
| for k, t in enumerate(w): | |||||
| token2words[tid] = i | |||||
| tid += 1 | |||||
| word_start = token2words[start] | |||||
| word_end = token2words[end] if end < len(tokens) else len(words) | |||||
| text = ''.join(words[word_start:word_end]) | |||||
| return text | |||||
| def add_special_token(self, token): | |||||
| if token not in self.special_tokens: | |||||
| self.special_tokens.append(token) | |||||
| if token not in self.vocab: | |||||
| self.vocab[token] = len(self.vocab) - 1 | |||||
| self.ids_to_tokens.append(token) | |||||
| return self.id(token) | |||||
| def part_of_whole_word(self, token, is_bos=False): | |||||
| if is_bos: | |||||
| return True | |||||
| if (len(token) == 1 and (_is_whitespace(list(token)[0]))): | |||||
| return False | |||||
| if _is_control(list(token)[0]): | |||||
| return False | |||||
| if _is_punctuation(list(token)[0]): | |||||
| return False | |||||
| if token in self.add_special_token: | |||||
| return False | |||||
| word_start = b'\xe2\x96\x81'.decode('utf-8') | |||||
| return not token.startswith(word_start) | |||||
| def pad(self): | |||||
| return '[PAD]' | |||||
| def bos(self): | |||||
| return '[CLS]' | |||||
| def eos(self): | |||||
| return '[SEP]' | |||||
| def unk(self): | |||||
| return '[UNK]' | |||||
| def mask(self): | |||||
| return '[MASK]' | |||||
| def sym(self, id): | |||||
| return self.ids_to_tokens[id] | |||||
| def id(self, sym): | |||||
| return self.vocab[sym] if sym in self.vocab else 1 | |||||
| def _encode_as_pieces(self, text): | |||||
| text = convert_to_unicode(text) | |||||
| if self.split_by_punct: | |||||
| words = self._run_split_on_punc(text) | |||||
| pieces = [self.spm.encode(w, out_type=str) for w in words] | |||||
| return [p for w in pieces for p in w] | |||||
| else: | |||||
| return self.spm.encode(text, out_type=str) | |||||
| def split_to_words(self, text): | |||||
| pieces = self._encode_as_pieces(text) | |||||
| word_start = b'\xe2\x96\x81'.decode('utf-8') | |||||
| words = [] | |||||
| offset = 0 | |||||
| prev_end = 0 | |||||
| for i, p in enumerate(pieces): | |||||
| if p.startswith(word_start): | |||||
| if offset > prev_end: | |||||
| words.append(text[prev_end:offset]) | |||||
| prev_end = offset | |||||
| w = p.replace(word_start, '') | |||||
| else: | |||||
| w = p | |||||
| try: | |||||
| s = text.index(w, offset) | |||||
| pn = '' | |||||
| k = i + 1 | |||||
| while k < len(pieces): | |||||
| pn = pieces[k].replace(word_start, '') | |||||
| if len(pn) > 0: | |||||
| break | |||||
| k += 1 | |||||
| if len(pn) > 0 and pn in text[offset:s]: | |||||
| offset = offset + 1 | |||||
| else: | |||||
| offset = s + len(w) | |||||
| except Exception: | |||||
| offset = offset + 1 | |||||
| if prev_end < offset: | |||||
| words.append(text[prev_end:offset]) | |||||
| return words | |||||
| def _run_strip_accents(self, text): | |||||
| """Strips accents from a piece of text.""" | |||||
| text = unicodedata.normalize('NFD', text) | |||||
| output = [] | |||||
| for char in text: | |||||
| cat = unicodedata.category(char) | |||||
| if cat == 'Mn': | |||||
| continue | |||||
| output.append(char) | |||||
| return ''.join(output) | |||||
| def _run_split_on_punc(self, text): | |||||
| """Splits punctuation on a piece of text.""" | |||||
| chars = list(text) | |||||
| i = 0 | |||||
| start_new_word = True | |||||
| output = [] | |||||
| while i < len(chars): | |||||
| char = chars[i] | |||||
| if _is_punctuation(char): | |||||
| output.append([char]) | |||||
| start_new_word = True | |||||
| else: | |||||
| if start_new_word: | |||||
| output.append([]) | |||||
| start_new_word = False | |||||
| output[-1].append(char) | |||||
| i += 1 | |||||
| return [''.join(x) for x in output] | |||||
| def save_pretrained(self, path: str, filename_prefix: str = None): | |||||
| filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] | |||||
| if filename_prefix is not None: | |||||
| filename = filename_prefix + '-' + filename | |||||
| full_path = os.path.join(path, filename) | |||||
| with open(full_path, 'wb') as fs: | |||||
| fs.write(self.spm.serialized_model_proto()) | |||||
| return (full_path, ) | |||||
| def _is_whitespace(char): | |||||
| """Checks whether `chars` is a whitespace character.""" | |||||
| # \t, \n, and \r are technically control characters but we treat them | |||||
| # as whitespace since they are generally considered as such. | |||||
| if char == ' ' or char == '\t' or char == '\n' or char == '\r': | |||||
| return True | |||||
| cat = unicodedata.category(char) | |||||
| if cat == 'Zs': | |||||
| return True | |||||
| return False | |||||
| def _is_control(char): | |||||
| """Checks whether `chars` is a control character.""" | |||||
| # These are technically control characters but we count them as whitespace | |||||
| # characters. | |||||
| if char == '\t' or char == '\n' or char == '\r': | |||||
| return False | |||||
| cat = unicodedata.category(char) | |||||
| if cat.startswith('C'): | |||||
| return True | |||||
| return False | |||||
| def _is_punctuation(char): | |||||
| """Checks whether `chars` is a punctuation character.""" | |||||
| cp = ord(char) | |||||
| # We treat all non-letter/number ASCII as punctuation. | |||||
| # Characters such as "^", "$", and "`" are not in the Unicode | |||||
| # Punctuation class but we treat them as punctuation anyways, for | |||||
| # consistency. | |||||
| if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( | |||||
| cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): | |||||
| return True | |||||
| cat = unicodedata.category(char) | |||||
| if cat.startswith('P'): | |||||
| return True | |||||
| return False | |||||
| def convert_to_unicode(text): | |||||
| """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |||||
| if isinstance(text, str): | |||||
| return text | |||||
| elif isinstance(text, bytes): | |||||
| return text.decode('utf-8', 'ignore') | |||||
| else: | |||||
| raise ValueError(f'Unsupported string type: {type(text)}') | |||||
| @@ -0,0 +1,241 @@ | |||||
| # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. | |||||
| # Copyright 2020 Microsoft and the HuggingFace Inc. team. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| """Fast Tokenization class for model DeBERTa.""" | |||||
| import os | |||||
| from shutil import copyfile | |||||
| from typing import Optional, Tuple | |||||
| from transformers.file_utils import is_sentencepiece_available | |||||
| from transformers.tokenization_utils_fast import PreTrainedTokenizerFast | |||||
| from modelscope.utils import logger as logging | |||||
| if is_sentencepiece_available(): | |||||
| from .tokenization_deberta_v2 import DebertaV2Tokenizer | |||||
| else: | |||||
| DebertaV2Tokenizer = None | |||||
| logger = logging.get_logger(__name__) | |||||
| VOCAB_FILES_NAMES = { | |||||
| 'vocab_file': 'spm.model', | |||||
| 'tokenizer_file': 'tokenizer.json' | |||||
| } | |||||
| PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} | |||||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} | |||||
| PRETRAINED_INIT_CONFIGURATION = {} | |||||
| class DebertaV2TokenizerFast(PreTrainedTokenizerFast): | |||||
| r""" | |||||
| Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) | |||||
| and [rjieba-py](https://github.com/messense/rjieba-py). | |||||
| Args: | |||||
| vocab_file (`str`): | |||||
| [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that | |||||
| contains the vocabulary necessary to instantiate a tokenizer. | |||||
| do_lower_case (`bool`, *optional*, defaults to `False`): | |||||
| Whether or not to lowercase the input when tokenizing. | |||||
| bos_token (`string`, *optional*, defaults to `"[CLS]"`): | |||||
| The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. | |||||
| When building a sequence using special tokens, this is not the token that is used for the beginning of | |||||
| sequence. The token used is the `cls_token`. | |||||
| eos_token (`string`, *optional*, defaults to `"[SEP]"`): | |||||
| The end of sequence token. When building a sequence using special tokens, this is not the token that is | |||||
| used for the end of sequence. The token used is the `sep_token`. | |||||
| unk_token (`str`, *optional*, defaults to `"[UNK]"`): | |||||
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |||||
| token instead. | |||||
| sep_token (`str`, *optional*, defaults to `"[SEP]"`): | |||||
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for | |||||
| sequence classification or for a text and a question for question answering. It is also used as the last | |||||
| token of a sequence built with special tokens. | |||||
| pad_token (`str`, *optional*, defaults to `"[PAD]"`): | |||||
| The token used for padding, for example when batching sequences of different lengths. | |||||
| cls_token (`str`, *optional*, defaults to `"[CLS]"`): | |||||
| The classifier token which is used when doing sequence classification (classification of the whole sequence | |||||
| instead of per-token classification). It is the first token of the sequence when built with special tokens. | |||||
| mask_token (`str`, *optional*, defaults to `"[MASK]"`): | |||||
| The token used for masking values. This is the token used when training this model with masked language | |||||
| modeling. This is the token which the model will try to predict. | |||||
| sp_model_kwargs (`dict`, *optional*): | |||||
| Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for | |||||
| SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, | |||||
| to set: | |||||
| - `enable_sampling`: Enable subword regularization. | |||||
| - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. | |||||
| - `nbest_size = {0,1}`: No sampling is performed. | |||||
| - `nbest_size > 1`: samples from the nbest_size results. | |||||
| - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) | |||||
| using forward-filtering-and-backward-sampling algorithm. | |||||
| - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for | |||||
| BPE-dropout. | |||||
| """ | |||||
| vocab_files_names = VOCAB_FILES_NAMES | |||||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||||
| pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION | |||||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||||
| slow_tokenizer_class = DebertaV2Tokenizer | |||||
| def __init__(self, | |||||
| vocab_file=None, | |||||
| tokenizer_file=None, | |||||
| do_lower_case=False, | |||||
| split_by_punct=False, | |||||
| split_chinese=True, | |||||
| bos_token='[CLS]', | |||||
| eos_token='[SEP]', | |||||
| unk_token='[UNK]', | |||||
| sep_token='[SEP]', | |||||
| pad_token='[PAD]', | |||||
| cls_token='[CLS]', | |||||
| mask_token='[MASK]', | |||||
| **kwargs) -> None: | |||||
| super().__init__( | |||||
| vocab_file, | |||||
| tokenizer_file=tokenizer_file, | |||||
| do_lower_case=do_lower_case, | |||||
| bos_token=bos_token, | |||||
| eos_token=eos_token, | |||||
| unk_token=unk_token, | |||||
| sep_token=sep_token, | |||||
| pad_token=pad_token, | |||||
| cls_token=cls_token, | |||||
| mask_token=mask_token, | |||||
| split_by_punct=split_by_punct, | |||||
| split_chinese=split_chinese, | |||||
| **kwargs, | |||||
| ) | |||||
| self.do_lower_case = do_lower_case | |||||
| self.split_by_punct = split_by_punct | |||||
| self.split_chinese = split_chinese | |||||
| self.vocab_file = vocab_file | |||||
| self.can_save_slow_tokenizer = False if not self.vocab_file else True | |||||
| def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | |||||
| """ | |||||
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and | |||||
| adding special tokens. A DeBERTa sequence has the following format: | |||||
| - single sequence: [CLS] X [SEP] | |||||
| - pair of sequences: [CLS] A [SEP] B [SEP] | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs to which the special tokens will be added. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| Returns: | |||||
| `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. | |||||
| """ | |||||
| if token_ids_1 is None: | |||||
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |||||
| cls = [self.cls_token_id] | |||||
| sep = [self.sep_token_id] | |||||
| return cls + token_ids_0 + sep + token_ids_1 + sep | |||||
| def get_special_tokens_mask(self, | |||||
| token_ids_0, | |||||
| token_ids_1=None, | |||||
| already_has_special_tokens=False): | |||||
| """ | |||||
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |||||
| special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| already_has_special_tokens (`bool`, *optional*, defaults to `False`): | |||||
| Whether or not the token list is already formatted with special tokens for the model. | |||||
| Returns: | |||||
| `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |||||
| """ | |||||
| if already_has_special_tokens: | |||||
| return super().get_special_tokens_mask( | |||||
| token_ids_0=token_ids_0, | |||||
| token_ids_1=token_ids_1, | |||||
| already_has_special_tokens=True) | |||||
| if token_ids_1 is not None: | |||||
| return [1] + ([0] * len(token_ids_0)) + [1] + ( | |||||
| [0] * len(token_ids_1)) + [1] | |||||
| return [1] + ([0] * len(token_ids_0)) + [1] | |||||
| def create_token_type_ids_from_sequences(self, | |||||
| token_ids_0, | |||||
| token_ids_1=None): | |||||
| """ | |||||
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa | |||||
| sequence pair mask has the following format: | |||||
| ``` | |||||
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | |||||
| | first sequence | second sequence | | |||||
| ``` | |||||
| If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). | |||||
| Args: | |||||
| token_ids_0 (`List[int]`): | |||||
| List of IDs. | |||||
| token_ids_1 (`List[int]`, *optional*): | |||||
| Optional second list of IDs for sequence pairs. | |||||
| Returns: | |||||
| `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). | |||||
| """ | |||||
| sep = [self.sep_token_id] | |||||
| cls = [self.cls_token_id] | |||||
| if token_ids_1 is None: | |||||
| return len(cls + token_ids_0 + sep) * [0] | |||||
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 | |||||
| + sep) * [1] | |||||
| def save_vocabulary(self, | |||||
| save_directory: str, | |||||
| filename_prefix: Optional[str] = None) -> Tuple[str]: | |||||
| if not self.can_save_slow_tokenizer: | |||||
| raise ValueError( | |||||
| 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow ' | |||||
| 'tokenizer.') | |||||
| if not os.path.isdir(save_directory): | |||||
| logger.error( | |||||
| f'Vocabulary path ({save_directory}) should be a directory') | |||||
| return | |||||
| out_vocab_file = os.path.join( | |||||
| save_directory, (filename_prefix + '-' if filename_prefix else '') | |||||
| + VOCAB_FILES_NAMES['vocab_file']) | |||||
| if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): | |||||
| copyfile(self.vocab_file, out_vocab_file) | |||||
| return (out_vocab_file, ) | |||||
| @@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel): | |||||
| state_dict_file = os.path.join(pretrained_model_name_or_path, | state_dict_file = os.path.join(pretrained_model_name_or_path, | ||||
| ModelFile.TORCH_MODEL_BIN_FILE) | ModelFile.TORCH_MODEL_BIN_FILE) | ||||
| state_dict = torch.load(state_dict_file) | state_dict = torch.load(state_dict_file) | ||||
| state_dict = { | |||||
| k.replace('model.language_model', 'language_model'): v | |||||
| for k, v in state_dict.items() | |||||
| } | |||||
| model.load_state_dict(state_dict) | model.load_state_dict(state_dict) | ||||
| return model | return model | ||||
| @@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer | |||||
| from modelscope.metainfo import Models | from modelscope.metainfo import Models | ||||
| from modelscope.models.base import TorchModel | from modelscope.models.base import TorchModel | ||||
| from modelscope.models.builder import MODELS | from modelscope.models.builder import MODELS | ||||
| from modelscope.models.nlp.deberta_v2 import \ | |||||
| DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer | |||||
| from modelscope.models.nlp.structbert import SbertForMaskedLM | from modelscope.models.nlp.structbert import SbertForMaskedLM | ||||
| from modelscope.models.nlp.veco import \ | from modelscope.models.nlp.veco import \ | ||||
| VecoForMaskedLM as VecoForMaskedLMTransformer | VecoForMaskedLM as VecoForMaskedLMTransformer | ||||
| @@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer): | |||||
| VecoForMaskedLM).from_pretrained( | VecoForMaskedLM).from_pretrained( | ||||
| pretrained_model_name_or_path=model_dir, | pretrained_model_name_or_path=model_dir, | ||||
| model_dir=model_dir) | model_dir=model_dir) | ||||
| @MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2) | |||||
| class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer): | |||||
| """Deberta v2 for MLM model. | |||||
| Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets. | |||||
| """ | |||||
| def __init__(self, config, model_dir): | |||||
| super(TorchModel, self).__init__(model_dir) | |||||
| DebertaV2ForMaskedLMTransformer.__init__(self, config) | |||||
| def forward(self, | |||||
| input_ids=None, | |||||
| attention_mask=None, | |||||
| token_type_ids=None, | |||||
| position_ids=None, | |||||
| head_mask=None, | |||||
| labels=None): | |||||
| output = DebertaV2ForMaskedLMTransformer.forward( | |||||
| self, | |||||
| input_ids=input_ids, | |||||
| attention_mask=attention_mask, | |||||
| token_type_ids=token_type_ids, | |||||
| position_ids=position_ids, | |||||
| labels=labels) | |||||
| output[OutputKeys.INPUT_IDS] = input_ids | |||||
| return output | |||||
| @classmethod | |||||
| def _instantiate(cls, **kwargs): | |||||
| model_dir = kwargs.get('model_dir') | |||||
| return super(DebertaV2ForMaskedLMTransformer, | |||||
| DebertaV2ForMaskedLM).from_pretrained( | |||||
| pretrained_model_name_or_path=model_dir, | |||||
| model_dir=model_dir) | |||||
| @@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel): # Model | |||||
| self.generator.dense.weight = self.decoder.embeddings.weight | self.generator.dense.weight = self.decoder.embeddings.weight | ||||
| if checkpoint is not None: | if checkpoint is not None: | ||||
| for key in list(checkpoint['model'].keys()): | |||||
| checkpoint['model'][key.replace('module.', | |||||
| '')] = checkpoint['model'][key] | |||||
| msg = self.load_state_dict(checkpoint['model'], strict=False) | |||||
| print(msg) | |||||
| if 'model' in checkpoint: | |||||
| checkpoint = checkpoint['model'] | |||||
| for key in list(checkpoint.keys()): | |||||
| checkpoint[key.replace('model.palm.', '')] = checkpoint[key] | |||||
| self.load_state_dict(checkpoint, strict=False) | |||||
| else: | else: | ||||
| for module in self.decoder.modules(): | for module in self.decoder.modules(): | ||||
| if isinstance(module, (nn.Linear, nn.Embedding)): | if isinstance(module, (nn.Linear, nn.Embedding)): | ||||
| @@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel): | |||||
| return addict.Dict(loss=loss) | return addict.Dict(loss=loss) | ||||
| class Translator(nn.Module): | |||||
| class Translator(object): | |||||
| """ | """ | ||||
| Uses a model to translate a batch of sentences. | Uses a model to translate a batch of sentences. | ||||
| """ | """ | ||||
| @@ -1298,8 +1298,8 @@ class Translator(nn.Module): | |||||
| return results | return results | ||||
| def forward(self, input_ids: torch.Tensor, | |||||
| attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: | |||||
| def __call__(self, input_ids: torch.Tensor, | |||||
| attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]: | |||||
| batch = self.Batch( | batch = self.Batch( | ||||
| batch_size=input_ids.size()[0], | batch_size=input_ids.size()[0], | ||||
| src=input_ids, | src=input_ids, | ||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .face_2d_keypoints_dataset import FaceKeypointDataset | |||||
| else: | |||||
| _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,13 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset | |||||
| from modelscope.metainfo import Datasets | |||||
| from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS | |||||
| from modelscope.utils.constant import Tasks | |||||
| @TASK_DATASETS.register_module( | |||||
| group_key=Tasks.face_2d_keypoints, | |||||
| module_name=Datasets.Face2dKeypointsDataset) | |||||
| class FaceKeypointDataset(_FaceKeypointDataset): | |||||
| """EasyCV dataset for face 2d keypoints.""" | |||||
| @@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset): | |||||
| for idx in range(iter_start, iter_end): | for idx in range(iter_start, iter_end): | ||||
| item_dict = self.dataset[idx] | item_dict = self.dataset[idx] | ||||
| res = { | res = { | ||||
| k: np.array(item_dict[k]) | |||||
| k: torch.tensor(item_dict[k]) | |||||
| for k in self.columns if k in self.retained_columns | for k in self.columns if k in self.retained_columns | ||||
| } | } | ||||
| for preprocessor in self.preprocessor_list: | for preprocessor in self.preprocessor_list: | ||||
| res.update({ | res.update({ | ||||
| k: np.array(v) | |||||
| k: torch.tensor(v) | |||||
| for k, v in preprocessor(item_dict).items() | for k, v in preprocessor(item_dict).items() | ||||
| if k in self.retained_columns | if k in self.retained_columns | ||||
| }) | }) | ||||
| @@ -574,14 +574,8 @@ class MsDataset: | |||||
| None | None | ||||
| """ | """ | ||||
| from modelscope.hub.api import HubApi | |||||
| _hub_api = HubApi() | |||||
| cookies = _hub_api.check_cookies_upload_data(use_cookies=True) | |||||
| _upload_manager = DatasetUploadManager( | _upload_manager = DatasetUploadManager( | ||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| version=version, | |||||
| cookies=cookies) | |||||
| dataset_name=dataset_name, namespace=namespace, version=version) | |||||
| _upload_manager.upload(object_name, local_file_path) | _upload_manager.upload(object_name, local_file_path) | ||||
| @staticmethod | @staticmethod | ||||
| @@ -18,6 +18,12 @@ class OssUtilities: | |||||
| self.oss_dir = oss_config['Dir'] | self.oss_dir = oss_config['Dir'] | ||||
| self.oss_backup_dir = oss_config['BackupDir'] | self.oss_backup_dir = oss_config['BackupDir'] | ||||
| self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset' | |||||
| self.upload_multipart_threshold = 50 * 1024 * 1024 | |||||
| self.upload_part_size = 1 * 1024 * 1024 | |||||
| self.upload_num_threads = 4 | |||||
| self.upload_max_retries = 3 | |||||
| @staticmethod | @staticmethod | ||||
| def _percentage(consumed_bytes, total_bytes): | def _percentage(consumed_bytes, total_bytes): | ||||
| if total_bytes: | if total_bytes: | ||||
| @@ -42,21 +48,27 @@ class OssUtilities: | |||||
| progress_callback=self._percentage) | progress_callback=self._percentage) | ||||
| return local_path | return local_path | ||||
| def upload(self, oss_file_name: str, local_file_path: str) -> str: | |||||
| max_retries = 3 | |||||
| def upload(self, oss_object_name: str, local_file_path: str) -> str: | |||||
| retry_count = 0 | retry_count = 0 | ||||
| object_key = os.path.join(self.oss_dir, oss_file_name) | |||||
| object_key = os.path.join(self.oss_dir, oss_object_name) | |||||
| resumable_store = oss2.ResumableStore( | |||||
| root=self.upload_resumable_tmp_store) | |||||
| while True: | while True: | ||||
| try: | try: | ||||
| retry_count += 1 | retry_count += 1 | ||||
| self.bucket.put_object_from_file( | |||||
| oss2.resumable_upload( | |||||
| self.bucket, | |||||
| object_key, | object_key, | ||||
| local_file_path, | local_file_path, | ||||
| progress_callback=self._percentage) | |||||
| store=resumable_store, | |||||
| multipart_threshold=self.upload_multipart_threshold, | |||||
| part_size=self.upload_part_size, | |||||
| progress_callback=self._percentage, | |||||
| num_threads=self.upload_num_threads) | |||||
| break | break | ||||
| except Exception: | except Exception: | ||||
| if retry_count >= max_retries: | |||||
| if retry_count >= self.upload_max_retries: | |||||
| raise | raise | ||||
| return object_key | return object_key | ||||
| @@ -1,23 +1,21 @@ | |||||
| from http.cookiejar import CookieJar | |||||
| from .oss_utils import OssUtilities | from .oss_utils import OssUtilities | ||||
| class DatasetUploadManager(object): | class DatasetUploadManager(object): | ||||
| def __init__(self, dataset_name: str, namespace: str, version: str, | |||||
| cookies: CookieJar): | |||||
| def __init__(self, dataset_name: str, namespace: str, version: str): | |||||
| from modelscope.hub.api import HubApi | from modelscope.hub.api import HubApi | ||||
| api = HubApi() | |||||
| oss_config = api.get_dataset_access_config_session( | |||||
| cookies=cookies, | |||||
| _hub_api = HubApi() | |||||
| _cookies = _hub_api.check_cookies_upload_data(use_cookies=True) | |||||
| _oss_config = _hub_api.get_dataset_access_config_session( | |||||
| cookies=_cookies, | |||||
| dataset_name=dataset_name, | dataset_name=dataset_name, | ||||
| namespace=namespace, | namespace=namespace, | ||||
| revision=version) | revision=version) | ||||
| self.oss_utilities = OssUtilities(oss_config) | |||||
| self.oss_utilities = OssUtilities(_oss_config) | |||||
| def upload(self, oss_file_name: str, local_file_path: str) -> str: | |||||
| oss_object_key = self.oss_utilities.upload( | |||||
| oss_file_name=oss_file_name, local_file_path=local_file_path) | |||||
| return oss_object_key | |||||
| def upload(self, object_name: str, local_file_path: str) -> str: | |||||
| object_key = self.oss_utilities.upload( | |||||
| oss_object_name=object_name, local_file_path=local_file_path) | |||||
| return object_key | |||||
| @@ -35,6 +35,7 @@ class OutputKeys(object): | |||||
| UUID = 'uuid' | UUID = 'uuid' | ||||
| WORD = 'word' | WORD = 'word' | ||||
| KWS_LIST = 'kws_list' | KWS_LIST = 'kws_list' | ||||
| TIMESTAMPS = 'timestamps' | |||||
| SPLIT_VIDEO_NUM = 'split_video_num' | SPLIT_VIDEO_NUM = 'split_video_num' | ||||
| SPLIT_META_DICT = 'split_meta_dict' | SPLIT_META_DICT = 'split_meta_dict' | ||||
| @@ -56,6 +57,15 @@ TASK_OUTPUTS = { | |||||
| # } | # } | ||||
| Tasks.ocr_recognition: [OutputKeys.TEXT], | Tasks.ocr_recognition: [OutputKeys.TEXT], | ||||
| # face 2d keypoint result for single sample | |||||
| # { | |||||
| # "keypoints": [ | |||||
| # [x1, y1]*106 | |||||
| # ], | |||||
| # "poses": [pitch, roll, yaw] | |||||
| # } | |||||
| Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES], | |||||
| # face detection result for single sample | # face detection result for single sample | ||||
| # { | # { | ||||
| # "scores": [0.9, 0.1, 0.05, 0.05] | # "scores": [0.9, 0.1, 0.05, 0.05] | ||||
| @@ -75,6 +85,14 @@ TASK_OUTPUTS = { | |||||
| Tasks.face_detection: | Tasks.face_detection: | ||||
| [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS], | [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS], | ||||
| # facial expression recognition result for single sample | |||||
| # { | |||||
| # "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02], | |||||
| # "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'] | |||||
| # } | |||||
| Tasks.facial_expression_recognition: | |||||
| [OutputKeys.SCORES, OutputKeys.LABELS], | |||||
| # face recognition result for single sample | # face recognition result for single sample | ||||
| # { | # { | ||||
| # "img_embedding": np.array with shape [1, D], | # "img_embedding": np.array with shape [1, D], | ||||
| @@ -201,6 +219,21 @@ TASK_OUTPUTS = { | |||||
| # } | # } | ||||
| Tasks.body_3d_keypoints: [OutputKeys.POSES], | Tasks.body_3d_keypoints: [OutputKeys.POSES], | ||||
| # 2D hand keypoints result for single sample | |||||
| # { | |||||
| # "keypoints": [ | |||||
| # [[x, y, score] * 21], | |||||
| # [[x, y, score] * 21], | |||||
| # [[x, y, score] * 21], | |||||
| # ], | |||||
| # "boxes": [ | |||||
| # [x1, y1, x2, y2], | |||||
| # [x1, y1, x2, y2], | |||||
| # [x1, y1, x2, y2], | |||||
| # ] | |||||
| # } | |||||
| Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES], | |||||
| # video single object tracking result for single video | # video single object tracking result for single video | ||||
| # { | # { | ||||
| # "boxes": [ | # "boxes": [ | ||||
| @@ -242,7 +275,20 @@ TASK_OUTPUTS = { | |||||
| # "output_img": np.ndarray with shape [height, width, 3] | # "output_img": np.ndarray with shape [height, width, 3] | ||||
| # } | # } | ||||
| Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG], | Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG], | ||||
| # text driven segmentation result for single sample | |||||
| # { | |||||
| # "masks": [ | |||||
| # np.array # 2D array containing only 0, 255 | |||||
| # ] | |||||
| # } | |||||
| Tasks.text_driven_segmentation: [OutputKeys.MASKS], | |||||
| # shop segmentation result for single sample | |||||
| # { | |||||
| # "masks": [ | |||||
| # np.array # 2D array containing only 0, 255 | |||||
| # ] | |||||
| # } | |||||
| Tasks.shop_segmentation: [OutputKeys.MASKS], | |||||
| # movide scene segmentation result for a single video | # movide scene segmentation result for a single video | ||||
| # { | # { | ||||
| # "split_video_num":3, | # "split_video_num":3, | ||||
| @@ -541,6 +587,19 @@ TASK_OUTPUTS = { | |||||
| # } | # } | ||||
| Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS], | Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS], | ||||
| # { | |||||
| # 'labels': ['吸烟', '打电话', '吸烟'], | |||||
| # 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487], | |||||
| # 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]], | |||||
| # 'timestamps': [1, 3, 5] | |||||
| # } | |||||
| Tasks.action_detection: [ | |||||
| OutputKeys.TIMESTAMPS, | |||||
| OutputKeys.LABELS, | |||||
| OutputKeys.SCORES, | |||||
| OutputKeys.BOXES, | |||||
| ], | |||||
| # { | # { | ||||
| # 'output': [ | # 'output': [ | ||||
| # [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509}, | # [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509}, | ||||
| @@ -551,6 +610,7 @@ TASK_OUTPUTS = { | |||||
| # {'label': '13421097', 'score': 2.75914817393641e-06}]] | # {'label': '13421097', 'score': 2.75914817393641e-06}]] | ||||
| # } | # } | ||||
| Tasks.faq_question_answering: [OutputKeys.OUTPUT], | Tasks.faq_question_answering: [OutputKeys.OUTPUT], | ||||
| # image person reid result for single sample | # image person reid result for single sample | ||||
| # { | # { | ||||
| # "img_embedding": np.array with shape [1, D], | # "img_embedding": np.array with shape [1, D], | ||||
| @@ -2,7 +2,6 @@ | |||||
| import os.path as osp | import os.path as osp | ||||
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||
| from contextlib import contextmanager | |||||
| from threading import Lock | from threading import Lock | ||||
| from typing import Any, Dict, Generator, List, Mapping, Union | from typing import Any, Dict, Generator, List, Mapping, Union | ||||
| @@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
| Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), | Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'), | ||||
| Tasks.action_recognition: (Pipelines.action_recognition, | Tasks.action_recognition: (Pipelines.action_recognition, | ||||
| 'damo/cv_TAdaConv_action-recognition'), | 'damo/cv_TAdaConv_action-recognition'), | ||||
| Tasks.action_detection: (Pipelines.action_detection, | |||||
| 'damo/cv_ResNetC3D_action-detection_detection2d'), | |||||
| Tasks.live_category: (Pipelines.live_category, | Tasks.live_category: (Pipelines.live_category, | ||||
| 'damo/cv_resnet50_live-category'), | 'damo/cv_resnet50_live-category'), | ||||
| Tasks.video_category: (Pipelines.video_category, | Tasks.video_category: (Pipelines.video_category, | ||||
| @@ -97,10 +99,18 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
| 'damo/cv_hrnetv2w32_body-2d-keypoints_image'), | 'damo/cv_hrnetv2w32_body-2d-keypoints_image'), | ||||
| Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints, | Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints, | ||||
| 'damo/cv_canonical_body-3d-keypoints_video'), | 'damo/cv_canonical_body-3d-keypoints_video'), | ||||
| Tasks.hand_2d_keypoints: | |||||
| (Pipelines.hand_2d_keypoints, | |||||
| 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'), | |||||
| Tasks.face_detection: (Pipelines.face_detection, | Tasks.face_detection: (Pipelines.face_detection, | ||||
| 'damo/cv_resnet_facedetection_scrfd10gkps'), | 'damo/cv_resnet_facedetection_scrfd10gkps'), | ||||
| Tasks.face_recognition: (Pipelines.face_recognition, | Tasks.face_recognition: (Pipelines.face_recognition, | ||||
| 'damo/cv_ir101_facerecognition_cfglint'), | 'damo/cv_ir101_facerecognition_cfglint'), | ||||
| Tasks.facial_expression_recognition: | |||||
| (Pipelines.facial_expression_recognition, | |||||
| 'damo/cv_vgg19_facial-expression-recognition_fer'), | |||||
| Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints, | |||||
| 'damo/cv_mobilenet_face-2d-keypoints_alignment'), | |||||
| Tasks.video_multi_modal_embedding: | Tasks.video_multi_modal_embedding: | ||||
| (Pipelines.video_multi_modal_embedding, | (Pipelines.video_multi_modal_embedding, | ||||
| 'damo/multi_modal_clip_vtretrival_msrvtt_53'), | 'damo/multi_modal_clip_vtretrival_msrvtt_53'), | ||||
| @@ -147,9 +157,14 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
| 'damo/cv_vitb_video-single-object-tracking_ostrack'), | 'damo/cv_vitb_video-single-object-tracking_ostrack'), | ||||
| Tasks.image_reid_person: (Pipelines.image_reid_person, | Tasks.image_reid_person: (Pipelines.image_reid_person, | ||||
| 'damo/cv_passvitb_image-reid-person_market'), | 'damo/cv_passvitb_image-reid-person_market'), | ||||
| Tasks.text_driven_segmentation: | |||||
| (Pipelines.text_driven_segmentation, | |||||
| 'damo/cv_vitl16_segmentation_text-driven-seg'), | |||||
| Tasks.movie_scene_segmentation: | Tasks.movie_scene_segmentation: | ||||
| (Pipelines.movie_scene_segmentation, | (Pipelines.movie_scene_segmentation, | ||||
| 'damo/cv_resnet50-bert_video-scene-segmentation_movienet') | |||||
| 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'), | |||||
| Tasks.shop_segmentation: (Pipelines.shop_segmentation, | |||||
| 'damo/cv_vitb16_segmentation_shop-seg'), | |||||
| } | } | ||||
| @@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .action_recognition_pipeline import ActionRecognitionPipeline | from .action_recognition_pipeline import ActionRecognitionPipeline | ||||
| from .action_detection_pipeline import ActionDetectionPipeline | |||||
| from .animal_recognition_pipeline import AnimalRecognitionPipeline | from .animal_recognition_pipeline import AnimalRecognitionPipeline | ||||
| from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline | from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline | ||||
| from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline | from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline | ||||
| from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline | |||||
| from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline | from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline | ||||
| from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline | from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline | ||||
| from .crowd_counting_pipeline import CrowdCountingPipeline | from .crowd_counting_pipeline import CrowdCountingPipeline | ||||
| @@ -42,15 +44,21 @@ if TYPE_CHECKING: | |||||
| from .tinynas_classification_pipeline import TinynasClassificationPipeline | from .tinynas_classification_pipeline import TinynasClassificationPipeline | ||||
| from .video_category_pipeline import VideoCategoryPipeline | from .video_category_pipeline import VideoCategoryPipeline | ||||
| from .virtual_try_on_pipeline import VirtualTryonPipeline | from .virtual_try_on_pipeline import VirtualTryonPipeline | ||||
| from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline | |||||
| from .shop_segmentation_pipleline import ShopSegmentationPipeline | |||||
| from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline | |||||
| from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline | |||||
| from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline | from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline | ||||
| from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline | |||||
| from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'action_recognition_pipeline': ['ActionRecognitionPipeline'], | 'action_recognition_pipeline': ['ActionRecognitionPipeline'], | ||||
| 'action_detection_pipeline': ['ActionDetectionPipeline'], | |||||
| 'animal_recognition_pipeline': ['AnimalRecognitionPipeline'], | 'animal_recognition_pipeline': ['AnimalRecognitionPipeline'], | ||||
| 'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'], | 'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'], | ||||
| 'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'], | 'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'], | ||||
| 'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'], | |||||
| 'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'], | 'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'], | ||||
| 'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'], | 'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'], | ||||
| 'crowd_counting_pipeline': ['CrowdCountingPipeline'], | 'crowd_counting_pipeline': ['CrowdCountingPipeline'], | ||||
| @@ -93,10 +101,18 @@ else: | |||||
| 'tinynas_classification_pipeline': ['TinynasClassificationPipeline'], | 'tinynas_classification_pipeline': ['TinynasClassificationPipeline'], | ||||
| 'video_category_pipeline': ['VideoCategoryPipeline'], | 'video_category_pipeline': ['VideoCategoryPipeline'], | ||||
| 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], | 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], | ||||
| 'easycv_pipeline': | |||||
| ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'], | |||||
| 'shop_segmentation_pipleline': ['ShopSegmentationPipeline'], | |||||
| 'easycv_pipeline': [ | |||||
| 'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline', | |||||
| 'Face2DKeypointsPipeline' | |||||
| ], | |||||
| 'text_driven_segmentation_pipeline': | |||||
| ['TextDrivenSegmentationPipeline'], | |||||
| 'movie_scene_segmentation_pipeline': | 'movie_scene_segmentation_pipeline': | ||||
| ['MovieSceneSegmentationPipeline'], | ['MovieSceneSegmentationPipeline'], | ||||
| 'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'], | |||||
| 'facial_expression_recognition_pipelin': | |||||
| ['FacialExpressionRecognitionPipeline'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -0,0 +1,63 @@ | |||||
| import math | |||||
| import os.path as osp | |||||
| from typing import Any, Dict | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.models.cv.action_detection import ActionDetONNX | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.utils.config import Config | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| @PIPELINES.register_module( | |||||
| Tasks.action_detection, module_name=Pipelines.action_detection) | |||||
| class ActionDetectionPipeline(Pipeline): | |||||
| def __init__(self, model: str, **kwargs): | |||||
| """ | |||||
| use `model` to create a action detection pipeline for prediction | |||||
| Args: | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model, **kwargs) | |||||
| model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE) | |||||
| logger.info(f'loading model from {model_path}') | |||||
| config_path = osp.join(self.model, ModelFile.CONFIGURATION) | |||||
| logger.info(f'loading config from {config_path}') | |||||
| self.cfg = Config.from_file(config_path) | |||||
| self.cfg.MODEL.model_file = model_path | |||||
| self.model = ActionDetONNX(self.model, self.cfg.MODEL, | |||||
| self.device_name) | |||||
| logger.info('load model done') | |||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||||
| if isinstance(input, str): | |||||
| video_name = input | |||||
| else: | |||||
| raise TypeError(f'input should be a str,' | |||||
| f' but got {type(input)}') | |||||
| result = {'video_name': video_name} | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| preds = self.model.forward(input['video_name']) | |||||
| labels = sum([pred['actions']['labels'] for pred in preds], []) | |||||
| scores = sum([pred['actions']['scores'] for pred in preds], []) | |||||
| boxes = sum([pred['actions']['boxes'] for pred in preds], []) | |||||
| timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels']) | |||||
| for pred in preds], []) | |||||
| out = { | |||||
| OutputKeys.TIMESTAMPS: timestamps, | |||||
| OutputKeys.LABELS: labels, | |||||
| OutputKeys.SCORES: scores, | |||||
| OutputKeys.BOXES: boxes | |||||
| } | |||||
| return out | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| return inputs | |||||
| @@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||
| from .detection_pipeline import EasyCVDetectionPipeline | from .detection_pipeline import EasyCVDetectionPipeline | ||||
| from .segmentation_pipeline import EasyCVSegmentationPipeline | from .segmentation_pipeline import EasyCVSegmentationPipeline | ||||
| from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'detection_pipeline': ['EasyCVDetectionPipeline'], | 'detection_pipeline': ['EasyCVDetectionPipeline'], | ||||
| 'segmentation_pipeline': ['EasyCVSegmentationPipeline'] | |||||
| 'segmentation_pipeline': ['EasyCVSegmentationPipeline'], | |||||
| 'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -0,0 +1,41 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from typing import Any | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from .base import EasyCVPipeline | |||||
| @PIPELINES.register_module( | |||||
| Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints) | |||||
| class Face2DKeypointsPipeline(EasyCVPipeline): | |||||
| """Pipeline for face 2d keypoints detection.""" | |||||
| def __init__(self, | |||||
| model: str, | |||||
| model_file_pattern=ModelFile.TORCH_MODEL_FILE, | |||||
| *args, | |||||
| **kwargs): | |||||
| """ | |||||
| model (str): model id on modelscope hub or local model path. | |||||
| model_file_pattern (str): model file pattern. | |||||
| """ | |||||
| super(Face2DKeypointsPipeline, self).__init__( | |||||
| model=model, | |||||
| model_file_pattern=model_file_pattern, | |||||
| *args, | |||||
| **kwargs) | |||||
| def show_result(self, img, points, scale=2, save_path=None): | |||||
| return self.predict_op.show_result(img, points, scale, save_path) | |||||
| def __call__(self, inputs) -> Any: | |||||
| output = self.predict_op(inputs)[0][0] | |||||
| points = output['point'] | |||||
| poses = output['pose'] | |||||
| return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses} | |||||
| @@ -0,0 +1,128 @@ | |||||
| import os.path as osp | |||||
| from typing import Any, Dict | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import PIL | |||||
| import torch | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.models.cv.face_recognition.align_face import align_face | |||||
| from modelscope.models.cv.facial_expression_recognition import \ | |||||
| FacialExpressionRecognition | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines import pipeline | |||||
| from modelscope.pipelines.base import Input, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| @PIPELINES.register_module( | |||||
| Tasks.facial_expression_recognition, | |||||
| module_name=Pipelines.facial_expression_recognition) | |||||
| class FacialExpressionRecognitionPipeline(Pipeline): | |||||
| def __init__(self, model: str, **kwargs): | |||||
| """ | |||||
| use `model` to create a face detection pipeline for prediction | |||||
| Args: | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model, **kwargs) | |||||
| ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE) | |||||
| logger.info(f'loading model from {ckpt_path}') | |||||
| device = torch.device( | |||||
| f'cuda:{0}' if torch.cuda.is_available() else 'cpu') | |||||
| fer = FacialExpressionRecognition(model_path=ckpt_path, device=device) | |||||
| self.fer = fer | |||||
| self.device = device | |||||
| logger.info('load model done') | |||||
| # face detect pipeline | |||||
| det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' | |||||
| self.face_detection = pipeline( | |||||
| Tasks.face_detection, model=det_model_id) | |||||
| def _choose_face(self, | |||||
| det_result, | |||||
| min_face=10, | |||||
| top_face=1, | |||||
| center_face=False): | |||||
| ''' | |||||
| choose face with maximum area | |||||
| Args: | |||||
| det_result: output of face detection pipeline | |||||
| min_face: minimum size of valid face w/h | |||||
| top_face: take faces with top max areas | |||||
| center_face: choose the most centerd face from multi faces, only valid if top_face > 1 | |||||
| ''' | |||||
| bboxes = np.array(det_result[OutputKeys.BOXES]) | |||||
| landmarks = np.array(det_result[OutputKeys.KEYPOINTS]) | |||||
| if bboxes.shape[0] == 0: | |||||
| logger.info('Warning: No face detected!') | |||||
| return None | |||||
| # face idx with enough size | |||||
| face_idx = [] | |||||
| for i in range(bboxes.shape[0]): | |||||
| box = bboxes[i] | |||||
| if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face: | |||||
| face_idx += [i] | |||||
| if len(face_idx) == 0: | |||||
| logger.info( | |||||
| f'Warning: Face size not enough, less than {min_face}x{min_face}!' | |||||
| ) | |||||
| return None | |||||
| bboxes = bboxes[face_idx] | |||||
| landmarks = landmarks[face_idx] | |||||
| # find max faces | |||||
| boxes = np.array(bboxes) | |||||
| area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) | |||||
| sort_idx = np.argsort(area)[-top_face:] | |||||
| # find center face | |||||
| if top_face > 1 and center_face and bboxes.shape[0] > 1: | |||||
| img_center = [img.shape[1] // 2, img.shape[0] // 2] | |||||
| min_dist = float('inf') | |||||
| sel_idx = -1 | |||||
| for _idx in sort_idx: | |||||
| box = boxes[_idx] | |||||
| dist = np.square( | |||||
| np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square( | |||||
| np.abs((box[1] + box[3]) / 2 - img_center[1])) | |||||
| if dist < min_dist: | |||||
| min_dist = dist | |||||
| sel_idx = _idx | |||||
| sort_idx = [sel_idx] | |||||
| main_idx = sort_idx[-1] | |||||
| return bboxes[main_idx], landmarks[main_idx] | |||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||||
| img = LoadImage.convert_to_ndarray(input) | |||||
| img = img[:, :, ::-1] | |||||
| det_result = self.face_detection(img.copy()) | |||||
| rtn = self._choose_face(det_result) | |||||
| face_img = None | |||||
| if rtn is not None: | |||||
| _, face_lmks = rtn | |||||
| face_lmks = face_lmks.reshape(5, 2) | |||||
| face_img, _ = align_face(img, (112, 112), face_lmks) | |||||
| face_img = face_img.astype(np.float32) | |||||
| result = {} | |||||
| result['img'] = face_img | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| result = self.fer(input) | |||||
| assert result is not None | |||||
| scores = result[0].tolist() | |||||
| labels = result[1].tolist() | |||||
| return { | |||||
| OutputKeys.SCORES: scores, | |||||
| OutputKeys.LABELS: labels, | |||||
| } | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| return inputs | |||||
| @@ -0,0 +1,51 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os.path | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from .easycv_pipelines.base import EasyCVPipeline | |||||
| @PIPELINES.register_module( | |||||
| Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints) | |||||
| class Hand2DKeypointsPipeline(EasyCVPipeline): | |||||
| """Pipeline for hand pose keypoint task.""" | |||||
| def __init__(self, | |||||
| model: str, | |||||
| model_file_pattern=ModelFile.TORCH_MODEL_FILE, | |||||
| *args, | |||||
| **kwargs): | |||||
| """ | |||||
| model (str): model id on modelscope hub or local model path. | |||||
| model_file_pattern (str): model file pattern. | |||||
| """ | |||||
| self.model_dir = model | |||||
| super(Hand2DKeypointsPipeline, self).__init__( | |||||
| model=model, | |||||
| model_file_pattern=model_file_pattern, | |||||
| *args, | |||||
| **kwargs) | |||||
| def _build_predict_op(self): | |||||
| """Build EasyCV predictor.""" | |||||
| from easycv.predictors.builder import build_predictor | |||||
| detection_predictor_type = self.cfg['DETECTION']['type'] | |||||
| detection_model_path = os.path.join( | |||||
| self.model_dir, self.cfg['DETECTION']['model_path']) | |||||
| detection_cfg_file = os.path.join(self.model_dir, | |||||
| self.cfg['DETECTION']['config_file']) | |||||
| detection_score_threshold = self.cfg['DETECTION']['score_threshold'] | |||||
| self.cfg.pipeline.predictor_config[ | |||||
| 'detection_predictor_config'] = dict( | |||||
| type=detection_predictor_type, | |||||
| model_path=detection_model_path, | |||||
| config_file=detection_cfg_file, | |||||
| score_threshold=detection_score_threshold) | |||||
| easycv_config = self._to_easycv_config() | |||||
| pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, { | |||||
| 'model_path': self.model_path, | |||||
| 'config_file': easycv_config | |||||
| }) | |||||
| return pipeline_op | |||||
| @@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline): | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | ||||
| rboxes = inputs['combined_rboxes'][0] | rboxes = inputs['combined_rboxes'][0] | ||||
| count = inputs['combined_counts'][0] | count = inputs['combined_counts'][0] | ||||
| if count == 0 or count < rboxes.shape[0]: | |||||
| raise Exception('modelscope error: No text detected') | |||||
| rboxes = rboxes[:count, :] | rboxes = rboxes[:count, :] | ||||
| # convert rboxes to polygons and find its coordinates on the original image | # convert rboxes to polygons and find its coordinates on the original image | ||||
| @@ -0,0 +1,58 @@ | |||||
| import os.path as osp | |||||
| from typing import Any, Dict | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import PIL | |||||
| import torch | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.models.cv.face_detection import RetinaFaceDetection | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| @PIPELINES.register_module( | |||||
| Tasks.face_detection, module_name=Pipelines.retina_face_detection) | |||||
| class RetinaFaceDetectionPipeline(Pipeline): | |||||
| def __init__(self, model: str, **kwargs): | |||||
| """ | |||||
| use `model` to create a face detection pipeline for prediction | |||||
| Args: | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model, **kwargs) | |||||
| ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE) | |||||
| logger.info(f'loading model from {ckpt_path}') | |||||
| detector = RetinaFaceDetection( | |||||
| model_path=ckpt_path, device=self.device) | |||||
| self.detector = detector | |||||
| logger.info('load model done') | |||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||||
| img = LoadImage.convert_to_ndarray(input) | |||||
| img = img.astype(np.float32) | |||||
| result = {'img': img} | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| result = self.detector(input) | |||||
| assert result is not None | |||||
| bboxes = result[0][:, :4].tolist() | |||||
| scores = result[0][:, 4].tolist() | |||||
| lms = result[1].tolist() | |||||
| return { | |||||
| OutputKeys.SCORES: scores, | |||||
| OutputKeys.BOXES: bboxes, | |||||
| OutputKeys.KEYPOINTS: lms, | |||||
| } | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| return inputs | |||||
| @@ -0,0 +1,51 @@ | |||||
| from typing import Any, Dict | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import Tasks | |||||
| @PIPELINES.register_module( | |||||
| Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation) | |||||
| class ShopSegmentationPipeline(Pipeline): | |||||
| def __init__(self, model: str, **kwargs): | |||||
| """ | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model, auto_collate=False, **kwargs) | |||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||||
| img = LoadImage.convert_to_ndarray(input) | |||||
| img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img) | |||||
| result = { | |||||
| 'img': img_tensor, | |||||
| 'ori_h': ori_h, | |||||
| 'ori_w': ori_w, | |||||
| 'crop_h': crop_h, | |||||
| 'crop_w': crop_w | |||||
| } | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| outputs = self.model.inference(input['img']) | |||||
| result = { | |||||
| 'data': outputs, | |||||
| 'ori_h': input['ori_h'], | |||||
| 'ori_w': input['ori_w'], | |||||
| 'crop_h': input['crop_h'], | |||||
| 'crop_w': input['crop_w'], | |||||
| } | |||||
| return result | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| data = self.model.postprocess(inputs['data'], inputs['crop_h'], | |||||
| inputs['crop_w'], inputs['ori_h'], | |||||
| inputs['ori_w']) | |||||
| outputs = {OutputKeys.MASKS: data} | |||||
| return outputs | |||||
| @@ -0,0 +1,51 @@ | |||||
| from typing import Any, Dict | |||||
| from modelscope.metainfo import Pipelines | |||||
| from modelscope.outputs import OutputKeys | |||||
| from modelscope.pipelines.base import Input, Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | |||||
| from modelscope.preprocessors import LoadImage | |||||
| from modelscope.utils.constant import Tasks | |||||
| @PIPELINES.register_module( | |||||
| Tasks.text_driven_segmentation, | |||||
| module_name=Pipelines.text_driven_segmentation) | |||||
| class TextDrivenSegmentationPipeline(Pipeline): | |||||
| def __init__(self, model: str, **kwargs): | |||||
| """ | |||||
| model: model id on modelscope hub. | |||||
| """ | |||||
| super().__init__(model=model, auto_collate=False, **kwargs) | |||||
| def preprocess(self, input: Dict) -> Dict[str, Any]: | |||||
| img = LoadImage.convert_to_ndarray(input['image']) | |||||
| img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img) | |||||
| result = { | |||||
| 'img': img_tensor, | |||||
| 'ori_h': ori_h, | |||||
| 'ori_w': ori_w, | |||||
| 'crop_h': crop_h, | |||||
| 'crop_w': crop_w, | |||||
| 'text': input['text'], | |||||
| } | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| outputs = self.model.inference(input['img'], input['text']) | |||||
| result = { | |||||
| 'data': outputs, | |||||
| 'ori_h': input['ori_h'], | |||||
| 'ori_w': input['ori_w'], | |||||
| 'crop_h': input['crop_h'], | |||||
| 'crop_w': input['crop_w'], | |||||
| } | |||||
| return result | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| data = self.model.postprocess(inputs['data'], inputs['crop_h'], | |||||
| inputs['crop_w'], inputs['ori_h'], | |||||
| inputs['ori_w']) | |||||
| outputs = {OutputKeys.MASKS: data} | |||||
| return outputs | |||||