Browse Source

merge with master

master
智丞 3 years ago
parent
commit
856bd858fd
71 changed files with 3578 additions and 164 deletions
  1. +3
    -0
      .gitattributes
  2. +0
    -1
      .gitignore
  3. +67
    -0
      Makefile.docker
  4. +0
    -0
      configs/examples/configuration.json
  5. +0
    -0
      configs/examples/configuration.py
  6. +0
    -0
      configs/examples/configuration.yaml
  7. +3
    -0
      data/test/images/image1.jpg
  8. +3
    -0
      data/test/images/image_matting.png
  9. +4
    -0
      docker/.dockerignore
  10. +53
    -0
      docker/pytorch.dockerfile
  11. +2
    -0
      docker/rcfiles/pip.conf.tsinghua
  12. +25
    -0
      docker/rcfiles/sources.list.aliyun
  13. +10
    -0
      docker/rcfiles/user.vimrc
  14. +12
    -0
      docker/scripts/install_libs.sh
  15. +1
    -1
      docs/source/conf.py
  16. +120
    -3
      docs/source/develop.md
  17. +1
    -1
      modelscope/models/__init__.py
  18. +0
    -0
      modelscope/models/audio/__init__.py
  19. +0
    -0
      modelscope/models/audio/layers/__init__.py
  20. +60
    -0
      modelscope/models/audio/layers/activations.py
  21. +78
    -0
      modelscope/models/audio/layers/affine_transform.py
  22. +178
    -0
      modelscope/models/audio/layers/deep_fsmn.py
  23. +50
    -0
      modelscope/models/audio/layers/layer_base.py
  24. +482
    -0
      modelscope/models/audio/layers/uni_deep_fsmn.py
  25. +0
    -0
      modelscope/models/audio/network/__init__.py
  26. +394
    -0
      modelscope/models/audio/network/loss.py
  27. +248
    -0
      modelscope/models/audio/network/modulation_loss.py
  28. +483
    -0
      modelscope/models/audio/network/se_net.py
  29. +17
    -9
      modelscope/models/base.py
  30. +2
    -0
      modelscope/models/nlp/__init__.py
  31. +88
    -0
      modelscope/models/nlp/sentence_similarity_model.py
  32. +17
    -0
      modelscope/models/nlp/sequence_classification_model.py
  33. +2
    -2
      modelscope/models/nlp/text_generation_model.py
  34. +57
    -0
      modelscope/models/nlp/token_classification_model.py
  35. +1
    -1
      modelscope/pipelines/__init__.py
  36. +1
    -0
      modelscope/pipelines/audio/__init__.py
  37. +160
    -0
      modelscope/pipelines/audio/linear_aec_pipeline.py
  38. +29
    -1
      modelscope/pipelines/base.py
  39. +8
    -6
      modelscope/pipelines/builder.py
  40. +3
    -3
      modelscope/pipelines/cv/image_matting_pipeline.py
  41. +5
    -2
      modelscope/pipelines/multi_modal/image_captioning.py
  42. +2
    -0
      modelscope/pipelines/nlp/__init__.py
  43. +65
    -0
      modelscope/pipelines/nlp/sentence_similarity_pipeline.py
  44. +16
    -37
      modelscope/pipelines/nlp/sequence_classification_pipeline.py
  45. +4
    -5
      modelscope/pipelines/nlp/text_generation_pipeline.py
  46. +71
    -0
      modelscope/pipelines/nlp/word_segmentation_pipeline.py
  47. +117
    -0
      modelscope/pipelines/outputs.py
  48. +21
    -17
      modelscope/pipelines/util.py
  49. +1
    -1
      modelscope/preprocessors/__init__.py
  50. +230
    -0
      modelscope/preprocessors/audio.py
  51. +1
    -1
      modelscope/preprocessors/image.py
  52. +78
    -9
      modelscope/preprocessors/nlp.py
  53. +6
    -6
      modelscope/utils/config.py
  54. +15
    -13
      modelscope/utils/constant.py
  55. +1
    -1
      modelscope/utils/registry.py
  56. +20
    -0
      modelscope/utils/test_utils.py
  57. +1
    -0
      requirements/docs.txt
  58. +3
    -2
      requirements/runtime.txt
  59. +2
    -1
      setup.cfg
  60. +8
    -11
      tests/pipelines/test_base.py
  61. +3
    -4
      tests/pipelines/test_image_captioning.py
  62. +11
    -15
      tests/pipelines/test_image_matting.py
  63. +3
    -0
      tests/pipelines/test_person_image_cartoon.py
  64. +67
    -0
      tests/pipelines/test_sentence_similarity.py
  65. +56
    -0
      tests/pipelines/test_speech_signal_process.py
  66. +6
    -0
      tests/pipelines/test_text_classification.py
  67. +7
    -3
      tests/pipelines/test_text_generation.py
  68. +62
    -0
      tests/pipelines/test_word_segmentation.py
  69. +20
    -0
      tests/preprocessors/test_image.py
  70. +9
    -0
      tests/run.py
  71. +5
    -8
      tests/utils/test_config.py

+ 3
- 0
.gitattributes View File

@@ -0,0 +1,3 @@
*.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text

+ 0
- 1
.gitignore View File

@@ -104,7 +104,6 @@ venv.bak/
# mypy # mypy
.mypy_cache/ .mypy_cache/


data
.vscode .vscode
.idea .idea




+ 67
- 0
Makefile.docker View File

@@ -0,0 +1,67 @@
DOCKER_REGISTRY = registry.cn-shanghai.aliyuncs.com
DOCKER_ORG = modelscope
DOCKER_IMAGE = modelscope
DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)

# CUDA_VERSION = 11.3
# CUDNN_VERSION = 8
BASE_RUNTIME = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
# BASE_DEVEL = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
BASE_DEVEL = pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel


MODELSCOPE_VERSION = $(shell git describe --tags --always)

# Can be either official / dev
BUILD_TYPE = dev
BUILD_PROGRESS = auto
BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE)

EXTRA_DOCKER_BUILD_FLAGS ?= --network=host
# DOCKER_BUILD = DOCKER_BUILDKIT=1 \
# docker build \
# --progress=$(BUILD_PROGRESS) \
# $(EXTRA_DOCKER_BUILD_FLAGS) \
# --target $(BUILD_TYPE) \
# -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
# $(BUILD_ARGS) \
# -f docker/pytorch.dockerfile .
DOCKER_BUILD = DOCKER_BUILDKIT=1 \
docker build \
$(EXTRA_DOCKER_BUILD_FLAGS) \
-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
$(BUILD_ARGS) \
-f docker/pytorch.dockerfile .
DOCKER_PUSH = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)

.PHONY: all
all: devel-image

.PHONY: devel-image
devel-image: BASE_IMAGE := $(BASE_DEVEL)
devel-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
devel-image:
$(DOCKER_BUILD)

.PHONY: devel-push
devel-push: BASE_IMAGE := $(BASE_DEVEL)
devel-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
devel-push:
$(DOCKER_PUSH)

.PHONY: runtime-image
runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
runtime-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
runtime-image:
$(DOCKER_BUILD)
docker tag $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(DOCKER_FULL_NAME):latest

.PHONY: runtime-push
runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
runtime-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
runtime-push:
$(DOCKER_PUSH)

.PHONY: clean
clean:
-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))

configs/examples/config.json → configs/examples/configuration.json View File


configs/examples/config.py → configs/examples/configuration.py View File


configs/examples/config.yaml → configs/examples/configuration.yaml View File


+ 3
- 0
data/test/images/image1.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
size 129862

+ 3
- 0
data/test/images/image_matting.png View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
size 603621

+ 4
- 0
docker/.dockerignore View File

@@ -0,0 +1,4 @@
*.sh
*.md
*.dockerfile
*.zip

+ 53
- 0
docker/pytorch.dockerfile View File

@@ -0,0 +1,53 @@
# syntax = docker/dockerfile:experimental
#
# NOTE: To build this you will need a docker version > 18.06 with
# experimental enabled and DOCKER_BUILDKIT=1
#
# If you do not use buildkit you are not going to have a good time
#
# For reference:
# https://docs.docker.com/develop/develop-images/build_enhancements/

# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
# FROM ${BASE_IMAGE} as dev-base

# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
# config pip source
RUN mkdir /root/.pip
COPY docker/rcfiles/pip.conf.tsinghua /root/.pip/pip.conf
COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list

# Install essential Ubuntu packages
RUN apt-get update &&\
apt-get install -y software-properties-common \
build-essential \
git \
wget \
vim \
curl \
zip \
zlib1g-dev \
unzip \
pkg-config

# install modelscope and its python env
WORKDIR /opt/modelscope
COPY . .
RUN pip install -r requirements.txt
# RUN --mount=type=cache,target=/opt/ccache \
# python setup.py install

# opencv-python-headless conflict with opencv-python installed
RUN python setup.py install \
&& pip uninstall -y opencv-python-headless

# prepare modelscope libs
COPY docker/scripts/install_libs.sh /tmp/
RUN bash /tmp/install_libs.sh && \
rm -rf /tmp/install_libs.sh

ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64

WORKDIR /workspace

+ 2
- 0
docker/rcfiles/pip.conf.tsinghua View File

@@ -0,0 +1,2 @@
[global]
index-url=https://pypi.tuna.tsinghua.edu.cn/simple

+ 25
- 0
docker/rcfiles/sources.list.aliyun View File

@@ -0,0 +1,25 @@
deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted

deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted

deb http://mirrors.aliyun.com/ubuntu/ bionic universe
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic universe
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates universe

deb http://mirrors.aliyun.com/ubuntu/ bionic multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse

deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse

deb http://mirrors.aliyun.com/ubuntu bionic-security main restricted
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security main restricted
deb http://mirrors.aliyun.com/ubuntu bionic-security universe
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security universe
deb http://mirrors.aliyun.com/ubuntu bionic-security multiverse
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security multiverse

+ 10
- 0
docker/rcfiles/user.vimrc View File

@@ -0,0 +1,10 @@
set nocompatible
set encoding=utf-8
set hlsearch
set smartindent
set ruler
set number
set ts=2
set sw=2
set expandtab
autocmd FileType make setlocal noexpandtab

+ 12
- 0
docker/scripts/install_libs.sh View File

@@ -0,0 +1,12 @@
#!/bin/bash

set -eo pipefail

ModelScopeLib=/usr/local/modelscope/lib64

if [ ! -d /usr/local/modelscope ]; then
mkdir -p $ModelScopeLib
fi

# audio libs
wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so

+ 1
- 1
docs/source/conf.py View File

@@ -76,7 +76,7 @@ exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'sphinx_rtd_theme'
html_theme = 'sphinx_book_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
html_theme_options = {} html_theme_options = {}




+ 120
- 3
docs/source/develop.md View File

@@ -34,13 +34,111 @@ make linter
``` ```


## 2. Test ## 2. Test
### 2.1 Unit test

### 2.1 Test level

There are mainly three test levels:

* level 0: tests for basic interface and function of framework, such as `tests/trainers/test_trainer_base.py`
* level 1: important functional test which test end2end workflow, such as `tests/pipelines/test_image_matting.py`
* level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.

Default test level is 0, which will only run those cases of level 0, you can set test level
via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)


```bash ```bash
# run all tests
TEST_LEVEL=2 make test

# run important functional tests
TEST_LEVEL=1 make test

# run core UT and basic functional tests
make test make test
``` ```


### 2.2 Test data
TODO
When writing test cases, you should assign a test level for your test case using
following code. If left default, the test level will be 0, it will run in each
test stage.

File test_module.py
```python
from modelscope.utils.test_utils import test_level

class ImageCartoonTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_by_direct_model_download(self):
pass
```

### 2.2 Run tests

1. Run your own single test case to test your self-implemented function. You can run your
test file directly, if it fails to run, pls check if variable `TEST_LEVEL`
exists in the environment and unset it.
```bash
python tests/path/to/your_test.py
```

2. Remember to run core tests in local environment before start a codereview, by default it will
only run test cases with level 0.
```bash
make tests
```

3. After you start a code review, ci tests will be triggered which will run test cases with level 1

4. Daily regression tests will run all cases at 0 am each day using master branch.

### 2.3 Test data storage

As we need a lot of data for testing, including images, videos, models. We use git lfs
to store those large files.

1. install git-lfs
for mac
```bash
brew install git-lfs
git lfs install
```

for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
```bash
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
git lfs install
```

for ubuntu
```bash
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
```

2. track your data type using git lfs, for example, to track png files
```bash
git lfs track "*.png"
```

3. add your test files to `data/test/` folder, you can make directories if you need.
```bash
git add data/test/test.png
```

4. commit your test data to remote branch
```bash
git commit -m "xxx"
```

To pull data from remote repo, just as the same way you pull git files.
```bash
git pull origin branch_name
```





## Code Review ## Code Review


@@ -93,3 +191,22 @@ TODO
```bash ```bash
make whl make whl
``` ```

## Build docker

build develop docker
```bash
sudo make -f Makefile.docker devel-image
```

push develop docker, passwd pls ask wenmeng.zwm
```bash
sudo docker login --username=mass_test@test.aliyunid.com registry.cn-shanghai.aliyuncs.com
Password:
sudo make -f Makefile.docker devel-push
```

To build runtime image, just replace `devel` with `runtime` in the upper commands.
```bash
udo make -f Makefile.docker runtime-image runtime-push
```

+ 1
- 1
modelscope/models/__init__.py View File

@@ -2,4 +2,4 @@


from .base import Model from .base import Model
from .builder import MODELS, build_model from .builder import MODELS, build_model
from .nlp import BertForSequenceClassification
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity

+ 0
- 0
modelscope/models/audio/__init__.py View File


+ 0
- 0
modelscope/models/audio/layers/__init__.py View File


+ 60
- 0
modelscope/models/audio/layers/activations.py View File

@@ -0,0 +1,60 @@
import torch.nn as nn

from .layer_base import LayerBase


class RectifiedLinear(LayerBase):

def __init__(self, input_dim, output_dim):
super(RectifiedLinear, self).__init__()
self.dim = input_dim
self.relu = nn.ReLU()

def forward(self, input):
return self.relu(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr


class LogSoftmax(LayerBase):

def __init__(self, input_dim, output_dim):
super(LogSoftmax, self).__init__()
self.dim = input_dim
self.ls = nn.LogSoftmax()

def forward(self, input):
return self.ls(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr


class Sigmoid(LayerBase):

def __init__(self, input_dim, output_dim):
super(Sigmoid, self).__init__()
self.dim = input_dim
self.sig = nn.Sigmoid()

def forward(self, input):
return self.sig(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr

+ 78
- 0
modelscope/models/audio/layers/affine_transform.py View File

@@ -0,0 +1,78 @@
import numpy as np
import torch as th
import torch.nn as nn

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class AffineTransform(LayerBase):

def __init__(self, input_dim, output_dim):
super(AffineTransform, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.linear = nn.Linear(input_dim, output_dim)

def forward(self, input):
return self.linear(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
self.input_dim)
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def to_raw_nnet(self, fid):
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
x.tofile(fid)

linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
x.tofile(fid)

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('AffineTransform format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(instr, '<BiasLearnRateCoef>')
if output is None:
raise Exception(
'AffineTransform format error for <BiasLearnRateCoef>')
instr, lr = output

output = expect_token_number(instr, '<MaxNorm>')
if output is None:
raise Exception('AffineTransform format error for <MaxNorm>')
instr, lr = output

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('AffineTransform format error for parsing matrix')
instr, mat = output

print(mat.shape)
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('AffineTransform format error for parsing matrix')
instr, mat = output
mat = np.squeeze(mat)
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))
return instr

+ 178
- 0
modelscope/models/audio/layers/deep_fsmn.py View File

@@ -0,0 +1,178 @@
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class DeepFsmn(LayerBase):

def __init__(self,
input_dim,
output_dim,
lorder=None,
rorder=None,
hidden_size=None,
layer_norm=False,
dropout=0):
super(DeepFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim

if lorder is None:
return

self.lorder = lorder
self.rorder = rorder
self.hidden_size = hidden_size
self.layer_norm = layer_norm

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.LayerNorm(hidden_size)
self.drop1 = nn.Dropout(p=dropout)
self.drop2 = nn.Dropout(p=dropout)
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1], [1, 1],
groups=output_dim,
bias=False)
self.conv2 = nn.Conv2d(
output_dim,
output_dim, [rorder, 1], [1, 1],
groups=output_dim,
bias=False)

def forward(self, input):

f1 = F.relu(self.linear(input))

f1 = self.drop1(f1)
if self.layer_norm:
f1 = self.norm(f1)

p1 = self.project(f1)

x = th.unsqueeze(p1, 1)

x_per = x.permute(0, 3, 2, 1)

y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
yr = F.pad(x_per, [0, 0, 0, self.rorder])
yr = yr[:, :, 1:, :]

out = x_per + self.conv1(y) + self.conv2(yr)
out = self.drop2(out)

out1 = out.permute(0, 3, 2, 1)

return input + out1.squeeze()

def to_kaldi_nnet(self):
re_str = ''
re_str += '<UniDeepFsmn> %d %d\n'\
% (self.output_dim, self.input_dim)
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
% (1, self.hidden_size, self.lorder, 1)
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)
proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(
instr,
'<HidSize>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <HidSize>')
instr, hiddensize = output
self.hidden_size = int(hiddensize)

output = expect_token_number(
instr,
'<LOrder>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LOrder>')
instr, lorder = output
self.lorder = int(lorder)

output = expect_token_number(
instr,
'<LStride>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LStride>')
instr, lstride = output
self.lstride = lstride

output = expect_token_number(
instr,
'<MaxNorm>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <MaxNorm>')

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat1 = np.fliplr(mat.T).copy()
self.conv1 = nn.Conv2d(
self.output_dim,
self.output_dim, [self.lorder, 1], [1, 1],
groups=self.output_dim,
bias=False)
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
mat_th = mat_th.unsqueeze(1)
mat_th = mat_th.unsqueeze(3)
self.conv1.weight = th.nn.Parameter(mat_th)

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output

self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
self.linear = nn.Linear(self.input_dim, self.hidden_size)

self.project.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

return instr

+ 50
- 0
modelscope/models/audio/layers/layer_base.py View File

@@ -0,0 +1,50 @@
import abc
import re

import numpy as np
import torch.nn as nn


def expect_token_number(instr, token):
first_token = re.match(r'^\s*' + token, instr)
if first_token is None:
return None
instr = instr[first_token.end():]
lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
if lr is None:
return None
return instr[lr.end():], lr.groups()[0]


def expect_kaldi_matrix(instr):
pos2 = instr.find('[', 0)
pos3 = instr.find(']', pos2)
mat = []
for stt in instr[pos2 + 1:pos3].split('\n'):
tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
if tmp_mat.size > 0:
mat.append(tmp_mat)
return instr[pos3 + 1:], np.array(mat)


def to_kaldi_matrix(np_mat):
"""
function that transform as str numpy mat to standard kaldi str matrix
:param np_mat: numpy mat
:return: str
"""
np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
out_str = str(np_mat)
out_str = out_str.replace('[', '')
out_str = out_str.replace(']', '')
return '[ %s ]\n' % out_str


class LayerBase(nn.Module, metaclass=abc.ABCMeta):

def __init__(self):
super(LayerBase, self).__init__()

@abc.abstractmethod
def to_kaldi_nnet(self):
pass

+ 482
- 0
modelscope/models/audio/layers/uni_deep_fsmn.py View File

@@ -0,0 +1,482 @@
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class SepConv(nn.Module):

def __init__(self,
in_channels,
filters,
out_channels,
kernel_size=(5, 2),
dilation=(1, 1)):
""" :param kernel_size (time, frequency)

"""
super(SepConv, self).__init__()
# depthwise + pointwise
self.dconv = nn.Conv2d(
in_channels,
in_channels * filters,
kernel_size,
dilation=dilation,
groups=in_channels)
self.pconv = nn.Conv2d(
in_channels * filters, out_channels, kernel_size=1)
self.padding = dilation[0] * (kernel_size[0] - 1)

def forward(self, input):
''' input: [B, C, T, F]
'''
x = F.pad(input, [0, 0, self.padding, 0])
x = self.dconv(x)
x = self.pconv(x)
return x


class Conv2d(nn.Module):

def __init__(self,
input_dim,
output_dim,
lorder=20,
rorder=0,
groups=1,
bias=False,
skip_connect=True):
super(Conv2d, self).__init__()
self.lorder = lorder
self.conv = nn.Conv2d(
input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
self.rorder = rorder
if self.rorder:
self.conv2 = nn.Conv2d(
input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
self.skip_connect = skip_connect

def forward(self, input):
# [B, 1, T, F]
x = th.unsqueeze(input, 1)
# [B, F, T, 1]
x_per = x.permute(0, 3, 2, 1)
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
out = self.conv(y)
if self.rorder:
yr = F.pad(x_per, [0, 0, 0, self.rorder])
yr = yr[:, :, 1:, :]
out += self.conv2(yr)
out = out.permute(0, 3, 2, 1).squeeze(1)
if self.skip_connect:
out = out + input
return out


class SelfAttLayer(nn.Module):

def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
super(SelfAttLayer, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)

self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.att = nn.Linear(input_dim, lorder, bias=False)

def forward(self, input):

f1 = F.relu(self.linear(input))

p1 = self.project(f1)

x = th.unsqueeze(p1, 1)

x_per = x.permute(0, 3, 2, 1)

y = F.pad(x_per, [0, 0, self.lorder - 1, 0])

# z [B, F, T, lorder]
z = x_per
for i in range(1, self.lorder):
z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)

# [B, T, lorder]
att = F.softmax(self.att(input), dim=-1)
att = th.unsqueeze(att, 1)
z = th.sum(z * att, axis=-1)

out1 = z.permute(0, 2, 1)

return input + out1


class TFFsmn(nn.Module):

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(TFFsmn, self).__init__()

self.skip_connect = skip_connect

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.Identity()
if layer_norm:
self.norm = nn.LayerNorm(input_dim)
self.act = nn.ReLU()
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)
dorder = 5
self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
self.padding_freq = dorder - 1

def forward(self, input):
return self.compute1(input)

def compute1(self, input):
''' linear-dconv-relu(norm)-linear-dconv
'''
x = self.linear(input)
# [B, 1, F, T]
x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
z = F.pad(x, [0, 0, self.padding_freq, 0])
z = self.conv2(z) + x
x = z.permute(0, 3, 2, 1).squeeze(-1)
x = self.act(x)
x = self.norm(x)
x = self.project(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()

return input + out


class CNNFsmn(nn.Module):
''' use cnn to reduce parameters
'''

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(CNNFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim
self.skip_connect = skip_connect

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)
self.act = nn.ReLU()
kernel_size = (3, 8)
stride = (1, 4)
self.conv = nn.Sequential(
nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))

self.dconv = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)

def forward(self, input):
return self.compute2(input)

def compute1(self, input):
''' linear-relu(norm)-conv2d-relu?-dconv
'''
# [B, T, F]
x = self.linear(input)
x = self.act(x)
x = th.unsqueeze(x, 1)
x = self.conv(x)
# [B, C, T, F] -> [B, 1, T, F]
b, c, t, f = x.shape
x = x.view([b, 1, t, -1])
x = x.permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.dconv(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
return input + out

def compute2(self, input):
''' conv2d-relu-linear-relu?-dconv
'''
x = th.unsqueeze(input, 1)
x = self.conv(x)
x = self.act(x)
# [B, C, T, F] -> [B, T, F]
b, c, t, f = x.shape
x = x.view([b, t, -1])
x = self.linear(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.dconv(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
return input + out


class UniDeepFsmn(LayerBase):

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(UniDeepFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim
self.skip_connect = skip_connect

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.Identity()
if layer_norm:
self.norm = nn.LayerNorm(input_dim)
self.act = nn.ReLU()
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)

def forward(self, input):
return self.compute1(input)

def compute1(self, input):
''' linear-relu(norm)-linear-dconv
'''
# [B, T, F]
x = self.linear(input)
x = self.act(x)
x = self.norm(x)
x = self.project(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()

return input + out

def compute2(self, input):
''' linear-dconv-linear-relu(norm)
'''
x = self.project(input)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
x = self.linear(out)
x = self.act(x)
x = self.norm(x)

return input + x

def compute3(self, input):
''' dconv-linear-relu(norm)-linear
'''
x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
x = self.linear(out)
x = self.act(x)
x = self.norm(x)
x = self.project(x)

return input + x

def to_kaldi_nnet(self):
re_str = ''
re_str += '<UniDeepFsmn> %d %d\n' \
% (self.output_dim, self.input_dim)
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
% (1, self.hidden_size, self.lorder, 1)
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)
proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def to_raw_nnet(self, fid):
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
x.tofile(fid)

proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
x.tofile(fid)

linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
x.tofile(fid)

linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
x.tofile(fid)

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(
instr,
'<HidSize>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <HidSize>')
instr, hiddensize = output
self.hidden_size = int(hiddensize)

output = expect_token_number(
instr,
'<LOrder>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LOrder>')
instr, lorder = output
self.lorder = int(lorder)

output = expect_token_number(
instr,
'<LStride>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LStride>')
instr, lstride = output
self.lstride = lstride

output = expect_token_number(
instr,
'<MaxNorm>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <MaxNorm>')

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat1 = np.fliplr(mat.T).copy()

self.conv1 = nn.Conv2d(
self.output_dim,
self.output_dim, [self.lorder, 1], [1, 1],
groups=self.output_dim,
bias=False)

mat_th = th.from_numpy(mat1).type(th.FloatTensor)
mat_th = mat_th.unsqueeze(1)
mat_th = mat_th.unsqueeze(3)
self.conv1.weight = th.nn.Parameter(mat_th)

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output

self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
self.linear = nn.Linear(self.input_dim, self.hidden_size)

self.project.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat = np.squeeze(mat)
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

return instr

+ 0
- 0
modelscope/models/audio/network/__init__.py View File


+ 394
- 0
modelscope/models/audio/network/loss.py View File

@@ -0,0 +1,394 @@
import torch
import torch.nn.functional as F

from .modulation_loss import (GaborSTRFConv, MelScale,
ModulationDomainLossModule)

EPS = 1e-8


def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
'''
stft: (batch, ..., 2) or complex(batch, ...)
y = x + n
'''
if torch.is_complex(mixed_spec):
yr, yi = mixed_spec.real, mixed_spec.imag
else:
yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
if torch.is_complex(clean_spec):
xr, xi = clean_spec.real, clean_spec.imag
else:
xr, xi = clean_spec[..., 0], clean_spec[..., 1]

if mask_type == 'iam':
ymag = torch.sqrt(yr**2 + yi**2)
xmag = torch.sqrt(xr**2 + xi**2)
iam = xmag / (ymag + EPS)
return torch.clamp(iam, 0, 1)

elif mask_type == 'psm':
ypow = yr**2 + yi**2
psm = (xr * yr + xi * yi) / (ypow + EPS)
return torch.clamp(psm, 0, 1)

elif mask_type == 'psmiam':
ypow = yr**2 + yi**2
psm = (xr * yr + xi * yi) / (ypow + EPS)
ymag = torch.sqrt(yr**2 + yi**2)
xmag = torch.sqrt(xr**2 + xi**2)
iam = xmag / (ymag + EPS)
psmiam = psm * iam
return torch.clamp(psmiam, 0, 1)

elif mask_type == 'crm':
ypow = yr**2 + yi**2
mr = (xr * yr + xi * yi) / (ypow + EPS)
mi = (xi * yr - xr * yi) / (ypow + EPS)
mr = torch.clamp(mr, -clip, clip)
mi = torch.clamp(mi, -clip, clip)
return mr, mi


def energy_vad(spec,
thdhigh=320 * 600 * 600 * 2,
thdlow=320 * 300 * 300 * 2,
int16=True):
'''
energy based vad should be accurate enough
spec: (batch, bins, frames, 2)
returns (batch, frames)
'''
energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
vad = energy > thdhigh
idx = torch.logical_and(vad == 0, energy > thdlow)
vad[idx] = 0.5
return vad


def modulation_loss_init(n_fft):
gabor_strf_parameters = torch.load(
'./network/gabor_strf_parameters.pt')['state_dict']
gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)

modulation_loss_module = ModulationDomainLossModule(
gabor_modulation_kernels.eval())
for param in modulation_loss_module.parameters():
param.requires_grad = False

stft2mel = MelScale(
n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()

return modulation_loss_module, stft2mel


def mask_loss_function(
loss_func='psm_loss',
loss_type='mse', # ['mse', 'mae', 'comb']
mask_type='psmiam',
use_mod_loss=False,
use_wav2vec_loss=False,
n_fft=640,
hop_length=320,
EPS=1e-8,
weight=None):
if weight is not None:
print(f'Use loss weight: {weight}')
winlen = n_fft
window = torch.hamming_window(winlen, periodic=False)

def stft(x, return_complex=False):
# returns [batch, bins, frames, 2]
return torch.stft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
return_complex=return_complex)

def istft(x, slen):
return torch.istft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
length=slen)

def mask_loss(targets, masks, nframes):
''' [Batch, Time, Frequency]
'''
with torch.no_grad():
mask_for_loss = torch.ones_like(targets)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks = masks * mask_for_loss
targets = targets * mask_for_loss

if weight is None:
alpha = 1
else: # for aec ST
alpha = weight - targets

if loss_type == 'mse':
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
elif loss_type == 'mae':
loss = torch.sum(alpha * torch.abs(targets - masks))
else: # mse(mask), mae(mask) approx 1:2
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
+ 0.1 * alpha * torch.abs(targets - masks))
loss /= torch.sum(nframes)
return loss

def spectrum_loss(targets, spec, nframes):
''' [Batch, Time, Frequency, 2]
'''
with torch.no_grad():
mask_for_loss = torch.ones_like(targets[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
xr = spec[..., 0] * mask_for_loss
xi = spec[..., 1] * mask_for_loss
yr = targets[..., 0] * mask_for_loss
yi = targets[..., 1] * mask_for_loss
xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
ymag = torch.sqrt(targets[..., 0]**2
+ targets[..., 1]**2) * mask_for_loss

loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
loss2 = torch.sum(torch.pow(xmag - ymag, 2))

loss = (loss1 + loss2) / torch.sum(nframes)
return loss

def sa_loss_dlen(mixed, clean, masks, nframes):
yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
with torch.no_grad():
mask_for_loss = torch.ones_like(xspec[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
emag = emag * mask_for_loss
xmag = xmag * mask_for_loss

loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
return loss

def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed)
clean_spec = stft(clean)
targets = compute_mask(mixed_spec, clean_spec, mask_type)
# [B, T, F]
targets = targets.permute(0, 2, 1)

loss = mask_loss(targets, masks, nframes)

if subtask is not None:
vadtargets = energy_vad(clean_spec)
with torch.no_grad():
mask_for_loss = torch.ones_like(targets[:, :, 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:] = 0
subtask = subtask[:, :, 0] * mask_for_loss
vadtargets = vadtargets * mask_for_loss

loss_vad = F.binary_cross_entropy(subtask, vadtargets)
return loss + loss_vad
return loss

def modulation_loss(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed, True)
clean_spec = stft(clean, True)
enhanced_mag = torch.abs(mixed_spec)
clean_mag = torch.abs(clean_spec)
with torch.no_grad():
mask_for_loss = torch.ones_like(clean_mag)
for idx, num in enumerate(nframes):
mask_for_loss[idx, :, num:] = 0
clean_mag = clean_mag * mask_for_loss
enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])

# Covert to log-mel representation
# (B,T,#mel_channels)
clean_log_mel = torch.log(
torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
enhanced_log_mel = torch.log(
torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)

alpha = compute_mask(mixed_spec, clean_spec, mask_type)
alpha = alpha.permute(0, 2, 1)
loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
alpha)
loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
# print(loss.item(), loss2.item()) #approx 1:4
loss = loss + loss2
return loss

def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
mixed /= 32768
clean /= 32768
mixed_spec = stft(mixed)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
est_clean = istft(estimate, clean.shape[1])
loss = wav2vec_loss_module(est_clean, clean)
return loss

def sisdr_loss_dlen(mixed,
clean,
masks,
nframes,
subtask=None,
zero_mean=True):
mixed_spec = stft(mixed)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
est_clean = istft(estimate, clean.shape[1])
flen = min(clean.shape[1], est_clean.shape[1])
clean = clean[:, :flen]
est_clean = est_clean[:, :flen]

# follow asteroid/losses/sdr.py
if zero_mean:
clean = clean - torch.mean(clean, dim=1, keepdim=True)
est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)

dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
scaled_clean = dot * clean / s_clean_energy
e_noise = est_clean - scaled_clean

# [batch]
sisdr = torch.sum(
scaled_clean**2, dim=1) / (
torch.sum(e_noise**2, dim=1) + EPS)
sisdr = -10 * torch.log10(sisdr + EPS)
loss = sisdr.mean()
return loss

def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed)
clean_spec = stft(clean)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)

dot_real = estimate[..., 0] * clean_spec[..., 0] + \
estimate[..., 1] * clean_spec[..., 1]
dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
estimate[..., 1] * clean_spec[..., 0]
dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
s_clean_energy = clean_spec[..., 0] ** 2 + \
clean_spec[..., 1] ** 2 + EPS
scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
e_noise = estimate - scaled_clean

# [batch]
scaled_clean_energy = torch.sum(
scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
e_noise_energy = torch.sum(
e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
sisdr = torch.sum(
scaled_clean_energy, dim=1) / (
torch.sum(e_noise_energy, dim=1) + EPS)
sisdr = -10 * torch.log10(sisdr + EPS)
loss = sisdr.mean()
return loss

def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed).permute([0, 2, 1, 3])
clean_spec = stft(clean).permute([0, 2, 1, 3])
mixed_spec = mixed_spec / 32768
clean_spec = clean_spec / 32768
tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')

D = int(masks.shape[2] / 2)
with torch.no_grad():
mask_for_loss = torch.ones_like(clean_spec[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
mr = masks[..., :D] * mask_for_loss
mi = masks[..., D:] * mask_for_loss
tgt_mr = tgt_mr * mask_for_loss
tgt_mi = tgt_mi * mask_for_loss

if weight is None:
alpha = 1
else:
alpha = weight - tgt_mr
# signal approximation
yr = mixed_spec[..., 0]
yi = mixed_spec[..., 1]
loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
+ torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
# mask approximation
loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
+ torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
return loss

def crm_miso_loss_dlen(mixed, clean, masks, nframes):
return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)

def mimo_loss_dlen(mixed, clean, masks, nframes):
chs = mixed.shape[-1]
D = masks.shape[2] // chs
loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
nframes)
for ch in range(1, chs):
loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
masks[..., ch * D:ch * D + D], nframes)
loss = loss + loss1
return loss / chs

def spec_loss_dlen(mixed, clean, spec, nframes):
clean_spec = stft(clean).permute([0, 2, 1, 3])
clean_spec = clean_spec / 32768

D = spec.shape[2] // 2
spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
dim=-1)
loss = spectrum_loss(clean_spec, spec_est, nframes)
return loss

if loss_func == 'psm_vad_loss_dlen':
return psm_vad_loss_dlen
elif loss_func == 'sisdr_loss_dlen':
return sisdr_loss_dlen
elif loss_func == 'sisdr_freq_loss_dlen':
return sisdr_freq_loss_dlen
elif loss_func == 'crm_loss_dlen':
return crm_loss_dlen
elif loss_func == 'modulation_loss':
return modulation_loss
elif loss_func == 'wav2vec_loss':
return wav2vec_loss
elif loss_func == 'mimo_loss_dlen':
return mimo_loss_dlen
elif loss_func == 'spec_loss_dlen':
return spec_loss_dlen
elif loss_func == 'sa_loss_dlen':
return sa_loss_dlen
else:
print('error loss func')
return None

+ 248
- 0
modelscope/models/audio/network/modulation_loss.py View File

@@ -0,0 +1,248 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.transforms import MelScale


class ModulationDomainLossModule(torch.nn.Module):
"""Modulation-domain loss function developed in [1] for supervised speech enhancement

In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
as the input spectrogram representation.
Specific parameter details are in the paper and in the example below

Parameters
----------
modulation_kernels: nn.Module
Differentiable module that transforms a spectrogram representation to the modulation domain

modulation_domain = modulation_kernels(input_tf_representation)
Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')

norm: boolean
Normalizes the modulation domain representation to be 0 mean across time

[1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
speech enhancement”
Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330


"""

def __init__(self, modulation_kernels, norm=True):
super(ModulationDomainLossModule, self).__init__()

self.modulation_kernels = modulation_kernels
self.mse = nn.MSELoss(reduce=False)
self.norm = norm

def forward(self, enhanced_spect, clean_spect, weight=None):
"""Calculate modulation-domain loss
Args:
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
Returns:
Tensor: Modulation-domain loss value.
"""

clean_mod = self.modulation_kernels(clean_spect)
enhanced_mod = self.modulation_kernels(enhanced_spect)

if self.norm:
mean_clean_mod = torch.mean(clean_mod, dim=2)
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

if weight is None:
alpha = 1
else: # TF-mask weight
alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
mod_mse_loss = torch.mean(
torch.sum(mod_mse_loss, dim=(1, 2, 3))
/ torch.sum(clean_mod**2, dim=(1, 2, 3)))

return mod_mse_loss


class ModulationDomainNCCLossModule(torch.nn.Module):
"""Modulation-domain loss function developed in [1] for supervised speech enhancement

# Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this

In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
as the input spectrogram representation.
Specific parameter details are in the paper and in the example below

Parameters
----------
modulation_kernels: nn.Module
Differentiable module that transforms a spectrogram representation to the modulation domain

modulation_domain = modulation_kernels(input_tf_representation)
Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')

[1]

"""

def __init__(self, modulation_kernels):
super(ModulationDomainNCCLossModule, self).__init__()

self.modulation_kernels = modulation_kernels
self.mse = nn.MSELoss(reduce=False)

def forward(self, enhanced_spect, clean_spect):
"""Calculate modulation-domain loss
Args:
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
Returns:
Tensor: Modulation-domain loss value.
"""

clean_mod = self.modulation_kernels(clean_spect)
enhanced_mod = self.modulation_kernels(enhanced_spect)
mean_clean_mod = torch.mean(clean_mod, dim=2)
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

inner_product = torch.sum(
normalized_clean * normalized_enhanced, dim=2)
normalized_denom = (torch.sum(
normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
normalized_enhanced * normalized_enhanced, dim=2))**.5

ncc = inner_product / normalized_denom
mod_mse_loss = torch.mean((ncc - 1.0)**2)

return mod_mse_loss


class GaborSTRFConv(nn.Module):
"""Gabor-STRF-based cross-correlation kernel."""

def __init__(self,
supn,
supk,
nkern,
rates=None,
scales=None,
norm_strf=True,
real_only=False):
"""Instantiate a Gabor-based STRF convolution layer.
Parameters
----------
supn: int
Time support in number of frames. Also the window length.
supk: int
Frequency support in number of channels. Also the window length.
nkern: int
Number of kernels, each with a learnable rate and scale.
rates: list of float, None
Initial values for temporal modulation.
scales: list of float, None
Initial values for spectral modulation.
norm_strf: Boolean
Normalize STRF kernels to be unit length
real_only: Boolean
If True, nkern REAL gabor-STRF kernels
If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
"""
super(GaborSTRFConv, self).__init__()
self.numN = supn
self.numK = supk
self.numKern = nkern
self.real_only = real_only
self.norm_strf = norm_strf

if not real_only:
nkern = nkern // 2

if supk % 2 == 0: # force odd number
supk += 1
self.supk = torch.arange(supk, dtype=torch.float32)
if supn % 2 == 0: # force odd number
supn += 1
self.supn = torch.arange(supn, dtype=self.supk.dtype)
self.padding = (supn // 2, supk // 2)
# Set up learnable parameters
# for param in (rates, scales):
# assert (not param) or len(param) == nkern
if not rates:

rates = torch.rand(nkern) * math.pi / 2.0

if not scales:

scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0

self.rates_ = nn.Parameter(torch.Tensor(rates))
self.scales_ = nn.Parameter(torch.Tensor(scales))

def strfs(self):
"""Make STRFs using the current parameters."""

if self.supn.device != self.rates_.device: # for first run
self.supn = self.supn.to(self.rates_.device)
self.supk = self.supk.to(self.rates_.device)
n0, k0 = self.padding

nwind = .5 - .5 * \
torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
kwind = .5 - .5 * \
torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))

new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))

n_n_0 = self.supn - n0
k_k_0 = self.supk - k0
n_mult = torch.matmul(
n_n_0.unsqueeze(1),
torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
self.rates_.device))
k_mult = torch.matmul(
torch.ones((len(self.supn),
1)).type(torch.FloatTensor).to(self.rates_.device),
k_k_0.unsqueeze(0))

inside = self.rates_.unsqueeze(1).unsqueeze(
1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
real_strf = torch.cos(inside) * new_wind.unsqueeze(0)

if self.real_only:
final_strf = real_strf

else:
imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
final_strf = torch.cat([real_strf, imag_strf], dim=0)

if self.norm_strf:
final_strf = final_strf / (torch.sum(
final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5

return final_strf

def forward(self, sigspec):
"""Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
if len(sigspec.shape) == 2: # expand batch dimension if single eg
sigspec = sigspec.unsqueeze(0)
strfs = self.strfs().unsqueeze(1).type_as(sigspec)
out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
return out

def __repr__(self):
"""Gabor filter"""
report = """
+++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++

""".format(self.numKern, self.numN, self.numK, self.real_only,
self.norm_strf)

return report

+ 483
- 0
modelscope/models/audio/network/se_net.py View File

@@ -0,0 +1,483 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

from ..layers.activations import RectifiedLinear, Sigmoid
from ..layers.affine_transform import AffineTransform
from ..layers.deep_fsmn import DeepFsmn
from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn


class MaskNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=128,
hidden_dim2=None,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(MaskNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
if hidden_dim2 is None:
hidden_dim2 = hidden_dim

if rorder == 0:
repeats = [
UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim2,
dilation=dilation,
layer_norm=layer_norm,
dropout=dropout) for i in range(layers)
]
else:
repeats = [
DeepFsmn(
hidden_dim,
hidden_dim,
lorder,
rorder,
hidden_dim2,
layer_norm=layer_norm,
dropout=dropout) for i in range(layers)
]
self.deepfsmn = nn.Sequential(*repeats)

self.linear2 = AffineTransform(hidden_dim, outdim)

self.crm = crm
if self.crm:
self.sig = nn.Tanh()
else:
self.sig = Sigmoid(outdim, outdim)

self.vad = vad
if self.vad:
self.linear3 = AffineTransform(hidden_dim, 1)

self.layers = layers
self.linearout = linearout
if self.linearout and self.vad:
print('Warning: not supported nnet')

def forward(self, feat, ctl=None):
x1 = self.linear1(feat)
x2 = self.relu(x1)
if ctl is not None:
ctl = min(ctl, self.layers - 1)
for i in range(ctl):
x2 = self.deepfsmn[i](x2)
mask = self.sig(self.linear2(x2))
if self.vad:
vad = torch.sigmoid(self.linear3(x2))
return mask, vad
else:
return mask
x3 = self.deepfsmn(x2)
if self.linearout:
return self.linear2(x3)
mask = self.sig(self.linear2(x3))
if self.vad:
vad = torch.sigmoid(self.linear3(x3))
return mask, vad
else:
return mask

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Nnet>\n'
re_str += self.linear1.to_kaldi_nnet()
re_str += self.relu.to_kaldi_nnet()
for dfsmn in self.deepfsmn:
re_str += dfsmn.to_kaldi_nnet()
re_str += self.linear2.to_kaldi_nnet()
re_str += self.sig.to_kaldi_nnet()
re_str += '</Nnet>\n'

return re_str

def to_raw_nnet(self, fid):
self.linear1.to_raw_nnet(fid)
for dfsmn in self.deepfsmn:
dfsmn.to_raw_nnet(fid)
self.linear2.to_raw_nnet(fid)


class StageNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
layers2=6,
hidden_dim=128,
lorder=20,
rorder=0,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(StageNet, self).__init__()

self.stage1 = nn.ModuleList()
self.stage2 = nn.ModuleList()
layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
self.stage1.append(layer)
for i in range(layers):
layer = UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim,
layer_norm=layer_norm,
dropout=dropout)
self.stage1.append(layer)
layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
self.stage1.append(layer)
# stage2
layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
self.stage2.append(layer)
for i in range(layers2):
layer = UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim,
layer_norm=layer_norm,
dropout=dropout)
self.stage2.append(layer)
layer = nn.Sequential(
nn.Linear(hidden_dim, outdim),
nn.Sigmoid() if not crm else nn.Tanh())
self.stage2.append(layer)
self.crm = crm
self.vad = vad
self.linearout = linearout
self.window = torch.hamming_window(640, periodic=False).cuda()
self.freezed = False

def freeze(self):
if not self.freezed:
for param in self.stage1.parameters():
param.requires_grad = False
self.freezed = True
print('freezed stage1')

def forward(self, feat, mixture, ctl=None):
if ctl == 'off':
x = feat
for i in range(len(self.stage1)):
x = self.stage1[i](x)
return x
else:
self.freeze()
x = feat
for i in range(len(self.stage1)):
x = self.stage1[i](x)

spec = torch.stft(
mixture / 32768,
640,
320,
640,
self.window,
center=False,
return_complex=True)
spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
est = x * specmag
y = torch.cat([est, feat], dim=-1)
for i in range(len(self.stage2)):
y = self.stage2[i](y)
return y


class Unet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
dims=[256] * 4,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(Unet, self).__init__()

self.linear1 = AffineTransform(indim, dims[0])
self.relu = RectifiedLinear(dims[0], dims[0])

self.encoder = nn.ModuleList()
self.decoder = nn.ModuleList()
for i in range(len(dims) - 1):
layer = nn.Sequential(
nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
nn.Linear(dims[i + 1], dims[i + 1], bias=False),
Conv2d(
dims[i + 1],
dims[i + 1],
lorder,
groups=dims[i + 1],
skip_connect=True))
self.encoder.append(layer)
for i in range(len(dims) - 1, 0, -1):
layer = nn.Sequential(
nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
nn.Linear(dims[i - 1], dims[i - 1], bias=False),
Conv2d(
dims[i - 1],
dims[i - 1],
lorder,
groups=dims[i - 1],
skip_connect=True))
self.decoder.append(layer)
self.tf = nn.ModuleList()
for i in range(layers - 2 * (len(dims) - 1)):
layer = nn.Sequential(
nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
nn.Linear(dims[-1], dims[-1], bias=False),
Conv2d(
dims[-1],
dims[-1],
lorder,
groups=dims[-1],
skip_connect=True))
self.tf.append(layer)

self.linear2 = AffineTransform(dims[0], outdim)
self.crm = crm
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
self.vad = False
self.layers = layers
self.linearout = linearout

def forward(self, x, ctl=None):
x = self.linear1(x)
x = self.relu(x)

encoder_out = []
for i in range(len(self.encoder)):
x = self.encoder[i](x)
encoder_out.append(x)
for i in range(len(self.tf)):
x = self.tf[i](x)
for i in range(len(self.decoder)):
x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
x = self.decoder[i](x)

x = self.linear2(x)
if self.linearout:
return x
return self.act(x)


class BranchNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=256,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(BranchNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)

self.convs = nn.ModuleList()
self.deepfsmn = nn.ModuleList()
self.FREQ = nn.ModuleList()
self.TIME = nn.ModuleList()
self.br1 = nn.ModuleList()
self.br2 = nn.ModuleList()
for i in range(layers):
'''
layer = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim, bias=False),
Conv2d(hidden_dim, hidden_dim, lorder,
groups=hidden_dim, skip_connect=True)
)
self.deepfsmn.append(layer)
'''
layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
self.FREQ.append(layer)
'''
layer = nn.GRU(hidden_dim, hidden_dim,
batch_first=True,
bidirectional=False)
self.TIME.append(layer)

layer = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim//2, bias=False),
Conv2d(hidden_dim//2, hidden_dim//2, lorder,
groups=hidden_dim//2, skip_connect=True)
)
self.br1.append(layer)
layer = nn.GRU(hidden_dim, hidden_dim//2,
batch_first=True,
bidirectional=False)
self.br2.append(layer)
'''

self.linear2 = AffineTransform(hidden_dim, outdim)
self.crm = crm
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
self.vad = False
self.layers = layers
self.linearout = linearout

def forward(self, x, ctl=None):
return self.forward_branch(x)

def forward_sepconv(self, x):
x = torch.unsqueeze(x, 1)
for i in range(len(self.convs)):
x = self.convs[i](x)
x = F.relu(x)
B, C, H, W = x.shape
x = x.permute(0, 2, 1, 3)
x = torch.reshape(x, [B, H, C * W])
x = self.linear1(x)
x = self.relu(x)
for i in range(self.layers):
x = self.deepfsmn[i](x) + x
x = self.linear2(x)
return self.act(x)

def forward_branch(self, x):
x = self.linear1(x)
x = self.relu(x)
for i in range(self.layers):
z = self.FREQ[i](x)
x = z + x
x = self.linear2(x)
if self.linearout:
return x
return self.act(x)


class TACNet(nn.Module):
''' transform average concatenate for ad hoc dr
'''

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=128,
lorder=20,
rorder=0,
crm=False,
vad=False,
linearout=False):
super(TACNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)

if rorder == 0:
repeats = [
UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
for i in range(layers)
]
else:
repeats = [
DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
for i in range(layers)
]
self.deepfsmn = nn.Sequential(*repeats)

self.ch_transform = nn.ModuleList([])
self.ch_average = nn.ModuleList([])
self.ch_concat = nn.ModuleList([])
for i in range(layers):
self.ch_transform.append(
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
self.ch_average.append(
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
self.ch_concat.append(
nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))

self.linear2 = AffineTransform(hidden_dim, outdim)

self.crm = crm
if self.crm:
self.sig = nn.Tanh()
else:
self.sig = Sigmoid(outdim, outdim)

self.vad = vad
if self.vad:
self.linear3 = AffineTransform(hidden_dim, 1)

self.layers = layers
self.linearout = linearout
if self.linearout and self.vad:
print('Warning: not supported nnet')

def forward(self, feat, ctl=None):
B, T, F = feat.shape
# assume 4ch
ch = 4
zlist = []
for c in range(ch):
z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
z = self.relu(z)
zlist.append(z)
for i in range(self.layers):
# forward
for c in range(ch):
zlist[c] = self.deepfsmn[i](zlist[c])

# transform
olist = []
for c in range(ch):
z = self.ch_transform[i](zlist[c])
olist.append(z)
# average
avg = 0
for c in range(ch):
avg = avg + olist[c]
avg = avg / ch
avg = self.ch_average[i](avg)
# concate
for c in range(ch):
tac = torch.cat([olist[c], avg], dim=-1)
tac = self.ch_concat[i](tac)
zlist[c] = zlist[c] + tac

for c in range(ch):
zlist[c] = self.sig(self.linear2(zlist[c]))
mask = torch.cat(zlist, dim=-1)
return mask

def to_kaldi_nnet(self):
pass

+ 17
- 9
modelscope/models/base.py View File

@@ -2,14 +2,13 @@


import os.path as osp import os.path as osp
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, List, Tuple, Union
from typing import Dict, Union


from maas_hub.file_download import model_file_download
from maas_hub.snapshot_download import snapshot_download from maas_hub.snapshot_download import snapshot_download


from modelscope.models.builder import build_model from modelscope.models.builder import build_model
from modelscope.utils.config import Config from modelscope.utils.config import Config
from modelscope.utils.constant import CONFIGFILE
from modelscope.utils.constant import ModelFile
from modelscope.utils.hub import get_model_cache_dir from modelscope.utils.hub import get_model_cache_dir


Tensor = Union['torch.Tensor', 'tf.Tensor'] Tensor = Union['torch.Tensor', 'tf.Tensor']
@@ -21,16 +20,24 @@ class Model(ABC):
self.model_dir = model_dir self.model_dir = model_dir


def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
return self.post_process(self.forward(input))
return self.postprocess(self.forward(input))


@abstractmethod @abstractmethod
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
pass pass


def post_process(self, input: Dict[str, Tensor],
**kwargs) -> Dict[str, Tensor]:
# model specific postprocess, implementation is optional
# will be called in Pipeline and evaluation loop(in the future)
def postprocess(self, input: Dict[str, Tensor],
**kwargs) -> Dict[str, Tensor]:
""" Model specific postprocess and convert model output to
standard model outputs.

Args:
inputs: input data

Return:
dict of results: a dict containing outputs of model, each
output should have the standard output name.
"""
return input return input


@classmethod @classmethod
@@ -47,7 +54,8 @@ class Model(ABC):
# raise ValueError( # raise ValueError(
# 'Remote model repo {model_name_or_path} does not exists') # 'Remote model repo {model_name_or_path} does not exists')


cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE))
cfg = Config.from_file(
osp.join(local_model_dir, ModelFile.CONFIGURATION))
task_name = cfg.task task_name = cfg.task
model_cfg = cfg.model model_cfg = cfg.model
# TODO @wenmeng.zwm may should manually initialize model after model building # TODO @wenmeng.zwm may should manually initialize model after model building


+ 2
- 0
modelscope/models/nlp/__init__.py View File

@@ -1,4 +1,6 @@
from .sentence_similarity_model import * # noqa F403
from .sequence_classification_model import * # noqa F403 from .sequence_classification_model import * # noqa F403
from .space.dialog_intent_prediction_model import * # noqa F403 from .space.dialog_intent_prediction_model import * # noqa F403
from .space.dialog_modeling_model import * # noqa F403 from .space.dialog_modeling_model import * # noqa F403
from .text_generation_model import * # noqa F403 from .text_generation_model import * # noqa F403
from .token_classification_model import * # noqa F403

+ 88
- 0
modelscope/models/nlp/sentence_similarity_model.py View File

@@ -0,0 +1,88 @@
import os
from typing import Any, Dict

import json
import numpy as np
import torch
from sofa import SbertModel
from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
from torch import nn

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['SbertForSentenceSimilarity']


class SbertTextClassifier(SbertPreTrainedModel):

def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.encoder = SbertModel(config, add_pooling_layer=True)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, input_ids=None, token_type_ids=None):
outputs = self.encoder(
input_ids,
token_type_ids=token_type_ids,
return_dict=None,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits


@MODELS.register_module(
Tasks.sentence_similarity,
module_name=r'sbert-base-chinese-sentence-similarity')
class SbertForSentenceSimilarity(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the sentence similarity model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir

self.model = SbertTextClassifier.from_pretrained(
model_dir, num_labels=2)
self.model.eval()
self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.id2label = {idx: name for name, idx in self.label_mapping.items()}

def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data

Returns:
Dict[str, np.ndarray]: results
Example:
{
'predictions': array([1]), # lable 0-negative 1-positive
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
}
"""
input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
token_type_ids = torch.tensor(
input['token_type_ids'], dtype=torch.long)
with torch.no_grad():
logits = self.model(input_ids, token_type_ids)
probs = logits.softmax(-1).numpy()
pred = logits.argmax(-1).numpy()
logits = logits.numpy()
res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
return res

+ 17
- 0
modelscope/models/nlp/sequence_classification_model.py View File

@@ -1,5 +1,7 @@
import os
from typing import Any, Dict from typing import Any, Dict


import json
import numpy as np import numpy as np


from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
@@ -34,6 +36,11 @@ class BertForSequenceClassification(Model):
('token_type_ids', torch.LongTensor)], ('token_type_ids', torch.LongTensor)],
output_keys=['predictions', 'probabilities', 'logits']) output_keys=['predictions', 'probabilities', 'logits'])


self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.id2label = {idx: name for name, idx in self.label_mapping.items()}

def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
"""return the result by the model """return the result by the model


@@ -50,3 +57,13 @@ class BertForSequenceClassification(Model):
} }
""" """
return self.model.predict(input) return self.model.predict(input)

def postprocess(self, inputs: Dict[str, np.ndarray],
**kwargs) -> Dict[str, np.ndarray]:
# N x num_classes
probs = inputs['probabilities']
result = {
'probs': probs,
}

return result

+ 2
- 2
modelscope/models/nlp/text_generation_model.py View File

@@ -4,11 +4,11 @@ from modelscope.utils.constant import Tasks
from ..base import Model, Tensor from ..base import Model, Tensor
from ..builder import MODELS from ..builder import MODELS


__all__ = ['PalmForTextGenerationModel']
__all__ = ['PalmForTextGeneration']




@MODELS.register_module(Tasks.text_generation, module_name=r'palm') @MODELS.register_module(Tasks.text_generation, module_name=r'palm')
class PalmForTextGenerationModel(Model):
class PalmForTextGeneration(Model):


def __init__(self, model_dir: str, *args, **kwargs): def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the text generation model from the `model_dir` path. """initialize the text generation model from the `model_dir` path.


+ 57
- 0
modelscope/models/nlp/token_classification_model.py View File

@@ -0,0 +1,57 @@
import os
from typing import Any, Dict, Union

import numpy as np
import torch
from sofa import SbertConfig, SbertForTokenClassification

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['StructBertForTokenClassification']


@MODELS.register_module(
Tasks.word_segmentation,
module_name=r'structbert-chinese-word-segmentation')
class StructBertForTokenClassification(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the word segmentation model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir
self.model = SbertForTokenClassification.from_pretrained(
self.model_dir)
self.config = SbertConfig.from_pretrained(self.model_dir)

def forward(self, input: Dict[str,
Any]) -> Dict[str, Union[str, np.ndarray]]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data

Returns:
Dict[str, Union[str,np.ndarray]]: results
Example:
{
'predictions': array([1,4]), # lable 0-negative 1-positive
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
'text': str(今天),
}
"""
input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
output = self.model(input_ids)
logits = output.logits
pred = torch.argmax(logits[0], dim=-1)
pred = pred.numpy()

rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
return rst

+ 1
- 1
modelscope/pipelines/__init__.py View File

@@ -1,4 +1,4 @@
from .audio import * # noqa F403
from .audio import LinearAECPipeline
from .base import Pipeline from .base import Pipeline
from .builder import pipeline from .builder import pipeline
from .cv import * # noqa F403 from .cv import * # noqa F403


+ 1
- 0
modelscope/pipelines/audio/__init__.py View File

@@ -0,0 +1 @@
from .linear_aec_pipeline import LinearAECPipeline

+ 160
- 0
modelscope/pipelines/audio/linear_aec_pipeline.py View File

@@ -0,0 +1,160 @@
import importlib
import os
from typing import Any, Dict

import numpy as np
import scipy.io.wavfile as wav
import torch
import yaml

from modelscope.preprocessors.audio import LinearAECAndFbank
from modelscope.utils.constant import ModelFile, Tasks
from ..base import Pipeline
from ..builder import PIPELINES

FEATURE_MVN = 'feature.DEY.mvn.txt'

CONFIG_YAML = 'dey_mini.yaml'


def initialize_config(module_cfg):
r"""According to config items, load specific module dynamically with params.
1. Load the module corresponding to the "module" param.
2. Call function (or instantiate class) corresponding to the "main" param.
3. Send the param (in "args") into the function (or class) when calling ( or instantiating).

Args:
module_cfg (dict): config items, eg:
{
"module": "models.model",
"main": "Model",
"args": {...}
}

Returns:
the module loaded.
"""
module = importlib.import_module(module_cfg['module'])
return getattr(module, module_cfg['main'])(**module_cfg['args'])


@PIPELINES.register_module(
Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
class LinearAECPipeline(Pipeline):
r"""AEC Inference Pipeline only support 16000 sample rate.

When invoke the class with pipeline.__call__(), you should provide two params:
Dict[str, Any]
the path of wav files,eg:{
"nearend_mic": "/your/data/near_end_mic_audio.wav",
"farend_speech": "/your/data/far_end_speech_audio.wav"}
output_path (str, optional): "/your/output/audio_after_aec.wav"
the file path to write generate audio.
"""

def __init__(self, model):
r"""
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model)
self.use_cuda = torch.cuda.is_available()
with open(
os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
self.config = yaml.full_load(f.read())
self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
self._init_model()
self.preprocessor = LinearAECAndFbank(self.config['io'])

n_fft = self.config['loss']['args']['n_fft']
hop_length = self.config['loss']['args']['hop_length']
winlen = n_fft
window = torch.hamming_window(winlen, periodic=False)

def stft(x):
return torch.stft(
x,
n_fft,
hop_length,
winlen,
center=False,
window=window.to(x.device),
return_complex=False)

def istft(x, slen):
return torch.istft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
length=slen)

self.stft = stft
self.istft = istft

def _init_model(self):
checkpoint = torch.load(
os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
map_location='cpu')
self.model = initialize_config(self.config['nnet'])
if self.use_cuda:
self.model = self.model.cuda()
self.model.load_state_dict(checkpoint)

def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
r"""The AEC process.

Args:
inputs: dict={'feature': Tensor, 'base': Tensor}
'feature' feature of input audio.
'base' the base audio to mask.

Returns:
dict:
{
'output_pcm': generated audio array
}
"""
output_data = self._process(inputs['feature'], inputs['base'])
return {'output_pcm': output_data}

def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
r"""The post process. Will save audio to file, if the output_path is given.

Args:
inputs: dict:
{
'output_pcm': generated audio array
}
kwargs: accept 'output_path' which is the path to write generated audio

Returns:
dict:
{
'output_pcm': generated audio array
}
"""
if 'output_path' in kwargs.keys():
wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
inputs['output_pcm'].astype(np.int16))
inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
return inputs

def _process(self, fbanks, mixture):
if self.use_cuda:
fbanks = fbanks.cuda()
mixture = mixture.cuda()
if self.model.vad:
with torch.no_grad():
masks, vad = self.model(fbanks.unsqueeze(0))
masks = masks.permute([2, 1, 0])
else:
with torch.no_grad():
masks = self.model(fbanks.unsqueeze(0))
masks = masks.permute([2, 1, 0])
spectrum = self.stft(mixture)
masked_spec = spectrum * masks
masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
return masked_sig

+ 29
- 1
modelscope/pipelines/base.py View File

@@ -12,10 +12,11 @@ from modelscope.pydatasets import PyDataset
from modelscope.utils.config import Config from modelscope.utils.config import Config
from modelscope.utils.hub import get_model_cache_dir from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from .outputs import TASK_OUTPUTS
from .util import is_model_name from .util import is_model_name


Tensor = Union['torch.Tensor', 'tf.Tensor'] Tensor = Union['torch.Tensor', 'tf.Tensor']
Input = Union[str, PyDataset, Dict, 'PIL.Image.Image', 'numpy.ndarray']
Input = Union[str, PyDataset, Dict, tuple, 'PIL.Image.Image', 'numpy.ndarray']
InputModel = Union[str, Model] InputModel = Union[str, Model]


output_keys = [ output_keys = [
@@ -106,8 +107,25 @@ class Pipeline(ABC):
out = self.preprocess(input, **post_kwargs) out = self.preprocess(input, **post_kwargs)
out = self.forward(out) out = self.forward(out)
out = self.postprocess(out, **post_kwargs) out = self.postprocess(out, **post_kwargs)
self._check_output(out)
return out return out


def _check_output(self, input):
# this attribute is dynamically attached by registry
# when cls is registered in registry using task name
task_name = self.group_key
if task_name not in TASK_OUTPUTS:
logger.warning(f'task {task_name} output keys are missing')
return
output_keys = TASK_OUTPUTS[task_name]
missing_keys = []
for k in output_keys:
if k not in input:
missing_keys.append(k)
if len(missing_keys) > 0:
raise ValueError(f'expected output keys are {output_keys}, '
f'those {missing_keys} are missing')

def preprocess(self, inputs: Input) -> Dict[str, Any]: def preprocess(self, inputs: Input) -> Dict[str, Any]:
""" Provide default implementation based on preprocess_cfg and user can reimplement it """ Provide default implementation based on preprocess_cfg and user can reimplement it
""" """
@@ -125,4 +143,14 @@ class Pipeline(ABC):


@abstractmethod @abstractmethod
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
""" If current pipeline support model reuse, common postprocess
code should be write here.

Args:
inputs: input data

Return:
dict of results: a dict containing outputs of model, each
output should have the standard output name.
"""
raise NotImplementedError('postprocess') raise NotImplementedError('postprocess')

+ 8
- 6
modelscope/pipelines/builder.py View File

@@ -3,21 +3,23 @@
import os.path as osp import os.path as osp
from typing import List, Union from typing import List, Union


import json
from maas_hub.file_download import model_file_download

from modelscope.models.base import Model from modelscope.models.base import Model
from modelscope.utils.config import Config, ConfigDict from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import CONFIGFILE, Tasks
from modelscope.utils.constant import Tasks
from modelscope.utils.registry import Registry, build_from_cfg from modelscope.utils.registry import Registry, build_from_cfg
from .base import Pipeline from .base import Pipeline
from .util import is_model_name


PIPELINES = Registry('pipelines') PIPELINES = Registry('pipelines')


DEFAULT_MODEL_FOR_PIPELINE = { DEFAULT_MODEL_FOR_PIPELINE = {
# TaskName: (pipeline_module_name, model_repo) # TaskName: (pipeline_module_name, model_repo)
Tasks.image_matting: ('image-matting', 'damo/image-matting-person'),
Tasks.word_segmentation:
('structbert-chinese-word-segmentation',
'damo/nlp_structbert_word-segmentation_chinese-base'),
Tasks.sentence_similarity:
('sbert-base-chinese-sentence-similarity',
'damo/nlp_structbert_sentence-similarity_chinese-base'),
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
Tasks.text_classification: Tasks.text_classification:
('bert-sentiment-analysis', 'damo/bert-base-sst2'), ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),


+ 3
- 3
modelscope/pipelines/cv/image_matting_pipeline.py View File

@@ -1,5 +1,5 @@
import os.path as osp import os.path as osp
from typing import Any, Dict, List, Tuple, Union
from typing import Any, Dict


import cv2 import cv2
import numpy as np import numpy as np
@@ -7,7 +7,7 @@ import PIL


from modelscope.pipelines.base import Input from modelscope.pipelines.base import Input
from modelscope.preprocessors import load_image from modelscope.preprocessors import load_image
from modelscope.utils.constant import TF_GRAPH_FILE, Tasks
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from ..base import Pipeline from ..base import Pipeline
from ..builder import PIPELINES from ..builder import PIPELINES
@@ -24,7 +24,7 @@ class ImageMattingPipeline(Pipeline):
import tensorflow as tf import tensorflow as tf
if tf.__version__ >= '2.0': if tf.__version__ >= '2.0':
tf = tf.compat.v1 tf = tf.compat.v1
model_path = osp.join(self.model, TF_GRAPH_FILE)
model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)


config = tf.ConfigProto(allow_soft_placement=True) config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True


+ 5
- 2
modelscope/pipelines/multi_modal/image_captioning.py View File

@@ -84,8 +84,11 @@ class ImageCaptionPipeline(Pipeline):
s = torch.cat([s, self.eos_item]) s = torch.cat([s, self.eos_item])
return s return s


patch_image = self.patch_resize_transform(
load_image(input)).unsqueeze(0)
if isinstance(input, Image.Image):
patch_image = self.patch_resize_transform(input).unsqueeze(0)
else:
patch_image = self.patch_resize_transform(
load_image(input)).unsqueeze(0)
patch_mask = torch.tensor([True]) patch_mask = torch.tensor([True])
text = 'what does the image describe?' text = 'what does the image describe?'
src_text = encode_text( src_text = encode_text(


+ 2
- 0
modelscope/pipelines/nlp/__init__.py View File

@@ -1,4 +1,6 @@
from .sentence_similarity_pipeline import * # noqa F403
from .sequence_classification_pipeline import * # noqa F403 from .sequence_classification_pipeline import * # noqa F403
from .space.dialog_intent_prediction_pipeline import * # noqa F403 from .space.dialog_intent_prediction_pipeline import * # noqa F403
from .space.dialog_modeling_pipeline import * # noqa F403 from .space.dialog_modeling_pipeline import * # noqa F403
from .text_generation_pipeline import * # noqa F403 from .text_generation_pipeline import * # noqa F403
from .word_segmentation_pipeline import * # noqa F403

+ 65
- 0
modelscope/pipelines/nlp/sentence_similarity_pipeline.py View File

@@ -0,0 +1,65 @@
import os
import uuid
from typing import Any, Dict, Union

import json
import numpy as np

from modelscope.models.nlp import SbertForSentenceSimilarity
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.utils.constant import Tasks
from ...models import Model
from ..base import Input, Pipeline
from ..builder import PIPELINES

__all__ = ['SentenceSimilarityPipeline']


@PIPELINES.register_module(
Tasks.sentence_similarity,
module_name=r'sbert-base-chinese-sentence-similarity')
class SentenceSimilarityPipeline(Pipeline):

def __init__(self,
model: Union[SbertForSentenceSimilarity, str],
preprocessor: SequenceClassificationPreprocessor = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

Args:
model (SbertForSentenceSimilarity): a model instance
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
"""
assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
'model must be a single str or SbertForSentenceSimilarity'
sc_model = model if isinstance(
model,
SbertForSentenceSimilarity) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = SequenceClassificationPreprocessor(
sc_model.model_dir,
first_sequence='first_sequence',
second_sequence='second_sequence')
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

assert hasattr(self.model, 'id2label'), \
'id2label map should be initalizaed in init function.'

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
"""process the prediction results

Args:
inputs (Dict[str, Any]): _description_

Returns:
Dict[str, str]: the prediction results
"""

probs = inputs['probabilities'][0]
num_classes = probs.shape[0]
top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
probs = probs[cls_ids].tolist()
cls_names = [self.model.id2label[cid] for cid in cls_ids]
b = 0
return {'scores': probs[b], 'labels': cls_names[b]}

+ 16
- 37
modelscope/pipelines/nlp/sequence_classification_pipeline.py View File

@@ -41,50 +41,29 @@ class SequenceClassificationPipeline(Pipeline):
second_sequence=None) second_sequence=None)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)


from easynlp.utils import io
self.label_path = os.path.join(sc_model.model_dir,
'label_mapping.json')
with io.open(self.label_path) as f:
self.label_mapping = json.load(f)
self.label_id_to_name = {
idx: name
for name, idx in self.label_mapping.items()
}
assert hasattr(self.model, 'id2label'), \
'id2label map should be initalizaed in init function.'


def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self,
inputs: Dict[str, Any],
topk: int = 5) -> Dict[str, str]:
"""process the prediction results """process the prediction results


Args: Args:
inputs (Dict[str, Any]): _description_
inputs (Dict[str, Any]): input data dict
topk (int): return topk classification result.


Returns: Returns:
Dict[str, str]: the prediction results Dict[str, str]: the prediction results
""" """
# NxC np.ndarray
probs = inputs['probs'][0]
num_classes = probs.shape[0]
topk = min(topk, num_classes)
top_indices = np.argpartition(probs, -topk)[-topk:]
cls_ids = top_indices[np.argsort(probs[top_indices])]
probs = probs[cls_ids].tolist()


probs = inputs['probabilities']
logits = inputs['logits']
predictions = np.argsort(-probs, axis=-1)
preds = predictions[0]
b = 0
new_result = list()
for pred in preds:
new_result.append({
'pred': self.label_id_to_name[pred],
'prob': float(probs[b][pred]),
'logit': float(logits[b][pred])
})
new_results = list()
new_results.append({
'id':
inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()),
'output':
new_result,
'predictions':
new_result[0]['pred'],
'probabilities':
','.join([str(t) for t in inputs['probabilities'][b]]),
'logits':
','.join([str(t) for t in inputs['logits'][b]])
})
cls_names = [self.model.id2label[cid] for cid in cls_ids]


return new_results[0]
return {'scores': probs, 'labels': cls_names}

+ 4
- 5
modelscope/pipelines/nlp/text_generation_pipeline.py View File

@@ -1,7 +1,7 @@
from typing import Dict, Optional, Union from typing import Dict, Optional, Union


from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp import PalmForTextGenerationModel
from modelscope.models.nlp import PalmForTextGeneration
from modelscope.preprocessors import TextGenerationPreprocessor from modelscope.preprocessors import TextGenerationPreprocessor
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from ..base import Pipeline, Tensor from ..base import Pipeline, Tensor
@@ -14,7 +14,7 @@ __all__ = ['TextGenerationPipeline']
class TextGenerationPipeline(Pipeline): class TextGenerationPipeline(Pipeline):


def __init__(self, def __init__(self,
model: Union[PalmForTextGenerationModel, str],
model: Union[PalmForTextGeneration, str],
preprocessor: Optional[TextGenerationPreprocessor] = None, preprocessor: Optional[TextGenerationPreprocessor] = None,
**kwargs): **kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
@@ -24,8 +24,7 @@ class TextGenerationPipeline(Pipeline):
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
""" """
sc_model = model if isinstance( sc_model = model if isinstance(
model,
PalmForTextGenerationModel) else Model.from_pretrained(model)
model, PalmForTextGeneration) else Model.from_pretrained(model)
if preprocessor is None: if preprocessor is None:
preprocessor = TextGenerationPreprocessor( preprocessor = TextGenerationPreprocessor(
sc_model.model_dir, sc_model.model_dir,
@@ -56,4 +55,4 @@ class TextGenerationPipeline(Pipeline):
'').split('[SEP]')[0].replace('[CLS]', '').split('[SEP]')[0].replace('[CLS]',
'').replace('[SEP]', '').replace('[SEP]',
'').replace('[UNK]', '') '').replace('[UNK]', '')
return {'pred_string': pred_string}
return {'text': pred_string}

+ 71
- 0
modelscope/pipelines/nlp/word_segmentation_pipeline.py View File

@@ -0,0 +1,71 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.preprocessors import TokenClassifcationPreprocessor
from modelscope.utils.constant import Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES

__all__ = ['WordSegmentationPipeline']


@PIPELINES.register_module(
Tasks.word_segmentation,
module_name=r'structbert-chinese-word-segmentation')
class WordSegmentationPipeline(Pipeline):

def __init__(self,
model: Union[StructBertForTokenClassification, str],
preprocessor: Optional[TokenClassifcationPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction

Args:
model (StructBertForTokenClassification): a model instance
preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
"""
model = model if isinstance(
model,
StructBertForTokenClassification) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TokenClassifcationPreprocessor(model.model_dir)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer
self.config = model.config
self.id2label = self.config.id2label

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
"""process the prediction results

Args:
inputs (Dict[str, Any]): _description_

Returns:
Dict[str, str]: the prediction results
"""

pred_list = inputs['predictions']
labels = []
for pre in pred_list:
labels.append(self.id2label[pre])
labels = labels[1:-1]
chunks = []
chunk = ''
assert len(inputs['text']) == len(labels)
for token, label in zip(inputs['text'], labels):
if label[0] == 'B' or label[0] == 'I':
chunk += token
else:
chunk += token
chunks.append(chunk)
chunk = ''
if chunk:
chunks.append(chunk)
seg_result = ' '.join(chunks)
rst = {
'output': seg_result,
}
return rst

+ 117
- 0
modelscope/pipelines/outputs.py View File

@@ -0,0 +1,117 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from modelscope.utils.constant import Tasks

TASK_OUTPUTS = {

# ============ vision tasks ===================

# image classification result for single sample
# {
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.image_classification: ['scores', 'labels'],
Tasks.image_tagging: ['scores', 'labels'],

# object detection result for single sample
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.object_detection: ['scores', 'labels', 'boxes'],

# instance segmentation result for single sample
# {
# "masks": [
# np.array in bgr channel order
# ],
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.image_segmentation: ['scores', 'labels', 'boxes'],

# image generation/editing/matting result for single sample
# {
# "output_png": np.array with shape(h, w, 4)
# for matting or (h, w, 3) for general purpose
# }
Tasks.image_editing: ['output_png'],
Tasks.image_matting: ['output_png'],
Tasks.image_generation: ['output_png'],

# pose estimation result for single sample
# {
# "poses": np.array with shape [num_pose, num_keypoint, 3],
# each keypoint is a array [x, y, score]
# "boxes": np.array with shape [num_pose, 4], each box is
# [x1, y1, x2, y2]
# }
Tasks.pose_estimation: ['poses', 'boxes'],

# ============ nlp tasks ===================

# text classification result for single sample
# {
# "labels": ["happy", "sad", "calm", "angry"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.text_classification: ['scores', 'labels'],

# text generation result for single sample
# {
# "text": "this is text generated by a model."
# }
Tasks.text_generation: ['text'],

# word segmentation result for single sample
# {
# "output": "今天 天气 不错 , 适合 出去 游玩"
# }
Tasks.word_segmentation: ['output'],

# sentence similarity result for single sample
# {
# "labels": "1",
# "scores": 0.9
# }
Tasks.sentence_similarity: ['scores', 'labels'],

# ============ audio tasks ===================

# audio processed for single file in PCM format
# {
# "output_pcm": np.array with shape(samples,) and dtype float32
# }
Tasks.speech_signal_process: ['output_pcm'],

# ============ multi-modal tasks ===================

# image caption result for single sample
# {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: ['caption'],

# visual grounding result for single sample
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.visual_grounding: ['boxes', 'scores'],

# text_to_image result for a single sample
# {
# "image": np.ndarray with shape [height, width, 3]
# }
Tasks.text_to_image_synthesis: ['image']
}

+ 21
- 17
modelscope/pipelines/util.py View File

@@ -1,12 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
import os
import os.path as osp import os.path as osp
from typing import List, Union from typing import List, Union


import json
from maas_hub.file_download import model_file_download from maas_hub.file_download import model_file_download


from modelscope.utils.constant import CONFIGFILE
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger

logger = get_logger()


def is_config_has_model(cfg_file):
try:
cfg = Config.from_file(cfg_file)
return hasattr(cfg, 'model')
except Exception as e:
logger.error(f'parse config file {cfg_file} failed: {e}')
return False




def is_model_name(model: Union[str, List]): def is_model_name(model: Union[str, List]):
@@ -15,24 +26,17 @@ def is_model_name(model: Union[str, List]):


def is_model_name_impl(model): def is_model_name_impl(model):
if osp.exists(model): if osp.exists(model):
if osp.exists(osp.join(model, CONFIGFILE)):
return True
cfg_file = osp.join(model, ModelFile.CONFIGURATION)
if osp.exists(cfg_file):
return is_config_has_model(cfg_file)
else: else:
return False return False
else: else:
# try:
# cfg_file = model_file_download(model, CONFIGFILE)
# except Exception:
# cfg_file = None
# TODO @wenmeng.zwm use exception instead of
# following tricky logic
cfg_file = model_file_download(model, CONFIGFILE)
with open(cfg_file, 'r') as infile:
cfg = json.load(infile)
if 'Code' in cfg:
try:
cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
return is_config_has_model(cfg_file)
except Exception:
return False return False
else:
return True


if isinstance(model, str): if isinstance(model, str):
return is_model_name_impl(model) return is_model_name_impl(model)


+ 1
- 1
modelscope/preprocessors/__init__.py View File

@@ -1,10 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.


from .audio import LinearAECAndFbank
from .base import Preprocessor from .base import Preprocessor
from .builder import PREPROCESSORS, build_preprocessor from .builder import PREPROCESSORS, build_preprocessor
from .common import Compose from .common import Compose
from .image import LoadImage, load_image from .image import LoadImage, load_image
from .nlp import * # noqa F403 from .nlp import * # noqa F403
from .nlp import TextGenerationPreprocessor
from .space.dialog_intent_prediction_preprocessor import * # noqa F403 from .space.dialog_intent_prediction_preprocessor import * # noqa F403
from .space.dialog_modeling_preprocessor import * # noqa F403 from .space.dialog_modeling_preprocessor import * # noqa F403

+ 230
- 0
modelscope/preprocessors/audio.py View File

@@ -0,0 +1,230 @@
import ctypes
import os
from typing import Any, Dict

import numpy as np
import scipy.io.wavfile as wav
import torch
import torchaudio.compliance.kaldi as kaldi
from numpy.ctypeslib import ndpointer

from modelscope.utils.constant import Fields
from .builder import PREPROCESSORS


def load_wav(path):
samp_rate, data = wav.read(path)
return np.float32(data), samp_rate


def load_library(libaec):
libaec_in_cwd = os.path.join('.', libaec)
if os.path.exists(libaec_in_cwd):
libaec = libaec_in_cwd
mitaec = ctypes.cdll.LoadLibrary(libaec)
fe_process = mitaec.fe_process_inst
fe_process.argtypes = [
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
]
return fe_process


def do_linear_aec(fe_process, mic, ref, int16range=True):
mic = np.float32(mic)
ref = np.float32(ref)
if len(mic) > len(ref):
mic = mic[:len(ref)]
out_mic = np.zeros_like(mic)
out_linear = np.zeros_like(mic)
out_echo = np.zeros_like(mic)
out_ref = np.zeros_like(mic)
if int16range:
mic /= 32768
ref /= 32768
fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
# out_ref not in use here
if int16range:
out_mic *= 32768
out_linear *= 32768
out_echo *= 32768
return out_mic, out_ref, out_linear, out_echo


def load_kaldi_feature_transform(filename):
fp = open(filename, 'r')
all_str = fp.read()
pos1 = all_str.find('AddShift')
pos2 = all_str.find('[', pos1)
pos3 = all_str.find(']', pos2)
mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
pos1 = all_str.find('Rescale')
pos2 = all_str.find('[', pos1)
pos3 = all_str.find(']', pos2)
scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
fp.close()
return mean, scale


class Feature:
r"""Extract feat from one utterance.
"""

def __init__(self,
fbank_config,
feat_type='spec',
mvn_file=None,
cuda=False):
r"""

Args:
fbank_config (dict):
feat_type (str):
raw: do nothing
fbank: use kaldi.fbank
spec: Real/Imag
logpow: log(1+|x|^2)
mvn_file (str): the path of data file for mean variance normalization
cuda:
"""
self.fbank_config = fbank_config
self.feat_type = feat_type
self.n_fft = fbank_config['frame_length'] * fbank_config[
'sample_frequency'] // 1000
self.hop_length = fbank_config['frame_shift'] * fbank_config[
'sample_frequency'] // 1000
self.window = torch.hamming_window(self.n_fft, periodic=False)

self.mvn = False
if mvn_file is not None and os.path.exists(mvn_file):
print(f'loading mvn file: {mvn_file}')
shift, scale = load_kaldi_feature_transform(mvn_file)
self.shift = torch.from_numpy(shift)
self.scale = torch.from_numpy(scale)
self.mvn = True
if cuda:
self.window = self.window.cuda()
if self.mvn:
self.shift = self.shift.cuda()
self.scale = self.scale.cuda()

def compute(self, utt):
r"""

Args:
utt: in [-32768, 32767] range

Returns:
[..., T, F]
"""
if self.feat_type == 'raw':
return utt
elif self.feat_type == 'fbank':
if len(utt.shape) == 1:
utt = utt.unsqueeze(0)
feat = kaldi.fbank(utt, **self.fbank_config)
elif self.feat_type == 'spec':
spec = torch.stft(
utt / 32768,
self.n_fft,
self.hop_length,
self.n_fft,
self.window,
center=False,
return_complex=True)
feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
elif self.feat_type == 'logpow':
spec = torch.stft(
utt,
self.n_fft,
self.hop_length,
self.n_fft,
self.window,
center=False,
return_complex=True)
abspow = torch.abs(spec)**2
feat = torch.log(1 + abspow).permute(-1, -2)
return feat

def normalize(self, feat):
if self.mvn:
feat = feat + self.shift
feat = feat * self.scale
return feat


@PREPROCESSORS.register_module(Fields.audio)
class LinearAECAndFbank:
SAMPLE_RATE = 16000

def __init__(self, io_config):
self.trunc_length = 7200 * self.SAMPLE_RATE
self.linear_aec_delay = io_config['linear_aec_delay']
self.feature = Feature(io_config['fbank_config'],
io_config['feat_type'], io_config['mvn'])
self.mitaec = load_library(io_config['mitaec_library'])
self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
""" linear filtering the near end mic and far end audio, then extract the feature
:param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
:return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature"
"""
# read files
nearend_mic, fs = load_wav(data['nearend_mic'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
farend_speech, fs = load_wav(data['farend_speech'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
if 'nearend_speech' in data:
nearend_speech, fs = load_wav(data['nearend_speech'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
else:
nearend_speech = np.zeros_like(nearend_mic)

out_mic, out_ref, out_linear, out_echo = do_linear_aec(
self.mitaec, nearend_mic, farend_speech)
# fix 20ms linear aec delay by delaying the target speech
extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
nearend_speech = np.concatenate([extra_zeros, nearend_speech])
# truncate files to the same length
flen = min(
len(out_mic), len(out_ref), len(out_linear), len(out_echo),
len(nearend_speech))
fstart = 0
flen = min(flen, self.trunc_length)
nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
out_mic[fstart:flen], out_ref[fstart:flen],
out_linear[fstart:flen], out_echo[fstart:flen],
nearend_speech[fstart:flen])

# extract features (frames, [mic, linear, ref, aes?])
feat = torch.FloatTensor()

nearend_mic = torch.from_numpy(np.float32(nearend_mic))
fbank_nearend_mic = self.feature.compute(nearend_mic)
feat = torch.cat([feat, fbank_nearend_mic], dim=1)

out_linear = torch.from_numpy(np.float32(out_linear))
fbank_out_linear = self.feature.compute(out_linear)
feat = torch.cat([feat, fbank_out_linear], dim=1)

out_echo = torch.from_numpy(np.float32(out_echo))
fbank_out_echo = self.feature.compute(out_echo)
feat = torch.cat([feat, fbank_out_echo], dim=1)

# feature transform
feat = self.feature.normalize(feat)

# prepare target
if nearend_speech is not None:
nearend_speech = torch.from_numpy(np.float32(nearend_speech))

if self.mask_on_mic:
base = nearend_mic
else:
base = out_linear
out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
return out_data

+ 1
- 1
modelscope/preprocessors/image.py View File

@@ -9,7 +9,7 @@ from modelscope.utils.constant import Fields
from .builder import PREPROCESSORS from .builder import PREPROCESSORS




@PREPROCESSORS.register_module(Fields.image)
@PREPROCESSORS.register_module(Fields.cv)
class LoadImage: class LoadImage:
"""Load an image from file or url. """Load an image from file or url.
Added or updated keys are "filename", "img", "img_shape", Added or updated keys are "filename", "img", "img_shape",


+ 78
- 9
modelscope/preprocessors/nlp.py View File

@@ -11,8 +11,8 @@ from .base import Preprocessor
from .builder import PREPROCESSORS from .builder import PREPROCESSORS


__all__ = [ __all__ = [
'Tokenize',
'SequenceClassificationPreprocessor',
'Tokenize', 'SequenceClassificationPreprocessor',
'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
] ]




@@ -31,7 +31,7 @@ class Tokenize(Preprocessor):




@PREPROCESSORS.register_module( @PREPROCESSORS.register_module(
Fields.nlp, module_name=r'bert-sentiment-analysis')
Fields.nlp, module_name=r'bert-sequence-classification')
class SequenceClassificationPreprocessor(Preprocessor): class SequenceClassificationPreprocessor(Preprocessor):


def __init__(self, model_dir: str, *args, **kwargs): def __init__(self, model_dir: str, *args, **kwargs):
@@ -51,21 +51,42 @@ class SequenceClassificationPreprocessor(Preprocessor):
self.sequence_length = kwargs.pop('sequence_length', 128) self.sequence_length = kwargs.pop('sequence_length', 128)


self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
print(f'this is the tokenzier {self.tokenizer}')


@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
@type_assert(object, (str, tuple))
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
"""process the raw input data """process the raw input data


Args: Args:
data (str): a sentence
Example:
'you are so handsome.'
data (str or tuple):
sentence1 (str): a sentence
Example:
'you are so handsome.'
or
(sentence1, sentence2)
sentence1 (str): a sentence
Example:
'you are so handsome.'
sentence2 (str): a sentence
Example:
'you are so beautiful.'


Returns: Returns:
Dict[str, Any]: the preprocessed data Dict[str, Any]: the preprocessed data
""" """


new_data = {self.first_sequence: data}
if not isinstance(data, tuple):
data = (
data,
None,
)

sentence1, sentence2 = data
new_data = {
self.first_sequence: sentence1,
self.second_sequence: sentence2
}

# preprocess the data for the model input # preprocess the data for the model input


rst = { rst = {
@@ -150,3 +171,51 @@ class TextGenerationPreprocessor(Preprocessor):
rst['token_type_ids'].append(feature['token_type_ids']) rst['token_type_ids'].append(feature['token_type_ids'])


return {k: torch.tensor(v) for k, v in rst.items()} return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(
Fields.nlp, module_name=r'bert-token-classification')
class TokenClassifcationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path

Args:
model_dir (str): model path
"""

super().__init__(*args, **kwargs)

from sofa import SbertTokenizer
self.model_dir: str = model_dir
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
"""process the raw input data

Args:
data (str): a sentence
Example:
'you are so handsome.'

Returns:
Dict[str, Any]: the preprocessed data
"""
# preprocess the data for the model input

text = data.replace(' ', '').strip()
tokens = []
for token in text:
token = self.tokenizer.tokenize(token)
tokens.extend(token)
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
attention_mask = [1] * len(input_ids)
token_type_ids = [0] * len(input_ids)
return {
'text': text,
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids
}

+ 6
- 6
modelscope/utils/config.py View File

@@ -74,17 +74,17 @@ class Config:
{'c': [1, 2, 3], 'd': 'dd'} {'c': [1, 2, 3], 'd': 'dd'}
>>> cfg.b.d >>> cfg.b.d
'dd' 'dd'
>>> cfg = Config.from_file('configs/examples/config.json')
>>> cfg = Config.from_file('configs/examples/configuration.json')
>>> cfg.filename >>> cfg.filename
'configs/examples/config.json'
'configs/examples/configuration.json'
>>> cfg.b >>> cfg.b
{'c': [1, 2, 3], 'd': 'dd'} {'c': [1, 2, 3], 'd': 'dd'}
>>> cfg = Config.from_file('configs/examples/config.py')
>>> cfg = Config.from_file('configs/examples/configuration.py')
>>> cfg.filename >>> cfg.filename
"configs/examples/config.py"
>>> cfg = Config.from_file('configs/examples/config.yaml')
"configs/examples/configuration.py"
>>> cfg = Config.from_file('configs/examples/configuration.yaml')
>>> cfg.filename >>> cfg.filename
"configs/examples/config.yaml"
"configs/examples/configuration.yaml"
""" """


@staticmethod @staticmethod


+ 15
- 13
modelscope/utils/constant.py View File

@@ -4,8 +4,8 @@
class Fields(object): class Fields(object):
""" Names for different application fields """ Names for different application fields
""" """
image = 'image'
video = 'video'
# image = 'image'
# video = 'video'
cv = 'cv' cv = 'cv'
nlp = 'nlp' nlp = 'nlp'
audio = 'audio' audio = 'audio'
@@ -30,7 +30,9 @@ class Tasks(object):
image_matting = 'image-matting' image_matting = 'image-matting'


# nlp tasks # nlp tasks
word_segmentation = 'word-segmentation'
sentiment_analysis = 'sentiment-analysis' sentiment_analysis = 'sentiment-analysis'
sentence_similarity = 'sentence-similarity'
text_classification = 'text-classification' text_classification = 'text-classification'
relation_extraction = 'relation-extraction' relation_extraction = 'relation-extraction'
zero_shot = 'zero-shot' zero_shot = 'zero-shot'
@@ -52,7 +54,7 @@ class Tasks(object):
text_to_speech = 'text-to-speech' text_to_speech = 'text-to-speech'
speech_signal_process = 'speech-signal-process' speech_signal_process = 'speech-signal-process'


# multi-media
# multi-modal tasks
image_captioning = 'image-captioning' image_captioning = 'image-captioning'
visual_grounding = 'visual-grounding' visual_grounding = 'visual-grounding'
text_to_image_synthesis = 'text-to-image-synthesis' text_to_image_synthesis = 'text-to-image-synthesis'
@@ -73,16 +75,16 @@ class Hubs(object):
huggingface = 'huggingface' huggingface = 'huggingface'




# configuration filename
# in order to avoid conflict with huggingface
# config file we use maas_config instead
CONFIGFILE = 'maas_config.json'
class ModelFile(object):
CONFIGURATION = 'configuration.json'
README = 'README.md'
TF_SAVED_MODEL_FILE = 'saved_model.pb'
TF_GRAPH_FILE = 'tf_graph.pb'
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
TF_CKPT_PREFIX = 'ckpt-'
TORCH_MODEL_FILE = 'pytorch_model.pt'
TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'



README_FILE = 'README.md'
TF_SAVED_MODEL_FILE = 'saved_model.pb'
TF_GRAPH_FILE = 'tf_graph.pb'
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
TF_CHECKPOINT_FILE = 'checkpoint'
TORCH_MODEL_FILE = 'pytorch_model.bin'
TENSORFLOW = 'tensorflow' TENSORFLOW = 'tensorflow'
PYTORCH = 'pytorch' PYTORCH = 'pytorch'

+ 1
- 1
modelscope/utils/registry.py View File

@@ -1,7 +1,6 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.


import inspect import inspect
from email.policy import default


from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


@@ -70,6 +69,7 @@ class Registry(object):
f'{self._name}[{group_key}]') f'{self._name}[{group_key}]')


self._modules[group_key][module_name] = module_cls self._modules[group_key][module_name] = module_cls
module_cls.group_key = group_key


if module_name in self._modules[default_group]: if module_name in self._modules[default_group]:
if id(self._modules[default_group][module_name]) == id(module_cls): if id(self._modules[default_group][module_name]) == id(module_cls):


+ 20
- 0
modelscope/utils/test_utils.py View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python
# Copyright (c) Alibaba, Inc. and its affiliates.

import os

TEST_LEVEL = 2
TEST_LEVEL_STR = 'TEST_LEVEL'


def test_level():
global TEST_LEVEL
if TEST_LEVEL_STR in os.environ:
TEST_LEVEL = int(os.environ[TEST_LEVEL_STR])

return TEST_LEVEL


def set_test_level(level: int):
global TEST_LEVEL
TEST_LEVEL = level

+ 1
- 0
requirements/docs.txt View File

@@ -1,6 +1,7 @@
docutils==0.16.0 docutils==0.16.0
recommonmark recommonmark
sphinx==4.0.2 sphinx==4.0.2
sphinx-book-theme
sphinx-copybutton sphinx-copybutton
sphinx_markdown_tables sphinx_markdown_tables
sphinx_rtd_theme==0.5.2 sphinx_rtd_theme==0.5.2

+ 3
- 2
requirements/runtime.txt View File

@@ -1,12 +1,13 @@
addict addict
datasets datasets
easydict easydict
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl
numpy numpy
opencv-python-headless opencv-python-headless
Pillow
Pillow>=6.2.0
pyyaml pyyaml
requests requests
scipy
tokenizers<=0.10.3 tokenizers<=0.10.3
transformers<=4.16.2 transformers<=4.16.2
yapf yapf

+ 2
- 1
setup.cfg View File

@@ -11,6 +11,7 @@ default_section = THIRDPARTY
BASED_ON_STYLE = pep8 BASED_ON_STYLE = pep8
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
SPLIT_BEFORE_ARITHMETIC_OPERATOR = true


[codespell] [codespell]
skip = *.ipynb skip = *.ipynb
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
[flake8] [flake8]
select = B,C,E,F,P,T4,W,B9 select = B,C,E,F,P,T4,W,B9
max-line-length = 120 max-line-length = 120
ignore = F401,F821
ignore = F401,F821,W503
exclude = docs/src,*.pyi,.git exclude = docs/src,*.pyi,.git

+ 8
- 11
tests/pipelines/test_base.py View File

@@ -35,9 +35,10 @@ class CustomPipelineTest(unittest.TestCase):
CustomPipeline1() CustomPipeline1()


def test_custom(self): def test_custom(self):
dummy_task = 'dummy-task'


@PIPELINES.register_module( @PIPELINES.register_module(
group_key=Tasks.image_tagging, module_name='custom-image')
group_key=dummy_task, module_name='custom-image')
class CustomImagePipeline(Pipeline): class CustomImagePipeline(Pipeline):


def __init__(self, def __init__(self,
@@ -67,32 +68,28 @@ class CustomPipelineTest(unittest.TestCase):
outputs['filename'] = inputs['url'] outputs['filename'] = inputs['url']
img = inputs['img'] img = inputs['img']
new_image = img.resize((img.width // 2, img.height // 2)) new_image = img.resize((img.width // 2, img.height // 2))
outputs['resize_image'] = np.array(new_image)
outputs['dummy_result'] = 'dummy_result'
outputs['output_png'] = np.array(new_image)
return outputs return outputs


def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs return inputs


self.assertTrue('custom-image' in PIPELINES.modules[default_group]) self.assertTrue('custom-image' in PIPELINES.modules[default_group])
add_default_pipeline_info(Tasks.image_tagging, 'custom-image')
add_default_pipeline_info(dummy_task, 'custom-image', overwrite=True)
pipe = pipeline(pipeline_name='custom-image') pipe = pipeline(pipeline_name='custom-image')
pipe2 = pipeline(Tasks.image_tagging)
pipe2 = pipeline(dummy_task)
self.assertTrue(type(pipe) is type(pipe2)) self.assertTrue(type(pipe) is type(pipe2))


img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
'aliyuncs.com/data/test/images/image1.jpg'
img_url = 'data/test/images/image1.jpg'
output = pipe(img_url) output = pipe(img_url)
self.assertEqual(output['filename'], img_url) self.assertEqual(output['filename'], img_url)
self.assertEqual(output['resize_image'].shape, (318, 512, 3))
self.assertEqual(output['dummy_result'], 'dummy_result')
self.assertEqual(output['output_png'].shape, (318, 512, 3))


outputs = pipe([img_url for i in range(4)]) outputs = pipe([img_url for i in range(4)])
self.assertEqual(len(outputs), 4) self.assertEqual(len(outputs), 4)
for out in outputs: for out in outputs:
self.assertEqual(out['filename'], img_url) self.assertEqual(out['filename'], img_url)
self.assertEqual(out['resize_image'].shape, (318, 512, 3))
self.assertEqual(out['dummy_result'], 'dummy_result')
self.assertEqual(out['output_png'].shape, (318, 512, 3))




if __name__ == '__main__': if __name__ == '__main__':


+ 3
- 4
tests/pipelines/test_image_captioning.py View File

@@ -7,11 +7,12 @@ import unittest
from modelscope.fileio import File from modelscope.fileio import File
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level




class ImageCaptionTest(unittest.TestCase): class ImageCaptionTest(unittest.TestCase):


@unittest.skip('skip long test')
@unittest.skip('skip before model is restored in model hub')
def test_run(self): def test_run(self):
model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt' model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'


@@ -26,9 +27,7 @@ class ImageCaptionTest(unittest.TestCase):
img_captioning = pipeline( img_captioning = pipeline(
Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir) Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)


result = img_captioning(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_captioning('data/test/images/image_matting.png')
print(result['caption']) print(result['caption'])






+ 11
- 15
tests/pipelines/test_image_matting.py View File

@@ -9,14 +9,15 @@ import cv2
from modelscope.fileio import File from modelscope.fileio import File
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pydatasets import PyDataset from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Tasks
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.hub import get_model_cache_dir from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level




class ImageMattingTest(unittest.TestCase): class ImageMattingTest(unittest.TestCase):


def setUp(self) -> None: def setUp(self) -> None:
self.model_id = 'damo/cv_unet_image-matting_damo'
self.model_id = 'damo/cv_unet_image-matting'
# switch to False if downloading everytime is not desired # switch to False if downloading everytime is not desired
purge_cache = True purge_cache = True
if purge_cache: if purge_cache:
@@ -28,20 +29,17 @@ class ImageMattingTest(unittest.TestCase):
model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \ model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
'.com/data/test/maas/image_matting/matting_person.pb' '.com/data/test/maas/image_matting/matting_person.pb'
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
model_file = osp.join(tmp_dir, 'matting_person.pb')
model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
with open(model_file, 'wb') as ofile: with open(model_file, 'wb') as ofile:
ofile.write(File.read(model_path)) ofile.write(File.read(model_path))
img_matting = pipeline(Tasks.image_matting, model=tmp_dir) img_matting = pipeline(Tasks.image_matting, model=tmp_dir)


result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png']) cv2.imwrite('result.png', result['output_png'])


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_dataset(self): def test_run_with_dataset(self):
input_location = [
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
]
input_location = ['data/test/images/image_matting.png']
# alternatively: # alternatively:
# input_location = '/dir/to/images' # input_location = '/dir/to/images'


@@ -52,21 +50,19 @@ class ImageMattingTest(unittest.TestCase):
cv2.imwrite('result.png', next(result)['output_png']) cv2.imwrite('result.png', next(result)['output_png'])
print(f'Output written to {osp.abspath("result.png")}') print(f'Output written to {osp.abspath("result.png")}')


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub(self): def test_run_modelhub(self):
img_matting = pipeline(Tasks.image_matting, model=self.model_id) img_matting = pipeline(Tasks.image_matting, model=self.model_id)


result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png']) cv2.imwrite('result.png', result['output_png'])
print(f'Output written to {osp.abspath("result.png")}') print(f'Output written to {osp.abspath("result.png")}')


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub_default_model(self): def test_run_modelhub_default_model(self):
img_matting = pipeline(Tasks.image_matting) img_matting = pipeline(Tasks.image_matting)


result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png']) cv2.imwrite('result.png', result['output_png'])
print(f'Output written to {osp.abspath("result.png")}') print(f'Output written to {osp.abspath("result.png")}')




+ 3
- 0
tests/pipelines/test_person_image_cartoon.py View File

@@ -8,6 +8,7 @@ import cv2
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pipelines.base import Pipeline from modelscope.pipelines.base import Pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level




class ImageCartoonTest(unittest.TestCase): class ImageCartoonTest(unittest.TestCase):
@@ -36,10 +37,12 @@ class ImageCartoonTest(unittest.TestCase):
img_cartoon = pipeline(Tasks.image_generation, model=model_dir) img_cartoon = pipeline(Tasks.image_generation, model=model_dir)
self.pipeline_inference(img_cartoon, self.test_image) self.pipeline_inference(img_cartoon, self.test_image)


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_modelhub(self): def test_run_modelhub(self):
img_cartoon = pipeline(Tasks.image_generation, model=self.model_id) img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
self.pipeline_inference(img_cartoon, self.test_image) self.pipeline_inference(img_cartoon, self.test_image)


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_modelhub_default_model(self): def test_run_modelhub_default_model(self):
img_cartoon = pipeline(Tasks.image_generation) img_cartoon = pipeline(Tasks.image_generation)
self.pipeline_inference(img_cartoon, self.test_image) self.pipeline_inference(img_cartoon, self.test_image)


+ 67
- 0
tests/pipelines/test_sentence_similarity.py View File

@@ -0,0 +1,67 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import shutil
import unittest

from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import SbertForSentenceSimilarity
from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class SentenceSimilarityTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
sentence1 = '今天气温比昨天高么?'
sentence2 = '今天湿度比昨天高么?'

def setUp(self) -> None:
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run(self):
cache_path = snapshot_download(self.model_id)
tokenizer = SequenceClassificationPreprocessor(cache_path)
model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
print('test1')
print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
print()
print(
f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = SequenceClassificationPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.sentence_similarity,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.sentence_similarity, model=self.model_id)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.sentence_similarity)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))


if __name__ == '__main__':
unittest.main()

+ 56
- 0
tests/pipelines/test_speech_signal_process.py View File

@@ -0,0 +1,56 @@
import os.path
import shutil
import unittest

from modelscope.fileio import File
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir

NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
NEAREND_MIC_FILE = 'nearend_mic.wav'
FAREND_SPEECH_FILE = 'farend_speech.wav'

AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \
'?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
AEC_LIB_FILE = 'libmitaec_pyio.so'


def download(remote_path, local_path):
local_dir = os.path.dirname(local_path)
if len(local_dir) > 0:
if not os.path.exists(local_dir):
os.makedirs(local_dir)
with open(local_path, 'wb') as ofile:
ofile.write(File.read(remote_path))


class SpeechSignalProcessTest(unittest.TestCase):

def setUp(self) -> None:
self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)
# A temporary hack to provide c++ lib. Download it first.
download(AEC_LIB_URL, AEC_LIB_FILE)

def test_run(self):
download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
input = {
'nearend_mic': NEAREND_MIC_FILE,
'farend_speech': FAREND_SPEECH_FILE
}
aec = pipeline(
Tasks.speech_signal_process,
model=self.model_id,
pipeline_name=r'speech_dfsmn_aec_psm_16k')
aec(input, output_path='output.wav')


if __name__ == '__main__':
unittest.main()

+ 6
- 0
tests/pipelines/test_text_classification.py View File

@@ -12,6 +12,7 @@ from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.pydatasets import PyDataset from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Hubs, Tasks from modelscope.utils.constant import Hubs, Tasks
from modelscope.utils.hub import get_model_cache_dir from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level




class SequenceClassificationTest(unittest.TestCase): class SequenceClassificationTest(unittest.TestCase):
@@ -43,6 +44,7 @@ class SequenceClassificationTest(unittest.TestCase):
break break
print(r) print(r)


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run(self): def test_run(self):
model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \ model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
'/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip' '/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
@@ -67,6 +69,7 @@ class SequenceClassificationTest(unittest.TestCase):
Tasks.text_classification, model=model, preprocessor=preprocessor) Tasks.text_classification, model=model, preprocessor=preprocessor)
print(pipeline2('Hello world!')) print(pipeline2('Hello world!'))


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor( preprocessor = SequenceClassificationPreprocessor(
@@ -77,6 +80,7 @@ class SequenceClassificationTest(unittest.TestCase):
preprocessor=preprocessor) preprocessor=preprocessor)
self.predict(pipeline_ins) self.predict(pipeline_ins)


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
text_classification = pipeline( text_classification = pipeline(
task=Tasks.text_classification, model=self.model_id) task=Tasks.text_classification, model=self.model_id)
@@ -85,6 +89,7 @@ class SequenceClassificationTest(unittest.TestCase):
'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) 'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result) self.printDataset(result)


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self): def test_run_with_default_model(self):
text_classification = pipeline(task=Tasks.text_classification) text_classification = pipeline(task=Tasks.text_classification)
result = text_classification( result = text_classification(
@@ -92,6 +97,7 @@ class SequenceClassificationTest(unittest.TestCase):
'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) 'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result) self.printDataset(result)


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_dataset(self): def test_run_with_dataset(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor( preprocessor = SequenceClassificationPreprocessor(


+ 7
- 3
tests/pipelines/test_text_generation.py View File

@@ -4,10 +4,11 @@ import unittest
from maas_hub.snapshot_download import snapshot_download from maas_hub.snapshot_download import snapshot_download


from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp import PalmForTextGenerationModel
from modelscope.models.nlp import PalmForTextGeneration
from modelscope.pipelines import TextGenerationPipeline, pipeline from modelscope.pipelines import TextGenerationPipeline, pipeline
from modelscope.preprocessors import TextGenerationPreprocessor from modelscope.preprocessors import TextGenerationPreprocessor
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level




class TextGenerationTest(unittest.TestCase): class TextGenerationTest(unittest.TestCase):
@@ -15,12 +16,12 @@ class TextGenerationTest(unittest.TestCase):
input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'" input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'" input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"


@unittest.skip('skip temporarily to save test time')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run(self): def test_run(self):
cache_path = snapshot_download(self.model_id) cache_path = snapshot_download(self.model_id)
preprocessor = TextGenerationPreprocessor( preprocessor = TextGenerationPreprocessor(
cache_path, first_sequence='sentence', second_sequence=None) cache_path, first_sequence='sentence', second_sequence=None)
model = PalmForTextGenerationModel(
model = PalmForTextGeneration(
cache_path, tokenizer=preprocessor.tokenizer) cache_path, tokenizer=preprocessor.tokenizer)
pipeline1 = TextGenerationPipeline(model, preprocessor) pipeline1 = TextGenerationPipeline(model, preprocessor)
pipeline2 = pipeline( pipeline2 = pipeline(
@@ -29,6 +30,7 @@ class TextGenerationTest(unittest.TestCase):
print() print()
print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}') print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self): def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id) model = Model.from_pretrained(self.model_id)
preprocessor = TextGenerationPreprocessor( preprocessor = TextGenerationPreprocessor(
@@ -37,11 +39,13 @@ class TextGenerationTest(unittest.TestCase):
task=Tasks.text_generation, model=model, preprocessor=preprocessor) task=Tasks.text_generation, model=model, preprocessor=preprocessor)
print(pipeline_ins(self.input1)) print(pipeline_ins(self.input1))


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self): def test_run_with_model_name(self):
pipeline_ins = pipeline( pipeline_ins = pipeline(
task=Tasks.text_generation, model=self.model_id) task=Tasks.text_generation, model=self.model_id)
print(pipeline_ins(self.input2)) print(pipeline_ins(self.input2))


@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self): def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.text_generation) pipeline_ins = pipeline(task=Tasks.text_generation)
print(pipeline_ins(self.input2)) print(pipeline_ins(self.input2))


+ 62
- 0
tests/pipelines/test_word_segmentation.py View File

@@ -0,0 +1,62 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import shutil
import unittest

from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.pipelines import WordSegmentationPipeline, pipeline
from modelscope.preprocessors import TokenClassifcationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class WordSegmentationTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
sentence = '今天天气不错,适合出去游玩'

def setUp(self) -> None:
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
cache_path = snapshot_download(self.model_id)
tokenizer = TokenClassifcationPreprocessor(cache_path)
model = StructBertForTokenClassification(
cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassifcationPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.word_segmentation)
print(pipeline_ins(input=self.sentence))


if __name__ == '__main__':
unittest.main()

+ 20
- 0
tests/preprocessors/test_image.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import unittest

import PIL

from modelscope.preprocessors import load_image
from modelscope.utils.logger import get_logger


class ImagePreprocessorTest(unittest.TestCase):

def test_load(self):
img = load_image('data/test/images/image_matting.png')
self.assertTrue(isinstance(img, PIL.Image.Image))
self.assertEqual(img.size, (948, 533))


if __name__ == '__main__':
unittest.main()

+ 9
- 0
tests/run.py View File

@@ -7,6 +7,11 @@ import sys
import unittest import unittest
from fnmatch import fnmatch from fnmatch import fnmatch


from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import set_test_level, test_level

logger = get_logger()



def gather_test_cases(test_dir, pattern, list_tests): def gather_test_cases(test_dir, pattern, list_tests):
case_list = [] case_list = []
@@ -49,5 +54,9 @@ if __name__ == '__main__':
'--pattern', default='test_*.py', help='test file pattern') '--pattern', default='test_*.py', help='test file pattern')
parser.add_argument( parser.add_argument(
'--test_dir', default='tests', help='directory to be tested') '--test_dir', default='tests', help='directory to be tested')
parser.add_argument(
'--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
args = parser.parse_args() args = parser.parse_args()
set_test_level(args.level)
logger.info(f'TEST LEVEL: {test_level()}')
main(args) main(args)

+ 5
- 8
tests/utils/test_config.py View File

@@ -1,11 +1,8 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
import argparse import argparse
import os.path as osp
import tempfile import tempfile
import unittest import unittest
from pathlib import Path


from modelscope.fileio import dump, load
from modelscope.utils.config import Config from modelscope.utils.config import Config


obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}} obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -14,25 +11,25 @@ obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
class ConfigTest(unittest.TestCase): class ConfigTest(unittest.TestCase):


def test_json(self): def test_json(self):
config_file = 'configs/examples/config.json'
config_file = 'configs/examples/configuration.json'
cfg = Config.from_file(config_file) cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1) self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b']) self.assertEqual(cfg.b, obj['b'])


def test_yaml(self): def test_yaml(self):
config_file = 'configs/examples/config.yaml'
config_file = 'configs/examples/configuration.yaml'
cfg = Config.from_file(config_file) cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1) self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b']) self.assertEqual(cfg.b, obj['b'])


def test_py(self): def test_py(self):
config_file = 'configs/examples/config.py'
config_file = 'configs/examples/configuration.py'
cfg = Config.from_file(config_file) cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1) self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b']) self.assertEqual(cfg.b, obj['b'])


def test_dump(self): def test_dump(self):
config_file = 'configs/examples/config.py'
config_file = 'configs/examples/configuration.py'
cfg = Config.from_file(config_file) cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1) self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b']) self.assertEqual(cfg.b, obj['b'])
@@ -53,7 +50,7 @@ class ConfigTest(unittest.TestCase):
self.assertEqual(yaml_str, infile.read()) self.assertEqual(yaml_str, infile.read())


def test_to_dict(self): def test_to_dict(self):
config_file = 'configs/examples/config.json'
config_file = 'configs/examples/configuration.json'
cfg = Config.from_file(config_file) cfg = Config.from_file(config_file)
d = cfg.to_dict() d = cfg.to_dict()
print(d) print(d)


Loading…
Cancel
Save