From e3bffedb87e25d35c061b231f794d96a314149c5 Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Thu, 28 Jul 2022 22:59:57 +0800 Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=20aec=20pipeline=E4=BF=AE?= =?UTF-8?q?=E6=94=B9C++=E5=BA=93=E4=BE=9D=E8=B5=96=E5=88=B0MinDAEC=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-M?= =?UTF-8?q?aaS/MaaS-lib/codereview/9563105?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use MinDAEC instead of cdll * feat: ANS pipeline can accept bytes as input and adjust processing order to reduce the amount of computation --- modelscope/preprocessors/__init__.py | 1 + modelscope/preprocessors/audio.py | 78 ++++++------------- requirements/audio.txt | 1 + tests/pipelines/test_speech_signal_process.py | 3 - 4 files changed, 27 insertions(+), 56 deletions(-) diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index fd4dd4c5..1aba9107 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -32,6 +32,7 @@ else: 'base': ['Preprocessor'], 'builder': ['PREPROCESSORS', 'build_preprocessor'], 'common': ['Compose'], + 'audio': ['LinearAECAndFbank'], 'asr': ['WavToScp'], 'video': ['ReadVideoData'], 'image': [ diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py index bb10c89c..cdee968b 100644 --- a/modelscope/preprocessors/audio.py +++ b/modelscope/preprocessors/audio.py @@ -1,58 +1,15 @@ -import ctypes +import io import os from typing import Any, Dict import numpy as np import scipy.io.wavfile as wav import torch -from numpy.ctypeslib import ndpointer from modelscope.utils.constant import Fields from .builder import PREPROCESSORS -def load_wav(path): - samp_rate, data = wav.read(path) - return np.float32(data), samp_rate - - -def load_library(libaec): - libaec_in_cwd = os.path.join('.', libaec) - if os.path.exists(libaec_in_cwd): - libaec = libaec_in_cwd - mitaec = ctypes.cdll.LoadLibrary(libaec) - fe_process = mitaec.fe_process_inst - fe_process.argtypes = [ - ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), - ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int, - ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), - ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), - ndpointer(ctypes.c_float, flags='C_CONTIGUOUS') - ] - return fe_process - - -def do_linear_aec(fe_process, mic, ref, int16range=True): - mic = np.float32(mic) - ref = np.float32(ref) - if len(mic) > len(ref): - mic = mic[:len(ref)] - out_mic = np.zeros_like(mic) - out_linear = np.zeros_like(mic) - out_echo = np.zeros_like(mic) - out_ref = np.zeros_like(mic) - if int16range: - mic /= 32768 - ref /= 32768 - fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo) - # out_ref not in use here - if int16range: - out_mic *= 32768 - out_linear *= 32768 - out_echo *= 32768 - return out_mic, out_ref, out_linear, out_echo - - def load_kaldi_feature_transform(filename): fp = open(filename, 'r') all_str = fp.read() @@ -162,11 +119,12 @@ class LinearAECAndFbank: SAMPLE_RATE = 16000 def __init__(self, io_config): + import MinDAEC self.trunc_length = 7200 * self.SAMPLE_RATE self.linear_aec_delay = io_config['linear_aec_delay'] self.feature = Feature(io_config['fbank_config'], io_config['feat_type'], io_config['mvn']) - self.mitaec = load_library(io_config['mitaec_library']) + self.mitaec = MinDAEC.load() self.mask_on_mic = io_config['mask_on'] == 'nearend_mic' def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: @@ -175,18 +133,15 @@ class LinearAECAndFbank: :return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature" """ # read files - nearend_mic, fs = load_wav(data['nearend_mic']) - assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' - farend_speech, fs = load_wav(data['farend_speech']) - assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' + nearend_mic, fs = self.load_wav(data['nearend_mic']) + farend_speech, fs = self.load_wav(data['farend_speech']) if 'nearend_speech' in data: - nearend_speech, fs = load_wav(data['nearend_speech']) - assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' + nearend_speech, fs = self.load_wav(data['nearend_speech']) else: nearend_speech = np.zeros_like(nearend_mic) - out_mic, out_ref, out_linear, out_echo = do_linear_aec( - self.mitaec, nearend_mic, farend_speech) + out_mic, out_ref, out_linear, out_echo = self.mitaec.do_linear_aec( + nearend_mic, farend_speech) # fix 20ms linear aec delay by delaying the target speech extra_zeros = np.zeros([int(self.linear_aec_delay * fs)]) nearend_speech = np.concatenate([extra_zeros, nearend_speech]) @@ -229,3 +184,20 @@ class LinearAECAndFbank: base = out_linear out_data = {'base': base, 'target': nearend_speech, 'feature': feat} return out_data + + @staticmethod + def load_wav(inputs): + import librosa + if isinstance(inputs, bytes): + inputs = io.BytesIO(inputs) + elif isinstance(inputs, str): + pass + else: + raise TypeError(f'Unsupported input type: {type(inputs)}.') + sample_rate, data = wav.read(inputs) + if len(data.shape) > 1: + raise ValueError('modelscope error:The audio must be mono.') + if sample_rate != LinearAECAndFbank.SAMPLE_RATE: + data = librosa.resample(data, sample_rate, + LinearAECAndFbank.SAMPLE_RATE) + return data.astype(np.float32), LinearAECAndFbank.SAMPLE_RATE diff --git a/requirements/audio.txt b/requirements/audio.txt index b1d9e2c3..3bd0d8af 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -8,6 +8,7 @@ kwsbp librosa lxml matplotlib +MinDAEC nara_wpe nltk # numpy requirements should be declared with tensorflow 1.15 but not here diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py index f911c0eb..edc7f34d 100644 --- a/tests/pipelines/test_speech_signal_process.py +++ b/tests/pipelines/test_speech_signal_process.py @@ -13,9 +13,6 @@ FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/ NEAREND_MIC_FILE = 'nearend_mic.wav' FAREND_SPEECH_FILE = 'farend_speech.wav' -AEC_LIB_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/dependencies/ics_MaaS_AEC_lib_libmitaec_pyio.so' -AEC_LIB_FILE = 'libmitaec_pyio.so' - NOISE_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ANS/sample_audio/speech_with_noise.wav' NOISE_SPEECH_FILE = 'speech_with_noise.wav'