From e3bffedb87e25d35c061b231f794d96a314149c5 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 28 Jul 2022 22:59:57 +0800
Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=20aec=20pipeline=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9C++=E5=BA=93=E4=BE=9D=E8=B5=96=E5=88=B0MinDAEC=20=20?=
 =?UTF-8?q?=20=20=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-M?=
 =?UTF-8?q?aaS/MaaS-lib/codereview/9563105?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * use MinDAEC instead of cdll

* feat: ANS pipeline can accept bytes as input and adjust processing order to reduce the amount of computation
---
 modelscope/preprocessors/__init__.py          |  1 +
 modelscope/preprocessors/audio.py             | 78 ++++++-------------
 requirements/audio.txt                        |  1 +
 tests/pipelines/test_speech_signal_process.py |  3 -
 4 files changed, 27 insertions(+), 56 deletions(-)

diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index fd4dd4c5..1aba9107 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -32,6 +32,7 @@ else:
         'base': ['Preprocessor'],
         'builder': ['PREPROCESSORS', 'build_preprocessor'],
         'common': ['Compose'],
+        'audio': ['LinearAECAndFbank'],
         'asr': ['WavToScp'],
         'video': ['ReadVideoData'],
         'image': [
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index bb10c89c..cdee968b 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -1,58 +1,15 @@
-import ctypes
+import io
 import os
 from typing import Any, Dict
 
 import numpy as np
 import scipy.io.wavfile as wav
 import torch
-from numpy.ctypeslib import ndpointer
 
 from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS
 
 
-def load_wav(path):
-    samp_rate, data = wav.read(path)
-    return np.float32(data), samp_rate
-
-
-def load_library(libaec):
-    libaec_in_cwd = os.path.join('.', libaec)
-    if os.path.exists(libaec_in_cwd):
-        libaec = libaec_in_cwd
-    mitaec = ctypes.cdll.LoadLibrary(libaec)
-    fe_process = mitaec.fe_process_inst
-    fe_process.argtypes = [
-        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
-        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
-        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
-        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
-        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
-    ]
-    return fe_process
-
-
-def do_linear_aec(fe_process, mic, ref, int16range=True):
-    mic = np.float32(mic)
-    ref = np.float32(ref)
-    if len(mic) > len(ref):
-        mic = mic[:len(ref)]
-    out_mic = np.zeros_like(mic)
-    out_linear = np.zeros_like(mic)
-    out_echo = np.zeros_like(mic)
-    out_ref = np.zeros_like(mic)
-    if int16range:
-        mic /= 32768
-        ref /= 32768
-    fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
-    # out_ref not in use here
-    if int16range:
-        out_mic *= 32768
-        out_linear *= 32768
-        out_echo *= 32768
-    return out_mic, out_ref, out_linear, out_echo
-
-
 def load_kaldi_feature_transform(filename):
     fp = open(filename, 'r')
     all_str = fp.read()
@@ -162,11 +119,12 @@ class LinearAECAndFbank:
     SAMPLE_RATE = 16000
 
     def __init__(self, io_config):
+        import MinDAEC
         self.trunc_length = 7200 * self.SAMPLE_RATE
         self.linear_aec_delay = io_config['linear_aec_delay']
         self.feature = Feature(io_config['fbank_config'],
                                io_config['feat_type'], io_config['mvn'])
-        self.mitaec = load_library(io_config['mitaec_library'])
+        self.mitaec = MinDAEC.load()
         self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
@@ -175,18 +133,15 @@ class LinearAECAndFbank:
         :return: dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
         """
         # read files
-        nearend_mic, fs = load_wav(data['nearend_mic'])
-        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
-        farend_speech, fs = load_wav(data['farend_speech'])
-        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
+        nearend_mic, fs = self.load_wav(data['nearend_mic'])
+        farend_speech, fs = self.load_wav(data['farend_speech'])
         if 'nearend_speech' in data:
-            nearend_speech, fs = load_wav(data['nearend_speech'])
-            assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
+            nearend_speech, fs = self.load_wav(data['nearend_speech'])
         else:
             nearend_speech = np.zeros_like(nearend_mic)
 
-        out_mic, out_ref, out_linear, out_echo = do_linear_aec(
-            self.mitaec, nearend_mic, farend_speech)
+        out_mic, out_ref, out_linear, out_echo = self.mitaec.do_linear_aec(
+            nearend_mic, farend_speech)
         # fix 20ms linear aec delay by delaying the target speech
         extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
         nearend_speech = np.concatenate([extra_zeros, nearend_speech])
@@ -229,3 +184,20 @@ class LinearAECAndFbank:
             base = out_linear
         out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
         return out_data
+
+    @staticmethod
+    def load_wav(inputs):
+        import librosa
+        if isinstance(inputs, bytes):
+            inputs = io.BytesIO(inputs)
+        elif isinstance(inputs, str):
+            pass
+        else:
+            raise TypeError(f'Unsupported input type: {type(inputs)}.')
+        sample_rate, data = wav.read(inputs)
+        if len(data.shape) > 1:
+            raise ValueError('modelscope error:The audio must be mono.')
+        if sample_rate != LinearAECAndFbank.SAMPLE_RATE:
+            data = librosa.resample(data, sample_rate,
+                                    LinearAECAndFbank.SAMPLE_RATE)
+        return data.astype(np.float32), LinearAECAndFbank.SAMPLE_RATE
diff --git a/requirements/audio.txt b/requirements/audio.txt
index b1d9e2c3..3bd0d8af 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -8,6 +8,7 @@ kwsbp
 librosa
 lxml
 matplotlib
+MinDAEC
 nara_wpe
 nltk
 # numpy requirements should be declared with tensorflow 1.15 but not here
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index f911c0eb..edc7f34d 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -13,9 +13,6 @@ FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/
 NEAREND_MIC_FILE = 'nearend_mic.wav'
 FAREND_SPEECH_FILE = 'farend_speech.wav'
 
-AEC_LIB_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/dependencies/ics_MaaS_AEC_lib_libmitaec_pyio.so'
-AEC_LIB_FILE = 'libmitaec_pyio.so'
-
 NOISE_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ANS/sample_audio/speech_with_noise.wav'
 NOISE_SPEECH_FILE = 'speech_with_noise.wav'