diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py index c788e783..db23b06f 100644 --- a/modelscope/pipelines/audio/asr_inference_pipeline.py +++ b/modelscope/pipelines/audio/asr_inference_pipeline.py @@ -110,6 +110,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): 'sampled_lengths': 'seq2seq/sampled_lengths', 'lang': 'zh-cn', 'code_base': inputs['code_base'], + 'mode': inputs['mode'], 'fs': { 'audio_fs': inputs['audio_fs'], 'model_fs': 16000 @@ -233,15 +234,16 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): def run_inference(self, cmd): asr_result = [] if self.framework == Frameworks.torch and cmd['code_base'] == 'funasr': - from funasr.bin import asr_inference_paraformer_modelscope + if cmd['mode'] == 'asr': + from funasr.bin import asr_inference_modelscope as asr_inference + else: + from funasr.bin import asr_inference_paraformer_modelscope as asr_inference - if hasattr(asr_inference_paraformer_modelscope, 'set_parameters'): - asr_inference_paraformer_modelscope.set_parameters( - sample_rate=cmd['fs']) - asr_inference_paraformer_modelscope.set_parameters( - language=cmd['lang']) + if hasattr(asr_inference, 'set_parameters'): + asr_inference.set_parameters(sample_rate=cmd['fs']) + asr_inference.set_parameters(language=cmd['lang']) - asr_result = asr_inference_paraformer_modelscope.asr_inference( + asr_result = asr_inference.asr_inference( batch_size=cmd['batch_size'], maxlenratio=cmd['maxlenratio'], minlenratio=cmd['minlenratio'], diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py index 1537b137..a06c9134 100644 --- a/modelscope/preprocessors/asr.py +++ b/modelscope/preprocessors/asr.py @@ -103,6 +103,12 @@ class WavToScp(Preprocessor): else: code_base = None inputs['code_base'] = code_base + # decoding mode + if 'mode' in inputs['model_config']: + mode = inputs['model_config']['mode'] + else: + mode = None + inputs['mode'] = mode if inputs['model_type'] == Frameworks.torch: assert inputs['model_config'].__contains__( @@ -111,8 +117,6 @@ class WavToScp(Preprocessor): 'am_model_config'), 'am_model_config does not exist' assert inputs['model_config'].__contains__( 'asr_model_config'), 'asr_model_config does not exist' - assert inputs['model_config'].__contains__( - 'asr_model_wav_config'), 'asr_model_wav_config does not exist' am_model_config: str = os.path.join( inputs['model_workspace'], @@ -127,9 +131,14 @@ class WavToScp(Preprocessor): assert os.path.exists( asr_model_config), 'asr_model_config does not exist' - asr_model_wav_config: str = os.path.join( - inputs['model_workspace'], - inputs['model_config']['asr_model_wav_config']) + if 'asr_model_wav_config' in inputs['model_config']: + asr_model_wav_config: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['asr_model_wav_config']) + else: + asr_model_wav_config: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['asr_model_config']) assert os.path.exists( asr_model_wav_config), 'asr_model_wav_config does not exist' diff --git a/requirements/audio.txt b/requirements/audio.txt index bef3764b..44b8c6a0 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -1,6 +1,6 @@ easyasr>=0.0.2 espnet==202204 -funasr>=0.1.0 +funasr>=0.1.3 h5py inflect keras diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py index b6532868..57e0ea5d 100644 --- a/tests/pipelines/test_automatic_speech_recognition.py +++ b/tests/pipelines/test_automatic_speech_recognition.py @@ -217,6 +217,41 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase, 'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline', 'wav_path': 'data/test/audios/asr_example_id.wav' }, + { + 'model_id': + 'damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, + { + 'model_id': + 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', + 'wav_path': 'data/test/audios/asr_example_id.wav' + }, ] def setUp(self) -> None: