From 5ac448f5c7ef1c781dd1772fe31484fc746e6a77 Mon Sep 17 00:00:00 2001 From: "shichen.fsc" Date: Wed, 27 Jul 2022 14:49:24 +0800 Subject: [PATCH] [to #42322933] simplify asr inference code, and remove disk-write behavior Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9410174 --- .../generic_automatic_speech_recognition.py | 4 +- modelscope/outputs.py | 8 +- modelscope/pipelines/audio/__init__.py | 2 +- modelscope/pipelines/audio/asr/__init__.py | 0 .../audio/asr/asr_engine/__init__.py | 0 .../audio/asr/asr_engine/asr_env_checking.py | 21 - .../asr_inference_paraformer_espnet.py | 690 -------- .../audio/asr/asr_engine/common/__init__.py | 0 .../audio/asr/asr_engine/common/asr_utils.py | 193 --- .../audio/asr/asr_engine/espnet/__init__.py | 0 .../asr/asr_engine/espnet/asr/__init__.py | 0 .../asr_engine/espnet/asr/decoder/__init__.py | 0 .../espnet/asr/decoder/transformer_decoder.py | 757 --------- .../asr_engine/espnet/asr/encoder/__init__.py | 0 .../espnet/asr/encoder/conformer_encoder.py | 710 -------- .../espnet/asr/encoder/sanm_encoder.py | 500 ------ .../asr/asr_engine/espnet/asr/espnet_model.py | 1131 ------------- .../espnet/asr/espnet_model_paraformer.py | 1444 ----------------- .../espnet/asr/frontend/__init__.py | 0 .../espnet/asr/frontend/wav_frontend.py | 113 -- .../espnet/asr/streaming_utilis/__init__.py | 0 .../asr/streaming_utilis/chunk_utilis.py | 321 ---- .../asr/asr_engine/espnet/nets/__init__.py | 0 .../espnet/nets/pytorch_backend/__init__.py | 0 .../pytorch_backend/cif_utils/__init__.py | 0 .../nets/pytorch_backend/cif_utils/cif.py | 250 --- .../pytorch_backend/transformer/__init__.py | 0 .../pytorch_backend/transformer/attention.py | 680 -------- .../transformer/encoder_layer.py | 239 --- .../asr/asr_engine/espnet/tasks/__init__.py | 0 .../audio/asr/asr_engine/espnet/tasks/asr.py | 890 ---------- .../audio/asr/asr_inference_pipeline.py | 223 --- .../pipelines/audio/asr_inference_pipeline.py | 213 +++ modelscope/preprocessors/asr.py | 287 ++-- requirements/audio.txt | 1 + .../test_automatic_speech_recognition.py | 338 ++-- 36 files changed, 587 insertions(+), 8428 deletions(-) delete mode 100644 modelscope/pipelines/audio/asr/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/asr_env_checking.py delete mode 100755 modelscope/pipelines/audio/asr/asr_engine/asr_inference_paraformer_espnet.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/common/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/common/asr_utils.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/transformer_decoder.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/conformer_encoder.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/sanm_encoder.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model_paraformer.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/wav_frontend.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/chunk_utilis.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/cif.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/attention.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/encoder_layer.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/__init__.py delete mode 100644 modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/asr.py delete mode 100644 modelscope/pipelines/audio/asr/asr_inference_pipeline.py create mode 100644 modelscope/pipelines/audio/asr_inference_pipeline.py diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py index b057a8b7..5213fdd1 100644 --- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py +++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py @@ -20,8 +20,10 @@ class GenericAutomaticSpeechRecognition(Model): Args: model_dir (str): the model path. am_model_name (str): the am model name from configuration.json + model_config (Dict[str, Any]): the detail config about model from configuration.json """ - + super().__init__(model_dir, am_model_name, model_config, *args, + **kwargs) self.model_cfg = { # the recognition model dir path 'model_workspace': model_dir, diff --git a/modelscope/outputs.py b/modelscope/outputs.py index a8948763..ed2d680d 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -312,5 +312,11 @@ TASK_OUTPUTS = { # { # "text": "this is the text generated by a model." # } - Tasks.visual_question_answering: [OutputKeys.TEXT] + Tasks.visual_question_answering: [OutputKeys.TEXT], + + # auto_speech_recognition result for a single sample + # { + # "text": "每天都要快乐喔" + # } + Tasks.auto_speech_recognition: [OutputKeys.TEXT] } diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py index 84a593b8..a5dd178a 100644 --- a/modelscope/pipelines/audio/__init__.py +++ b/modelscope/pipelines/audio/__init__.py @@ -3,7 +3,7 @@ from modelscope.utils.error import TENSORFLOW_IMPORT_ERROR try: - from .asr.asr_inference_pipeline import AutomaticSpeechRecognitionPipeline + from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline from .kws_kwsbp_pipeline import * # noqa F403 from .linear_aec_pipeline import LinearAECPipeline except ModuleNotFoundError as e: diff --git a/modelscope/pipelines/audio/asr/__init__.py b/modelscope/pipelines/audio/asr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/asr_env_checking.py b/modelscope/pipelines/audio/asr/asr_engine/asr_env_checking.py deleted file mode 100644 index 81c41737..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/asr_env_checking.py +++ /dev/null @@ -1,21 +0,0 @@ -import ssl - -import nltk - -try: - _create_unverified_https_context = ssl._create_unverified_context -except AttributeError: - pass -else: - ssl._create_default_https_context = _create_unverified_https_context - -try: - nltk.data.find('taggers/averaged_perceptron_tagger') -except LookupError: - nltk.download( - 'averaged_perceptron_tagger', halt_on_error=False, raise_on_error=True) - -try: - nltk.data.find('corpora/cmudict') -except LookupError: - nltk.download('cmudict', halt_on_error=False, raise_on_error=True) diff --git a/modelscope/pipelines/audio/asr/asr_engine/asr_inference_paraformer_espnet.py b/modelscope/pipelines/audio/asr/asr_engine/asr_inference_paraformer_espnet.py deleted file mode 100755 index befb7a01..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/asr_inference_paraformer_espnet.py +++ /dev/null @@ -1,690 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. - -import argparse -import logging -import sys -import time -from pathlib import Path -from typing import Any, Optional, Sequence, Tuple, Union - -import numpy as np -import torch -from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer -from espnet2.asr.transducer.beam_search_transducer import \ - ExtendedHypothesis as ExtTransHypothesis # noqa: H301 -from espnet2.asr.transducer.beam_search_transducer import \ - Hypothesis as TransHypothesis -from espnet2.fileio.datadir_writer import DatadirWriter -from espnet2.tasks.lm import LMTask -from espnet2.text.build_tokenizer import build_tokenizer -from espnet2.text.token_id_converter import TokenIDConverter -from espnet2.torch_utils.device_funcs import to_device -from espnet2.torch_utils.set_all_random_seed import set_all_random_seed -from espnet2.utils import config_argparse -from espnet2.utils.types import str2bool, str2triple_str, str_or_none -from espnet.nets.batch_beam_search import BatchBeamSearch -from espnet.nets.batch_beam_search_online_sim import BatchBeamSearchOnlineSim -from espnet.nets.beam_search import BeamSearch, Hypothesis -from espnet.nets.pytorch_backend.transformer.subsampling import \ - TooShortUttError -from espnet.nets.scorer_interface import BatchScorerInterface -from espnet.nets.scorers.ctc import CTCPrefixScorer -from espnet.nets.scorers.length_bonus import LengthBonus -from espnet.utils.cli_utils import get_commandline_args -from typeguard import check_argument_types - -from .espnet.asr.frontend.wav_frontend import WavFrontend -from .espnet.tasks.asr import ASRTaskNAR as ASRTask - - -class Speech2Text: - - def __init__(self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - transducer_conf: dict = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - ngram_scorer: str = 'full', - ngram_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = 'cpu', - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - batch_size: int = 1, - dtype: str = 'float32', - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - streaming: bool = False, - frontend_conf: dict = None): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, device) - if asr_model.frontend is None and frontend_conf is not None: - frontend = WavFrontend(**frontend_conf) - asr_model.frontend = frontend - asr_model.to(dtype=getattr(torch, dtype)).eval() - - decoder = asr_model.decoder - - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - token_list = asr_model.token_list - scorers.update( - decoder=decoder, - ctc=ctc, - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device) - scorers['lm'] = lm.lm - - # 3. Build ngram model - if ngram_file is not None: - if ngram_scorer == 'full': - from espnet.nets.scorers.ngram import NgramFullScorer - - ngram = NgramFullScorer(ngram_file, token_list) - else: - from espnet.nets.scorers.ngram import NgramPartScorer - - ngram = NgramPartScorer(ngram_file, token_list) - else: - ngram = None - scorers['ngram'] = ngram - - # 4. Build BeamSearch object - if asr_model.use_transducer_decoder: - beam_search_transducer = BeamSearchTransducer( - decoder=asr_model.decoder, - joint_network=asr_model.joint_network, - beam_size=beam_size, - lm=scorers['lm'] if 'lm' in scorers else None, - lm_weight=lm_weight, - **transducer_conf, - ) - beam_search = None - else: - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else 'full', - ) - - # TODO(karita): make all scorers batchfied - if batch_size == 1: - non_batch = [ - k for k, v in beam_search.full_scorers.items() - if not isinstance(v, BatchScorerInterface) - ] - if len(non_batch) == 0: - if streaming: - beam_search.__class__ = BatchBeamSearchOnlineSim - beam_search.set_streaming_config(asr_train_config) - logging.info( - 'BatchBeamSearchOnlineSim implementation is selected.' - ) - else: - beam_search.__class__ = BatchBeamSearch - else: - logging.warning( - f'As non-batch scorers {non_batch} are found, ' - f'fall back to non-batch implementation.') - - beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() - for scorer in scorers.values(): - if isinstance(scorer, torch.nn.Module): - scorer.to( - device=device, dtype=getattr(torch, dtype)).eval() - - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == 'bpe': - if bpemodel is not None: - tokenizer = build_tokenizer( - token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - self.beam_search = beam_search - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - - @torch.no_grad() - def __call__(self, speech: Union[torch.Tensor, np.ndarray]): - """Inference - - Args: - data: Input speech data - Returns: - text, token, token_int, hyp - """ - - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - # data: (Nsamples,) -> (1, Nsamples) - speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - # lengths: (1,) - lengths = speech.new_full([1], - dtype=torch.long, - fill_value=speech.size(1)) - batch = {'speech': speech, 'speech_lengths': lengths} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, enc_len = self.asr_model.encode(**batch) - if isinstance(enc, tuple): - enc = enc[0] - assert len(enc) == 1, len(enc) - - predictor_outs = self.asr_model.calc_predictor(enc, enc_len) - pre_acoustic_embeds, pre_token_length = predictor_outs[ - 0], predictor_outs[1] - pre_token_length = torch.tensor([pre_acoustic_embeds.size(1)], - device=pre_acoustic_embeds.device) - decoder_outs = self.asr_model.cal_decoder_with_predictor( - enc, enc_len, pre_acoustic_embeds, pre_token_length) - decoder_out = decoder_outs[0] - - yseq = decoder_out.argmax(dim=-1) - score = decoder_out.max(dim=-1)[0] - score = torch.sum(score, dim=-1) - # pad with mask tokens to ensure compatibility with sos/eos tokens - yseq = torch.tensor( - [self.asr_model.sos] + yseq.tolist()[0] + [self.asr_model.eos], - device=yseq.device) - nbest_hyps = [Hypothesis(yseq=yseq, score=score)] - - results = [] - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis, TransHypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = None if self.asr_model.use_transducer_decoder else -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - - results.append((text, token, token_int, hyp, speech.size(1))) - - return results - - @staticmethod - def from_pretrained( - model_tag: Optional[str] = None, - **kwargs: Optional[Any], - ): - """Build Speech2Text instance from the pretrained model. - - Args: - model_tag (Optional[str]): Model tag of the pretrained models. - Currently, the tags of espnet_model_zoo are supported. - - Returns: - Speech2Text: Speech2Text instance. - - """ - if model_tag is not None: - try: - from espnet_model_zoo.downloader import ModelDownloader - - except ImportError: - logging.error( - '`espnet_model_zoo` is not installed. ' - 'Please install via `pip install -U espnet_model_zoo`.') - raise - d = ModelDownloader() - kwargs.update(**d.download_and_unpack(model_tag)) - - return Speech2Text(**kwargs) - - -def inference( - output_dir: str, - maxlenratio: float, - minlenratio: float, - batch_size: int, - dtype: str, - beam_size: int, - ngpu: int, - seed: int, - ctc_weight: float, - lm_weight: float, - ngram_weight: float, - penalty: float, - nbest: int, - num_workers: int, - log_level: Union[int, str], - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - key_file: Optional[str], - asr_train_config: Optional[str], - asr_model_file: Optional[str], - lm_train_config: Optional[str], - lm_file: Optional[str], - word_lm_train_config: Optional[str], - word_lm_file: Optional[str], - ngram_file: Optional[str], - model_tag: Optional[str], - token_type: Optional[str], - bpemodel: Optional[str], - allow_variable_data_keys: bool, - transducer_conf: Optional[dict], - streaming: bool, - frontend_conf: dict = None, -): - assert check_argument_types() - if batch_size > 1: - raise NotImplementedError('batch decoding is not implemented') - if word_lm_train_config is not None: - raise NotImplementedError('Word LM is not implemented') - if ngpu > 1: - raise NotImplementedError('only single GPU decoding is supported') - - logging.basicConfig( - level=log_level, - format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', - ) - - if ngpu >= 1: - device = 'cuda' - else: - device = 'cpu' - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - transducer_conf=transducer_conf, - lm_train_config=lm_train_config, - lm_file=lm_file, - ngram_file=ngram_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - streaming=streaming, - frontend_conf=frontend_conf, - ) - speech2text = Speech2Text.from_pretrained( - model_tag=model_tag, - **speech2text_kwargs, - ) - - # 3. Build data-iterator - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, - False), - collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - forward_time_total = 0.0 - length_total = 0.0 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - with DatadirWriter(output_dir) as writer: - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f'{len(keys)} != {_bs}' - batch = { - k: v[0] - for k, v in batch.items() if not k.endswith('_lengths') - } - - # N-best list of (text, token, token_int, hyp_object) - - try: - time_beg = time.time() - results = speech2text(**batch) - time_end = time.time() - forward_time = time_end - time_beg - length = results[0][-1] - results = [results[0][:-1]] - forward_time_total += forward_time - length_total += length - except TooShortUttError as e: - logging.warning(f'Utterance {keys} {e}') - hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) - results = [[' ', [''], [2], hyp]] * nbest - - # Only supporting batch_size==1 - key = keys[0] - for n, (text, token, token_int, - hyp) in zip(range(1, nbest + 1), results): - # Create a directory: outdir/{n}best_recog - ibest_writer = writer[f'{n}best_recog'] - - # Write the result to each file - ibest_writer['token'][key] = ' '.join(token) - ibest_writer['token_int'][key] = ' '.join(map(str, token_int)) - ibest_writer['score'][key] = str(hyp.score) - - if text is not None: - ibest_writer['text'][key] = text - - logging.info( - 'decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}' - .format(length_total, forward_time_total, - 100 * forward_time_total / length_total)) - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description='ASR Decoding', - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - '--log_level', - type=lambda x: x.upper(), - default='INFO', - choices=('CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'), - help='The verbose level of logging', - ) - - parser.add_argument('--output_dir', type=str, required=True) - parser.add_argument( - '--ngpu', - type=int, - default=0, - help='The number of gpus. 0 indicates CPU mode', - ) - parser.add_argument('--seed', type=int, default=0, help='Random seed') - parser.add_argument( - '--dtype', - default='float32', - choices=['float16', 'float32', 'float64'], - help='Data type', - ) - parser.add_argument( - '--num_workers', - type=int, - default=1, - help='The number of workers used for DataLoader', - ) - - group = parser.add_argument_group('Input data related') - group.add_argument( - '--data_path_and_name_and_type', - type=str2triple_str, - required=True, - action='append', - ) - group.add_argument('--key_file', type=str_or_none) - group.add_argument( - '--allow_variable_data_keys', type=str2bool, default=False) - - group = parser.add_argument_group('The model configuration related') - group.add_argument( - '--asr_train_config', - type=str, - help='ASR training configuration', - ) - group.add_argument( - '--asr_model_file', - type=str, - help='ASR model parameter file', - ) - group.add_argument( - '--lm_train_config', - type=str, - help='LM training configuration', - ) - group.add_argument( - '--lm_file', - type=str, - help='LM parameter file', - ) - group.add_argument( - '--word_lm_train_config', - type=str, - help='Word LM training configuration', - ) - group.add_argument( - '--word_lm_file', - type=str, - help='Word LM parameter file', - ) - group.add_argument( - '--ngram_file', - type=str, - help='N-gram parameter file', - ) - group.add_argument( - '--model_tag', - type=str, - help='Pretrained model tag. If specify this option, *_train_config and ' - '*_file will be overwritten', - ) - - group = parser.add_argument_group('Beam-search related') - group.add_argument( - '--batch_size', - type=int, - default=1, - help='The batch size for inference', - ) - group.add_argument( - '--nbest', type=int, default=1, help='Output N-best hypotheses') - group.add_argument('--beam_size', type=int, default=20, help='Beam size') - group.add_argument( - '--penalty', type=float, default=0.0, help='Insertion penalty') - group.add_argument( - '--maxlenratio', - type=float, - default=0.0, - help='Input length ratio to obtain max output length. ' - 'If maxlenratio=0.0 (default), it uses a end-detect ' - 'function ' - 'to automatically find maximum hypothesis lengths.' - 'If maxlenratio<0.0, its absolute value is interpreted' - 'as a constant max output length', - ) - group.add_argument( - '--minlenratio', - type=float, - default=0.0, - help='Input length ratio to obtain min output length', - ) - group.add_argument( - '--ctc_weight', - type=float, - default=0.5, - help='CTC weight in joint decoding', - ) - group.add_argument( - '--lm_weight', type=float, default=1.0, help='RNNLM weight') - group.add_argument( - '--ngram_weight', type=float, default=0.9, help='ngram weight') - group.add_argument('--streaming', type=str2bool, default=False) - - group.add_argument( - '--frontend_conf', - default=None, - help='', - ) - - group = parser.add_argument_group('Text converter related') - group.add_argument( - '--token_type', - type=str_or_none, - default=None, - choices=['char', 'bpe', None], - help='The token type for ASR model. ' - 'If not given, refers from the training args', - ) - group.add_argument( - '--bpemodel', - type=str_or_none, - default=None, - help='The model path of sentencepiece. ' - 'If not given, refers from the training args', - ) - group.add_argument( - '--transducer_conf', - default=None, - help='The keyword arguments for transducer beam search.', - ) - - return parser - - -def asr_inference( - output_dir: str, - maxlenratio: float, - minlenratio: float, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - asr_train_config: Optional[str], - asr_model_file: Optional[str], - nbest: int = 1, - num_workers: int = 1, - log_level: Union[int, str] = 'INFO', - batch_size: int = 1, - dtype: str = 'float32', - seed: int = 0, - key_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - word_lm_file: Optional[str] = None, - ngram_file: Optional[str] = None, - ngram_weight: float = 0.9, - model_tag: Optional[str] = None, - token_type: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - transducer_conf: Optional[dict] = None, - streaming: bool = False, - frontend_conf: dict = None, -): - inference( - output_dir=output_dir, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - batch_size=batch_size, - dtype=dtype, - beam_size=beam_size, - ngpu=ngpu, - seed=seed, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - num_workers=num_workers, - log_level=log_level, - data_path_and_name_and_type=data_path_and_name_and_type, - key_file=key_file, - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - word_lm_train_config=word_lm_train_config, - word_lm_file=word_lm_file, - ngram_file=ngram_file, - model_tag=model_tag, - token_type=token_type, - bpemodel=bpemodel, - allow_variable_data_keys=allow_variable_data_keys, - transducer_conf=transducer_conf, - streaming=streaming, - frontend_conf=frontend_conf) - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop('config', None) - inference(**kwargs) - - -if __name__ == '__main__': - main() diff --git a/modelscope/pipelines/audio/asr/asr_engine/common/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/common/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/common/asr_utils.py b/modelscope/pipelines/audio/asr/asr_engine/common/asr_utils.py deleted file mode 100644 index 0d9a5f43..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/common/asr_utils.py +++ /dev/null @@ -1,193 +0,0 @@ -import os -from typing import Any, Dict, List - -import numpy as np - - -def type_checking(wav_path: str, - recog_type: str = None, - audio_format: str = None, - workspace: str = None): - assert os.path.exists(wav_path), f'wav_path:{wav_path} does not exist' - - r_recog_type = recog_type - r_audio_format = audio_format - r_workspace = workspace - r_wav_path = wav_path - - if r_workspace is None or len(r_workspace) == 0: - r_workspace = os.path.join(os.getcwd(), '.tmp') - - if r_recog_type is None: - if os.path.isfile(wav_path): - if wav_path.endswith('.wav') or wav_path.endswith('.WAV'): - r_recog_type = 'wav' - r_audio_format = 'wav' - - elif os.path.isdir(wav_path): - dir_name = os.path.basename(wav_path) - if 'test' in dir_name: - r_recog_type = 'test' - elif 'dev' in dir_name: - r_recog_type = 'dev' - elif 'train' in dir_name: - r_recog_type = 'train' - - if r_audio_format is None: - if find_file_by_ends(wav_path, '.ark'): - r_audio_format = 'kaldi_ark' - elif find_file_by_ends(wav_path, '.wav') or find_file_by_ends( - wav_path, '.WAV'): - r_audio_format = 'wav' - - if r_audio_format == 'kaldi_ark' and r_recog_type != 'wav': - # datasets with kaldi_ark file - r_wav_path = os.path.abspath(os.path.join(r_wav_path, '../')) - elif r_audio_format == 'wav' and r_recog_type != 'wav': - # datasets with waveform files - r_wav_path = os.path.abspath(os.path.join(r_wav_path, '../../')) - - return r_recog_type, r_audio_format, r_workspace, r_wav_path - - -def find_file_by_ends(dir_path: str, ends: str): - dir_files = os.listdir(dir_path) - for file in dir_files: - file_path = os.path.join(dir_path, file) - if os.path.isfile(file_path): - if file_path.endswith(ends): - return True - elif os.path.isdir(file_path): - if find_file_by_ends(file_path, ends): - return True - - return False - - -def compute_wer(hyp_text_path: str, ref_text_path: str) -> Dict[str, Any]: - assert os.path.exists(hyp_text_path), 'hyp_text does not exist' - assert os.path.exists(ref_text_path), 'ref_text does not exist' - - rst = { - 'Wrd': 0, - 'Corr': 0, - 'Ins': 0, - 'Del': 0, - 'Sub': 0, - 'Snt': 0, - 'Err': 0.0, - 'S.Err': 0.0, - 'wrong_words': 0, - 'wrong_sentences': 0 - } - - with open(ref_text_path, 'r', encoding='utf-8') as r: - r_lines = r.readlines() - - with open(hyp_text_path, 'r', encoding='utf-8') as h: - h_lines = h.readlines() - - for r_line in r_lines: - r_line_item = r_line.split() - r_key = r_line_item[0] - r_sentence = r_line_item[1] - for h_line in h_lines: - # find sentence from hyp text - if r_key in h_line: - h_line_item = h_line.split() - h_sentence = h_line_item[1] - out_item = compute_wer_by_line(h_sentence, r_sentence) - rst['Wrd'] += out_item['nwords'] - rst['Corr'] += out_item['cor'] - rst['wrong_words'] += out_item['wrong'] - rst['Ins'] += out_item['ins'] - rst['Del'] += out_item['del'] - rst['Sub'] += out_item['sub'] - rst['Snt'] += 1 - if out_item['wrong'] > 0: - rst['wrong_sentences'] += 1 - - break - - if rst['Wrd'] > 0: - rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2) - if rst['Snt'] > 0: - rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2) - - return rst - - -def compute_wer_by_line(hyp: list, ref: list) -> Dict[str, Any]: - len_hyp = len(hyp) - len_ref = len(ref) - cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16) - - ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8) - - for i in range(len_hyp + 1): - cost_matrix[i][0] = i - for j in range(len_ref + 1): - cost_matrix[0][j] = j - - for i in range(1, len_hyp + 1): - for j in range(1, len_ref + 1): - if hyp[i - 1] == ref[j - 1]: - cost_matrix[i][j] = cost_matrix[i - 1][j - 1] - else: - substitution = cost_matrix[i - 1][j - 1] + 1 - insertion = cost_matrix[i - 1][j] + 1 - deletion = cost_matrix[i][j - 1] + 1 - - compare_val = [substitution, insertion, deletion] - - min_val = min(compare_val) - operation_idx = compare_val.index(min_val) + 1 - cost_matrix[i][j] = min_val - ops_matrix[i][j] = operation_idx - - match_idx = [] - i = len_hyp - j = len_ref - rst = { - 'nwords': len_hyp, - 'cor': 0, - 'wrong': 0, - 'ins': 0, - 'del': 0, - 'sub': 0 - } - while i >= 0 or j >= 0: - i_idx = max(0, i) - j_idx = max(0, j) - - if ops_matrix[i_idx][j_idx] == 0: # correct - if i - 1 >= 0 and j - 1 >= 0: - match_idx.append((j - 1, i - 1)) - rst['cor'] += 1 - - i -= 1 - j -= 1 - - elif ops_matrix[i_idx][j_idx] == 2: # insert - i -= 1 - rst['ins'] += 1 - - elif ops_matrix[i_idx][j_idx] == 3: # delete - j -= 1 - rst['del'] += 1 - - elif ops_matrix[i_idx][j_idx] == 1: # substitute - i -= 1 - j -= 1 - rst['sub'] += 1 - - if i < 0 and j >= 0: - rst['del'] += 1 - elif j < 0 and i >= 0: - rst['ins'] += 1 - - match_idx.reverse() - wrong_cnt = cost_matrix[len_hyp][len_ref] - rst['wrong'] = wrong_cnt - - return rst diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/transformer_decoder.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/transformer_decoder.py deleted file mode 100644 index e1435db1..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/decoder/transformer_decoder.py +++ /dev/null @@ -1,757 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -"""Decoder definition.""" -from typing import Any, List, Sequence, Tuple - -import torch -from espnet2.asr.decoder.abs_decoder import AbsDecoder -from espnet.nets.pytorch_backend.nets_utils import make_pad_mask -from espnet.nets.pytorch_backend.transformer.attention import \ - MultiHeadedAttention -from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer -from espnet.nets.pytorch_backend.transformer.dynamic_conv import \ - DynamicConvolution -from espnet.nets.pytorch_backend.transformer.dynamic_conv2d import \ - DynamicConvolution2D -from espnet.nets.pytorch_backend.transformer.embedding import \ - PositionalEncoding -from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm -from espnet.nets.pytorch_backend.transformer.lightconv import \ - LightweightConvolution -from espnet.nets.pytorch_backend.transformer.lightconv2d import \ - LightweightConvolution2D -from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask -from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import \ - PositionwiseFeedForward # noqa: H301 -from espnet.nets.pytorch_backend.transformer.repeat import repeat -from espnet.nets.scorer_interface import BatchScorerInterface -from typeguard import check_argument_types - - -class BaseTransformerDecoder(AbsDecoder, BatchScorerInterface): - """Base class of Transfomer decoder module. - - Args: - vocab_size: output dim - encoder_output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the number of units of position-wise feed forward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - self_attention_dropout_rate: dropout rate for attention - input_layer: input layer type - use_output_layer: whether to use output layer - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: whether to use layer_norm before the first block - concat_after: whether to concat attention layer's input and output - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. - i.e. x -> x + att(x) - """ - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - ): - assert check_argument_types() - super().__init__() - attention_dim = encoder_output_size - - if input_layer == 'embed': - self.embed = torch.nn.Sequential( - torch.nn.Embedding(vocab_size, attention_dim), - pos_enc_class(attention_dim, positional_dropout_rate), - ) - elif input_layer == 'linear': - self.embed = torch.nn.Sequential( - torch.nn.Linear(vocab_size, attention_dim), - torch.nn.LayerNorm(attention_dim), - torch.nn.Dropout(dropout_rate), - torch.nn.ReLU(), - pos_enc_class(attention_dim, positional_dropout_rate), - ) - else: - raise ValueError( - f"only 'embed' or 'linear' is supported: {input_layer}") - - self.normalize_before = normalize_before - if self.normalize_before: - self.after_norm = LayerNorm(attention_dim) - if use_output_layer: - self.output_layer = torch.nn.Linear(attention_dim, vocab_size) - else: - self.output_layer = None - - # Must set by the inheritance - self.decoders = None - - def forward( - self, - hs_pad: torch.Tensor, - hlens: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Forward decoder. - - Args: - hs_pad: encoded memory, float32 (batch, maxlen_in, feat) - hlens: (batch) - ys_in_pad: - input token ids, int64 (batch, maxlen_out) - if input_layer == "embed" - input tensor (batch, maxlen_out, #mels) in the other cases - ys_in_lens: (batch) - Returns: - (tuple): tuple containing: - - x: decoded token score before softmax (batch, maxlen_out, token) - if use_output_layer is True, - olens: (batch, ) - """ - tgt = ys_in_pad - # tgt_mask: (B, 1, L) - tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device) - # m: (1, L, L) - m = subsequent_mask( - tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m - - memory = hs_pad - memory_mask = ( - ~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to( - memory.device) - # Padding for Longformer - if memory_mask.shape[-1] != memory.shape[1]: - padlen = memory.shape[1] - memory_mask.shape[-1] - memory_mask = torch.nn.functional.pad(memory_mask, (0, padlen), - 'constant', False) - - x = self.embed(tgt) - x, tgt_mask, memory, memory_mask = self.decoders( - x, tgt_mask, memory, memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.output_layer is not None: - x = self.output_layer(x) - - olens = tgt_mask.sum(1) - return x, olens - - def forward_one_step( - self, - tgt: torch.Tensor, - tgt_mask: torch.Tensor, - memory: torch.Tensor, - cache: List[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """Forward one step. - - Args: - tgt: input token ids, int64 (batch, maxlen_out) - tgt_mask: input token mask, (batch, maxlen_out) - dtype=torch.uint8 in PyTorch 1.2- - dtype=torch.bool in PyTorch 1.2+ (include 1.2) - memory: encoded memory, float32 (batch, maxlen_in, feat) - cache: cached output list of (batch, max_time_out-1, size) - Returns: - y, cache: NN output value and cache per `self.decoders`. - y.shape` is (batch, maxlen_out, token) - """ - x = self.embed(tgt) - if cache is None: - cache = [None] * len(self.decoders) - new_cache = [] - for c, decoder in zip(cache, self.decoders): - x, tgt_mask, memory, memory_mask = decoder( - x, tgt_mask, memory, None, cache=c) - new_cache.append(x) - - if self.normalize_before: - y = self.after_norm(x[:, -1]) - else: - y = x[:, -1] - if self.output_layer is not None: - y = torch.log_softmax(self.output_layer(y), dim=-1) - - return y, new_cache - - def score(self, ys, state, x): - """Score.""" - ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0) - logp, state = self.forward_one_step( - ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state) - return logp.squeeze(0), state - - def batch_score(self, ys: torch.Tensor, states: List[Any], - xs: torch.Tensor) -> Tuple[torch.Tensor, List[Any]]: - """Score new token batch. - - Args: - ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). - states (List[Any]): Scorer states for prefix tokens. - xs (torch.Tensor): - The encoder feature that generates ys (n_batch, xlen, n_feat). - - Returns: - tuple[torch.Tensor, List[Any]]: Tuple of - batchfied scores for next token with shape of `(n_batch, n_vocab)` - and next state list for ys. - - """ - # merge states - n_batch = len(ys) - n_layers = len(self.decoders) - if states[0] is None: - batch_state = None - else: - # transpose state of [batch, layer] into [layer, batch] - batch_state = [ - torch.stack([states[b][i] for b in range(n_batch)]) - for i in range(n_layers) - ] - - # batch decoding - ys_mask = subsequent_mask(ys.size(-1), device=xs.device).unsqueeze(0) - logp, states = self.forward_one_step( - ys, ys_mask, xs, cache=batch_state) - - # transpose state of [layer, batch] into [batch, layer] - state_list = [[states[i][b] for i in range(n_layers)] - for b in range(n_batch)] - return logp, state_list - - -class TransformerDecoder(BaseTransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - - attention_dim = encoder_output_size - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - -class ParaformerDecoder(TransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - ): - assert check_argument_types() - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - - attention_dim = encoder_output_size - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - def forward( - self, - hs_pad: torch.Tensor, - hlens: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Forward decoder. - - Args: - hs_pad: encoded memory, float32 (batch, maxlen_in, feat) - hlens: (batch) - ys_in_pad: - input token ids, int64 (batch, maxlen_out) - if input_layer == "embed" - input tensor (batch, maxlen_out, #mels) in the other cases - ys_in_lens: (batch) - Returns: - (tuple): tuple containing: - - x: decoded token score before softmax (batch, maxlen_out, token) - if use_output_layer is True, - olens: (batch, ) - """ - tgt = ys_in_pad - # tgt_mask: (B, 1, L) - tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device) - # m: (1, L, L) - # m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - # tgt_mask = tgt_mask & m - - memory = hs_pad - memory_mask = ( - ~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to( - memory.device) - # Padding for Longformer - if memory_mask.shape[-1] != memory.shape[1]: - padlen = memory.shape[1] - memory_mask.shape[-1] - memory_mask = torch.nn.functional.pad(memory_mask, (0, padlen), - 'constant', False) - - # x = self.embed(tgt) - x = tgt - x, tgt_mask, memory, memory_mask = self.decoders( - x, tgt_mask, memory, memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.output_layer is not None: - x = self.output_layer(x) - - olens = tgt_mask.sum(1) - return x, olens - - -class ParaformerDecoderBertEmbed(TransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - embeds_id: int = 2, - ): - assert check_argument_types() - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - - attention_dim = encoder_output_size - self.decoders = repeat( - embeds_id, - lambda lnum: DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - if embeds_id == num_blocks: - self.decoders2 = None - else: - self.decoders2 = repeat( - num_blocks - embeds_id, - lambda lnum: DecoderLayer( - attention_dim, - MultiHeadedAttention(attention_heads, attention_dim, - self_attention_dropout_rate), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - def forward( - self, - hs_pad: torch.Tensor, - hlens: torch.Tensor, - ys_in_pad: torch.Tensor, - ys_in_lens: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Forward decoder. - - Args: - hs_pad: encoded memory, float32 (batch, maxlen_in, feat) - hlens: (batch) - ys_in_pad: - input token ids, int64 (batch, maxlen_out) - if input_layer == "embed" - input tensor (batch, maxlen_out, #mels) in the other cases - ys_in_lens: (batch) - Returns: - (tuple): tuple containing: - - x: decoded token score before softmax (batch, maxlen_out, token) - if use_output_layer is True, - olens: (batch, ) - """ - tgt = ys_in_pad - # tgt_mask: (B, 1, L) - tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device) - # m: (1, L, L) - # m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0) - # tgt_mask: (B, L, L) - # tgt_mask = tgt_mask & m - - memory = hs_pad - memory_mask = ( - ~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to( - memory.device) - # Padding for Longformer - if memory_mask.shape[-1] != memory.shape[1]: - padlen = memory.shape[1] - memory_mask.shape[-1] - memory_mask = torch.nn.functional.pad(memory_mask, (0, padlen), - 'constant', False) - - # x = self.embed(tgt) - x = tgt - x, tgt_mask, memory, memory_mask = self.decoders( - x, tgt_mask, memory, memory_mask) - embeds_outputs = x - if self.decoders2 is not None: - x, tgt_mask, memory, memory_mask = self.decoders2( - x, tgt_mask, memory, memory_mask) - if self.normalize_before: - x = self.after_norm(x) - if self.output_layer is not None: - x = self.output_layer(x) - - olens = tgt_mask.sum(1) - return x, olens, embeds_outputs - - -class LightweightConvolutionTransformerDecoder(BaseTransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - conv_wshare: int = 4, - conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), - conv_usebias: int = False, - ): - assert check_argument_types() - if len(conv_kernel_length) != num_blocks: - raise ValueError( - 'conv_kernel_length must have equal number of values to num_blocks: ' - f'{len(conv_kernel_length)} != {num_blocks}') - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - - attention_dim = encoder_output_size - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - LightweightConvolution( - wshare=conv_wshare, - n_feat=attention_dim, - dropout_rate=self_attention_dropout_rate, - kernel_size=conv_kernel_length[lnum], - use_kernel_mask=True, - use_bias=conv_usebias, - ), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - -class LightweightConvolution2DTransformerDecoder(BaseTransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - conv_wshare: int = 4, - conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), - conv_usebias: int = False, - ): - assert check_argument_types() - if len(conv_kernel_length) != num_blocks: - raise ValueError( - 'conv_kernel_length must have equal number of values to num_blocks: ' - f'{len(conv_kernel_length)} != {num_blocks}') - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - - attention_dim = encoder_output_size - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - LightweightConvolution2D( - wshare=conv_wshare, - n_feat=attention_dim, - dropout_rate=self_attention_dropout_rate, - kernel_size=conv_kernel_length[lnum], - use_kernel_mask=True, - use_bias=conv_usebias, - ), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - -class DynamicConvolutionTransformerDecoder(BaseTransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - conv_wshare: int = 4, - conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), - conv_usebias: int = False, - ): - assert check_argument_types() - if len(conv_kernel_length) != num_blocks: - raise ValueError( - 'conv_kernel_length must have equal number of values to num_blocks: ' - f'{len(conv_kernel_length)} != {num_blocks}') - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - attention_dim = encoder_output_size - - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - DynamicConvolution( - wshare=conv_wshare, - n_feat=attention_dim, - dropout_rate=self_attention_dropout_rate, - kernel_size=conv_kernel_length[lnum], - use_kernel_mask=True, - use_bias=conv_usebias, - ), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) - - -class DynamicConvolution2DTransformerDecoder(BaseTransformerDecoder): - - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - self_attention_dropout_rate: float = 0.0, - src_attention_dropout_rate: float = 0.0, - input_layer: str = 'embed', - use_output_layer: bool = True, - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - conv_wshare: int = 4, - conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), - conv_usebias: int = False, - ): - assert check_argument_types() - if len(conv_kernel_length) != num_blocks: - raise ValueError( - 'conv_kernel_length must have equal number of values to num_blocks: ' - f'{len(conv_kernel_length)} != {num_blocks}') - super().__init__( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - dropout_rate=dropout_rate, - positional_dropout_rate=positional_dropout_rate, - input_layer=input_layer, - use_output_layer=use_output_layer, - pos_enc_class=pos_enc_class, - normalize_before=normalize_before, - ) - attention_dim = encoder_output_size - - self.decoders = repeat( - num_blocks, - lambda lnum: DecoderLayer( - attention_dim, - DynamicConvolution2D( - wshare=conv_wshare, - n_feat=attention_dim, - dropout_rate=self_attention_dropout_rate, - kernel_size=conv_kernel_length[lnum], - use_kernel_mask=True, - use_bias=conv_usebias, - ), - MultiHeadedAttention(attention_heads, attention_dim, - src_attention_dropout_rate), - PositionwiseFeedForward(attention_dim, linear_units, - dropout_rate), - dropout_rate, - normalize_before, - concat_after, - ), - ) diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/conformer_encoder.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/conformer_encoder.py deleted file mode 100644 index 463852e9..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/conformer_encoder.py +++ /dev/null @@ -1,710 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -"""Conformer encoder definition.""" - -import logging -from typing import List, Optional, Tuple, Union - -import torch -from espnet2.asr.ctc import CTC -from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule -from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer -from espnet.nets.pytorch_backend.nets_utils import (get_activation, - make_pad_mask) -from espnet.nets.pytorch_backend.transformer.embedding import \ - LegacyRelPositionalEncoding # noqa: H301 -from espnet.nets.pytorch_backend.transformer.embedding import \ - PositionalEncoding # noqa: H301 -from espnet.nets.pytorch_backend.transformer.embedding import \ - RelPositionalEncoding # noqa: H301 -from espnet.nets.pytorch_backend.transformer.embedding import \ - ScaledPositionalEncoding # noqa: H301 -from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm -from espnet.nets.pytorch_backend.transformer.multi_layer_conv import ( - Conv1dLinear, MultiLayeredConv1d) -from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import \ - PositionwiseFeedForward # noqa: H301 -from espnet.nets.pytorch_backend.transformer.repeat import repeat -from espnet.nets.pytorch_backend.transformer.subsampling import ( - Conv2dSubsampling, Conv2dSubsampling2, Conv2dSubsampling6, - Conv2dSubsampling8, TooShortUttError, check_short_utt) -from typeguard import check_argument_types - -from ...nets.pytorch_backend.transformer.attention import \ - LegacyRelPositionMultiHeadedAttention # noqa: H301 -from ...nets.pytorch_backend.transformer.attention import \ - MultiHeadedAttention # noqa: H301 -from ...nets.pytorch_backend.transformer.attention import \ - RelPositionMultiHeadedAttention # noqa: H301 -from ...nets.pytorch_backend.transformer.attention import ( - LegacyRelPositionMultiHeadedAttentionSANM, - RelPositionMultiHeadedAttentionSANM) - - -class ConformerEncoder(AbsEncoder): - """Conformer encoder module. - - Args: - input_size (int): Input dimension. - output_size (int): Dimension of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - attention_dropout_rate (float): Dropout rate in attention. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - input_layer (Union[str, torch.nn.Module]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. - If True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - If False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - rel_pos_type (str): Whether to use the latest relative positional encoding or - the legacy one. The legacy relative positional encoding will be deprecated - in the future. More Details can be found in - https://github.com/espnet/espnet/pull/2816. - encoder_pos_enc_layer_type (str): Encoder positional encoding layer type. - encoder_attn_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - macaron_style (bool): Whether to use macaron style for positionwise layer. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - - """ - - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = 'conv2d', - normalize_before: bool = True, - concat_after: bool = False, - positionwise_layer_type: str = 'linear', - positionwise_conv_kernel_size: int = 3, - macaron_style: bool = False, - rel_pos_type: str = 'legacy', - pos_enc_layer_type: str = 'rel_pos', - selfattention_layer_type: str = 'rel_selfattn', - activation_type: str = 'swish', - use_cnn_module: bool = True, - zero_triu: bool = False, - cnn_module_kernel: int = 31, - padding_idx: int = -1, - interctc_layer_idx: List[int] = [], - interctc_use_conditioning: bool = False, - stochastic_depth_rate: Union[float, List[float]] = 0.0, - ): - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if rel_pos_type == 'legacy': - if pos_enc_layer_type == 'rel_pos': - pos_enc_layer_type = 'legacy_rel_pos' - if selfattention_layer_type == 'rel_selfattn': - selfattention_layer_type = 'legacy_rel_selfattn' - elif rel_pos_type == 'latest': - assert selfattention_layer_type != 'legacy_rel_selfattn' - assert pos_enc_layer_type != 'legacy_rel_pos' - else: - raise ValueError('unknown rel_pos_type: ' + rel_pos_type) - - activation = get_activation(activation_type) - if pos_enc_layer_type == 'abs_pos': - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == 'scaled_abs_pos': - pos_enc_class = ScaledPositionalEncoding - elif pos_enc_layer_type == 'rel_pos': - assert selfattention_layer_type == 'rel_selfattn' - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == 'legacy_rel_pos': - assert selfattention_layer_type == 'legacy_rel_selfattn' - pos_enc_class = LegacyRelPositionalEncoding - else: - raise ValueError('unknown pos_enc_layer: ' + pos_enc_layer_type) - - if input_layer == 'linear': - self.embed = torch.nn.Sequential( - torch.nn.Linear(input_size, output_size), - torch.nn.LayerNorm(output_size), - torch.nn.Dropout(dropout_rate), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d': - self.embed = Conv2dSubsampling( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d2': - self.embed = Conv2dSubsampling2( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d6': - self.embed = Conv2dSubsampling6( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d8': - self.embed = Conv2dSubsampling8( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'embed': - self.embed = torch.nn.Sequential( - torch.nn.Embedding( - input_size, output_size, padding_idx=padding_idx), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif isinstance(input_layer, torch.nn.Module): - self.embed = torch.nn.Sequential( - input_layer, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer is None: - self.embed = torch.nn.Sequential( - pos_enc_class(output_size, positional_dropout_rate)) - else: - raise ValueError('unknown input_layer: ' + input_layer) - self.normalize_before = normalize_before - if positionwise_layer_type == 'linear': - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - elif positionwise_layer_type == 'conv1d': - positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d-linear': - positionwise_layer = Conv1dLinear - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - else: - raise NotImplementedError('Support only linear or conv1d.') - - if selfattention_layer_type == 'selfattn': - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - elif selfattention_layer_type == 'legacy_rel_selfattn': - assert pos_enc_layer_type == 'legacy_rel_pos' - encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - elif selfattention_layer_type == 'rel_selfattn': - assert pos_enc_layer_type == 'rel_pos' - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - zero_triu, - ) - else: - raise ValueError('unknown encoder_attn_layer: ' - + selfattention_layer_type) - - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation) - - if isinstance(stochastic_depth_rate, float): - stochastic_depth_rate = [stochastic_depth_rate] * num_blocks - - if len(stochastic_depth_rate) != num_blocks: - raise ValueError( - f'Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) ' - f'should be equal to num_blocks ({num_blocks})') - - self.encoders = repeat( - num_blocks, - lambda lnum: EncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer(*positionwise_layer_args) - if macaron_style else None, - convolution_layer(*convolution_layer_args) - if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - stochastic_depth_rate[lnum], - ), - ) - if self.normalize_before: - self.after_norm = LayerNorm(output_size) - - self.interctc_layer_idx = interctc_layer_idx - if len(interctc_layer_idx) > 0: - assert 0 < min(interctc_layer_idx) and max( - interctc_layer_idx) < num_blocks - self.interctc_use_conditioning = interctc_use_conditioning - self.conditioning_layer = None - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs_pad: torch.Tensor, - ilens: torch.Tensor, - prev_states: torch.Tensor = None, - ctc: CTC = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Calculate forward propagation. - - Args: - xs_pad (torch.Tensor): Input tensor (#batch, L, input_size). - ilens (torch.Tensor): Input length (#batch). - prev_states (torch.Tensor): Not to be used now. - - Returns: - torch.Tensor: Output tensor (#batch, L, output_size). - torch.Tensor: Output length (#batch). - torch.Tensor: Not to be used now. - - """ - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - - if (isinstance(self.embed, Conv2dSubsampling) - or isinstance(self.embed, Conv2dSubsampling2) - or isinstance(self.embed, Conv2dSubsampling6) - or isinstance(self.embed, Conv2dSubsampling8)): - short_status, limit_size = check_short_utt(self.embed, - xs_pad.size(1)) - if short_status: - raise TooShortUttError( - f'has {xs_pad.size(1)} frames and is too short for subsampling ' - + # noqa: * - f'(it needs more than {limit_size} frames), return empty results', # noqa: * - xs_pad.size(1), - limit_size) # noqa: * - xs_pad, masks = self.embed(xs_pad, masks) - else: - xs_pad = self.embed(xs_pad) - - intermediate_outs = [] - if len(self.interctc_layer_idx) == 0: - xs_pad, masks = self.encoders(xs_pad, masks) - else: - for layer_idx, encoder_layer in enumerate(self.encoders): - xs_pad, masks = encoder_layer(xs_pad, masks) - - if layer_idx + 1 in self.interctc_layer_idx: - encoder_out = xs_pad - if isinstance(encoder_out, tuple): - encoder_out = encoder_out[0] - - # intermediate outputs are also normalized - if self.normalize_before: - encoder_out = self.after_norm(encoder_out) - - intermediate_outs.append((layer_idx + 1, encoder_out)) - - if self.interctc_use_conditioning: - ctc_out = ctc.softmax(encoder_out) - - if isinstance(xs_pad, tuple): - x, pos_emb = xs_pad - x = x + self.conditioning_layer(ctc_out) - xs_pad = (x, pos_emb) - else: - xs_pad = xs_pad + self.conditioning_layer(ctc_out) - - if isinstance(xs_pad, tuple): - xs_pad = xs_pad[0] - if self.normalize_before: - xs_pad = self.after_norm(xs_pad) - - olens = masks.squeeze(1).sum(1) - if len(intermediate_outs) > 0: - return (xs_pad, intermediate_outs), olens, None - return xs_pad, olens, None - - -class SANMEncoder_v2(AbsEncoder): - """Conformer encoder module. - - Args: - input_size (int): Input dimension. - output_size (int): Dimension of attention. - attention_heads (int): The number of heads of multi head attention. - linear_units (int): The number of units of position-wise feed forward. - num_blocks (int): The number of decoder blocks. - dropout_rate (float): Dropout rate. - attention_dropout_rate (float): Dropout rate in attention. - positional_dropout_rate (float): Dropout rate after adding positional encoding. - input_layer (Union[str, torch.nn.Module]): Input layer type. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. - If True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - If False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. - rel_pos_type (str): Whether to use the latest relative positional encoding or - the legacy one. The legacy relative positional encoding will be deprecated - in the future. More Details can be found in - https://github.com/espnet/espnet/pull/2816. - encoder_pos_enc_layer_type (str): Encoder positional encoding layer type. - encoder_attn_layer_type (str): Encoder attention layer type. - activation_type (str): Encoder activation function type. - macaron_style (bool): Whether to use macaron style for positionwise layer. - use_cnn_module (bool): Whether to use convolution module. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel (int): Kernerl size of convolution module. - padding_idx (int): Padding idx for input_layer=embed. - - """ - - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: str = 'conv2d', - normalize_before: bool = True, - concat_after: bool = False, - positionwise_layer_type: str = 'linear', - positionwise_conv_kernel_size: int = 3, - macaron_style: bool = False, - rel_pos_type: str = 'legacy', - pos_enc_layer_type: str = 'rel_pos', - selfattention_layer_type: str = 'rel_selfattn', - activation_type: str = 'swish', - use_cnn_module: bool = False, - sanm_shfit: int = 0, - zero_triu: bool = False, - cnn_module_kernel: int = 31, - padding_idx: int = -1, - interctc_layer_idx: List[int] = [], - interctc_use_conditioning: bool = False, - stochastic_depth_rate: Union[float, List[float]] = 0.0, - ): - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if rel_pos_type == 'legacy': - if pos_enc_layer_type == 'rel_pos': - pos_enc_layer_type = 'legacy_rel_pos' - if selfattention_layer_type == 'rel_selfattn': - selfattention_layer_type = 'legacy_rel_selfattn' - if selfattention_layer_type == 'rel_selfattnsanm': - selfattention_layer_type = 'legacy_rel_selfattnsanm' - - elif rel_pos_type == 'latest': - assert selfattention_layer_type != 'legacy_rel_selfattn' - assert pos_enc_layer_type != 'legacy_rel_pos' - else: - raise ValueError('unknown rel_pos_type: ' + rel_pos_type) - - activation = get_activation(activation_type) - if pos_enc_layer_type == 'abs_pos': - pos_enc_class = PositionalEncoding - elif pos_enc_layer_type == 'scaled_abs_pos': - pos_enc_class = ScaledPositionalEncoding - elif pos_enc_layer_type == 'rel_pos': - # assert selfattention_layer_type == "rel_selfattn" - pos_enc_class = RelPositionalEncoding - elif pos_enc_layer_type == 'legacy_rel_pos': - # assert selfattention_layer_type == "legacy_rel_selfattn" - pos_enc_class = LegacyRelPositionalEncoding - logging.warning( - 'Using legacy_rel_pos and it will be deprecated in the future.' - ) - else: - raise ValueError('unknown pos_enc_layer: ' + pos_enc_layer_type) - - if input_layer == 'linear': - self.embed = torch.nn.Sequential( - torch.nn.Linear(input_size, output_size), - torch.nn.LayerNorm(output_size), - torch.nn.Dropout(dropout_rate), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d': - self.embed = Conv2dSubsampling( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d2': - self.embed = Conv2dSubsampling2( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d6': - self.embed = Conv2dSubsampling6( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d8': - self.embed = Conv2dSubsampling8( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'embed': - self.embed = torch.nn.Sequential( - torch.nn.Embedding( - input_size, output_size, padding_idx=padding_idx), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif isinstance(input_layer, torch.nn.Module): - self.embed = torch.nn.Sequential( - input_layer, - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer is None: - self.embed = torch.nn.Sequential( - pos_enc_class(output_size, positional_dropout_rate)) - else: - raise ValueError('unknown input_layer: ' + input_layer) - self.normalize_before = normalize_before - if positionwise_layer_type == 'linear': - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - activation, - ) - elif positionwise_layer_type == 'conv1d': - positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d-linear': - positionwise_layer = Conv1dLinear - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - else: - raise NotImplementedError('Support only linear or conv1d.') - - if selfattention_layer_type == 'selfattn': - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - elif selfattention_layer_type == 'legacy_rel_selfattn': - assert pos_enc_layer_type == 'legacy_rel_pos' - encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - logging.warning( - 'Using legacy_rel_selfattn and it will be deprecated in the future.' - ) - - elif selfattention_layer_type == 'legacy_rel_selfattnsanm': - assert pos_enc_layer_type == 'legacy_rel_pos' - encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttentionSANM - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - logging.warning( - 'Using legacy_rel_selfattn and it will be deprecated in the future.' - ) - - elif selfattention_layer_type == 'rel_selfattn': - assert pos_enc_layer_type == 'rel_pos' - encoder_selfattn_layer = RelPositionMultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - zero_triu, - ) - elif selfattention_layer_type == 'rel_selfattnsanm': - assert pos_enc_layer_type == 'rel_pos' - encoder_selfattn_layer = RelPositionMultiHeadedAttentionSANM - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - zero_triu, - cnn_module_kernel, - sanm_shfit, - ) - else: - raise ValueError('unknown encoder_attn_layer: ' - + selfattention_layer_type) - - convolution_layer = ConvolutionModule - convolution_layer_args = (output_size, cnn_module_kernel, activation) - - if isinstance(stochastic_depth_rate, float): - stochastic_depth_rate = [stochastic_depth_rate] * num_blocks - - if len(stochastic_depth_rate) != num_blocks: - raise ValueError( - f'Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) ' - f'should be equal to num_blocks ({num_blocks})') - - self.encoders = repeat( - num_blocks, - lambda lnum: EncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer(*positionwise_layer_args) - if macaron_style else None, - convolution_layer(*convolution_layer_args) - if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, - stochastic_depth_rate[lnum], - ), - ) - if self.normalize_before: - self.after_norm = LayerNorm(output_size) - - self.interctc_layer_idx = interctc_layer_idx - if len(interctc_layer_idx) > 0: - assert 0 < min(interctc_layer_idx) and max( - interctc_layer_idx) < num_blocks - self.interctc_use_conditioning = interctc_use_conditioning - self.conditioning_layer = None - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs_pad: torch.Tensor, - ilens: torch.Tensor, - prev_states: torch.Tensor = None, - ctc: CTC = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Calculate forward propagation. - - Args: - xs_pad (torch.Tensor): Input tensor (#batch, L, input_size). - ilens (torch.Tensor): Input length (#batch). - prev_states (torch.Tensor): Not to be used now. - - Returns: - torch.Tensor: Output tensor (#batch, L, output_size). - torch.Tensor: Output length (#batch). - torch.Tensor: Not to be used now. - - """ - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - - if (isinstance(self.embed, Conv2dSubsampling) - or isinstance(self.embed, Conv2dSubsampling2) - or isinstance(self.embed, Conv2dSubsampling6) - or isinstance(self.embed, Conv2dSubsampling8)): - short_status, limit_size = check_short_utt(self.embed, - xs_pad.size(1)) - if short_status: - raise TooShortUttError( - f'has {xs_pad.size(1)} frames and is too short for subsampling ' - + # noqa: * - f'(it needs more than {limit_size} frames), return empty results', - xs_pad.size(1), - limit_size) # noqa: * - xs_pad, masks = self.embed(xs_pad, masks) - else: - xs_pad = self.embed(xs_pad) - - intermediate_outs = [] - if len(self.interctc_layer_idx) == 0: - xs_pad, masks = self.encoders(xs_pad, masks) - else: - for layer_idx, encoder_layer in enumerate(self.encoders): - xs_pad, masks = encoder_layer(xs_pad, masks) - - if layer_idx + 1 in self.interctc_layer_idx: - encoder_out = xs_pad - if isinstance(encoder_out, tuple): - encoder_out = encoder_out[0] - - # intermediate outputs are also normalized - if self.normalize_before: - encoder_out = self.after_norm(encoder_out) - - intermediate_outs.append((layer_idx + 1, encoder_out)) - - if self.interctc_use_conditioning: - ctc_out = ctc.softmax(encoder_out) - - if isinstance(xs_pad, tuple): - x, pos_emb = xs_pad - x = x + self.conditioning_layer(ctc_out) - xs_pad = (x, pos_emb) - else: - xs_pad = xs_pad + self.conditioning_layer(ctc_out) - - if isinstance(xs_pad, tuple): - xs_pad = xs_pad[0] - if self.normalize_before: - xs_pad = self.after_norm(xs_pad) - - olens = masks.squeeze(1).sum(1) - if len(intermediate_outs) > 0: - return (xs_pad, intermediate_outs), olens, None - return xs_pad, olens, None diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/sanm_encoder.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/sanm_encoder.py deleted file mode 100644 index 92e51b2e..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/encoder/sanm_encoder.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -"""Transformer encoder definition.""" - -import logging -from typing import List, Optional, Sequence, Tuple, Union - -import torch -from espnet2.asr.ctc import CTC -from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet.nets.pytorch_backend.nets_utils import make_pad_mask -from espnet.nets.pytorch_backend.transformer.embedding import \ - PositionalEncoding -from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm -from espnet.nets.pytorch_backend.transformer.multi_layer_conv import ( - Conv1dLinear, MultiLayeredConv1d) -from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import \ - PositionwiseFeedForward # noqa: H301 -from espnet.nets.pytorch_backend.transformer.repeat import repeat -from espnet.nets.pytorch_backend.transformer.subsampling import ( - Conv2dSubsampling, Conv2dSubsampling2, Conv2dSubsampling6, - Conv2dSubsampling8, TooShortUttError, check_short_utt) -from typeguard import check_argument_types - -from ...asr.streaming_utilis.chunk_utilis import overlap_chunk -from ...nets.pytorch_backend.transformer.attention import ( - MultiHeadedAttention, MultiHeadedAttentionSANM) -from ...nets.pytorch_backend.transformer.encoder_layer import ( - EncoderLayer, EncoderLayerChunk) - - -class SANMEncoder(AbsEncoder): - """Transformer encoder module. - - Args: - input_size: input dim - output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the number of units of position-wise feed forward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - attention_dropout_rate: dropout rate in attention - positional_dropout_rate: dropout rate after adding positional encoding - input_layer: input layer type - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: whether to use layer_norm before the first block - concat_after: whether to concat attention layer's input and output - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. - i.e. x -> x + att(x) - positionwise_layer_type: linear of conv1d - positionwise_conv_kernel_size: kernel size of positionwise conv1d layer - padding_idx: padding_idx for input_layer=embed - """ - - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: Optional[str] = 'conv2d', - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - positionwise_layer_type: str = 'linear', - positionwise_conv_kernel_size: int = 1, - padding_idx: int = -1, - interctc_layer_idx: List[int] = [], - interctc_use_conditioning: bool = False, - kernel_size: int = 11, - sanm_shfit: int = 0, - selfattention_layer_type: str = 'sanm', - ): - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if input_layer == 'linear': - self.embed = torch.nn.Sequential( - torch.nn.Linear(input_size, output_size), - torch.nn.LayerNorm(output_size), - torch.nn.Dropout(dropout_rate), - torch.nn.ReLU(), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d': - self.embed = Conv2dSubsampling(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d2': - self.embed = Conv2dSubsampling2(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d6': - self.embed = Conv2dSubsampling6(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d8': - self.embed = Conv2dSubsampling8(input_size, output_size, - dropout_rate) - elif input_layer == 'embed': - self.embed = torch.nn.Sequential( - torch.nn.Embedding( - input_size, output_size, padding_idx=padding_idx), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer is None: - if input_size == output_size: - self.embed = None - else: - self.embed = torch.nn.Linear(input_size, output_size) - else: - raise ValueError('unknown input_layer: ' + input_layer) - self.normalize_before = normalize_before - if positionwise_layer_type == 'linear': - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d': - positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d-linear': - positionwise_layer = Conv1dLinear - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - else: - raise NotImplementedError('Support only linear or conv1d.') - - if selfattention_layer_type == 'selfattn': - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - elif selfattention_layer_type == 'sanm': - encoder_selfattn_layer = MultiHeadedAttentionSANM - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - kernel_size, - sanm_shfit, - ) - - self.encoders = repeat( - num_blocks, - lambda lnum: EncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - dropout_rate, - normalize_before, - concat_after, - ), - ) - if self.normalize_before: - self.after_norm = LayerNorm(output_size) - - self.interctc_layer_idx = interctc_layer_idx - if len(interctc_layer_idx) > 0: - assert 0 < min(interctc_layer_idx) and max( - interctc_layer_idx) < num_blocks - self.interctc_use_conditioning = interctc_use_conditioning - self.conditioning_layer = None - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs_pad: torch.Tensor, - ilens: torch.Tensor, - prev_states: torch.Tensor = None, - ctc: CTC = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Embed positions in tensor. - - Args: - xs_pad: input tensor (B, L, D) - ilens: input length (B) - prev_states: Not to be used now. - Returns: - position embedded tensor and mask - """ - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - - if self.embed is None: - xs_pad = xs_pad - elif (isinstance(self.embed, Conv2dSubsampling) - or isinstance(self.embed, Conv2dSubsampling2) - or isinstance(self.embed, Conv2dSubsampling6) - or isinstance(self.embed, Conv2dSubsampling8)): - short_status, limit_size = check_short_utt(self.embed, - xs_pad.size(1)) - if short_status: - raise TooShortUttError( - f'has {xs_pad.size(1)} frames and is too short for subsampling ' - + # noqa: * - f'(it needs more than {limit_size} frames), return empty results', - xs_pad.size(1), - limit_size, - ) - xs_pad, masks = self.embed(xs_pad, masks) - else: - xs_pad = self.embed(xs_pad) - - intermediate_outs = [] - if len(self.interctc_layer_idx) == 0: - xs_pad, masks = self.encoders(xs_pad, masks) - else: - for layer_idx, encoder_layer in enumerate(self.encoders): - xs_pad, masks = encoder_layer(xs_pad, masks) - - if layer_idx + 1 in self.interctc_layer_idx: - encoder_out = xs_pad - - # intermediate outputs are also normalized - if self.normalize_before: - encoder_out = self.after_norm(encoder_out) - - intermediate_outs.append((layer_idx + 1, encoder_out)) - - if self.interctc_use_conditioning: - ctc_out = ctc.softmax(encoder_out) - xs_pad = xs_pad + self.conditioning_layer(ctc_out) - - if self.normalize_before: - xs_pad = self.after_norm(xs_pad) - - olens = masks.squeeze(1).sum(1) - if len(intermediate_outs) > 0: - return (xs_pad, intermediate_outs), olens, None - return xs_pad, olens, None - - -class SANMEncoderChunk(AbsEncoder): - """Transformer encoder module. - - Args: - input_size: input dim - output_size: dimension of attention - attention_heads: the number of heads of multi head attention - linear_units: the number of units of position-wise feed forward - num_blocks: the number of decoder blocks - dropout_rate: dropout rate - attention_dropout_rate: dropout rate in attention - positional_dropout_rate: dropout rate after adding positional encoding - input_layer: input layer type - pos_enc_class: PositionalEncoding or ScaledPositionalEncoding - normalize_before: whether to use layer_norm before the first block - concat_after: whether to concat attention layer's input and output - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. - i.e. x -> x + att(x) - positionwise_layer_type: linear of conv1d - positionwise_conv_kernel_size: kernel size of positionwise conv1d layer - padding_idx: padding_idx for input_layer=embed - """ - - def __init__( - self, - input_size: int, - output_size: int = 256, - attention_heads: int = 4, - linear_units: int = 2048, - num_blocks: int = 6, - dropout_rate: float = 0.1, - positional_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.0, - input_layer: Optional[str] = 'conv2d', - pos_enc_class=PositionalEncoding, - normalize_before: bool = True, - concat_after: bool = False, - positionwise_layer_type: str = 'linear', - positionwise_conv_kernel_size: int = 1, - padding_idx: int = -1, - interctc_layer_idx: List[int] = [], - interctc_use_conditioning: bool = False, - kernel_size: int = 11, - sanm_shfit: int = 0, - selfattention_layer_type: str = 'sanm', - chunk_size: Union[int, Sequence[int]] = (16, ), - stride: Union[int, Sequence[int]] = (10, ), - pad_left: Union[int, Sequence[int]] = (0, ), - encoder_att_look_back_factor: Union[int, Sequence[int]] = (1, ), - ): - assert check_argument_types() - super().__init__() - self._output_size = output_size - - if input_layer == 'linear': - self.embed = torch.nn.Sequential( - torch.nn.Linear(input_size, output_size), - torch.nn.LayerNorm(output_size), - torch.nn.Dropout(dropout_rate), - torch.nn.ReLU(), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer == 'conv2d': - self.embed = Conv2dSubsampling(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d2': - self.embed = Conv2dSubsampling2(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d6': - self.embed = Conv2dSubsampling6(input_size, output_size, - dropout_rate) - elif input_layer == 'conv2d8': - self.embed = Conv2dSubsampling8(input_size, output_size, - dropout_rate) - elif input_layer == 'embed': - self.embed = torch.nn.Sequential( - torch.nn.Embedding( - input_size, output_size, padding_idx=padding_idx), - pos_enc_class(output_size, positional_dropout_rate), - ) - elif input_layer is None: - if input_size == output_size: - self.embed = None - else: - self.embed = torch.nn.Linear(input_size, output_size) - else: - raise ValueError('unknown input_layer: ' + input_layer) - self.normalize_before = normalize_before - if positionwise_layer_type == 'linear': - positionwise_layer = PositionwiseFeedForward - positionwise_layer_args = ( - output_size, - linear_units, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d': - positionwise_layer = MultiLayeredConv1d - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - elif positionwise_layer_type == 'conv1d-linear': - positionwise_layer = Conv1dLinear - positionwise_layer_args = ( - output_size, - linear_units, - positionwise_conv_kernel_size, - dropout_rate, - ) - else: - raise NotImplementedError('Support only linear or conv1d.') - - if selfattention_layer_type == 'selfattn': - encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - ) - elif selfattention_layer_type == 'sanm': - encoder_selfattn_layer = MultiHeadedAttentionSANM - encoder_selfattn_layer_args = ( - attention_heads, - output_size, - attention_dropout_rate, - kernel_size, - sanm_shfit, - ) - - self.encoders = repeat( - num_blocks, - lambda lnum: EncoderLayerChunk( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - dropout_rate, - normalize_before, - concat_after, - ), - ) - if self.normalize_before: - self.after_norm = LayerNorm(output_size) - - self.interctc_layer_idx = interctc_layer_idx - if len(interctc_layer_idx) > 0: - assert 0 < min(interctc_layer_idx) and max( - interctc_layer_idx) < num_blocks - self.interctc_use_conditioning = interctc_use_conditioning - self.conditioning_layer = None - shfit_fsmn = (kernel_size - 1) // 2 - self.overlap_chunk_cls = overlap_chunk( - chunk_size=chunk_size, - stride=stride, - pad_left=pad_left, - shfit_fsmn=shfit_fsmn, - encoder_att_look_back_factor=encoder_att_look_back_factor, - ) - - def output_size(self) -> int: - return self._output_size - - def forward( - self, - xs_pad: torch.Tensor, - ilens: torch.Tensor, - prev_states: torch.Tensor = None, - ctc: CTC = None, - ind: int = 0, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Embed positions in tensor. - - Args: - xs_pad: input tensor (B, L, D) - ilens: input length (B) - prev_states: Not to be used now. - Returns: - position embedded tensor and mask - """ - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - - if self.embed is None: - xs_pad = xs_pad - elif (isinstance(self.embed, Conv2dSubsampling) - or isinstance(self.embed, Conv2dSubsampling2) - or isinstance(self.embed, Conv2dSubsampling6) - or isinstance(self.embed, Conv2dSubsampling8)): - short_status, limit_size = check_short_utt(self.embed, - xs_pad.size(1)) - if short_status: - raise TooShortUttError( - f'has {xs_pad.size(1)} frames and is too short for subsampling ' - + # noqa: * - f'(it needs more than {limit_size} frames), return empty results', - xs_pad.size(1), - limit_size, - ) - xs_pad, masks = self.embed(xs_pad, masks) - else: - xs_pad = self.embed(xs_pad) - - mask_shfit_chunk, mask_att_chunk_encoder = None, None - if self.overlap_chunk_cls is not None: - ilens = masks.squeeze(1).sum(1) - chunk_outs = self.overlap_chunk_cls.gen_chunk_mask(ilens, ind) - xs_pad, ilens = self.overlap_chunk_cls.split_chunk( - xs_pad, ilens, chunk_outs=chunk_outs) - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - mask_shfit_chunk = self.overlap_chunk_cls.get_mask_shfit_chunk( - chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype) - mask_att_chunk_encoder = self.overlap_chunk_cls.get_mask_att_chunk_encoder( - chunk_outs, xs_pad.device, xs_pad.size(0), dtype=xs_pad.dtype) - - intermediate_outs = [] - if len(self.interctc_layer_idx) == 0: - xs_pad, masks, _, _, _ = self.encoders(xs_pad, masks, None, - mask_shfit_chunk, - mask_att_chunk_encoder) - else: - for layer_idx, encoder_layer in enumerate(self.encoders): - xs_pad, masks, _, _, _ = encoder_layer(xs_pad, masks, None, - mask_shfit_chunk, - mask_att_chunk_encoder) - - if layer_idx + 1 in self.interctc_layer_idx: - encoder_out = xs_pad - - # intermediate outputs are also normalized - if self.normalize_before: - encoder_out = self.after_norm(encoder_out) - - intermediate_outs.append((layer_idx + 1, encoder_out)) - - if self.interctc_use_conditioning: - ctc_out = ctc.softmax(encoder_out) - xs_pad = xs_pad + self.conditioning_layer(ctc_out) - - if self.normalize_before: - xs_pad = self.after_norm(xs_pad) - - if self.overlap_chunk_cls is not None: - xs_pad, olens = self.overlap_chunk_cls.remove_chunk( - xs_pad, ilens, chunk_outs) - if len(intermediate_outs) > 0: - return (xs_pad, intermediate_outs), olens, None - return xs_pad, olens, None diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model.py deleted file mode 100644 index 6f5b3688..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model.py +++ /dev/null @@ -1,1131 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -import logging -from contextlib import contextmanager -from distutils.version import LooseVersion -from typing import Dict, List, Optional, Tuple, Union - -import torch -from espnet2.asr.ctc import CTC -from espnet2.asr.decoder.abs_decoder import AbsDecoder -from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet2.asr.frontend.abs_frontend import AbsFrontend -from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder -from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder -from espnet2.asr.specaug.abs_specaug import AbsSpecAug -from espnet2.asr.transducer.error_calculator import ErrorCalculatorTransducer -from espnet2.asr.transducer.utils import get_transducer_task_io -from espnet2.layers.abs_normalize import AbsNormalize -from espnet2.torch_utils.device_funcs import force_gatherable -from espnet2.train.abs_espnet_model import AbsESPnetModel -from espnet.nets.e2e_asr_common import ErrorCalculator -from espnet.nets.pytorch_backend.nets_utils import th_accuracy -from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos -from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import \ - LabelSmoothingLoss # noqa: H301 -from typeguard import check_argument_types - -from .streaming_utilis.chunk_utilis import sequence_mask - -if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'): - from torch.cuda.amp import autocast -else: - # Nothing to do if torch<1.6.0 - @contextmanager - def autocast(enabled=True): - yield - - -class ESPnetASRModel(AbsESPnetModel): - """CTC-attention hybrid Encoder-Decoder model""" - - def __init__( - self, - vocab_size: int, - token_list: Union[Tuple[str, ...], List[str]], - frontend: Optional[AbsFrontend], - specaug: Optional[AbsSpecAug], - normalize: Optional[AbsNormalize], - preencoder: Optional[AbsPreEncoder], - encoder: AbsEncoder, - postencoder: Optional[AbsPostEncoder], - decoder: AbsDecoder, - ctc: CTC, - joint_network: Optional[torch.nn.Module], - ctc_weight: float = 0.5, - interctc_weight: float = 0.0, - ignore_id: int = -1, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - report_cer: bool = True, - report_wer: bool = True, - sym_space: str = '', - sym_blank: str = '', - extract_feats_in_collect_stats: bool = True, - ): - assert check_argument_types() - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - assert 0.0 <= interctc_weight < 1.0, interctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.blank_id = 0 - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.interctc_weight = interctc_weight - self.token_list = token_list.copy() - - self.frontend = frontend - self.specaug = specaug - self.normalize = normalize - self.preencoder = preencoder - self.postencoder = postencoder - self.encoder = encoder - - if not hasattr(self.encoder, 'interctc_use_conditioning'): - self.encoder.interctc_use_conditioning = False - if self.encoder.interctc_use_conditioning: - self.encoder.conditioning_layer = torch.nn.Linear( - vocab_size, self.encoder.output_size()) - - self.use_transducer_decoder = joint_network is not None - - self.error_calculator = None - - if self.use_transducer_decoder: - # from warprnnt_pytorch import RNNTLoss - from warp_rnnt import rnnt_loss as RNNTLoss - - self.decoder = decoder - self.joint_network = joint_network - - self.criterion_transducer = RNNTLoss - - if report_cer or report_wer: - self.error_calculator_trans = ErrorCalculatorTransducer( - decoder, - joint_network, - token_list, - sym_space, - sym_blank, - report_cer=report_cer, - report_wer=report_wer, - ) - else: - self.error_calculator_trans = None - - if self.ctc_weight != 0: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, - report_wer) - else: - # we set self.decoder = None in the CTC mode since - # self.decoder parameters were never used and PyTorch complained - # and threw an Exception in the multi-GPU experiment. - # thanks Jeff Farris for pointing out the issue. - if ctc_weight == 1.0: - self.decoder = None - else: - self.decoder = decoder - - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - if report_cer or report_wer: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, report_wer) - - if ctc_weight == 0.0: - self.ctc = None - else: - self.ctc = ctc - - self.extract_feats_in_collect_stats = extract_feats_in_collect_stats - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] - == # noqa: * - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - batch_size = speech.shape[0] - - # for data-parallel - text = text[:, :text_lengths.max()] - - # 1. Encoder - encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - loss_att, acc_att, cer_att, wer_att = None, None, None, None - loss_ctc, cer_ctc = None, None - loss_transducer, cer_transducer, wer_transducer = None, None, None - stats = dict() - - # 1. CTC branch - if self.ctc_weight != 0.0: - loss_ctc, cer_ctc = self._calc_ctc_loss(encoder_out, - encoder_out_lens, text, - text_lengths) - - # Collect CTC branch stats - stats['loss_ctc'] = loss_ctc.detach( - ) if loss_ctc is not None else None - stats['cer_ctc'] = cer_ctc - - # Intermediate CTC (optional) - loss_interctc = 0.0 - if self.interctc_weight != 0.0 and intermediate_outs is not None: - for layer_idx, intermediate_out in intermediate_outs: - # we assume intermediate_out has the same length & padding - # as those of encoder_out - loss_ic, cer_ic = self._calc_ctc_loss(intermediate_out, - encoder_out_lens, text, - text_lengths) - loss_interctc = loss_interctc + loss_ic - - # Collect Intermedaite CTC stats - stats['loss_interctc_layer{}'.format(layer_idx)] = ( - loss_ic.detach() if loss_ic is not None else None) - stats['cer_interctc_layer{}'.format(layer_idx)] = cer_ic - - loss_interctc = loss_interctc / len(intermediate_outs) - - # calculate whole encoder loss - loss_ctc = (1 - self.interctc_weight - ) * loss_ctc + self.interctc_weight * loss_interctc - - if self.use_transducer_decoder: - # 2a. Transducer decoder branch - ( - loss_transducer, - cer_transducer, - wer_transducer, - ) = self._calc_transducer_loss( - encoder_out, - encoder_out_lens, - text, - ) - - if loss_ctc is not None: - loss = loss_transducer + (self.ctc_weight * loss_ctc) - else: - loss = loss_transducer - - # Collect Transducer branch stats - stats['loss_transducer'] = ( - loss_transducer.detach() - if loss_transducer is not None else None) - stats['cer_transducer'] = cer_transducer - stats['wer_transducer'] = wer_transducer - - else: - # 2b. Attention decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att, cer_att, wer_att = self._calc_att_loss( - encoder_out, encoder_out_lens, text, text_lengths) - - # 3. CTC-Att loss definition - if self.ctc_weight == 0.0: - loss = loss_att - elif self.ctc_weight == 1.0: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + ( - 1 - self.ctc_weight) * loss_att - - # Collect Attn branch stats - stats['loss_att'] = loss_att.detach( - ) if loss_att is not None else None - stats['acc'] = acc_att - stats['cer'] = cer_att - stats['wer'] = wer_att - - # Collect total loss stats - # TODO(wjm): needed to be checked - # TODO(wjm): same problem: https://github.com/espnet/espnet/issues/4136 - # FIXME(wjm): for logger error when accum_grad > 1 - # stats["loss"] = loss.detach() - stats['loss'] = torch.clone(loss.detach()) - - # force_gatherable: to-device and to-tensor if scalar for DataParallel - loss, stats, weight = force_gatherable((loss, stats, batch_size), - loss.device) - return loss, stats, weight - - def collect_feats( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - if self.extract_feats_in_collect_stats: - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - else: - # Generate dummy stats if extract_feats_in_collect_stats is False - logging.warning( - 'Generating dummy stats for feats and feats_lengths, ' - 'because encoder_conf.extract_feats_in_collect_stats is ' - f'{self.extract_feats_in_collect_stats}') - feats, feats_lengths = speech, speech_lengths - return {'feats': feats, 'feats_lengths': feats_lengths} - - def encode( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Frontend + Encoder. Note that this method is used by asr_inference.py - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - """ - with autocast(False): - # 1. Extract feats - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - - # 2. Data augmentation - if self.specaug is not None and self.training: - feats, feats_lengths = self.specaug(feats, feats_lengths) - - # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN - if self.normalize is not None: - feats, feats_lengths = self.normalize(feats, feats_lengths) - - # Pre-encoder, e.g. used for raw input data - if self.preencoder is not None: - feats, feats_lengths = self.preencoder(feats, feats_lengths) - - # 4. Forward encoder - # feats: (Batch, Length, Dim) - # -> encoder_out: (Batch, Length2, Dim2) - if self.encoder.interctc_use_conditioning: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths, ctc=self.ctc) - else: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - # Post-encoder, e.g. NLU - if self.postencoder is not None: - encoder_out, encoder_out_lens = self.postencoder( - encoder_out, encoder_out_lens) - - assert encoder_out.size(0) == speech.size(0), ( - encoder_out.size(), - speech.size(0), - ) - assert encoder_out.size(1) <= encoder_out_lens.max(), ( - encoder_out.size(), - encoder_out_lens.max(), - ) - - if intermediate_outs is not None: - return (encoder_out, intermediate_outs), encoder_out_lens - - return encoder_out, encoder_out_lens - - def _extract_feats( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert speech_lengths.dim() == 1, speech_lengths.shape - - # for data-parallel - speech = speech[:, :speech_lengths.max()] - - if self.frontend is not None: - # Frontend - # e.g. STFT and Feature extract - # data_loader may send time-domain signal in this case - # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim) - feats, feats_lengths = self.frontend(speech, speech_lengths) - else: - # No frontend and no feature extract - feats, feats_lengths = speech, speech_lengths - return feats, feats_lengths - - def nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> torch.Tensor: - """Compute negative log likelihood(nll) from transformer-decoder - - Normally, this function is called in batchify_nll. - - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - """ - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) # [batch, seqlen, dim] - batch_size = decoder_out.size(0) - decoder_num_class = decoder_out.size(2) - # nll: negative log-likelihood - nll = torch.nn.functional.cross_entropy( - decoder_out.view(-1, decoder_num_class), - ys_out_pad.view(-1), - ignore_index=self.ignore_id, - reduction='none', - ) - nll = nll.view(batch_size, -1) - nll = nll.sum(dim=1) - assert nll.size(0) == batch_size - return nll - - def batchify_nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - batch_size: int = 100, - ): - """Compute negative log likelihood(nll) from transformer-decoder - - To avoid OOM, this fuction seperate the input into batches. - Then call nll for each batch and combine and return results. - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - batch_size: int, samples each batch contain when computing nll, - you may change this to avoid OOM or increase - GPU memory usage - """ - total_num = encoder_out.size(0) - if total_num <= batch_size: - nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - else: - nll = [] - start_idx = 0 - while True: - end_idx = min(start_idx + batch_size, total_num) - batch_encoder_out = encoder_out[start_idx:end_idx, :, :] - batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx] - batch_ys_pad = ys_pad[start_idx:end_idx, :] - batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx] - batch_nll = self.nll( - batch_encoder_out, - batch_encoder_out_lens, - batch_ys_pad, - batch_ys_pad_lens, - ) - nll.append(batch_nll) - start_idx = end_idx - if start_idx == total_num: - break - nll = torch.cat(nll) - assert nll.size(0) == total_num - return nll - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) - - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att - - def _calc_ctc_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - # Calc CTC loss - loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - - # Calc CER using CTC - cer_ctc = None - if not self.training and self.error_calculator is not None: - ys_hat = self.ctc.argmax(encoder_out).data - cer_ctc = self.error_calculator( - ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) - return loss_ctc, cer_ctc - - def _calc_transducer_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - labels: torch.Tensor, - ): - """Compute Transducer loss. - - Args: - encoder_out: Encoder output sequences. (B, T, D_enc) - encoder_out_lens: Encoder output sequences lengths. (B,) - labels: Label ID sequences. (B, L) - - Return: - loss_transducer: Transducer loss value. - cer_transducer: Character error rate for Transducer. - wer_transducer: Word Error Rate for Transducer. - - """ - decoder_in, target, t_len, u_len = get_transducer_task_io( - labels, - encoder_out_lens, - ignore_id=self.ignore_id, - blank_id=self.blank_id, - ) - - self.decoder.set_device(encoder_out.device) - decoder_out = self.decoder(decoder_in) - - joint_out = self.joint_network( - encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)) - - loss_transducer = self.criterion_transducer( - joint_out, - target, - t_len, - u_len, - reduction='sum', - ) - - cer_transducer, wer_transducer = None, None - if not self.training and self.error_calculator_trans is not None: - cer_transducer, wer_transducer = self.error_calculator_trans( - encoder_out, target) - - return loss_transducer, cer_transducer, wer_transducer - - -class AEDStreaming(AbsESPnetModel): - """CTC-attention hybrid Encoder-Decoder model""" - - def __init__( - self, - vocab_size: int, - token_list: Union[Tuple[str, ...], List[str]], - frontend: Optional[AbsFrontend], - specaug: Optional[AbsSpecAug], - normalize: Optional[AbsNormalize], - preencoder: Optional[AbsPreEncoder], - encoder: AbsEncoder, - postencoder: Optional[AbsPostEncoder], - decoder: AbsDecoder, - ctc: CTC, - joint_network: Optional[torch.nn.Module], - ctc_weight: float = 0.5, - interctc_weight: float = 0.0, - ignore_id: int = -1, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - report_cer: bool = True, - report_wer: bool = True, - sym_space: str = '', - sym_blank: str = '', - extract_feats_in_collect_stats: bool = True, - predictor=None, - predictor_weight: float = 0.0, - ): - assert check_argument_types() - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - assert 0.0 <= interctc_weight < 1.0, interctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.blank_id = 0 - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.interctc_weight = interctc_weight - self.token_list = token_list.copy() - - self.frontend = frontend - self.specaug = specaug - self.normalize = normalize - self.preencoder = preencoder - self.postencoder = postencoder - self.encoder = encoder - - if not hasattr(self.encoder, 'interctc_use_conditioning'): - self.encoder.interctc_use_conditioning = False - if self.encoder.interctc_use_conditioning: - self.encoder.conditioning_layer = torch.nn.Linear( - vocab_size, self.encoder.output_size()) - - self.use_transducer_decoder = joint_network is not None - - self.error_calculator = None - - if self.use_transducer_decoder: - # from warprnnt_pytorch import RNNTLoss - from warp_rnnt import rnnt_loss as RNNTLoss - - self.decoder = decoder - self.joint_network = joint_network - - self.criterion_transducer = RNNTLoss - - if report_cer or report_wer: - self.error_calculator_trans = ErrorCalculatorTransducer( - decoder, - joint_network, - token_list, - sym_space, - sym_blank, - report_cer=report_cer, - report_wer=report_wer, - ) - else: - self.error_calculator_trans = None - - if self.ctc_weight != 0: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, - report_wer) - else: - # we set self.decoder = None in the CTC mode since - # self.decoder parameters were never used and PyTorch complained - # and threw an Exception in the multi-GPU experiment. - # thanks Jeff Farris for pointing out the issue. - if ctc_weight == 1.0: - self.decoder = None - else: - self.decoder = decoder - - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - if report_cer or report_wer: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, report_wer) - - if ctc_weight == 0.0: - self.ctc = None - else: - self.ctc = ctc - - self.extract_feats_in_collect_stats = extract_feats_in_collect_stats - self.predictor = predictor - self.predictor_weight = predictor_weight - self.criterion_pre = torch.nn.L1Loss() - self.step_cur = 0 - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] - == # noqa: * - text_lengths.shape[0]), (speech.shape, speech_lengths.shape, - text.shape, text_lengths.shape) - batch_size = speech.shape[0] - - # for data-parallel - text = text[:, :text_lengths.max()] - - # 1. Encoder - encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - loss_att, acc_att, cer_att, wer_att = None, None, None, None - loss_ctc, cer_ctc = None, None - loss_transducer, cer_transducer, wer_transducer = None, None, None - stats = dict() - - # 1. CTC branch - if self.ctc_weight != 0.0: - loss_ctc, cer_ctc = self._calc_ctc_loss(encoder_out, - encoder_out_lens, text, - text_lengths) - - # Collect CTC branch stats - stats['loss_ctc'] = loss_ctc.detach( - ) if loss_ctc is not None else None - stats['cer_ctc'] = cer_ctc - - # Intermediate CTC (optional) - loss_interctc = 0.0 - if self.interctc_weight != 0.0 and intermediate_outs is not None: - for layer_idx, intermediate_out in intermediate_outs: - # we assume intermediate_out has the same length & padding - # as those of encoder_out - loss_ic, cer_ic = self._calc_ctc_loss(intermediate_out, - encoder_out_lens, text, - text_lengths) - loss_interctc = loss_interctc + loss_ic - - # Collect Intermedaite CTC stats - stats['loss_interctc_layer{}'.format(layer_idx)] = ( - loss_ic.detach() if loss_ic is not None else None) - stats['cer_interctc_layer{}'.format(layer_idx)] = cer_ic - - loss_interctc = loss_interctc / len(intermediate_outs) - - # calculate whole encoder loss - loss_ctc = (1 - self.interctc_weight - ) * loss_ctc + self.interctc_weight * loss_interctc - - if self.use_transducer_decoder: - # 2a. Transducer decoder branch - ( - loss_transducer, - cer_transducer, - wer_transducer, - ) = self._calc_transducer_loss( - encoder_out, - encoder_out_lens, - text, - ) - - if loss_ctc is not None: - loss = loss_transducer + (self.ctc_weight * loss_ctc) - else: - loss = loss_transducer - - # Collect Transducer branch stats - stats['loss_transducer'] = ( - loss_transducer.detach() - if loss_transducer is not None else None) - stats['cer_transducer'] = cer_transducer - stats['wer_transducer'] = wer_transducer - - else: - # 2b. Attention decoder branch - if self.ctc_weight != 1.0: - loss_att, acc_att, cer_att, wer_att = self._calc_att_loss( - encoder_out, encoder_out_lens, text, text_lengths) - - # 3. CTC-Att loss definition - if self.ctc_weight == 0.0: - loss = loss_att - elif self.ctc_weight == 1.0: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + ( - 1 - self.ctc_weight) * loss_att - - # Collect Attn branch stats - stats['loss_att'] = loss_att.detach( - ) if loss_att is not None else None - stats['acc'] = acc_att - stats['cer'] = cer_att - stats['wer'] = wer_att - - # Collect total loss stats - # TODO(wjm): needed to be checked - # TODO(wjm): same problem: https://github.com/espnet/espnet/issues/4136 - # FIXME(wjm): for logger error when accum_grad > 1 - # stats["loss"] = loss.detach() - stats['loss'] = torch.clone(loss.detach()) - - # force_gatherable: to-device and to-tensor if scalar for DataParallel - loss, stats, weight = force_gatherable((loss, stats, batch_size), - loss.device) - return loss, stats, weight - - def collect_feats( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - if self.extract_feats_in_collect_stats: - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - else: - # Generate dummy stats if extract_feats_in_collect_stats is False - logging.warning( - 'Generating dummy stats for feats and feats_lengths, ' - 'because encoder_conf.extract_feats_in_collect_stats is ' - f'{self.extract_feats_in_collect_stats}') - feats, feats_lengths = speech, speech_lengths - return {'feats': feats, 'feats_lengths': feats_lengths} - - def encode( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Frontend + Encoder. Note that this method is used by asr_inference.py - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - """ - with autocast(False): - # 1. Extract feats - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - - # 2. Data augmentation - if self.specaug is not None and self.training: - feats, feats_lengths = self.specaug(feats, feats_lengths) - - # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN - if self.normalize is not None: - feats, feats_lengths = self.normalize(feats, feats_lengths) - - # Pre-encoder, e.g. used for raw input data - if self.preencoder is not None: - feats, feats_lengths = self.preencoder(feats, feats_lengths) - - # 4. Forward encoder - # feats: (Batch, Length, Dim) - # -> encoder_out: (Batch, Length2, Dim2) - if self.encoder.interctc_use_conditioning: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths, ctc=self.ctc) - else: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - # Post-encoder, e.g. NLU - if self.postencoder is not None: - encoder_out, encoder_out_lens = self.postencoder( - encoder_out, encoder_out_lens) - - assert encoder_out.size(0) == speech.size(0), ( - encoder_out.size(), - speech.size(0), - ) - assert encoder_out.size(1) <= encoder_out_lens.max(), ( - encoder_out.size(), - encoder_out_lens.max(), - ) - - if intermediate_outs is not None: - return (encoder_out, intermediate_outs), encoder_out_lens - - return encoder_out, encoder_out_lens - - def _extract_feats( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert speech_lengths.dim() == 1, speech_lengths.shape - - # for data-parallel - speech = speech[:, :speech_lengths.max()] - - if self.frontend is not None: - # Frontend - # e.g. STFT and Feature extract - # data_loader may send time-domain signal in this case - # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim) - feats, feats_lengths = self.frontend(speech, speech_lengths) - else: - # No frontend and no feature extract - feats, feats_lengths = speech, speech_lengths - return feats, feats_lengths - - def nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> torch.Tensor: - """Compute negative log likelihood(nll) from transformer-decoder - - Normally, this function is called in batchify_nll. - - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - """ - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) # [batch, seqlen, dim] - batch_size = decoder_out.size(0) - decoder_num_class = decoder_out.size(2) - # nll: negative log-likelihood - nll = torch.nn.functional.cross_entropy( - decoder_out.view(-1, decoder_num_class), - ys_out_pad.view(-1), - ignore_index=self.ignore_id, - reduction='none', - ) - nll = nll.view(batch_size, -1) - nll = nll.sum(dim=1) - assert nll.size(0) == batch_size - return nll - - def batchify_nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - batch_size: int = 100, - ): - """Compute negative log likelihood(nll) from transformer-decoder - - To avoid OOM, this fuction seperate the input into batches. - Then call nll for each batch and combine and return results. - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - batch_size: int, samples each batch contain when computing nll, - you may change this to avoid OOM or increase - GPU memory usage - """ - total_num = encoder_out.size(0) - if total_num <= batch_size: - nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - else: - nll = [] - start_idx = 0 - while True: - end_idx = min(start_idx + batch_size, total_num) - batch_encoder_out = encoder_out[start_idx:end_idx, :, :] - batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx] - batch_ys_pad = ys_pad[start_idx:end_idx, :] - batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx] - batch_nll = self.nll( - batch_encoder_out, - batch_encoder_out_lens, - batch_ys_pad, - batch_ys_pad_lens, - ) - nll.append(batch_nll) - start_idx = end_idx - if start_idx == total_num: - break - nll = torch.cat(nll) - assert nll.size(0) == total_num - return nll - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) - - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att - - def _calc_att_predictor_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - encoder_out_mask = sequence_mask( - encoder_out_lens, - maxlen=encoder_out.size(1), - dtype=encoder_out.dtype, - device=encoder_out.device)[:, None, :] - # logging.info( - # "encoder_out_mask size: {}".format(encoder_out_mask.size())) - pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( - encoder_out, - ys_out_pad, - encoder_out_mask, - ignore_id=self.ignore_id, - target_label_length=ys_in_lens) - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) - - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att - - def _calc_ctc_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - # Calc CTC loss - loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - - # Calc CER using CTC - cer_ctc = None - if not self.training and self.error_calculator is not None: - ys_hat = self.ctc.argmax(encoder_out).data - cer_ctc = self.error_calculator( - ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) - return loss_ctc, cer_ctc - - def _calc_transducer_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - labels: torch.Tensor, - ): - """Compute Transducer loss. - - Args: - encoder_out: Encoder output sequences. (B, T, D_enc) - encoder_out_lens: Encoder output sequences lengths. (B,) - labels: Label ID sequences. (B, L) - - Return: - loss_transducer: Transducer loss value. - cer_transducer: Character error rate for Transducer. - wer_transducer: Word Error Rate for Transducer. - - """ - decoder_in, target, t_len, u_len = get_transducer_task_io( - labels, - encoder_out_lens, - ignore_id=self.ignore_id, - blank_id=self.blank_id, - ) - - self.decoder.set_device(encoder_out.device) - decoder_out = self.decoder(decoder_in) - - joint_out = self.joint_network( - encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)) - - loss_transducer = self.criterion_transducer( - joint_out, - target, - t_len, - u_len, - reduction='sum', - ) - - cer_transducer, wer_transducer = None, None - if not self.training and self.error_calculator_trans is not None: - cer_transducer, wer_transducer = self.error_calculator_trans( - encoder_out, target) - - return loss_transducer, cer_transducer, wer_transducer diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model_paraformer.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model_paraformer.py deleted file mode 100644 index 9b3ac624..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/espnet_model_paraformer.py +++ /dev/null @@ -1,1444 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -import logging -from contextlib import contextmanager -from distutils.version import LooseVersion -from typing import Dict, List, Optional, Tuple, Union - -import torch -from espnet2.asr.ctc import CTC -from espnet2.asr.decoder.abs_decoder import AbsDecoder -from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet2.asr.espnet_model import ESPnetASRModel -from espnet2.asr.frontend.abs_frontend import AbsFrontend -from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder -from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder -from espnet2.asr.specaug.abs_specaug import AbsSpecAug -from espnet2.asr.transducer.error_calculator import ErrorCalculatorTransducer -from espnet2.asr.transducer.utils import get_transducer_task_io -from espnet2.layers.abs_normalize import AbsNormalize -from espnet2.torch_utils.device_funcs import force_gatherable -from espnet2.train.abs_espnet_model import AbsESPnetModel -from espnet.nets.e2e_asr_common import ErrorCalculator -from espnet.nets.pytorch_backend.nets_utils import make_pad_mask, th_accuracy -from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos -from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import \ - LabelSmoothingLoss # noqa: H301 -from typeguard import check_argument_types - -from ...espnet.nets.pytorch_backend.cif_utils.cif import \ - CIF_Model as cif_predictor - -if LooseVersion(torch.__version__) >= LooseVersion('1.6.0'): - from torch.cuda.amp import autocast -else: - # Nothing to do if torch<1.6.0 - @contextmanager - def autocast(enabled=True): - yield - - -class Paraformer(AbsESPnetModel): - """CTC-attention hybrid Encoder-Decoder model""" - - def __init__( - self, - vocab_size: int, - token_list: Union[Tuple[str, ...], List[str]], - frontend: Optional[AbsFrontend], - specaug: Optional[AbsSpecAug], - normalize: Optional[AbsNormalize], - preencoder: Optional[AbsPreEncoder], - encoder: AbsEncoder, - postencoder: Optional[AbsPostEncoder], - decoder: AbsDecoder, - ctc: CTC, - joint_network: Optional[torch.nn.Module], - ctc_weight: float = 0.5, - interctc_weight: float = 0.0, - ignore_id: int = -1, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - report_cer: bool = True, - report_wer: bool = True, - sym_space: str = '', - sym_blank: str = '', - extract_feats_in_collect_stats: bool = True, - predictor=None, - predictor_weight: float = 0.0, - glat_context_p: float = 0.2, - ): - assert check_argument_types() - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - assert 0.0 <= interctc_weight < 1.0, interctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.blank_id = 0 - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.interctc_weight = interctc_weight - self.token_list = token_list.copy() - - self.frontend = frontend - self.specaug = specaug - self.normalize = normalize - self.preencoder = preencoder - self.postencoder = postencoder - self.encoder = encoder - - if not hasattr(self.encoder, 'interctc_use_conditioning'): - self.encoder.interctc_use_conditioning = False - if self.encoder.interctc_use_conditioning: - self.encoder.conditioning_layer = torch.nn.Linear( - vocab_size, self.encoder.output_size()) - - self.use_transducer_decoder = joint_network is not None - - self.error_calculator = None - - if self.use_transducer_decoder: - # from warprnnt_pytorch import RNNTLoss - from warp_rnnt import rnnt_loss as RNNTLoss - - self.decoder = decoder - self.joint_network = joint_network - - self.criterion_transducer = RNNTLoss - - if report_cer or report_wer: - self.error_calculator_trans = ErrorCalculatorTransducer( - decoder, - joint_network, - token_list, - sym_space, - sym_blank, - report_cer=report_cer, - report_wer=report_wer, - ) - else: - self.error_calculator_trans = None - - if self.ctc_weight != 0: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, - report_wer) - else: - # we set self.decoder = None in the CTC mode since - # self.decoder parameters were never used and PyTorch complained - # and threw an Exception in the multi-GPU experiment. - # thanks Jeff Farris for pointing out the issue. - if ctc_weight == 1.0: - self.decoder = None - else: - self.decoder = decoder - - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - if report_cer or report_wer: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, report_wer) - - if ctc_weight == 0.0: - self.ctc = None - else: - self.ctc = ctc - - self.extract_feats_in_collect_stats = extract_feats_in_collect_stats - self.predictor = predictor - self.predictor_weight = predictor_weight - self.glat_context_p = glat_context_p - self.criterion_pre = torch.nn.L1Loss() - self.step_cur = 0 - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == text_lengths.shape[0]), \ - (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape) - batch_size = speech.shape[0] - self.step_cur += 1 - # for data-parallel - text = text[:, :text_lengths.max()] - speech = speech[:, :speech_lengths.max(), :] - - # 1. Encoder - encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - loss_att, acc_att, cer_att, wer_att = None, None, None, None - loss_ctc, cer_ctc = None, None - loss_transducer, cer_transducer, wer_transducer = None, None, None - loss_pre = None - stats = dict() - - # 1. CTC branch - if self.ctc_weight != 0.0: - loss_ctc, cer_ctc = self._calc_ctc_loss(encoder_out, - encoder_out_lens, text, - text_lengths) - - # Collect CTC branch stats - stats['loss_ctc'] = loss_ctc.detach( - ) if loss_ctc is not None else None - stats['cer_ctc'] = cer_ctc - - # Intermediate CTC (optional) - loss_interctc = 0.0 - if self.interctc_weight != 0.0 and intermediate_outs is not None: - for layer_idx, intermediate_out in intermediate_outs: - # we assume intermediate_out has the same length & padding - # as those of encoder_out - loss_ic, cer_ic = self._calc_ctc_loss(intermediate_out, - encoder_out_lens, text, - text_lengths) - loss_interctc = loss_interctc + loss_ic - - # Collect Intermedaite CTC stats - stats['loss_interctc_layer{}'.format(layer_idx)] = ( - loss_ic.detach() if loss_ic is not None else None) - stats['cer_interctc_layer{}'.format(layer_idx)] = cer_ic - - loss_interctc = loss_interctc / len(intermediate_outs) - - # calculate whole encoder loss - loss_ctc = (1 - self.interctc_weight - ) * loss_ctc + self.interctc_weight * loss_interctc - - if self.use_transducer_decoder: - # 2a. Transducer decoder branch - ( - loss_transducer, - cer_transducer, - wer_transducer, - ) = self._calc_transducer_loss( - encoder_out, - encoder_out_lens, - text, - ) - - if loss_ctc is not None: - loss = loss_transducer + (self.ctc_weight * loss_ctc) - else: - loss = loss_transducer - - # Collect Transducer branch stats - stats['loss_transducer'] = ( - loss_transducer.detach() - if loss_transducer is not None else None) - stats['cer_transducer'] = cer_transducer - stats['wer_transducer'] = wer_transducer - - else: - # 2b. Attention decoder branch - if self.ctc_weight != 1.0: - - loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss( - encoder_out, encoder_out_lens, text, text_lengths) - - # 3. CTC-Att loss definition - if self.ctc_weight == 0.0: - loss = loss_att - elif self.ctc_weight == 1.0: - loss = loss_ctc - else: - loss = self.ctc_weight * loss_ctc + ( - 1 - self.ctc_weight - ) * loss_att + loss_pre * self.predictor_weight - - # Collect Attn branch stats - stats['loss_att'] = loss_att.detach( - ) if loss_att is not None else None - stats['acc'] = acc_att - stats['cer'] = cer_att - stats['wer'] = wer_att - stats['loss_pre'] = loss_pre.detach().cpu( - ) if loss_pre is not None else None - - # Collect total loss stats - # TODO(wjm): needed to be checked - # TODO(wjm): same problem: https://github.com/espnet/espnet/issues/4136 - # FIXME(wjm): for logger error when accum_grad > 1 - # stats["loss"] = loss.detach() - stats['loss'] = torch.clone(loss.detach()) - - # force_gatherable: to-device and to-tensor if scalar for DataParallel - loss, stats, weight = force_gatherable((loss, stats, batch_size), - loss.device) - return loss, stats, weight - - def collect_feats( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - if self.extract_feats_in_collect_stats: - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - else: - # Generate dummy stats if extract_feats_in_collect_stats is False - logging.warning( - 'Generating dummy stats for feats and feats_lengths, ' - 'because encoder_conf.extract_feats_in_collect_stats is ' - f'{self.extract_feats_in_collect_stats}') - feats, feats_lengths = speech, speech_lengths - return {'feats': feats, 'feats_lengths': feats_lengths} - - def encode( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Frontend + Encoder. Note that this method is used by asr_inference.py - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - """ - with autocast(False): - # 1. Extract feats - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - - # 2. Data augmentation - if self.specaug is not None and self.training: - feats, feats_lengths = self.specaug(feats, feats_lengths) - - # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN - if self.normalize is not None: - feats, feats_lengths = self.normalize(feats, feats_lengths) - - # Pre-encoder, e.g. used for raw input data - if self.preencoder is not None: - feats, feats_lengths = self.preencoder(feats, feats_lengths) - - # 4. Forward encoder - # feats: (Batch, Length, Dim) - # -> encoder_out: (Batch, Length2, Dim2) - if self.encoder.interctc_use_conditioning: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths, ctc=self.ctc) - else: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - # Post-encoder, e.g. NLU - if self.postencoder is not None: - encoder_out, encoder_out_lens = self.postencoder( - encoder_out, encoder_out_lens) - - assert encoder_out.size(0) == speech.size(0), ( - encoder_out.size(), - speech.size(0), - ) - assert encoder_out.size(1) <= encoder_out_lens.max(), ( - encoder_out.size(), - encoder_out_lens.max(), - ) - - if intermediate_outs is not None: - return (encoder_out, intermediate_outs), encoder_out_lens - - return encoder_out, encoder_out_lens - - def calc_predictor(self, encoder_out, encoder_out_lens): - - encoder_out_mask = (~make_pad_mask( - encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( - encoder_out.device) - pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( - encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) - return pre_acoustic_embeds, pre_token_length - - def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens): - - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens) - decoder_out = torch.log_softmax(decoder_out, dim=-1) - return decoder_out, ys_pad_lens - - def _extract_feats( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert speech_lengths.dim() == 1, speech_lengths.shape - - # for data-parallel - speech = speech[:, :speech_lengths.max()] - if self.frontend is not None: - # Frontend - # e.g. STFT and Feature extract - # data_loader may send time-domain signal in this case - # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim) - feats, feats_lengths = self.frontend(speech, speech_lengths) - else: - # No frontend and no feature extract - feats, feats_lengths = speech, speech_lengths - return feats, feats_lengths - - def nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> torch.Tensor: - """Compute negative log likelihood(nll) from transformer-decoder - - Normally, this function is called in batchify_nll. - - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - """ - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) # [batch, seqlen, dim] - batch_size = decoder_out.size(0) - decoder_num_class = decoder_out.size(2) - # nll: negative log-likelihood - nll = torch.nn.functional.cross_entropy( - decoder_out.view(-1, decoder_num_class), - ys_out_pad.view(-1), - ignore_index=self.ignore_id, - reduction='none', - ) - nll = nll.view(batch_size, -1) - nll = nll.sum(dim=1) - assert nll.size(0) == batch_size - return nll - - def batchify_nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - batch_size: int = 100, - ): - """Compute negative log likelihood(nll) from transformer-decoder - - To avoid OOM, this fuction seperate the input into batches. - Then call nll for each batch and combine and return results. - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - batch_size: int, samples each batch contain when computing nll, - you may change this to avoid OOM or increase - GPU memory usage - """ - total_num = encoder_out.size(0) - if total_num <= batch_size: - nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - else: - nll = [] - start_idx = 0 - while True: - end_idx = min(start_idx + batch_size, total_num) - batch_encoder_out = encoder_out[start_idx:end_idx, :, :] - batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx] - batch_ys_pad = ys_pad[start_idx:end_idx, :] - batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx] - batch_nll = self.nll( - batch_encoder_out, - batch_encoder_out_lens, - batch_ys_pad, - batch_ys_pad_lens, - ) - nll.append(batch_nll) - start_idx = end_idx - if start_idx == total_num: - break - nll = torch.cat(nll) - assert nll.size(0) == total_num - return nll - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - encoder_out_mask = (~make_pad_mask( - encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( - encoder_out.device) - pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( - encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id) - - # 0. sampler - decoder_out_1st = None - if self.glat_context_p > 0.0: - if self.step_cur < 2: - logging.info( - 'enable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds, decoder_out_1st = self.sampler( - encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, - pre_acoustic_embeds) - else: - if self.step_cur < 2: - logging.info( - 'disable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds = pre_acoustic_embeds - - # 1. Forward decoder - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens) - decoder_out, _ = decoder_outs[0], decoder_outs[1] - - if decoder_out_1st is None: - decoder_out_1st = decoder_out - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_pad) - acc_att = th_accuracy( - decoder_out_1st.view(-1, self.vocab_size), - ys_pad, - ignore_label=self.ignore_id, - ) - loss_pre = self.criterion_pre( - ys_pad_lens.type_as(pre_token_length), pre_token_length) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out_1st.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att, loss_pre - - def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, - pre_acoustic_embeds): - - tgt_mask = (~make_pad_mask(ys_pad_lens, - maxlen=ys_pad_lens.max())[:, :, None]).to( - ys_pad.device) - ys_pad *= tgt_mask[:, :, 0] - ys_pad_embed = self.decoder.embed(ys_pad) - with torch.no_grad(): - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - pre_acoustic_embeds, ys_pad_lens) - decoder_out, _ = decoder_outs[0], decoder_outs[1] - pred_tokens = decoder_out.argmax(-1) - nonpad_positions = ys_pad.ne(self.ignore_id) - seq_lens = (nonpad_positions).sum(1) - same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1) - input_mask = torch.ones_like(nonpad_positions) - bsz, seq_len = ys_pad.size() - for li in range(bsz): - target_num = (((seq_lens[li] - same_num[li].sum()).float()) - * self.glat_context_p).long() - if target_num > 0: - input_mask[li].scatter_( - dim=0, - index=torch.randperm(seq_lens[li])[:target_num].cuda(), - value=0) - input_mask = input_mask.eq(1) - input_mask = input_mask.masked_fill(~nonpad_positions, False) - input_mask_expand_dim = input_mask.unsqueeze(2).to( - pre_acoustic_embeds.device) - - sematic_embeds = pre_acoustic_embeds.masked_fill( - ~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill( - input_mask_expand_dim, 0) - return sematic_embeds * tgt_mask, decoder_out * tgt_mask - - def _calc_att_loss_ar( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) - - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att - - def _calc_ctc_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - # Calc CTC loss - loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - - # Calc CER using CTC - cer_ctc = None - if not self.training and self.error_calculator is not None: - ys_hat = self.ctc.argmax(encoder_out).data - cer_ctc = self.error_calculator( - ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) - return loss_ctc, cer_ctc - - def _calc_transducer_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - labels: torch.Tensor, - ): - """Compute Transducer loss. - - Args: - encoder_out: Encoder output sequences. (B, T, D_enc) - encoder_out_lens: Encoder output sequences lengths. (B,) - labels: Label ID sequences. (B, L) - - Return: - loss_transducer: Transducer loss value. - cer_transducer: Character error rate for Transducer. - wer_transducer: Word Error Rate for Transducer. - - """ - decoder_in, target, t_len, u_len = get_transducer_task_io( - labels, - encoder_out_lens, - ignore_id=self.ignore_id, - blank_id=self.blank_id, - ) - - self.decoder.set_device(encoder_out.device) - decoder_out = self.decoder(decoder_in) - - joint_out = self.joint_network( - encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)) - - loss_transducer = self.criterion_transducer( - joint_out, - target, - t_len, - u_len, - reduction='sum', - ) - - cer_transducer, wer_transducer = None, None - if not self.training and self.error_calculator_trans is not None: - cer_transducer, wer_transducer = self.error_calculator_trans( - encoder_out, target) - - return loss_transducer, cer_transducer, wer_transducer - - -class ParaformerBertEmbed(AbsESPnetModel): - """CTC-attention hybrid Encoder-Decoder model""" - - def __init__( - self, - vocab_size: int, - token_list: Union[Tuple[str, ...], List[str]], - frontend: Optional[AbsFrontend], - specaug: Optional[AbsSpecAug], - normalize: Optional[AbsNormalize], - preencoder: Optional[AbsPreEncoder], - encoder: AbsEncoder, - postencoder: Optional[AbsPostEncoder], - decoder: AbsDecoder, - ctc: CTC, - joint_network: Optional[torch.nn.Module], - ctc_weight: float = 0.5, - interctc_weight: float = 0.0, - ignore_id: int = -1, - lsm_weight: float = 0.0, - length_normalized_loss: bool = False, - report_cer: bool = True, - report_wer: bool = True, - sym_space: str = '', - sym_blank: str = '', - extract_feats_in_collect_stats: bool = True, - predictor: cif_predictor = None, - predictor_weight: float = 0.0, - glat_context_p: float = 0.2, - embed_dims: int = 768, - embeds_loss_weight: float = 0.0, - ): - assert check_argument_types() - assert 0.0 <= ctc_weight <= 1.0, ctc_weight - assert 0.0 <= interctc_weight < 1.0, interctc_weight - - super().__init__() - # note that eos is the same as sos (equivalent ID) - self.blank_id = 0 - self.sos = vocab_size - 1 - self.eos = vocab_size - 1 - self.vocab_size = vocab_size - self.ignore_id = ignore_id - self.ctc_weight = ctc_weight - self.interctc_weight = interctc_weight - self.token_list = token_list.copy() - - self.frontend = frontend - self.specaug = specaug - self.normalize = normalize - self.preencoder = preencoder - self.postencoder = postencoder - self.encoder = encoder - - if not hasattr(self.encoder, 'interctc_use_conditioning'): - self.encoder.interctc_use_conditioning = False - if self.encoder.interctc_use_conditioning: - self.encoder.conditioning_layer = torch.nn.Linear( - vocab_size, self.encoder.output_size()) - - self.use_transducer_decoder = joint_network is not None - - self.error_calculator = None - - if self.use_transducer_decoder: - # from warprnnt_pytorch import RNNTLoss - from warp_rnnt import rnnt_loss as RNNTLoss - - self.decoder = decoder - self.joint_network = joint_network - - self.criterion_transducer = RNNTLoss - - if report_cer or report_wer: - self.error_calculator_trans = ErrorCalculatorTransducer( - decoder, - joint_network, - token_list, - sym_space, - sym_blank, - report_cer=report_cer, - report_wer=report_wer, - ) - else: - self.error_calculator_trans = None - - if self.ctc_weight != 0: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, - report_wer) - else: - # we set self.decoder = None in the CTC mode since - # self.decoder parameters were never used and PyTorch complained - # and threw an Exception in the multi-GPU experiment. - # thanks Jeff Farris for pointing out the issue. - if ctc_weight == 1.0: - self.decoder = None - else: - self.decoder = decoder - - self.criterion_att = LabelSmoothingLoss( - size=vocab_size, - padding_idx=ignore_id, - smoothing=lsm_weight, - normalize_length=length_normalized_loss, - ) - - if report_cer or report_wer: - self.error_calculator = ErrorCalculator( - token_list, sym_space, sym_blank, report_cer, report_wer) - - if ctc_weight == 0.0: - self.ctc = None - else: - self.ctc = ctc - - self.extract_feats_in_collect_stats = extract_feats_in_collect_stats - self.predictor = predictor - self.predictor_weight = predictor_weight - self.glat_context_p = glat_context_p - self.criterion_pre = torch.nn.L1Loss() - self.step_cur = 0 - self.pro_nn = torch.nn.Linear(encoder.output_size(), embed_dims) - self.cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6) - self.embeds_loss_weight = embeds_loss_weight - self.length_normalized_loss = length_normalized_loss - - def forward( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - embed: torch.Tensor = None, - embed_lengths: torch.Tensor = None, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: - """Frontend + Encoder + Decoder + Calc loss - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - text: (Batch, Length) - text_lengths: (Batch,) - """ - assert text_lengths.dim() == 1, text_lengths.shape - # Check that batch_size is unified - assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == text_lengths.shape[0]), \ - (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape) - batch_size = speech.shape[0] - self.step_cur += 1 - # for data-parallel - text = text[:, :text_lengths.max()] - speech = speech[:, :speech_lengths.max(), :] - if embed is not None: - embed = embed[:, :embed_lengths.max(), :] - - # 1. Encoder - encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - loss_att, acc_att, cer_att, wer_att = None, None, None, None - loss_ctc, cer_ctc = None, None - loss_transducer, cer_transducer, wer_transducer = None, None, None - loss_pre = None - cos_loss = None - stats = dict() - - # 1. CTC branch - if self.ctc_weight != 0.0: - loss_ctc, cer_ctc = self._calc_ctc_loss(encoder_out, - encoder_out_lens, text, - text_lengths) - - # Collect CTC branch stats - stats['loss_ctc'] = loss_ctc.detach( - ) if loss_ctc is not None else None - stats['cer_ctc'] = cer_ctc - - # Intermediate CTC (optional) - loss_interctc = 0.0 - - if self.interctc_weight != 0.0 and intermediate_outs is not None: - for layer_idx, intermediate_out in intermediate_outs: - # we assume intermediate_out has the same length & padding - # as those of encoder_out - loss_ic, cer_ic = self._calc_ctc_loss(intermediate_out, - encoder_out_lens, text, - text_lengths) - loss_interctc = loss_interctc + loss_ic - - # Collect Intermedaite CTC stats - stats['loss_interctc_layer{}'.format(layer_idx)] = ( - loss_ic.detach() if loss_ic is not None else None) - stats['cer_interctc_layer{}'.format(layer_idx)] = cer_ic - - loss_interctc = loss_interctc / len(intermediate_outs) - - # calculate whole encoder loss - loss_ctc = (1 - self.interctc_weight - ) * loss_ctc + self.interctc_weight * loss_interctc - - if self.use_transducer_decoder: - # 2a. Transducer decoder branch - ( - loss_transducer, - cer_transducer, - wer_transducer, - ) = self._calc_transducer_loss( - encoder_out, - encoder_out_lens, - text, - ) - - if loss_ctc is not None: - loss = loss_transducer + (self.ctc_weight * loss_ctc) - else: - loss = loss_transducer - - # Collect Transducer branch stats - stats['loss_transducer'] = ( - loss_transducer.detach() - if loss_transducer is not None else None) - stats['cer_transducer'] = cer_transducer - stats['wer_transducer'] = wer_transducer - - else: - # 2b. Attention decoder branch - if self.ctc_weight != 1.0: - - if embed is None or self.embeds_loss_weight <= 0.0: - loss_ret = self._calc_att_loss(encoder_out, - encoder_out_lens, text, - text_lengths) - loss_att, acc_att, cer_att, wer_att, loss_pre = loss_ret[ - 0], loss_ret[1], loss_ret[2], loss_ret[3], loss_ret[4] - else: - loss_ret = self._calc_att_loss_embed( - encoder_out, encoder_out_lens, text, text_lengths, - embed, embed_lengths) - loss_att, acc_att, cer_att, wer_att, loss_pre = loss_ret[ - 0], loss_ret[1], loss_ret[2], loss_ret[3], loss_ret[4] - embeds_outputs = None - if len(loss_ret) > 5: - embeds_outputs = loss_ret[5] - if embeds_outputs is not None: - cos_loss = self._calc_embed_loss( - text, text_lengths, embed, embed_lengths, - embeds_outputs) - # 3. CTC-Att loss definition - if self.ctc_weight == 0.0: - loss = loss_att - elif self.ctc_weight == 1.0: - loss = loss_ctc - elif self.embeds_loss_weight > 0.0: - loss = self.ctc_weight * loss_ctc + ( - 1 - self.ctc_weight - ) * loss_att + loss_pre * self.predictor_weight + cos_loss * self.embeds_loss_weight - else: - loss = self.ctc_weight * loss_ctc + ( - 1 - self.ctc_weight - ) * loss_att + loss_pre * self.predictor_weight - - # Collect Attn branch stats - stats['loss_att'] = loss_att.detach( - ) if loss_att is not None else None - stats['acc'] = acc_att - stats['cer'] = cer_att - stats['wer'] = wer_att - stats['loss_pre'] = loss_pre.detach().cpu( - ) if loss_pre is not None else None - stats['cos_loss'] = cos_loss.detach().cpu( - ) if cos_loss is not None else None - - # Collect total loss stats - # TODO(wjm): needed to be checked - # TODO(wjm): same problem: https://github.com/espnet/espnet/issues/4136 - # FIXME(wjm): for logger error when accum_grad > 1 - # stats["loss"] = loss.detach() - stats['loss'] = torch.clone(loss.detach()) - - # force_gatherable: to-device and to-tensor if scalar for DataParallel - loss, stats, weight = force_gatherable((loss, stats, batch_size), - loss.device) - return loss, stats, weight - - def _calc_embed_loss( - self, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - embed: torch.Tensor = None, - embed_lengths: torch.Tensor = None, - embeds_outputs: torch.Tensor = None, - ): - embeds_outputs = self.pro_nn(embeds_outputs) - tgt_mask = (~make_pad_mask(ys_pad_lens, - maxlen=ys_pad_lens.max())[:, :, None]).to( - ys_pad.device) - embeds_outputs *= tgt_mask # b x l x d - embed *= tgt_mask # b x l x d - cos_loss = 1.0 - self.cos(embeds_outputs, embed) - cos_loss *= tgt_mask.squeeze(2) - if self.length_normalized_loss: - token_num_total = torch.sum(tgt_mask) - else: - token_num_total = tgt_mask.size()[0] - cos_loss_total = torch.sum(cos_loss) - cos_loss = cos_loss_total / token_num_total - return cos_loss - - def collect_feats( - self, - speech: torch.Tensor, - speech_lengths: torch.Tensor, - text: torch.Tensor, - text_lengths: torch.Tensor, - ) -> Dict[str, torch.Tensor]: - if self.extract_feats_in_collect_stats: - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - else: - # Generate dummy stats if extract_feats_in_collect_stats is False - logging.warning( - 'Generating dummy stats for feats and feats_lengths, ' - 'because encoder_conf.extract_feats_in_collect_stats is ' - f'{self.extract_feats_in_collect_stats}') - feats, feats_lengths = speech, speech_lengths - return {'feats': feats, 'feats_lengths': feats_lengths} - - def encode( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Frontend + Encoder. Note that this method is used by asr_inference.py - - Args: - speech: (Batch, Length, ...) - speech_lengths: (Batch, ) - """ - with autocast(False): - # 1. Extract feats - feats, feats_lengths = self._extract_feats(speech, speech_lengths) - - # 2. Data augmentation - if self.specaug is not None and self.training: - feats, feats_lengths = self.specaug(feats, feats_lengths) - - # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN - if self.normalize is not None: - feats, feats_lengths = self.normalize(feats, feats_lengths) - - # Pre-encoder, e.g. used for raw input data - if self.preencoder is not None: - feats, feats_lengths = self.preencoder(feats, feats_lengths) - - # 4. Forward encoder - # feats: (Batch, Length, Dim) - # -> encoder_out: (Batch, Length2, Dim2) - if self.encoder.interctc_use_conditioning: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths, ctc=self.ctc) - else: - encoder_out, encoder_out_lens, _ = self.encoder( - feats, feats_lengths) - intermediate_outs = None - if isinstance(encoder_out, tuple): - intermediate_outs = encoder_out[1] - encoder_out = encoder_out[0] - - # Post-encoder, e.g. NLU - if self.postencoder is not None: - encoder_out, encoder_out_lens = self.postencoder( - encoder_out, encoder_out_lens) - - assert encoder_out.size(0) == speech.size(0), ( - encoder_out.size(), - speech.size(0), - ) - assert encoder_out.size(1) <= encoder_out_lens.max(), ( - encoder_out.size(), - encoder_out_lens.max(), - ) - - if intermediate_outs is not None: - return (encoder_out, intermediate_outs), encoder_out_lens - - return encoder_out, encoder_out_lens - - def calc_predictor(self, encoder_out, encoder_out_lens): - - encoder_out_mask = (~make_pad_mask( - encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( - encoder_out.device) - # logging.info( - # "encoder_out_mask size: {}".format(encoder_out_mask.size())) - pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( - encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) - return pre_acoustic_embeds, pre_token_length - - def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens): - - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens) - decoder_out = decoder_outs[0] - decoder_out = torch.log_softmax(decoder_out, dim=-1) - return decoder_out, ys_pad_lens - - def _extract_feats( - self, speech: torch.Tensor, - speech_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert speech_lengths.dim() == 1, speech_lengths.shape - - # for data-parallel - speech = speech[:, :speech_lengths.max()] - - if self.frontend is not None: - # Frontend - # e.g. STFT and Feature extract - # data_loader may send time-domain signal in this case - # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim) - feats, feats_lengths = self.frontend(speech, speech_lengths) - else: - # No frontend and no feature extract - feats, feats_lengths = speech, speech_lengths - return feats, feats_lengths - - def nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ) -> torch.Tensor: - """Compute negative log likelihood(nll) from transformer-decoder - - Normally, this function is called in batchify_nll. - - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - """ - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) # [batch, seqlen, dim] - batch_size = decoder_out.size(0) - decoder_num_class = decoder_out.size(2) - # nll: negative log-likelihood - nll = torch.nn.functional.cross_entropy( - decoder_out.view(-1, decoder_num_class), - ys_out_pad.view(-1), - ignore_index=self.ignore_id, - reduction='none', - ) - nll = nll.view(batch_size, -1) - nll = nll.sum(dim=1) - assert nll.size(0) == batch_size - return nll - - def batchify_nll( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - batch_size: int = 100, - ): - """Compute negative log likelihood(nll) from transformer-decoder - - To avoid OOM, this fuction seperate the input into batches. - Then call nll for each batch and combine and return results. - Args: - encoder_out: (Batch, Length, Dim) - encoder_out_lens: (Batch,) - ys_pad: (Batch, Length) - ys_pad_lens: (Batch,) - batch_size: int, samples each batch contain when computing nll, - you may change this to avoid OOM or increase - GPU memory usage - """ - total_num = encoder_out.size(0) - if total_num <= batch_size: - nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - else: - nll = [] - start_idx = 0 - while True: - end_idx = min(start_idx + batch_size, total_num) - batch_encoder_out = encoder_out[start_idx:end_idx, :, :] - batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx] - batch_ys_pad = ys_pad[start_idx:end_idx, :] - batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx] - batch_nll = self.nll( - batch_encoder_out, - batch_encoder_out_lens, - batch_ys_pad, - batch_ys_pad_lens, - ) - nll.append(batch_nll) - start_idx = end_idx - if start_idx == total_num: - break - nll = torch.cat(nll) - assert nll.size(0) == total_num - return nll - - def _calc_att_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - encoder_out_mask = (~make_pad_mask( - encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( - encoder_out.device) - pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( - encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id) - - # 0. sampler - decoder_out_1st = None - if self.glat_context_p > 0.0: - if self.step_cur < 2: - logging.info( - 'enable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds, decoder_out_1st = self.sampler( - encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, - pre_acoustic_embeds) - else: - if self.step_cur < 2: - logging.info( - 'disable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds = pre_acoustic_embeds - - # 1. Forward decoder - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens) - decoder_out, _ = decoder_outs[0], decoder_outs[1] - - if decoder_out_1st is None: - decoder_out_1st = decoder_out - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_pad) - acc_att = th_accuracy( - decoder_out_1st.view(-1, self.vocab_size), - ys_pad, - ignore_label=self.ignore_id, - ) - loss_pre = self.criterion_pre( - ys_pad_lens.type_as(pre_token_length), pre_token_length) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out_1st.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att, loss_pre - - def _calc_att_loss_embed( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - embed: torch.Tensor = None, - embed_lengths: torch.Tensor = None, - ): - encoder_out_mask = (~make_pad_mask( - encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( - encoder_out.device) - pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( - encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id) - - # 0. sampler - decoder_out_1st = None - if self.glat_context_p > 0.0: - if self.step_cur < 2: - logging.info( - 'enable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds, decoder_out_1st = self.sampler( - encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, - pre_acoustic_embeds) - else: - if self.step_cur < 2: - logging.info( - 'disable sampler in paraformer, glat_context_p: {}'.format( - self.glat_context_p)) - sematic_embeds = pre_acoustic_embeds - - # 1. Forward decoder - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - sematic_embeds, ys_pad_lens) - decoder_out, _ = decoder_outs[0], decoder_outs[1] - if len(decoder_outs) > 2: - embeds_outputs = decoder_outs[2] - if decoder_out_1st is None: - decoder_out_1st = decoder_out - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_pad) - acc_att = th_accuracy( - decoder_out_1st.view(-1, self.vocab_size), - ys_pad, - ignore_label=self.ignore_id, - ) - loss_pre = self.criterion_pre( - ys_pad_lens.type_as(pre_token_length), pre_token_length) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out_1st.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att, loss_pre, embeds_outputs - - def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, - pre_acoustic_embeds): - - tgt_mask = (~make_pad_mask(ys_pad_lens, - maxlen=ys_pad_lens.max())[:, :, None]).to( - ys_pad.device) - ys_pad *= tgt_mask[:, :, 0] - ys_pad_embed = self.decoder.embed(ys_pad) - with torch.no_grad(): - decoder_outs = self.decoder(encoder_out, encoder_out_lens, - pre_acoustic_embeds, ys_pad_lens) - decoder_out, _ = decoder_outs[0], decoder_outs[1] - pred_tokens = decoder_out.argmax(-1) - nonpad_positions = ys_pad.ne(self.ignore_id) - seq_lens = (nonpad_positions).sum(1) - same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1) - input_mask = torch.ones_like(nonpad_positions) - bsz, seq_len = ys_pad.size() - for li in range(bsz): - target_num = (((seq_lens[li] - same_num[li].sum()).float()) - * self.glat_context_p).long() - if target_num > 0: - input_mask[li].scatter_( - dim=0, - index=torch.randperm(seq_lens[li])[:target_num].cuda(), - value=0) - input_mask = input_mask.eq(1) - input_mask = input_mask.masked_fill(~nonpad_positions, False) - input_mask_expand_dim = input_mask.unsqueeze(2).to( - pre_acoustic_embeds.device) - - sematic_embeds = pre_acoustic_embeds.masked_fill( - ~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill( - input_mask_expand_dim, 0) - return sematic_embeds * tgt_mask, decoder_out * tgt_mask - - def _calc_att_loss_ar( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, - self.ignore_id) - ys_in_lens = ys_pad_lens + 1 - - # 1. Forward decoder - decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, - ys_in_lens) - - # 2. Compute attention loss - loss_att = self.criterion_att(decoder_out, ys_out_pad) - acc_att = th_accuracy( - decoder_out.view(-1, self.vocab_size), - ys_out_pad, - ignore_label=self.ignore_id, - ) - - # Compute cer/wer using attention-decoder - if self.training or self.error_calculator is None: - cer_att, wer_att = None, None - else: - ys_hat = decoder_out.argmax(dim=-1) - cer_att, wer_att = self.error_calculator(ys_hat.cpu(), - ys_pad.cpu()) - - return loss_att, acc_att, cer_att, wer_att - - def _calc_ctc_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - ys_pad: torch.Tensor, - ys_pad_lens: torch.Tensor, - ): - # Calc CTC loss - loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) - - # Calc CER using CTC - cer_ctc = None - if not self.training and self.error_calculator is not None: - ys_hat = self.ctc.argmax(encoder_out).data - cer_ctc = self.error_calculator( - ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) - return loss_ctc, cer_ctc - - def _calc_transducer_loss( - self, - encoder_out: torch.Tensor, - encoder_out_lens: torch.Tensor, - labels: torch.Tensor, - ): - """Compute Transducer loss. - - Args: - encoder_out: Encoder output sequences. (B, T, D_enc) - encoder_out_lens: Encoder output sequences lengths. (B,) - labels: Label ID sequences. (B, L) - - Return: - loss_transducer: Transducer loss value. - cer_transducer: Character error rate for Transducer. - wer_transducer: Word Error Rate for Transducer. - - """ - decoder_in, target, t_len, u_len = get_transducer_task_io( - labels, - encoder_out_lens, - ignore_id=self.ignore_id, - blank_id=self.blank_id, - ) - - self.decoder.set_device(encoder_out.device) - decoder_out = self.decoder(decoder_in) - - joint_out = self.joint_network( - encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)) - - loss_transducer = self.criterion_transducer( - joint_out, - target, - t_len, - u_len, - reduction='sum', - ) - - cer_transducer, wer_transducer = None, None - if not self.training and self.error_calculator_trans is not None: - cer_transducer, wer_transducer = self.error_calculator_trans( - encoder_out, target) - - return loss_transducer, cer_transducer, wer_transducer diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/wav_frontend.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/wav_frontend.py deleted file mode 100644 index 1adc24f1..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/frontend/wav_frontend.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. - -import copy -from typing import Optional, Tuple, Union - -import humanfriendly -import numpy as np -import torch -import torchaudio -import torchaudio.compliance.kaldi as kaldi -from espnet2.asr.frontend.abs_frontend import AbsFrontend -from espnet2.layers.log_mel import LogMel -from espnet2.layers.stft import Stft -from espnet2.utils.get_default_kwargs import get_default_kwargs -from espnet.nets.pytorch_backend.frontends.frontend import Frontend -from typeguard import check_argument_types - - -class WavFrontend(AbsFrontend): - """Conventional frontend structure for ASR. - - Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN - """ - - def __init__( - self, - fs: Union[int, str] = 16000, - n_fft: int = 512, - win_length: int = 400, - hop_length: int = 160, - window: Optional[str] = 'hamming', - center: bool = True, - normalized: bool = False, - onesided: bool = True, - n_mels: int = 80, - fmin: int = None, - fmax: int = None, - htk: bool = False, - frontend_conf: Optional[dict] = get_default_kwargs(Frontend), - apply_stft: bool = True, - ): - assert check_argument_types() - super().__init__() - if isinstance(fs, str): - fs = humanfriendly.parse_size(fs) - - # Deepcopy (In general, dict shouldn't be used as default arg) - frontend_conf = copy.deepcopy(frontend_conf) - self.hop_length = hop_length - self.win_length = win_length - self.window = window - self.fs = fs - - if apply_stft: - self.stft = Stft( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - center=center, - window=window, - normalized=normalized, - onesided=onesided, - ) - else: - self.stft = None - self.apply_stft = apply_stft - - if frontend_conf is not None: - self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf) - else: - self.frontend = None - - self.logmel = LogMel( - fs=fs, - n_fft=n_fft, - n_mels=n_mels, - fmin=fmin, - fmax=fmax, - htk=htk, - ) - self.n_mels = n_mels - self.frontend_type = 'default' - - def output_size(self) -> int: - return self.n_mels - - def forward( - self, input: torch.Tensor, - input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - - sample_frequency = self.fs - num_mel_bins = self.n_mels - frame_length = self.win_length * 1000 / sample_frequency - frame_shift = self.hop_length * 1000 / sample_frequency - - waveform = input * (1 << 15) - - mat = kaldi.fbank( - waveform, - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=1.0, - energy_floor=0.0, - window_type=self.window, - sample_frequency=sample_frequency) - - input_feats = mat[None, :] - feats_lens = torch.randn(1) - feats_lens.fill_(input_feats.shape[1]) - - return input_feats, feats_lens diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/chunk_utilis.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/chunk_utilis.py deleted file mode 100644 index e9f1b785..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/asr/streaming_utilis/chunk_utilis.py +++ /dev/null @@ -1,321 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -import logging -import math - -import numpy as np -import torch -from espnet.nets.pytorch_backend.nets_utils import make_pad_mask - -from ...nets.pytorch_backend.cif_utils.cif import \ - cif_predictor as cif_predictor - -np.set_printoptions(threshold=np.inf) -torch.set_printoptions(profile='full', precision=100000, linewidth=None) - - -def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device='cpu'): - if maxlen is None: - maxlen = lengths.max() - row_vector = torch.arange(0, maxlen, 1) - matrix = torch.unsqueeze(lengths, dim=-1) - mask = row_vector < matrix - - return mask.type(dtype).to(device) - - -class overlap_chunk(): - - def __init__( - self, - chunk_size: tuple = (16, ), - stride: tuple = (10, ), - pad_left: tuple = (0, ), - encoder_att_look_back_factor: tuple = (1, ), - shfit_fsmn: int = 0, - ): - self.chunk_size, self.stride, self.pad_left, self.encoder_att_look_back_factor \ - = chunk_size, stride, pad_left, encoder_att_look_back_factor - self.shfit_fsmn = shfit_fsmn - self.x_add_mask = None - self.x_rm_mask = None - self.x_len = None - self.mask_shfit_chunk = None - self.mask_chunk_predictor = None - self.mask_att_chunk_encoder = None - self.mask_shift_att_chunk_decoder = None - self.chunk_size_cur, self.stride_cur, self.pad_left_cur, self.encoder_att_look_back_factor_cur \ - = None, None, None, None - - def get_chunk_size(self, ind: int = 0): - # with torch.no_grad: - chunk_size, stride, pad_left, encoder_att_look_back_factor = self.chunk_size[ - ind], self.stride[ind], self.pad_left[ - ind], self.encoder_att_look_back_factor[ind] - self.chunk_size_cur, self.stride_cur, self.pad_left_cur, - self.encoder_att_look_back_factor_cur, self.chunk_size_pad_shift_cur \ - = chunk_size, stride, pad_left, encoder_att_look_back_factor, chunk_size + self.shfit_fsmn - return self.chunk_size_cur, self.stride_cur, self.pad_left_cur, self.encoder_att_look_back_factor_cur - - def gen_chunk_mask(self, x_len, ind=0, num_units=1, num_units_predictor=1): - - with torch.no_grad(): - x_len = x_len.cpu().numpy() - x_len_max = x_len.max() - - chunk_size, stride, pad_left, encoder_att_look_back_factor = self.get_chunk_size( - ind) - shfit_fsmn = self.shfit_fsmn - chunk_size_pad_shift = chunk_size + shfit_fsmn - - chunk_num_batch = np.ceil(x_len / stride).astype(np.int32) - x_len_chunk = ( - chunk_num_batch - 1 - ) * chunk_size_pad_shift + shfit_fsmn + pad_left + 0 + x_len - ( - chunk_num_batch - 1) * stride - x_len_chunk = x_len_chunk.astype(x_len.dtype) - x_len_chunk_max = x_len_chunk.max() - - chunk_num = int(math.ceil(x_len_max / stride)) - dtype = np.int32 - max_len_for_x_mask_tmp = max(chunk_size, x_len_max) - x_add_mask = np.zeros([0, max_len_for_x_mask_tmp], dtype=dtype) - x_rm_mask = np.zeros([max_len_for_x_mask_tmp, 0], dtype=dtype) - mask_shfit_chunk = np.zeros([0, num_units], dtype=dtype) - mask_chunk_predictor = np.zeros([0, num_units_predictor], - dtype=dtype) - mask_shift_att_chunk_decoder = np.zeros([0, 1], dtype=dtype) - mask_att_chunk_encoder = np.zeros( - [0, chunk_num * chunk_size_pad_shift], dtype=dtype) - for chunk_ids in range(chunk_num): - # x_mask add - fsmn_padding = np.zeros((shfit_fsmn, max_len_for_x_mask_tmp), - dtype=dtype) - x_mask_cur = np.diag(np.ones(chunk_size, dtype=np.float32)) - x_mask_pad_left = np.zeros((chunk_size, chunk_ids * stride), - dtype=dtype) - x_mask_pad_right = np.zeros( - (chunk_size, max_len_for_x_mask_tmp), dtype=dtype) - x_cur_pad = np.concatenate( - [x_mask_pad_left, x_mask_cur, x_mask_pad_right], axis=1) - x_cur_pad = x_cur_pad[:chunk_size, :max_len_for_x_mask_tmp] - x_add_mask_fsmn = np.concatenate([fsmn_padding, x_cur_pad], - axis=0) - x_add_mask = np.concatenate([x_add_mask, x_add_mask_fsmn], - axis=0) - - # x_mask rm - fsmn_padding = np.zeros((max_len_for_x_mask_tmp, shfit_fsmn), - dtype=dtype) - x_mask_cur = np.diag(np.ones(stride, dtype=dtype)) - x_mask_right = np.zeros((stride, chunk_size - stride), - dtype=dtype) - x_mask_cur = np.concatenate([x_mask_cur, x_mask_right], axis=1) - x_mask_cur_pad_top = np.zeros((chunk_ids * stride, chunk_size), - dtype=dtype) - x_mask_cur_pad_bottom = np.zeros( - (max_len_for_x_mask_tmp, chunk_size), dtype=dtype) - x_rm_mask_cur = np.concatenate( - [x_mask_cur_pad_top, x_mask_cur, x_mask_cur_pad_bottom], - axis=0) - x_rm_mask_cur = x_rm_mask_cur[:max_len_for_x_mask_tmp, : - chunk_size] - x_rm_mask_cur_fsmn = np.concatenate( - [fsmn_padding, x_rm_mask_cur], axis=1) - x_rm_mask = np.concatenate([x_rm_mask, x_rm_mask_cur_fsmn], - axis=1) - - # fsmn_padding_mask - pad_shfit_mask = np.zeros([shfit_fsmn, num_units], dtype=dtype) - ones_1 = np.ones([chunk_size, num_units], dtype=dtype) - mask_shfit_chunk_cur = np.concatenate([pad_shfit_mask, ones_1], - axis=0) - mask_shfit_chunk = np.concatenate( - [mask_shfit_chunk, mask_shfit_chunk_cur], axis=0) - - # predictor mask - zeros_1 = np.zeros( - [shfit_fsmn + pad_left, num_units_predictor], dtype=dtype) - ones_2 = np.ones([stride, num_units_predictor], dtype=dtype) - zeros_3 = np.zeros( - [chunk_size - stride - pad_left, num_units_predictor], - dtype=dtype) - ones_zeros = np.concatenate([ones_2, zeros_3], axis=0) - mask_chunk_predictor_cur = np.concatenate( - [zeros_1, ones_zeros], axis=0) - mask_chunk_predictor = np.concatenate( - [mask_chunk_predictor, mask_chunk_predictor_cur], axis=0) - - # encoder att mask - zeros_1_top = np.zeros( - [shfit_fsmn, chunk_num * chunk_size_pad_shift], - dtype=dtype) - - zeros_2_num = max(chunk_ids - encoder_att_look_back_factor, 0) - zeros_2 = np.zeros( - [chunk_size, zeros_2_num * chunk_size_pad_shift], - dtype=dtype) - - encoder_att_look_back_num = max(chunk_ids - zeros_2_num, 0) - zeros_2_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype) - ones_2_mid = np.ones([stride, stride], dtype=dtype) - zeros_2_bottom = np.zeros([chunk_size - stride, stride], - dtype=dtype) - zeros_2_right = np.zeros([chunk_size, chunk_size - stride], - dtype=dtype) - ones_2 = np.concatenate([ones_2_mid, zeros_2_bottom], axis=0) - ones_2 = np.concatenate([zeros_2_left, ones_2, zeros_2_right], - axis=1) - ones_2 = np.tile(ones_2, [1, encoder_att_look_back_num]) - - zeros_3_left = np.zeros([chunk_size, shfit_fsmn], dtype=dtype) - ones_3_right = np.ones([chunk_size, chunk_size], dtype=dtype) - ones_3 = np.concatenate([zeros_3_left, ones_3_right], axis=1) - - zeros_remain_num = max(chunk_num - 1 - chunk_ids, 0) - zeros_remain = np.zeros( - [chunk_size, zeros_remain_num * chunk_size_pad_shift], - dtype=dtype) - - ones2_bottom = np.concatenate( - [zeros_2, ones_2, ones_3, zeros_remain], axis=1) - mask_att_chunk_encoder_cur = np.concatenate( - [zeros_1_top, ones2_bottom], axis=0) - mask_att_chunk_encoder = np.concatenate( - [mask_att_chunk_encoder, mask_att_chunk_encoder_cur], - axis=0) - - # decoder fsmn_shift_att_mask - zeros_1 = np.zeros([shfit_fsmn, 1]) - ones_1 = np.ones([chunk_size, 1]) - mask_shift_att_chunk_decoder_cur = np.concatenate( - [zeros_1, ones_1], axis=0) - mask_shift_att_chunk_decoder = np.concatenate( - [ - mask_shift_att_chunk_decoder, - mask_shift_att_chunk_decoder_cur - ], - vaxis=0) # noqa: * - - self.x_add_mask = x_add_mask[:x_len_chunk_max, :x_len_max] - self.x_len_chunk = x_len_chunk - self.x_rm_mask = x_rm_mask[:x_len_max, :x_len_chunk_max] - self.x_len = x_len - self.mask_shfit_chunk = mask_shfit_chunk[:x_len_chunk_max, :] - self.mask_chunk_predictor = mask_chunk_predictor[: - x_len_chunk_max, :] - self.mask_att_chunk_encoder = mask_att_chunk_encoder[: - x_len_chunk_max, : - x_len_chunk_max] - self.mask_shift_att_chunk_decoder = mask_shift_att_chunk_decoder[: - x_len_chunk_max, :] - - return (self.x_add_mask, self.x_len_chunk, self.x_rm_mask, self.x_len, - self.mask_shfit_chunk, self.mask_chunk_predictor, - self.mask_att_chunk_encoder, self.mask_shift_att_chunk_decoder) - - def split_chunk(self, x, x_len, chunk_outs): - """ - :param x: (b, t, d) - :param x_length: (b) - :param ind: int - :return: - """ - x = x[:, :x_len.max(), :] - b, t, d = x.size() - x_len_mask = (~make_pad_mask(x_len, maxlen=t)).to(x.device) - x *= x_len_mask[:, :, None] - - x_add_mask = self.get_x_add_mask(chunk_outs, x.device, dtype=x.dtype) - x_len_chunk = self.get_x_len_chunk( - chunk_outs, x_len.device, dtype=x_len.dtype) - x = torch.transpose(x, 1, 0) - x = torch.reshape(x, [t, -1]) - x_chunk = torch.mm(x_add_mask, x) - x_chunk = torch.reshape(x_chunk, [-1, b, d]).transpose(1, 0) - - return x_chunk, x_len_chunk - - def remove_chunk(self, x_chunk, x_len_chunk, chunk_outs): - x_chunk = x_chunk[:, :x_len_chunk.max(), :] - b, t, d = x_chunk.size() - x_len_chunk_mask = (~make_pad_mask(x_len_chunk, maxlen=t)).to( - x_chunk.device) - x_chunk *= x_len_chunk_mask[:, :, None] - - x_rm_mask = self.get_x_rm_mask( - chunk_outs, x_chunk.device, dtype=x_chunk.dtype) - x_len = self.get_x_len( - chunk_outs, x_len_chunk.device, dtype=x_len_chunk.dtype) - x_chunk = torch.transpose(x_chunk, 1, 0) - x_chunk = torch.reshape(x_chunk, [t, -1]) - x = torch.mm(x_rm_mask, x_chunk) - x = torch.reshape(x, [-1, b, d]).transpose(1, 0) - - return x, x_len - - def get_x_add_mask(self, chunk_outs, device, idx=0, dtype=torch.float32): - x = chunk_outs[idx] - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_x_len_chunk(self, chunk_outs, device, idx=1, dtype=torch.float32): - x = chunk_outs[idx] - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_x_rm_mask(self, chunk_outs, device, idx=2, dtype=torch.float32): - x = chunk_outs[idx] - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_x_len(self, chunk_outs, device, idx=3, dtype=torch.float32): - x = chunk_outs[idx] - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_mask_shfit_chunk(self, - chunk_outs, - device, - batch_size=1, - num_units=1, - idx=4, - dtype=torch.float32): - x = chunk_outs[idx] - x = np.tile(x[None, :, :, ], [batch_size, 1, num_units]) - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_mask_chunk_predictor(self, - chunk_outs, - device, - batch_size=1, - num_units=1, - idx=5, - dtype=torch.float32): - x = chunk_outs[idx] - x = np.tile(x[None, :, :, ], [batch_size, 1, num_units]) - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_mask_att_chunk_encoder(self, - chunk_outs, - device, - batch_size=1, - idx=6, - dtype=torch.float32): - x = chunk_outs[idx] - x = np.tile(x[None, :, :, ], [batch_size, 1, 1]) - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() - - def get_mask_shift_att_chunk_decoder(self, - chunk_outs, - device, - batch_size=1, - idx=7, - dtype=torch.float32): - x = chunk_outs[idx] - x = np.tile(x[None, None, :, 0], [batch_size, 1, 1]) - x = torch.from_numpy(x).type(dtype).to(device) - return x.detach() diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/cif.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/cif.py deleted file mode 100644 index 9381fb98..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/cif_utils/cif.py +++ /dev/null @@ -1,250 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -import logging - -import numpy as np -import torch -from espnet.nets.pytorch_backend.nets_utils import make_pad_mask -from torch import nn - - -class CIF_Model(nn.Module): - - def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1): - super(CIF_Model, self).__init__() - - self.pad = nn.ConstantPad1d((l_order, r_order), 0) - self.cif_conv1d = nn.Conv1d( - idim, idim, l_order + r_order + 1, groups=idim) - self.cif_output = nn.Linear(idim, 1) - self.dropout = torch.nn.Dropout(p=dropout) - self.threshold = threshold - - def forward(self, hidden, target_label=None, mask=None, ignore_id=-1): - h = hidden - context = h.transpose(1, 2) - queries = self.pad(context) - memory = self.cif_conv1d(queries) - output = memory + context - output = self.dropout(output) - output = output.transpose(1, 2) - output = torch.relu(output) - output = self.cif_output(output) - alphas = torch.sigmoid(output) - if mask is not None: - alphas = alphas * mask.transpose(-1, -2).float() - alphas = alphas.squeeze(-1) - if target_label is not None: - target_length = (target_label != ignore_id).float().sum(-1) - else: - target_length = None - cif_length = alphas.sum(-1) - if target_label is not None: - alphas *= (target_length / cif_length)[:, None].repeat( - 1, alphas.size(1)) - cif_output, cif_peak = cif(hidden, alphas, self.threshold) - return cif_output, cif_length, target_length, cif_peak - - def gen_frame_alignments(self, - alphas: torch.Tensor = None, - memory_sequence_length: torch.Tensor = None, - is_training: bool = True, - dtype: torch.dtype = torch.float32): - batch_size, maximum_length = alphas.size() - int_type = torch.int32 - token_num = torch.round(torch.sum(alphas, dim=1)).type(int_type) - - max_token_num = torch.max(token_num).item() - - alphas_cumsum = torch.cumsum(alphas, dim=1) - alphas_cumsum = torch.floor(alphas_cumsum).type(int_type) - alphas_cumsum = torch.tile(alphas_cumsum[:, None, :], - [1, max_token_num, 1]) - - index = torch.ones([batch_size, max_token_num], dtype=int_type) - index = torch.cumsum(index, dim=1) - index = torch.tile(index[:, :, None], [1, 1, maximum_length]) - - index_div = torch.floor(torch.divide(alphas_cumsum, - index)).type(int_type) - index_div_bool_zeros = index_div.eq(0) - index_div_bool_zeros_count = torch.sum( - index_div_bool_zeros, dim=-1) + 1 - index_div_bool_zeros_count = torch.clip(index_div_bool_zeros_count, 0, - memory_sequence_length.max()) - token_num_mask = (~make_pad_mask(token_num, maxlen=max_token_num)).to( - token_num.device) - index_div_bool_zeros_count *= token_num_mask - - index_div_bool_zeros_count_tile = torch.tile( - index_div_bool_zeros_count[:, :, None], [1, 1, maximum_length]) - ones = torch.ones_like(index_div_bool_zeros_count_tile) - zeros = torch.zeros_like(index_div_bool_zeros_count_tile) - ones = torch.cumsum(ones, dim=2) - cond = index_div_bool_zeros_count_tile == ones - index_div_bool_zeros_count_tile = torch.where(cond, zeros, ones) - - index_div_bool_zeros_count_tile_bool = index_div_bool_zeros_count_tile.type( - torch.bool) - index_div_bool_zeros_count_tile = 1 - index_div_bool_zeros_count_tile_bool.type( - int_type) - index_div_bool_zeros_count_tile_out = torch.sum( - index_div_bool_zeros_count_tile, dim=1) - index_div_bool_zeros_count_tile_out = index_div_bool_zeros_count_tile_out.type( - int_type) - predictor_mask = (~make_pad_mask( - memory_sequence_length, - maxlen=memory_sequence_length.max())).type(int_type).to( - memory_sequence_length.device) # noqa: * - index_div_bool_zeros_count_tile_out = index_div_bool_zeros_count_tile_out * predictor_mask - return index_div_bool_zeros_count_tile_out.detach( - ), index_div_bool_zeros_count.detach() - - -class cif_predictor(nn.Module): - - def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1): - super(cif_predictor, self).__init__() - - self.pad = nn.ConstantPad1d((l_order, r_order), 0) - self.cif_conv1d = nn.Conv1d( - idim, idim, l_order + r_order + 1, groups=idim) - self.cif_output = nn.Linear(idim, 1) - self.dropout = torch.nn.Dropout(p=dropout) - self.threshold = threshold - - def forward(self, - hidden, - target_label=None, - mask=None, - ignore_id=-1, - mask_chunk_predictor=None, - target_label_length=None): - h = hidden - context = h.transpose(1, 2) - queries = self.pad(context) - memory = self.cif_conv1d(queries) - output = memory + context - output = self.dropout(output) - output = output.transpose(1, 2) - output = torch.relu(output) - output = self.cif_output(output) - alphas = torch.sigmoid(output) - if mask is not None: - alphas = alphas * mask.transpose(-1, -2).float() - if mask_chunk_predictor is not None: - alphas = alphas * mask_chunk_predictor - alphas = alphas.squeeze(-1) - if target_label_length is not None: - target_length = target_label_length - elif target_label is not None: - target_length = (target_label != ignore_id).float().sum(-1) - else: - target_length = None - token_num = alphas.sum(-1) - if target_length is not None: - alphas *= (target_length / token_num)[:, None].repeat( - 1, alphas.size(1)) - acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold) - return acoustic_embeds, token_num, alphas, cif_peak - - def gen_frame_alignments(self, - alphas: torch.Tensor = None, - memory_sequence_length: torch.Tensor = None, - is_training: bool = True, - dtype: torch.dtype = torch.float32): - batch_size, maximum_length = alphas.size() - int_type = torch.int32 - token_num = torch.round(torch.sum(alphas, dim=1)).type(int_type) - - max_token_num = torch.max(token_num).item() - - alphas_cumsum = torch.cumsum(alphas, dim=1) - alphas_cumsum = torch.floor(alphas_cumsum).type(int_type) - alphas_cumsum = torch.tile(alphas_cumsum[:, None, :], - [1, max_token_num, 1]) - - index = torch.ones([batch_size, max_token_num], dtype=int_type) - index = torch.cumsum(index, dim=1) - index = torch.tile(index[:, :, None], [1, 1, maximum_length]) - - index_div = torch.floor(torch.divide(alphas_cumsum, - index)).type(int_type) - index_div_bool_zeros = index_div.eq(0) - index_div_bool_zeros_count = torch.sum( - index_div_bool_zeros, dim=-1) + 1 - index_div_bool_zeros_count = torch.clip(index_div_bool_zeros_count, 0, - memory_sequence_length.max()) - token_num_mask = (~make_pad_mask(token_num, maxlen=max_token_num)).to( - token_num.device) - index_div_bool_zeros_count *= token_num_mask - - index_div_bool_zeros_count_tile = torch.tile( - index_div_bool_zeros_count[:, :, None], [1, 1, maximum_length]) - ones = torch.ones_like(index_div_bool_zeros_count_tile) - zeros = torch.zeros_like(index_div_bool_zeros_count_tile) - ones = torch.cumsum(ones, dim=2) - cond = index_div_bool_zeros_count_tile == ones - index_div_bool_zeros_count_tile = torch.where(cond, zeros, ones) - - index_div_bool_zeros_count_tile_bool = index_div_bool_zeros_count_tile.type( - torch.bool) - index_div_bool_zeros_count_tile = 1 - index_div_bool_zeros_count_tile_bool.type( - int_type) - index_div_bool_zeros_count_tile_out = torch.sum( - index_div_bool_zeros_count_tile, dim=1) - index_div_bool_zeros_count_tile_out = index_div_bool_zeros_count_tile_out.type( - int_type) - predictor_mask = (~make_pad_mask( - memory_sequence_length, - maxlen=memory_sequence_length.max())).type(int_type).to( - memory_sequence_length.device) # noqa: * - index_div_bool_zeros_count_tile_out = index_div_bool_zeros_count_tile_out * predictor_mask - return index_div_bool_zeros_count_tile_out.detach( - ), index_div_bool_zeros_count.detach() - - -def cif(hidden, alphas, threshold): - batch_size, len_time, hidden_size = hidden.size() - - # loop varss - integrate = torch.zeros([batch_size], device=hidden.device) - frame = torch.zeros([batch_size, hidden_size], device=hidden.device) - # intermediate vars along time - list_fires = [] - list_frames = [] - - for t in range(len_time): - alpha = alphas[:, t] - distribution_completion = torch.ones([batch_size], - device=hidden.device) - integrate - - integrate += alpha - list_fires.append(integrate) - - fire_place = integrate >= threshold - integrate = torch.where( - fire_place, - integrate - torch.ones([batch_size], device=hidden.device), - integrate) - cur = torch.where(fire_place, distribution_completion, alpha) - remainds = alpha - cur - - frame += cur[:, None] * hidden[:, t, :] - list_frames.append(frame) - frame = torch.where(fire_place[:, None].repeat(1, hidden_size), - remainds[:, None] * hidden[:, t, :], frame) - - fires = torch.stack(list_fires, 1) - frames = torch.stack(list_frames, 1) - list_ls = [] - len_labels = torch.round(alphas.sum(-1)).int() - max_label_len = len_labels.max() - for b in range(batch_size): - fire = fires[b, :] - ls = torch.index_select(frames[b, :, :], 0, - torch.nonzero(fire >= threshold).squeeze()) - pad_l = torch.zeros([max_label_len - ls.size(0), hidden_size], - device=hidden.device) - list_ls.append(torch.cat([ls, pad_l], 0)) - return torch.stack(list_ls, 0), fires diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/attention.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/attention.py deleted file mode 100644 index 53766246..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/attention.py +++ /dev/null @@ -1,680 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -"""Multi-Head Attention layer definition.""" - -import logging -import math - -import numpy -import torch -from torch import nn - -torch.set_printoptions(profile='full', precision=1) - - -class MultiHeadedAttention(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - - def __init__(self, n_head, n_feat, dropout_rate): - """Construct an MultiHeadedAttention object.""" - super(MultiHeadedAttention, self).__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.attn = None - self.dropout = nn.Dropout(p=dropout_rate) - - def forward_qkv(self, query, key, value): - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention(self, value, scores, mask): - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - if mask is not None: - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - min_value = float( - numpy.finfo(torch.tensor( - 0, dtype=scores.dtype).numpy().dtype).min) - scores = scores.masked_fill(mask, min_value) - self.attn = torch.softmax( - scores, dim=-1).masked_fill(mask, - 0.0) # (batch, head, time1, time2) - else: - self.attn = torch.softmax( - scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(self.attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query, key, value, mask): - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - q, k, v = self.forward_qkv(query, key, value) - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - return self.forward_attention(v, scores, mask) - - -class MultiHeadedAttentionSANM(nn.Module): - """Multi-Head Attention layer. - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - - """ - - def __init__(self, - n_head, - n_feat, - dropout_rate, - kernel_size, - sanm_shfit=0): - """Construct an MultiHeadedAttention object.""" - super(MultiHeadedAttentionSANM, self).__init__() - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.attn = None - self.dropout = nn.Dropout(p=dropout_rate) - - self.fsmn_block = nn.Conv1d( - n_feat, - n_feat, - kernel_size, - stride=1, - padding=0, - groups=n_feat, - bias=False) - # padding - left_padding = (kernel_size - 1) // 2 - if sanm_shfit > 0: - left_padding = left_padding + sanm_shfit - right_padding = kernel_size - 1 - left_padding - self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) - - def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): - ''' - :param x: (#batch, time1, size). - :param mask: Mask tensor (#batch, 1, time) - :return: - ''' - # b, t, d = inputs.size() - mask = mask[:, 0, :, None] - if mask_shfit_chunk is not None: - mask = mask * mask_shfit_chunk - inputs *= mask - x = inputs.transpose(1, 2) - x = self.pad_fn(x) - x = self.fsmn_block(x) - x = x.transpose(1, 2) - x += inputs - x = self.dropout(x) - return x * mask - - def forward_qkv(self, query, key, value): - """Transform query, key and value. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - - Returns: - torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). - torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). - torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). - - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) # (batch, head, time1, d_k) - k = k.transpose(1, 2) # (batch, head, time2, d_k) - v = v.transpose(1, 2) # (batch, head, time2, d_k) - - return q, k, v - - def forward_attention(self, - value, - scores, - mask, - mask_att_chunk_encoder=None): - """Compute attention context vector. - - Args: - value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). - scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). - mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). - - Returns: - torch.Tensor: Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). - - """ - n_batch = value.size(0) - if mask is not None: - if mask_att_chunk_encoder is not None: - mask = mask * mask_att_chunk_encoder - - mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) - - min_value = float( - numpy.finfo(torch.tensor( - 0, dtype=scores.dtype).numpy().dtype).min) - scores = scores.masked_fill(mask, min_value) - self.attn = torch.softmax( - scores, dim=-1).masked_fill(mask, - 0.0) # (batch, head, time1, time2) - else: - self.attn = torch.softmax( - scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(self.attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = (x.transpose(1, 2).contiguous().view(n_batch, -1, - self.h * self.d_k) - ) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, - query, - key, - value, - mask, - mask_shfit_chunk=None, - mask_att_chunk_encoder=None): - """Compute scaled dot product attention. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - fsmn_memory = self.forward_fsmn(value, mask, mask_shfit_chunk) - q, k, v = self.forward_qkv(query, key, value) - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - att_outs = self.forward_attention(v, scores, mask, - mask_att_chunk_encoder) - return att_outs + fsmn_memory - - -class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding (old version). - - Details can be found in https://github.com/espnet/espnet/pull/2816. - - Paper: https://arxiv.org/abs/1901.02860 - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - - """ - - def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - self.zero_triu = zero_triu - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x): - """Compute relative positional encoding. - - Args: - x (torch.Tensor): Input tensor (batch, head, time1, time2). - - Returns: - torch.Tensor: Output tensor. - - """ - zero_pad = torch.zeros((*x.size()[:3], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if self.zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query, key, value, pos_emb, mask): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time1) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask) - - -class LegacyRelPositionMultiHeadedAttentionSANM(MultiHeadedAttentionSANM): - """Multi-Head Attention layer with relative position encoding (old version). - - Details can be found in https://github.com/espnet/espnet/pull/2816. - - Paper: https://arxiv.org/abs/1901.02860 - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - - """ - - def __init__(self, - n_head, - n_feat, - dropout_rate, - zero_triu=False, - kernel_size=15, - sanm_shfit=0): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate, kernel_size, sanm_shfit) - self.zero_triu = zero_triu - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x): - """Compute relative positional encoding. - - Args: - x (torch.Tensor): Input tensor (batch, head, time1, time2). - - Returns: - torch.Tensor: Output tensor. - - """ - zero_pad = torch.zeros((*x.size()[:3], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as(x) - - if self.zero_triu: - ones = torch.ones((x.size(2), x.size(3))) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query, key, value, pos_emb, mask): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - fsmn_memory = self.forward_fsmn(value, mask) - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time1) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - att_outs = self.forward_attention(v, scores, mask) - return att_outs + fsmn_memory - - -class RelPositionMultiHeadedAttention(MultiHeadedAttention): - """Multi-Head Attention layer with relative position encoding (new implementation). - - Details can be found in https://github.com/espnet/espnet/pull/2816. - - Paper: https://arxiv.org/abs/1901.02860 - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - - """ - - def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate) - self.zero_triu = zero_triu - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x): - """Compute relative positional encoding. - - Args: - x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - - Returns: - torch.Tensor: Output tensor. - - """ - zero_pad = torch.zeros((*x.size()[:3], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as( - x)[:, :, :, :x.size(-1) // 2 - + 1] # only keep the positions from 0 to time2 - - if self.zero_triu: - ones = torch.ones((x.size(2), x.size(3)), device=x.device) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query, key, value, pos_emb, mask): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, 2*time1-1, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, 2*time1-1) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - return self.forward_attention(v, scores, mask) - - -class RelPositionMultiHeadedAttentionSANM(MultiHeadedAttentionSANM): - """Multi-Head Attention layer with relative position encoding (new implementation). - - Details can be found in https://github.com/espnet/espnet/pull/2816. - - Paper: https://arxiv.org/abs/1901.02860 - - Args: - n_head (int): The number of heads. - n_feat (int): The number of features. - dropout_rate (float): Dropout rate. - zero_triu (bool): Whether to zero the upper triangular part of attention matrix. - - """ - - def __init__(self, - n_head, - n_feat, - dropout_rate, - zero_triu=False, - kernel_size=15, - sanm_shfit=0): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head, n_feat, dropout_rate, kernel_size, sanm_shfit) - self.zero_triu = zero_triu - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) - torch.nn.init.xavier_uniform_(self.pos_bias_u) - torch.nn.init.xavier_uniform_(self.pos_bias_v) - - def rel_shift(self, x): - """Compute relative positional encoding. - - Args: - x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - - Returns: - torch.Tensor: Output tensor. - - """ - zero_pad = torch.zeros((*x.size()[:3], 1), - device=x.device, - dtype=x.dtype) - x_padded = torch.cat([zero_pad, x], dim=-1) - - x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) - x = x_padded[:, :, 1:].view_as( - x)[:, :, :, :x.size(-1) // 2 - + 1] # only keep the positions from 0 to time2 - - if self.zero_triu: - ones = torch.ones((x.size(2), x.size(3)), device=x.device) - x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] - - return x - - def forward(self, query, key, value, pos_emb, mask): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - - Args: - query (torch.Tensor): Query tensor (#batch, time1, size). - key (torch.Tensor): Key tensor (#batch, time2, size). - value (torch.Tensor): Value tensor (#batch, time2, size). - pos_emb (torch.Tensor): Positional embedding tensor - (#batch, 2*time1-1, size). - mask (torch.Tensor): Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - - Returns: - torch.Tensor: Output tensor (#batch, time1, d_model). - - """ - fsmn_memory = self.forward_fsmn(value, mask) - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, 2*time1-1) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - matrix_bd = self.rel_shift(matrix_bd) - - scores = (matrix_ac + matrix_bd) / math.sqrt( - self.d_k) # (batch, head, time1, time2) - - att_outs = self.forward_attention(v, scores, mask) - return att_outs + fsmn_memory diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/encoder_layer.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/encoder_layer.py deleted file mode 100644 index 91466b05..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/nets/pytorch_backend/transformer/encoder_layer.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -"""Encoder self-attention layer definition.""" - -import torch -from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm -from torch import nn - - -class EncoderLayer(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate (float): Proability to skip this layer. - During training, the layer may skip residual computation and return input - as-is with given probability. - """ - - def __init__( - self, - size, - self_attn, - feed_forward, - dropout_rate, - normalize_before=True, - concat_after=False, - stochastic_depth_rate=0.0, - ): - """Construct an EncoderLayer object.""" - super(EncoderLayer, self).__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = LayerNorm(size) - self.norm2 = LayerNorm(size) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - self.stochastic_depth_rate = stochastic_depth_rate - - def forward(self, x, mask, cache=None): - """Compute encoded features. - - Args: - x_input (torch.Tensor): Input tensor (#batch, time, size). - mask (torch.Tensor): Mask tensor for the input (#batch, time). - cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time). - - """ - skip_layer = False - # with stochastic depth, residual connection `x + f(x)` becomes - # `x <- x + 1 / (1 - p) * f(x)` at training time. - stoch_layer_coeff = 1.0 - if self.training and self.stochastic_depth_rate > 0: - skip_layer = torch.rand(1).item() < self.stochastic_depth_rate - stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) - - if skip_layer: - if cache is not None: - x = torch.cat([cache, x], dim=1) - return x, mask - - residual = x - if self.normalize_before: - x = self.norm1(x) - - if cache is None: - x_q = x - else: - assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) - x_q = x[:, -1:, :] - residual = residual[:, -1:, :] - mask = None if mask is None else mask[:, -1:, :] - - if self.concat_after: - x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1) - x = residual + stoch_layer_coeff * self.concat_linear(x_concat) - else: - x = residual + stoch_layer_coeff * self.dropout( - self.self_attn(x_q, x, x, mask)) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, mask - - -class EncoderLayerChunk(nn.Module): - """Encoder layer module. - - Args: - size (int): Input dimension. - self_attn (torch.nn.Module): Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward (torch.nn.Module): Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - dropout_rate (float): Dropout rate. - normalize_before (bool): Whether to use layer_norm before the first block. - concat_after (bool): Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate (float): Proability to skip this layer. - During training, the layer may skip residual computation and return input - as-is with given probability. - """ - - def __init__( - self, - size, - self_attn, - feed_forward, - dropout_rate, - normalize_before=True, - concat_after=False, - stochastic_depth_rate=0.0, - ): - """Construct an EncoderLayer object.""" - super(EncoderLayerChunk, self).__init__() - self.self_attn = self_attn - self.feed_forward = feed_forward - self.norm1 = LayerNorm(size) - self.norm2 = LayerNorm(size) - self.dropout = nn.Dropout(dropout_rate) - self.size = size - self.normalize_before = normalize_before - self.concat_after = concat_after - if self.concat_after: - self.concat_linear = nn.Linear(size + size, size) - self.stochastic_depth_rate = stochastic_depth_rate - - def forward(self, - x, - mask, - cache=None, - mask_shfit_chunk=None, - mask_att_chunk_encoder=None): - """Compute encoded features. - - Args: - x_input (torch.Tensor): Input tensor (#batch, time, size). - mask (torch.Tensor): Mask tensor for the input (#batch, time). - cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). - - Returns: - torch.Tensor: Output tensor (#batch, time, size). - torch.Tensor: Mask tensor (#batch, time). - - """ - skip_layer = False - # with stochastic depth, residual connection `x + f(x)` becomes - # `x <- x + 1 / (1 - p) * f(x)` at training time. - stoch_layer_coeff = 1.0 - if self.training and self.stochastic_depth_rate > 0: - skip_layer = torch.rand(1).item() < self.stochastic_depth_rate - stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) - - if skip_layer: - if cache is not None: - x = torch.cat([cache, x], dim=1) - return x, mask - - residual = x - if self.normalize_before: - x = self.norm1(x) - - if cache is None: - x_q = x - else: - assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) - x_q = x[:, -1:, :] - residual = residual[:, -1:, :] - mask = None if mask is None else mask[:, -1:, :] - - if self.concat_after: - x_concat = torch.cat( - (x, - self.self_attn( - x_q, - x, - x, - mask, - mask_shfit_chunk=mask_shfit_chunk, - mask_att_chunk_encoder=mask_att_chunk_encoder)), - dim=-1) - x = residual + stoch_layer_coeff * self.concat_linear(x_concat) - else: - x = residual + stoch_layer_coeff * self.dropout( - self.self_attn( - x_q, - x, - x, - mask, - mask_shfit_chunk=mask_shfit_chunk, - mask_att_chunk_encoder=mask_att_chunk_encoder)) - if not self.normalize_before: - x = self.norm1(x) - - residual = x - if self.normalize_before: - x = self.norm2(x) - x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) - if not self.normalize_before: - x = self.norm2(x) - - if cache is not None: - x = torch.cat([cache, x], dim=1) - - return x, mask, None, mask_shfit_chunk, mask_att_chunk_encoder diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/__init__.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/asr.py b/modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/asr.py deleted file mode 100644 index 7419abd4..00000000 --- a/modelscope/pipelines/audio/asr/asr_engine/espnet/tasks/asr.py +++ /dev/null @@ -1,890 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -# Part of the implementation is borrowed from espnet/espnet. -import argparse -import logging -import os -from pathlib import Path -from typing import Callable, Collection, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -import yaml -from espnet2.asr.ctc import CTC -from espnet2.asr.decoder.abs_decoder import AbsDecoder -from espnet2.asr.decoder.mlm_decoder import MLMDecoder -from espnet2.asr.decoder.rnn_decoder import RNNDecoder -from espnet2.asr.decoder.transformer_decoder import \ - DynamicConvolution2DTransformerDecoder # noqa: H301 -from espnet2.asr.decoder.transformer_decoder import \ - LightweightConvolution2DTransformerDecoder # noqa: H301 -from espnet2.asr.decoder.transformer_decoder import \ - LightweightConvolutionTransformerDecoder # noqa: H301 -from espnet2.asr.decoder.transformer_decoder import ( - DynamicConvolutionTransformerDecoder, TransformerDecoder) -from espnet2.asr.encoder.abs_encoder import AbsEncoder -from espnet2.asr.encoder.contextual_block_conformer_encoder import \ - ContextualBlockConformerEncoder # noqa: H301 -from espnet2.asr.encoder.contextual_block_transformer_encoder import \ - ContextualBlockTransformerEncoder # noqa: H301 -from espnet2.asr.encoder.hubert_encoder import (FairseqHubertEncoder, - FairseqHubertPretrainEncoder) -from espnet2.asr.encoder.longformer_encoder import LongformerEncoder -from espnet2.asr.encoder.rnn_encoder import RNNEncoder -from espnet2.asr.encoder.transformer_encoder import TransformerEncoder -from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder -from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder -from espnet2.asr.espnet_model import ESPnetASRModel -from espnet2.asr.frontend.abs_frontend import AbsFrontend -from espnet2.asr.frontend.default import DefaultFrontend -from espnet2.asr.frontend.fused import FusedFrontends -from espnet2.asr.frontend.s3prl import S3prlFrontend -from espnet2.asr.frontend.windowing import SlidingWindow -from espnet2.asr.maskctc_model import MaskCTCModel -from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder -from espnet2.asr.postencoder.hugging_face_transformers_postencoder import \ - HuggingFaceTransformersPostEncoder # noqa: H301 -from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder -from espnet2.asr.preencoder.linear import LinearProjection -from espnet2.asr.preencoder.sinc import LightweightSincConvs -from espnet2.asr.specaug.abs_specaug import AbsSpecAug -from espnet2.asr.specaug.specaug import SpecAug -from espnet2.asr.transducer.joint_network import JointNetwork -from espnet2.asr.transducer.transducer_decoder import TransducerDecoder -from espnet2.layers.abs_normalize import AbsNormalize -from espnet2.layers.global_mvn import GlobalMVN -from espnet2.layers.utterance_mvn import UtteranceMVN -from espnet2.tasks.abs_task import AbsTask -from espnet2.text.phoneme_tokenizer import g2p_choices -from espnet2.torch_utils.initialize import initialize -from espnet2.train.abs_espnet_model import AbsESPnetModel -from espnet2.train.class_choices import ClassChoices -from espnet2.train.collate_fn import CommonCollateFn -from espnet2.train.preprocessor import CommonPreprocessor -from espnet2.train.trainer import Trainer -from espnet2.utils.get_default_kwargs import get_default_kwargs -from espnet2.utils.nested_dict_action import NestedDictAction -from espnet2.utils.types import (float_or_none, int_or_none, str2bool, - str_or_none) -from typeguard import check_argument_types, check_return_type - -from ..asr.decoder.transformer_decoder import (ParaformerDecoder, - ParaformerDecoderBertEmbed) -from ..asr.encoder.conformer_encoder import ConformerEncoder, SANMEncoder_v2 -from ..asr.encoder.sanm_encoder import SANMEncoder, SANMEncoderChunk -from ..asr.espnet_model import AEDStreaming -from ..asr.espnet_model_paraformer import Paraformer, ParaformerBertEmbed -from ..nets.pytorch_backend.cif_utils.cif import cif_predictor - -# FIXME(wjm): suggested by fairseq, We need to setup root logger before importing any fairseq libraries. -logging.basicConfig( - level='INFO', - format=f"[{os.uname()[1].split('.')[0]}]" - f' %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', -) -# FIXME(wjm): create logger to set level, unset __name__ for different files to share the same logger -logger = logging.getLogger() - -frontend_choices = ClassChoices( - name='frontend', - classes=dict( - default=DefaultFrontend, - sliding_window=SlidingWindow, - s3prl=S3prlFrontend, - fused=FusedFrontends, - ), - type_check=AbsFrontend, - default='default', -) -specaug_choices = ClassChoices( - name='specaug', - classes=dict(specaug=SpecAug, ), - type_check=AbsSpecAug, - default=None, - optional=True, -) -normalize_choices = ClassChoices( - 'normalize', - classes=dict( - global_mvn=GlobalMVN, - utterance_mvn=UtteranceMVN, - ), - type_check=AbsNormalize, - default='utterance_mvn', - optional=True, -) -model_choices = ClassChoices( - 'model', - classes=dict( - espnet=ESPnetASRModel, - maskctc=MaskCTCModel, - paraformer=Paraformer, - paraformer_bert_embed=ParaformerBertEmbed, - aedstreaming=AEDStreaming, - ), - type_check=AbsESPnetModel, - default='espnet', -) -preencoder_choices = ClassChoices( - name='preencoder', - classes=dict( - sinc=LightweightSincConvs, - linear=LinearProjection, - ), - type_check=AbsPreEncoder, - default=None, - optional=True, -) -encoder_choices = ClassChoices( - 'encoder', - classes=dict( - conformer=ConformerEncoder, - transformer=TransformerEncoder, - contextual_block_transformer=ContextualBlockTransformerEncoder, - contextual_block_conformer=ContextualBlockConformerEncoder, - vgg_rnn=VGGRNNEncoder, - rnn=RNNEncoder, - wav2vec2=FairSeqWav2Vec2Encoder, - hubert=FairseqHubertEncoder, - hubert_pretrain=FairseqHubertPretrainEncoder, - longformer=LongformerEncoder, - sanm=SANMEncoder, - sanm_v2=SANMEncoder_v2, - sanm_chunk=SANMEncoderChunk, - ), - type_check=AbsEncoder, - default='rnn', -) -postencoder_choices = ClassChoices( - name='postencoder', - classes=dict( - hugging_face_transformers=HuggingFaceTransformersPostEncoder, ), - type_check=AbsPostEncoder, - default=None, - optional=True, -) -decoder_choices = ClassChoices( - 'decoder', - classes=dict( - transformer=TransformerDecoder, - lightweight_conv=LightweightConvolutionTransformerDecoder, - lightweight_conv2d=LightweightConvolution2DTransformerDecoder, - dynamic_conv=DynamicConvolutionTransformerDecoder, - dynamic_conv2d=DynamicConvolution2DTransformerDecoder, - rnn=RNNDecoder, - transducer=TransducerDecoder, - mlm=MLMDecoder, - paraformer_decoder=ParaformerDecoder, - paraformer_decoder_bert_embed=ParaformerDecoderBertEmbed, - ), - type_check=AbsDecoder, - default='rnn', -) - -predictor_choices = ClassChoices( - name='predictor', - classes=dict( - cif_predictor=cif_predictor, - ctc_predictor=None, - ), - type_check=None, - default='cif_predictor', - optional=True, -) - - -class ASRTask(AbsTask): - # If you need more than one optimizers, change this value - num_optimizers: int = 1 - - # Add variable objects configurations - class_choices_list = [ - # --frontend and --frontend_conf - frontend_choices, - # --specaug and --specaug_conf - specaug_choices, - # --normalize and --normalize_conf - normalize_choices, - # --model and --model_conf - model_choices, - # --preencoder and --preencoder_conf - preencoder_choices, - # --encoder and --encoder_conf - encoder_choices, - # --postencoder and --postencoder_conf - postencoder_choices, - # --decoder and --decoder_conf - decoder_choices, - ] - - # If you need to modify train() or eval() procedures, change Trainer class here - trainer = Trainer - - @classmethod - def add_task_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group(description='Task related') - - # NOTE(kamo): add_arguments(..., required=True) can't be used - # to provide --print_config mode. Instead of it, do as - required = parser.get_default('required') - required += ['token_list'] - - group.add_argument( - '--token_list', - type=str_or_none, - default=None, - help='A text mapping int-id to token', - ) - group.add_argument( - '--init', - type=lambda x: str_or_none(x.lower()), - default=None, - help='The initialization method', - choices=[ - 'chainer', - 'xavier_uniform', - 'xavier_normal', - 'kaiming_uniform', - 'kaiming_normal', - None, - ], - ) - - group.add_argument( - '--input_size', - type=int_or_none, - default=None, - help='The number of input dimension of the feature', - ) - - group.add_argument( - '--ctc_conf', - action=NestedDictAction, - default=get_default_kwargs(CTC), - help='The keyword arguments for CTC class.', - ) - group.add_argument( - '--joint_net_conf', - action=NestedDictAction, - default=None, - help='The keyword arguments for joint network class.', - ) - - group = parser.add_argument_group(description='Preprocess related') - group.add_argument( - '--use_preprocessor', - type=str2bool, - default=True, - help='Apply preprocessing to data or not', - ) - group.add_argument( - '--token_type', - type=str, - default='bpe', - choices=['bpe', 'char', 'word', 'phn'], - help='The text will be tokenized ' - 'in the specified level token', - ) - group.add_argument( - '--bpemodel', - type=str_or_none, - default=None, - help='The model file of sentencepiece', - ) - parser.add_argument( - '--non_linguistic_symbols', - type=str_or_none, - help='non_linguistic_symbols file path', - ) - parser.add_argument( - '--cleaner', - type=str_or_none, - choices=[None, 'tacotron', 'jaconv', 'vietnamese'], - default=None, - help='Apply text cleaning', - ) - parser.add_argument( - '--g2p', - type=str_or_none, - choices=g2p_choices, - default=None, - help='Specify g2p method if --token_type=phn', - ) - parser.add_argument( - '--speech_volume_normalize', - type=float_or_none, - default=None, - help='Scale the maximum amplitude to the given value.', - ) - parser.add_argument( - '--rir_scp', - type=str_or_none, - default=None, - help='The file path of rir scp file.', - ) - parser.add_argument( - '--rir_apply_prob', - type=float, - default=1.0, - help='THe probability for applying RIR convolution.', - ) - parser.add_argument( - '--noise_scp', - type=str_or_none, - default=None, - help='The file path of noise scp file.', - ) - parser.add_argument( - '--noise_apply_prob', - type=float, - default=1.0, - help='The probability applying Noise adding.', - ) - parser.add_argument( - '--noise_db_range', - type=str, - default='13_15', - help='The range of noise decibel level.', - ) - - for class_choices in cls.class_choices_list: - # Append -- and --_conf. - # e.g. --encoder and --encoder_conf - class_choices.add_arguments(group) - - @classmethod - def build_collate_fn( - cls, args: argparse.Namespace, train: bool - ) -> Callable[[Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[ - List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() - # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol - return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) - - @classmethod - def build_preprocess_fn( - cls, args: argparse.Namespace, train: bool - ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() - if args.use_preprocessor: - retval = CommonPreprocessor( - train=train, - token_type=args.token_type, - token_list=args.token_list, - bpemodel=args.bpemodel, - non_linguistic_symbols=args.non_linguistic_symbols, - text_cleaner=args.cleaner, - g2p_type=args.g2p, - # NOTE(kamo): Check attribute existence for backward compatibility - rir_scp=args.rir_scp if hasattr(args, 'rir_scp') else None, - rir_apply_prob=args.rir_apply_prob if hasattr( - args, 'rir_apply_prob') else 1.0, - noise_scp=args.noise_scp - if hasattr(args, 'noise_scp') else None, - noise_apply_prob=args.noise_apply_prob if hasattr( - args, 'noise_apply_prob') else 1.0, - noise_db_range=args.noise_db_range if hasattr( - args, 'noise_db_range') else '13_15', - speech_volume_normalize=args.speech_volume_normalize - if hasattr(args, 'rir_scp') else None, - ) - else: - retval = None - assert check_return_type(retval) - return retval - - @classmethod - def required_data_names(cls, - train: bool = True, - inference: bool = False) -> Tuple[str, ...]: - if not inference: - retval = ('speech', 'text') - else: - # Recognition mode - retval = ('speech', ) - return retval - - @classmethod - def optional_data_names(cls, - train: bool = True, - inference: bool = False) -> Tuple[str, ...]: - retval = () - assert check_return_type(retval) - return retval - - @classmethod - def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel: - assert check_argument_types() - if isinstance(args.token_list, str): - with open(args.token_list, encoding='utf-8') as f: - token_list = [line.rstrip() for line in f] - - # Overwriting token_list to keep it as "portable". - args.token_list = list(token_list) - elif isinstance(args.token_list, (tuple, list)): - token_list = list(args.token_list) - else: - raise RuntimeError('token_list must be str or list') - vocab_size = len(token_list) - logger.info(f'Vocabulary size: {vocab_size }') - - # 1. frontend - if args.input_size is None: - # Extract features in the model - frontend_class = frontend_choices.get_class(args.frontend) - frontend = frontend_class(**args.frontend_conf) - input_size = frontend.output_size() - else: - # Give features from data-loader - args.frontend = None - args.frontend_conf = {} - frontend = None - input_size = args.input_size - - # 2. Data augmentation for spectrogram - if args.specaug is not None: - specaug_class = specaug_choices.get_class(args.specaug) - specaug = specaug_class(**args.specaug_conf) - else: - specaug = None - - # 3. Normalization layer - if args.normalize is not None: - normalize_class = normalize_choices.get_class(args.normalize) - normalize = normalize_class(**args.normalize_conf) - else: - normalize = None - - # 4. Pre-encoder input block - # NOTE(kan-bayashi): Use getattr to keep the compatibility - if getattr(args, 'preencoder', None) is not None: - preencoder_class = preencoder_choices.get_class(args.preencoder) - preencoder = preencoder_class(**args.preencoder_conf) - input_size = preencoder.output_size() - else: - preencoder = None - - # 4. Encoder - encoder_class = encoder_choices.get_class(args.encoder) - encoder = encoder_class(input_size=input_size, **args.encoder_conf) - - # 5. Post-encoder block - # NOTE(kan-bayashi): Use getattr to keep the compatibility - encoder_output_size = encoder.output_size() - if getattr(args, 'postencoder', None) is not None: - postencoder_class = postencoder_choices.get_class(args.postencoder) - postencoder = postencoder_class( - input_size=encoder_output_size, **args.postencoder_conf) - encoder_output_size = postencoder.output_size() - else: - postencoder = None - - # 5. Decoder - decoder_class = decoder_choices.get_class(args.decoder) - - if args.decoder == 'transducer': - decoder = decoder_class( - vocab_size, - embed_pad=0, - **args.decoder_conf, - ) - - joint_network = JointNetwork( - vocab_size, - encoder.output_size(), - decoder.dunits, - **args.joint_net_conf, - ) - else: - decoder = decoder_class( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - **args.decoder_conf, - ) - - joint_network = None - - # 6. CTC - ctc = CTC( - odim=vocab_size, - encoder_output_size=encoder_output_size, - **args.ctc_conf) - - # 7. Build model - try: - model_class = model_choices.get_class(args.model) - except AttributeError: - model_class = model_choices.get_class('espnet') - model = model_class( - vocab_size=vocab_size, - frontend=frontend, - specaug=specaug, - normalize=normalize, - preencoder=preencoder, - encoder=encoder, - postencoder=postencoder, - decoder=decoder, - ctc=ctc, - joint_network=joint_network, - token_list=token_list, - **args.model_conf, - ) - - # FIXME(kamo): Should be done in model? - # 8. Initialize - if args.init is not None: - initialize(model, args.init) - - assert check_return_type(model) - return model - - -class ASRTaskNAR(AbsTask): - # If you need more than one optimizers, change this value - num_optimizers: int = 1 - - # Add variable objects configurations - class_choices_list = [ - # --frontend and --frontend_conf - frontend_choices, - # --specaug and --specaug_conf - specaug_choices, - # --normalize and --normalize_conf - normalize_choices, - # --model and --model_conf - model_choices, - # --preencoder and --preencoder_conf - preencoder_choices, - # --encoder and --encoder_conf - encoder_choices, - # --postencoder and --postencoder_conf - postencoder_choices, - # --decoder and --decoder_conf - decoder_choices, - # --predictor and --predictor_conf - predictor_choices, - ] - - # If you need to modify train() or eval() procedures, change Trainer class here - trainer = Trainer - - @classmethod - def add_task_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group(description='Task related') - - # NOTE(kamo): add_arguments(..., required=True) can't be used - # to provide --print_config mode. Instead of it, do as - required = parser.get_default('required') - required += ['token_list'] - - group.add_argument( - '--token_list', - type=str_or_none, - default=None, - help='A text mapping int-id to token', - ) - group.add_argument( - '--init', - type=lambda x: str_or_none(x.lower()), - default=None, - help='The initialization method', - choices=[ - 'chainer', - 'xavier_uniform', - 'xavier_normal', - 'kaiming_uniform', - 'kaiming_normal', - None, - ], - ) - - group.add_argument( - '--input_size', - type=int_or_none, - default=None, - help='The number of input dimension of the feature', - ) - - group.add_argument( - '--ctc_conf', - action=NestedDictAction, - default=get_default_kwargs(CTC), - help='The keyword arguments for CTC class.', - ) - group.add_argument( - '--joint_net_conf', - action=NestedDictAction, - default=None, - help='The keyword arguments for joint network class.', - ) - - group = parser.add_argument_group(description='Preprocess related') - group.add_argument( - '--use_preprocessor', - type=str2bool, - default=True, - help='Apply preprocessing to data or not', - ) - group.add_argument( - '--token_type', - type=str, - default='bpe', - choices=['bpe', 'char', 'word', 'phn'], - help='The text will be tokenized ' - 'in the specified level token', - ) - group.add_argument( - '--bpemodel', - type=str_or_none, - default=None, - help='The model file of sentencepiece', - ) - parser.add_argument( - '--non_linguistic_symbols', - type=str_or_none, - help='non_linguistic_symbols file path', - ) - parser.add_argument( - '--cleaner', - type=str_or_none, - choices=[None, 'tacotron', 'jaconv', 'vietnamese'], - default=None, - help='Apply text cleaning', - ) - parser.add_argument( - '--g2p', - type=str_or_none, - choices=g2p_choices, - default=None, - help='Specify g2p method if --token_type=phn', - ) - parser.add_argument( - '--speech_volume_normalize', - type=float_or_none, - default=None, - help='Scale the maximum amplitude to the given value.', - ) - parser.add_argument( - '--rir_scp', - type=str_or_none, - default=None, - help='The file path of rir scp file.', - ) - parser.add_argument( - '--rir_apply_prob', - type=float, - default=1.0, - help='THe probability for applying RIR convolution.', - ) - parser.add_argument( - '--noise_scp', - type=str_or_none, - default=None, - help='The file path of noise scp file.', - ) - parser.add_argument( - '--noise_apply_prob', - type=float, - default=1.0, - help='The probability applying Noise adding.', - ) - parser.add_argument( - '--noise_db_range', - type=str, - default='13_15', - help='The range of noise decibel level.', - ) - - for class_choices in cls.class_choices_list: - # Append -- and --_conf. - # e.g. --encoder and --encoder_conf - class_choices.add_arguments(group) - - @classmethod - def build_collate_fn( - cls, args: argparse.Namespace, train: bool - ) -> Callable[[Collection[Tuple[str, Dict[str, np.ndarray]]]], Tuple[ - List[str], Dict[str, torch.Tensor]], ]: - assert check_argument_types() - # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol - return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1) - - @classmethod - def build_preprocess_fn( - cls, args: argparse.Namespace, train: bool - ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: - assert check_argument_types() - if args.use_preprocessor: - retval = CommonPreprocessor( - train=train, - token_type=args.token_type, - token_list=args.token_list, - bpemodel=args.bpemodel, - non_linguistic_symbols=args.non_linguistic_symbols, - text_cleaner=args.cleaner, - g2p_type=args.g2p, - # NOTE(kamo): Check attribute existence for backward compatibility - rir_scp=args.rir_scp if hasattr(args, 'rir_scp') else None, - rir_apply_prob=args.rir_apply_prob if hasattr( - args, 'rir_apply_prob') else 1.0, - noise_scp=args.noise_scp - if hasattr(args, 'noise_scp') else None, - noise_apply_prob=args.noise_apply_prob if hasattr( - args, 'noise_apply_prob') else 1.0, - noise_db_range=args.noise_db_range if hasattr( - args, 'noise_db_range') else '13_15', - speech_volume_normalize=args.speech_volume_normalize - if hasattr(args, 'rir_scp') else None, - ) - else: - retval = None - assert check_return_type(retval) - return retval - - @classmethod - def required_data_names(cls, - train: bool = True, - inference: bool = False) -> Tuple[str, ...]: - if not inference: - retval = ('speech', 'text') - else: - # Recognition mode - retval = ('speech', ) - return retval - - @classmethod - def optional_data_names(cls, - train: bool = True, - inference: bool = False) -> Tuple[str, ...]: - retval = () - assert check_return_type(retval) - return retval - - @classmethod - def build_model(cls, args: argparse.Namespace): - assert check_argument_types() - if isinstance(args.token_list, str): - with open(args.token_list, encoding='utf-8') as f: - token_list = [line.rstrip() for line in f] - - # Overwriting token_list to keep it as "portable". - args.token_list = list(token_list) - elif isinstance(args.token_list, (tuple, list)): - token_list = list(args.token_list) - else: - raise RuntimeError('token_list must be str or list') - vocab_size = len(token_list) - # logger.info(f'Vocabulary size: {vocab_size }') - - # 1. frontend - if args.input_size is None: - # Extract features in the model - frontend_class = frontend_choices.get_class(args.frontend) - frontend = frontend_class(**args.frontend_conf) - input_size = frontend.output_size() - else: - # Give features from data-loader - args.frontend = None - args.frontend_conf = {} - frontend = None - input_size = args.input_size - - # 2. Data augmentation for spectrogram - if args.specaug is not None: - specaug_class = specaug_choices.get_class(args.specaug) - specaug = specaug_class(**args.specaug_conf) - else: - specaug = None - - # 3. Normalization layer - if args.normalize is not None: - normalize_class = normalize_choices.get_class(args.normalize) - normalize = normalize_class(**args.normalize_conf) - else: - normalize = None - - # 4. Pre-encoder input block - # NOTE(kan-bayashi): Use getattr to keep the compatibility - if getattr(args, 'preencoder', None) is not None: - preencoder_class = preencoder_choices.get_class(args.preencoder) - preencoder = preencoder_class(**args.preencoder_conf) - input_size = preencoder.output_size() - else: - preencoder = None - - # 4. Encoder - encoder_class = encoder_choices.get_class(args.encoder) - encoder = encoder_class(input_size=input_size, **args.encoder_conf) - - # 5. Post-encoder block - # NOTE(kan-bayashi): Use getattr to keep the compatibility - encoder_output_size = encoder.output_size() - if getattr(args, 'postencoder', None) is not None: - postencoder_class = postencoder_choices.get_class(args.postencoder) - postencoder = postencoder_class( - input_size=encoder_output_size, **args.postencoder_conf) - encoder_output_size = postencoder.output_size() - else: - postencoder = None - - # 5. Decoder - decoder_class = decoder_choices.get_class(args.decoder) - - if args.decoder == 'transducer': - decoder = decoder_class( - vocab_size, - embed_pad=0, - **args.decoder_conf, - ) - - joint_network = JointNetwork( - vocab_size, - encoder.output_size(), - decoder.dunits, - **args.joint_net_conf, - ) - else: - decoder = decoder_class( - vocab_size=vocab_size, - encoder_output_size=encoder_output_size, - **args.decoder_conf, - ) - - joint_network = None - - # 6. CTC - ctc = CTC( - odim=vocab_size, - encoder_output_size=encoder_output_size, - **args.ctc_conf) - - predictor_class = predictor_choices.get_class(args.predictor) - predictor = predictor_class(**args.predictor_conf) - - # 7. Build model - try: - model_class = model_choices.get_class(args.model) - except AttributeError: - model_class = model_choices.get_class('espnet') - model = model_class( - vocab_size=vocab_size, - frontend=frontend, - specaug=specaug, - normalize=normalize, - preencoder=preencoder, - encoder=encoder, - postencoder=postencoder, - decoder=decoder, - ctc=ctc, - joint_network=joint_network, - token_list=token_list, - predictor=predictor, - **args.model_conf, - ) - - # FIXME(kamo): Should be done in model? - # 8. Initialize - if args.init is not None: - initialize(model, args.init) - - assert check_return_type(model) - return model diff --git a/modelscope/pipelines/audio/asr/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr/asr_inference_pipeline.py deleted file mode 100644 index 20e7b6bf..00000000 --- a/modelscope/pipelines/audio/asr/asr_inference_pipeline.py +++ /dev/null @@ -1,223 +0,0 @@ -import os -import shutil -import threading -from typing import Any, Dict, List, Sequence, Tuple, Union - -import yaml - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import WavToScp -from modelscope.utils.constant import Tasks -from modelscope.utils.logger import get_logger -from .asr_engine.common import asr_utils - -logger = get_logger() - -__all__ = ['AutomaticSpeechRecognitionPipeline'] - - -@PIPELINES.register_module( - Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference) -class AutomaticSpeechRecognitionPipeline(Pipeline): - """ASR Pipeline - """ - - def __init__(self, - model: Union[List[Model], List[str]] = None, - preprocessor: WavToScp = None, - **kwargs): - """use `model` and `preprocessor` to create an asr pipeline for prediction - """ - from .asr_engine import asr_env_checking - assert model is not None, 'asr model should be provided' - - model_list: List = [] - if isinstance(model[0], Model): - model_list = model - else: - model_list.append(Model.from_pretrained(model[0])) - if len(model) == 2 and model[1] is not None: - model_list.append(Model.from_pretrained(model[1])) - - super().__init__(model=model_list, preprocessor=preprocessor, **kwargs) - - self._preprocessor = preprocessor - self._am_model = model_list[0] - if len(model_list) == 2 and model_list[1] is not None: - self._lm_model = model_list[1] - - def __call__(self, - wav_path: str, - recog_type: str = None, - audio_format: str = None, - workspace: str = None) -> Dict[str, Any]: - assert len(wav_path) > 0, 'wav_path should be provided' - - self._recog_type = recog_type - self._audio_format = audio_format - self._workspace = workspace - self._wav_path = wav_path - - if recog_type is None or audio_format is None or workspace is None: - self._recog_type, self._audio_format, self._workspace, self._wav_path = asr_utils.type_checking( - wav_path, recog_type, audio_format, workspace) - - if self._preprocessor is None: - self._preprocessor = WavToScp(workspace=self._workspace) - - output = self._preprocessor.forward(self._am_model.forward(), - self._recog_type, - self._audio_format, self._wav_path) - output = self.forward(output) - rst = self.postprocess(output) - return rst - - def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - """Decoding - """ - - logger.info(f"Decoding with {inputs['audio_format']} files ...") - - j: int = 0 - process = [] - - while j < inputs['thread_count']: - data_cmd: Sequence[Tuple[str, str, str]] - if inputs['audio_format'] == 'wav': - data_cmd = [(os.path.join(inputs['workspace'], - 'data.' + str(j) + '.scp'), 'speech', - 'sound')] - elif inputs['audio_format'] == 'kaldi_ark': - data_cmd = [(os.path.join(inputs['workspace'], - 'data.' + str(j) + '.scp'), 'speech', - 'kaldi_ark')] - - output_dir: str = os.path.join(inputs['output'], - 'output.' + str(j)) - if not os.path.exists(output_dir): - os.mkdir(output_dir) - - config_file = open(inputs['asr_model_config']) - root = yaml.full_load(config_file) - config_file.close() - frontend_conf = None - if 'frontend_conf' in root: - frontend_conf = root['frontend_conf'] - - cmd = { - 'model_type': inputs['model_type'], - 'beam_size': root['beam_size'], - 'penalty': root['penalty'], - 'maxlenratio': root['maxlenratio'], - 'minlenratio': root['minlenratio'], - 'ctc_weight': root['ctc_weight'], - 'lm_weight': root['lm_weight'], - 'output_dir': output_dir, - 'ngpu': 0, - 'log_level': 'ERROR', - 'data_path_and_name_and_type': data_cmd, - 'asr_train_config': inputs['am_model_config'], - 'asr_model_file': inputs['am_model_path'], - 'batch_size': inputs['model_config']['batch_size'], - 'frontend_conf': frontend_conf - } - - thread = AsrInferenceThread(j, cmd) - thread.start() - j += 1 - process.append(thread) - - for p in process: - p.join() - - return inputs - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - """process the asr results - """ - - logger.info('Computing the result of ASR ...') - - rst = {'rec_result': 'None'} - - # single wav task - if inputs['recog_type'] == 'wav' and inputs['audio_format'] == 'wav': - text_file: str = os.path.join(inputs['output'], 'output.0', - '1best_recog', 'text') - - if os.path.exists(text_file): - f = open(text_file, 'r') - result_str: str = f.readline() - f.close() - if len(result_str) > 0: - result_list = result_str.split() - if len(result_list) >= 2: - rst['rec_result'] = result_list[1] - - # run with datasets, and audio format is waveform or kaldi_ark - elif inputs['recog_type'] != 'wav': - inputs['reference_text'] = self._ref_text_tidy(inputs) - inputs['datasets_result'] = asr_utils.compute_wer( - inputs['hypothesis_text'], inputs['reference_text']) - - else: - raise ValueError('recog_type and audio_format are mismatching') - - if 'datasets_result' in inputs: - rst['datasets_result'] = inputs['datasets_result'] - - # remove workspace dir (.tmp) - if os.path.exists(self._workspace): - shutil.rmtree(self._workspace) - - return rst - - def _ref_text_tidy(self, inputs: Dict[str, Any]) -> str: - ref_text: str = os.path.join(inputs['output'], 'text.ref') - k: int = 0 - - while k < inputs['thread_count']: - output_text = os.path.join(inputs['output'], 'output.' + str(k), - '1best_recog', 'text') - if os.path.exists(output_text): - with open(output_text, 'r', encoding='utf-8') as i: - lines = i.readlines() - - with open(ref_text, 'a', encoding='utf-8') as o: - for line in lines: - o.write(line) - - k += 1 - - return ref_text - - -class AsrInferenceThread(threading.Thread): - - def __init__(self, threadID, cmd): - threading.Thread.__init__(self) - self._threadID = threadID - self._cmd = cmd - - def run(self): - if self._cmd['model_type'] == 'pytorch': - from .asr_engine import asr_inference_paraformer_espnet - asr_inference_paraformer_espnet.asr_inference( - batch_size=self._cmd['batch_size'], - output_dir=self._cmd['output_dir'], - maxlenratio=self._cmd['maxlenratio'], - minlenratio=self._cmd['minlenratio'], - beam_size=self._cmd['beam_size'], - ngpu=self._cmd['ngpu'], - ctc_weight=self._cmd['ctc_weight'], - lm_weight=self._cmd['lm_weight'], - penalty=self._cmd['penalty'], - log_level=self._cmd['log_level'], - data_path_and_name_and_type=self. - _cmd['data_path_and_name_and_type'], - asr_train_config=self._cmd['asr_train_config'], - asr_model_file=self._cmd['asr_model_file'], - frontend_conf=self._cmd['frontend_conf']) diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py new file mode 100644 index 00000000..ac53d12d --- /dev/null +++ b/modelscope/pipelines/audio/asr_inference_pipeline.py @@ -0,0 +1,213 @@ +import os +from typing import Any, Dict, List, Sequence, Tuple, Union + +import yaml + +from modelscope.metainfo import Pipelines +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import WavToScp +from modelscope.utils.constant import Frameworks, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['AutomaticSpeechRecognitionPipeline'] + + +@PIPELINES.register_module( + Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference) +class AutomaticSpeechRecognitionPipeline(Pipeline): + """ASR Inference Pipeline + """ + + def __init__(self, + model: Union[Model, str] = None, + preprocessor: WavToScp = None, + **kwargs): + """use `model` and `preprocessor` to create an asr pipeline for prediction + """ + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + def __call__(self, + audio_in: Union[str, bytes], + recog_type: str = None, + audio_format: str = None) -> Dict[str, Any]: + from easyasr.common import asr_utils + + self.recog_type = recog_type + self.audio_format = audio_format + self.audio_in = audio_in + + if recog_type is None or audio_format is None: + self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( + audio_in, recog_type, audio_format) + + if self.preprocessor is None: + self.preprocessor = WavToScp() + + output = self.preprocessor.forward(self.model.forward(), + self.recog_type, self.audio_format, + self.audio_in) + output = self.forward(output) + rst = self.postprocess(output) + return rst + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """Decoding + """ + + logger.info(f"Decoding with {inputs['audio_format']} files ...") + + data_cmd: Sequence[Tuple[str, str]] + if inputs['audio_format'] == 'wav' or inputs['audio_format'] == 'pcm': + data_cmd = ['speech', 'sound'] + elif inputs['audio_format'] == 'kaldi_ark': + data_cmd = ['speech', 'kaldi_ark'] + elif inputs['audio_format'] == 'tfrecord': + data_cmd = ['speech', 'tfrecord'] + + # generate asr inference command + cmd = { + 'model_type': inputs['model_type'], + 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available + 'log_level': 'ERROR', + 'audio_in': inputs['audio_lists'], + 'name_and_type': data_cmd, + 'asr_model_file': inputs['am_model_path'], + 'idx_text': '' + } + + if self.framework == Frameworks.torch: + config_file = open(inputs['asr_model_config']) + root = yaml.full_load(config_file) + config_file.close() + frontend_conf = None + if 'frontend_conf' in root: + frontend_conf = root['frontend_conf'] + + cmd['beam_size'] = root['beam_size'] + cmd['penalty'] = root['penalty'] + cmd['maxlenratio'] = root['maxlenratio'] + cmd['minlenratio'] = root['minlenratio'] + cmd['ctc_weight'] = root['ctc_weight'] + cmd['lm_weight'] = root['lm_weight'] + cmd['asr_train_config'] = inputs['am_model_config'] + cmd['batch_size'] = inputs['model_config']['batch_size'] + cmd['frontend_conf'] = frontend_conf + + elif self.framework == Frameworks.tf: + cmd['fs'] = inputs['model_config']['fs'] + cmd['hop_length'] = inputs['model_config']['hop_length'] + cmd['feature_dims'] = inputs['model_config']['feature_dims'] + cmd['predictions_file'] = 'text' + cmd['mvn_file'] = inputs['am_mvn_file'] + cmd['vocab_file'] = inputs['vocab_file'] + if 'idx_text' in inputs: + cmd['idx_text'] = inputs['idx_text'] + + else: + raise ValueError('model type is mismatching') + + inputs['asr_result'] = self.run_inference(cmd) + + return inputs + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """process the asr results + """ + from easyasr.common import asr_utils + + logger.info('Computing the result of ASR ...') + + rst = {} + + # single wav or pcm task + if inputs['recog_type'] == 'wav': + if 'asr_result' in inputs and len(inputs['asr_result']) > 0: + text = inputs['asr_result'][0]['value'] + if len(text) > 0: + rst[OutputKeys.TEXT] = text + + # run with datasets, and audio format is waveform or kaldi_ark or tfrecord + elif inputs['recog_type'] != 'wav': + inputs['reference_list'] = self.ref_list_tidy(inputs) + inputs['datasets_result'] = asr_utils.compute_wer( + inputs['asr_result'], inputs['reference_list']) + + else: + raise ValueError('recog_type and audio_format are mismatching') + + if 'datasets_result' in inputs: + rst[OutputKeys.TEXT] = inputs['datasets_result'] + + return rst + + def ref_list_tidy(self, inputs: Dict[str, Any]) -> List[Any]: + ref_list = [] + + if inputs['audio_format'] == 'tfrecord': + # should assemble idx + txt + with open(inputs['reference_text'], 'r', encoding='utf-8') as r: + text_lines = r.readlines() + + with open(inputs['idx_text'], 'r', encoding='utf-8') as i: + idx_lines = i.readlines() + + j: int = 0 + while j < min(len(text_lines), len(idx_lines)): + idx_str = idx_lines[j].strip() + text_str = text_lines[j].strip().replace(' ', '') + item = {'key': idx_str, 'value': text_str} + ref_list.append(item) + j += 1 + + else: + # text contain idx + sentence + with open(inputs['reference_text'], 'r', encoding='utf-8') as f: + lines = f.readlines() + + for line in lines: + line_item = line.split() + item = {'key': line_item[0], 'value': line_item[1]} + ref_list.append(item) + + return ref_list + + def run_inference(self, cmd): + asr_result = [] + if self.framework == Frameworks.torch: + from easyasr import asr_inference_paraformer_espnet + asr_result = asr_inference_paraformer_espnet.asr_inference( + batch_size=cmd['batch_size'], + maxlenratio=cmd['maxlenratio'], + minlenratio=cmd['minlenratio'], + beam_size=cmd['beam_size'], + ngpu=cmd['ngpu'], + ctc_weight=cmd['ctc_weight'], + lm_weight=cmd['lm_weight'], + penalty=cmd['penalty'], + log_level=cmd['log_level'], + name_and_type=cmd['name_and_type'], + audio_lists=cmd['audio_in'], + asr_train_config=cmd['asr_train_config'], + asr_model_file=cmd['asr_model_file'], + frontend_conf=cmd['frontend_conf']) + elif self.framework == Frameworks.tf: + from easyasr import asr_inference_paraformer_tf + asr_result = asr_inference_paraformer_tf.asr_inference( + ngpu=cmd['ngpu'], + name_and_type=cmd['name_and_type'], + audio_lists=cmd['audio_in'], + idx_text_file=cmd['idx_text'], + asr_model_file=cmd['asr_model_file'], + vocab_file=cmd['vocab_file'], + am_mvn_file=cmd['mvn_file'], + predictions_file=cmd['predictions_file'], + fs=cmd['fs'], + hop_length=cmd['hop_length'], + feature_dims=cmd['feature_dims']) + + return asr_result diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py index f13cc2e7..de0eb634 100644 --- a/modelscope/preprocessors/asr.py +++ b/modelscope/preprocessors/asr.py @@ -1,14 +1,9 @@ -import io import os -import shutil -from pathlib import Path -from typing import Any, Dict, List - -import yaml +from typing import Any, Dict, List, Union from modelscope.metainfo import Preprocessors from modelscope.models.base import Model -from modelscope.utils.constant import Fields +from modelscope.utils.constant import Fields, Frameworks from .base import Preprocessor from .builder import PREPROCESSORS @@ -19,44 +14,32 @@ __all__ = ['WavToScp'] Fields.audio, module_name=Preprocessors.wav_to_scp) class WavToScp(Preprocessor): """generate audio scp from wave or ark - - Args: - workspace (str): """ - def __init__(self, workspace: str = None): - # the workspace path - if workspace is None or len(workspace) == 0: - self._workspace = os.path.join(os.getcwd(), '.tmp') - else: - self._workspace = workspace - - if not os.path.exists(self._workspace): - os.mkdir(self._workspace) + def __init__(self): + pass def __call__(self, - model: List[Model] = None, + model: Model = None, recog_type: str = None, audio_format: str = None, - wav_path: str = None) -> Dict[str, Any]: - assert len(model) > 0, 'preprocess model is invalid' - assert len(recog_type) > 0, 'preprocess recog_type is empty' - assert len(audio_format) > 0, 'preprocess audio_format is empty' - assert len(wav_path) > 0, 'preprocess wav_path is empty' - - self._am_model = model[0] - if len(model) == 2 and model[1] is not None: - self._lm_model = model[1] - out = self.forward(self._am_model.forward(), recog_type, audio_format, - wav_path) + audio_in: Union[str, bytes] = None) -> Dict[str, Any]: + assert model is not None, 'preprocess model is empty' + assert recog_type is not None and len( + recog_type) > 0, 'preprocess recog_type is empty' + assert audio_format is not None, 'preprocess audio_format is empty' + assert audio_in is not None, 'preprocess audio_in is empty' + + self.am_model = model + out = self.forward(self.am_model.forward(), recog_type, audio_format, + audio_in) return out def forward(self, model: Dict[str, Any], recog_type: str, - audio_format: str, wav_path: str) -> Dict[str, Any]: + audio_format: str, audio_in: Union[str, + bytes]) -> Dict[str, Any]: assert len(recog_type) > 0, 'preprocess recog_type is empty' assert len(audio_format) > 0, 'preprocess audio_format is empty' - assert len(wav_path) > 0, 'preprocess wav_path is empty' - assert os.path.exists(wav_path), 'preprocess wav_path does not exist' assert len( model['am_model']) > 0, 'preprocess model[am_model] is empty' assert len(model['am_model_path'] @@ -70,90 +53,104 @@ class WavToScp(Preprocessor): assert len(model['model_config'] ) > 0, 'preprocess model[model_config] is empty' - # the am model name - am_model: str = model['am_model'] - # the am model file path - am_model_path: str = model['am_model_path'] - # the recognition model dir path - model_workspace: str = model['model_workspace'] - # the recognition model config dict - global_model_config_dict: str = model['model_config'] - rst = { - 'workspace': os.path.join(self._workspace, recog_type), - 'am_model': am_model, - 'am_model_path': am_model_path, - 'model_workspace': model_workspace, + # the recognition model dir path + 'model_workspace': model['model_workspace'], + # the am model name + 'am_model': model['am_model'], + # the am model file path + 'am_model_path': model['am_model_path'], # the asr type setting, eg: test dev train wav 'recog_type': recog_type, - # the asr audio format setting, eg: wav, kaldi_ark + # the asr audio format setting, eg: wav, pcm, kaldi_ark, tfrecord 'audio_format': audio_format, - # the test wav file path or the dataset path - 'wav_path': wav_path, - 'model_config': global_model_config_dict + # the recognition model config dict + 'model_config': model['model_config'] } - out = self._config_checking(rst) - out = self._env_setting(out) + if isinstance(audio_in, str): + # wav file path or the dataset path + rst['wav_path'] = audio_in + + out = self.config_checking(rst) + out = self.env_setting(out) if audio_format == 'wav': - out = self._scp_generation_from_wav(out) + out['audio_lists'] = self.scp_generation_from_wav(out) elif audio_format == 'kaldi_ark': - out = self._scp_generation_from_ark(out) + out['audio_lists'] = self.scp_generation_from_ark(out) + elif audio_format == 'tfrecord': + out['audio_lists'] = os.path.join(out['wav_path'], 'data.records') + elif audio_format == 'pcm': + out['audio_lists'] = audio_in return out - def _config_checking(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + def config_checking(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """config checking """ assert inputs['model_config'].__contains__( 'type'), 'model type does not exist' - assert inputs['model_config'].__contains__( - 'batch_size'), 'batch_size does not exist' - assert inputs['model_config'].__contains__( - 'am_model_config'), 'am_model_config does not exist' - assert inputs['model_config'].__contains__( - 'asr_model_config'), 'asr_model_config does not exist' - assert inputs['model_config'].__contains__( - 'asr_model_wav_config'), 'asr_model_wav_config does not exist' - - am_model_config: str = os.path.join( - inputs['model_workspace'], - inputs['model_config']['am_model_config']) - assert os.path.exists( - am_model_config), 'am_model_config does not exist' - inputs['am_model_config'] = am_model_config - - asr_model_config: str = os.path.join( - inputs['model_workspace'], - inputs['model_config']['asr_model_config']) - assert os.path.exists( - asr_model_config), 'asr_model_config does not exist' - - asr_model_wav_config: str = os.path.join( - inputs['model_workspace'], - inputs['model_config']['asr_model_wav_config']) - assert os.path.exists( - asr_model_wav_config), 'asr_model_wav_config does not exist' - inputs['model_type'] = inputs['model_config']['type'] - if inputs['audio_format'] == 'wav': - inputs['asr_model_config'] = asr_model_wav_config + if inputs['model_type'] == Frameworks.torch: + assert inputs['model_config'].__contains__( + 'batch_size'), 'batch_size does not exist' + assert inputs['model_config'].__contains__( + 'am_model_config'), 'am_model_config does not exist' + assert inputs['model_config'].__contains__( + 'asr_model_config'), 'asr_model_config does not exist' + assert inputs['model_config'].__contains__( + 'asr_model_wav_config'), 'asr_model_wav_config does not exist' + + am_model_config: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['am_model_config']) + assert os.path.exists( + am_model_config), 'am_model_config does not exist' + inputs['am_model_config'] = am_model_config + + asr_model_config: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['asr_model_config']) + assert os.path.exists( + asr_model_config), 'asr_model_config does not exist' + + asr_model_wav_config: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['asr_model_wav_config']) + assert os.path.exists( + asr_model_wav_config), 'asr_model_wav_config does not exist' + + if inputs['audio_format'] == 'wav' or inputs[ + 'audio_format'] == 'pcm': + inputs['asr_model_config'] = asr_model_wav_config + else: + inputs['asr_model_config'] = asr_model_config + + elif inputs['model_type'] == Frameworks.tf: + assert inputs['model_config'].__contains__( + 'vocab_file'), 'vocab_file does not exist' + vocab_file: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['vocab_file']) + assert os.path.exists(vocab_file), 'vocab file does not exist' + inputs['vocab_file'] = vocab_file + + assert inputs['model_config'].__contains__( + 'am_mvn_file'), 'am_mvn_file does not exist' + am_mvn_file: str = os.path.join( + inputs['model_workspace'], + inputs['model_config']['am_mvn_file']) + assert os.path.exists(am_mvn_file), 'am mvn file does not exist' + inputs['am_mvn_file'] = am_mvn_file + else: - inputs['asr_model_config'] = asr_model_config + raise ValueError('model type is mismatched') return inputs - def _env_setting(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - - if not os.path.exists(inputs['workspace']): - os.mkdir(inputs['workspace']) - - inputs['output'] = os.path.join(inputs['workspace'], 'logdir') - if not os.path.exists(inputs['output']): - os.mkdir(inputs['output']) - + def env_setting(self, inputs: Dict[str, Any]) -> Dict[str, Any]: # run with datasets, should set datasets_path and text_path if inputs['recog_type'] != 'wav': inputs['datasets_path'] = inputs['wav_path'] @@ -162,25 +159,39 @@ class WavToScp(Preprocessor): if inputs['audio_format'] == 'wav': inputs['wav_path'] = os.path.join(inputs['datasets_path'], 'wav', inputs['recog_type']) - inputs['hypothesis_text'] = os.path.join( + inputs['reference_text'] = os.path.join( inputs['datasets_path'], 'transcript', 'data.text') - assert os.path.exists(inputs['hypothesis_text'] - ), 'hypothesis text does not exist' + assert os.path.exists( + inputs['reference_text']), 'reference text does not exist' + # run with datasets, and audio format is kaldi_ark elif inputs['audio_format'] == 'kaldi_ark': inputs['wav_path'] = os.path.join(inputs['datasets_path'], inputs['recog_type']) - inputs['hypothesis_text'] = os.path.join( + inputs['reference_text'] = os.path.join( inputs['wav_path'], 'data.text') - assert os.path.exists(inputs['hypothesis_text'] - ), 'hypothesis text does not exist' + assert os.path.exists( + inputs['reference_text']), 'reference text does not exist' + + # run with datasets, and audio format is tfrecord + elif inputs['audio_format'] == 'tfrecord': + inputs['wav_path'] = os.path.join(inputs['datasets_path'], + inputs['recog_type']) + inputs['reference_text'] = os.path.join( + inputs['wav_path'], 'data.txt') + assert os.path.exists( + inputs['reference_text']), 'reference text does not exist' + inputs['idx_text'] = os.path.join(inputs['wav_path'], + 'data.idx') + assert os.path.exists( + inputs['idx_text']), 'idx text does not exist' return inputs - def _scp_generation_from_wav(self, inputs: Dict[str, - Any]) -> Dict[str, Any]: + def scp_generation_from_wav(self, inputs: Dict[str, Any]) -> List[Any]: """scp generation from waveform files """ + from easyasr.common import asr_utils # find all waveform files wav_list = [] @@ -191,64 +202,46 @@ class WavToScp(Preprocessor): wav_list.append(file_path) else: wav_dir: str = inputs['wav_path'] - wav_list = self._recursion_dir_all_wave(wav_list, wav_dir) + wav_list = asr_utils.recursion_dir_all_wav(wav_list, wav_dir) list_count: int = len(wav_list) inputs['wav_count'] = list_count - # store all wav into data.0.scp - inputs['thread_count'] = 1 + # store all wav into audio list + audio_lists = [] j: int = 0 - wav_list_path = os.path.join(inputs['workspace'], 'data.0.scp') - with open(wav_list_path, 'a') as f: - while j < list_count: - wav_file = wav_list[j] - wave_scp_content: str = os.path.splitext( - os.path.basename(wav_file))[0] - wave_scp_content += ' ' + wav_file + '\n' - f.write(wave_scp_content) - j += 1 + while j < list_count: + wav_file = wav_list[j] + wave_key: str = os.path.splitext(os.path.basename(wav_file))[0] + item = {'key': wave_key, 'file': wav_file} + audio_lists.append(item) + j += 1 - return inputs + return audio_lists - def _scp_generation_from_ark(self, inputs: Dict[str, - Any]) -> Dict[str, Any]: + def scp_generation_from_ark(self, inputs: Dict[str, Any]) -> List[Any]: """scp generation from kaldi ark file """ - inputs['thread_count'] = 1 ark_scp_path = os.path.join(inputs['wav_path'], 'data.scp') ark_file_path = os.path.join(inputs['wav_path'], 'data.ark') assert os.path.exists(ark_scp_path), 'data.scp does not exist' assert os.path.exists(ark_file_path), 'data.ark does not exist' - new_ark_scp_path = os.path.join(inputs['workspace'], 'data.0.scp') - with open(ark_scp_path, 'r', encoding='utf-8') as f: lines = f.readlines() - with open(new_ark_scp_path, 'w', encoding='utf-8') as n: - for line in lines: - outs = line.strip().split(' ') - if len(outs) == 2: - key = outs[0] - sub = outs[1].split(':') - if len(sub) == 2: - nums = sub[1] - content = key + ' ' + ark_file_path + ':' + nums + '\n' - n.write(content) - - return inputs - - def _recursion_dir_all_wave(self, wav_list, - dir_path: str) -> Dict[str, Any]: - dir_files = os.listdir(dir_path) - for file in dir_files: - file_path = os.path.join(dir_path, file) - if os.path.isfile(file_path): - if file_path.endswith('.wav') or file_path.endswith('.WAV'): - wav_list.append(file_path) - elif os.path.isdir(file_path): - self._recursion_dir_all_wave(wav_list, file_path) - - return wav_list + # store all ark item into audio list + audio_lists = [] + for line in lines: + outs = line.strip().split(' ') + if len(outs) == 2: + key = outs[0] + sub = outs[1].split(':') + if len(sub) == 2: + nums = sub[1] + content = ark_file_path + ':' + nums + item = {'key': key, 'file': content} + audio_lists.append(item) + + return audio_lists diff --git a/requirements/audio.txt b/requirements/audio.txt index f0fdb054..71b29eb2 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -1,3 +1,4 @@ +easyasr>=0.0.2 espnet>=202204 #tts h5py diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py index 22d1d777..0659720a 100644 --- a/tests/pipelines/test_automatic_speech_recognition.py +++ b/tests/pipelines/test_automatic_speech_recognition.py @@ -1,15 +1,20 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os import shutil +import sys import tarfile import unittest +from typing import Any, Dict, Union +import numpy as np import requests +import soundfile +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks +from modelscope.utils.constant import ColorCodes, Tasks from modelscope.utils.logger import get_logger -from modelscope.utils.test_utils import test_level +from modelscope.utils.test_utils import download_and_untar, test_level logger = get_logger() @@ -21,6 +26,9 @@ LITTLE_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AS AISHELL1_TESTSETS_FILE = 'aishell1.tar.gz' AISHELL1_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/aishell1.tar.gz' +TFRECORD_TESTSETS_FILE = 'tfrecord.tar.gz' +TFRECORD_TESTSETS_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/datasets/tfrecord.tar.gz' + def un_tar_gz(fname, dirs): t = tarfile.open(fname) @@ -28,45 +36,168 @@ def un_tar_gz(fname, dirs): class AutomaticSpeechRecognitionTest(unittest.TestCase): + action_info = { + 'test_run_with_wav_pytorch': { + 'checking_item': OutputKeys.TEXT, + 'example': 'wav_example' + }, + 'test_run_with_pcm_pytorch': { + 'checking_item': OutputKeys.TEXT, + 'example': 'wav_example' + }, + 'test_run_with_wav_tf': { + 'checking_item': OutputKeys.TEXT, + 'example': 'wav_example' + }, + 'test_run_with_pcm_tf': { + 'checking_item': OutputKeys.TEXT, + 'example': 'wav_example' + }, + 'test_run_with_wav_dataset_pytorch': { + 'checking_item': OutputKeys.TEXT, + 'example': 'dataset_example' + }, + 'test_run_with_wav_dataset_tf': { + 'checking_item': OutputKeys.TEXT, + 'example': 'dataset_example' + }, + 'test_run_with_ark_dataset': { + 'checking_item': OutputKeys.TEXT, + 'example': 'dataset_example' + }, + 'test_run_with_tfrecord_dataset': { + 'checking_item': OutputKeys.TEXT, + 'example': 'dataset_example' + }, + 'dataset_example': { + 'Wrd': 49532, # the number of words + 'Snt': 5000, # the number of sentences + 'Corr': 47276, # the number of correct words + 'Ins': 49, # the number of insert words + 'Del': 152, # the number of delete words + 'Sub': 2207, # the number of substitution words + 'wrong_words': 2408, # the number of wrong words + 'wrong_sentences': 1598, # the number of wrong sentences + 'Err': 4.86, # WER/CER + 'S.Err': 31.96 # SER + }, + 'wav_example': { + 'text': '每一天都要快乐喔' + } + } def setUp(self) -> None: - self._am_model_id = 'damo/speech_paraformer_asr_nat-aishell1-pytorch' + self.am_pytorch_model_id = 'damo/speech_paraformer_asr_nat-aishell1-pytorch' + self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1' # this temporary workspace dir will store waveform files - self._workspace = os.path.join(os.getcwd(), '.tmp') - if not os.path.exists(self._workspace): - os.mkdir(self._workspace) + self.workspace = os.path.join(os.getcwd(), '.tmp') + if not os.path.exists(self.workspace): + os.mkdir(self.workspace) + + def tearDown(self) -> None: + # remove workspace dir (.tmp) + shutil.rmtree(self.workspace, ignore_errors=True) + + def run_pipeline(self, model_id: str, + audio_in: Union[str, bytes]) -> Dict[str, Any]: + inference_16k_pipline = pipeline( + task=Tasks.auto_speech_recognition, model=model_id) + + rec_result = inference_16k_pipline(audio_in) + + return rec_result + + def log_error(self, functions: str, result: Dict[str, Any]) -> None: + logger.error(ColorCodes.MAGENTA + functions + ': FAILED.' + + ColorCodes.END) + logger.error( + ColorCodes.MAGENTA + functions + ' correct result example:' + + ColorCodes.YELLOW + + str(self.action_info[self.action_info[functions]['example']]) + + ColorCodes.END) + + raise ValueError('asr result is mismatched') + + def check_result(self, functions: str, result: Dict[str, Any]) -> None: + if result.__contains__(self.action_info[functions]['checking_item']): + logger.info(ColorCodes.MAGENTA + functions + ': SUCCESS.' + + ColorCodes.END) + logger.info( + ColorCodes.YELLOW + + str(result[self.action_info[functions]['checking_item']]) + + ColorCodes.END) + else: + self.log_error(functions, result) + + def wav2bytes(self, wav_file) -> bytes: + audio, fs = soundfile.read(wav_file) + + # float32 -> int16 + audio = np.asarray(audio) + dtype = np.dtype('int16') + i = np.iinfo(dtype) + abs_max = 2**(i.bits - 1) + offset = i.min + abs_max + audio = (audio * abs_max + offset).clip(i.min, i.max).astype(dtype) + + # int16(PCM_16) -> byte + audio = audio.tobytes() + return audio @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_wav(self): + def test_run_with_wav_pytorch(self): '''run with single waveform file ''' - logger.info('Run ASR test with waveform file ...') + logger.info('Run ASR test with waveform file (pytorch)...') wav_file_path = os.path.join(os.getcwd(), WAV_FILE) - inference_16k_pipline = pipeline( - task=Tasks.auto_speech_recognition, model=[self._am_model_id]) - self.assertTrue(inference_16k_pipline is not None) + rec_result = self.run_pipeline( + model_id=self.am_pytorch_model_id, audio_in=wav_file_path) + self.check_result('test_run_with_wav_pytorch', rec_result) - rec_result = inference_16k_pipline(wav_file_path) - self.assertTrue(len(rec_result['rec_result']) > 0) - self.assertTrue(rec_result['rec_result'] != 'None') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_pcm_pytorch(self): + '''run with wav data ''' - result structure: - { - 'rec_result': '每一天都要快乐喔' - } - or - { - 'rec_result': 'None' - } + + logger.info('Run ASR test with wav data (pytorch)...') + + audio = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE)) + + rec_result = self.run_pipeline( + model_id=self.am_pytorch_model_id, audio_in=audio) + self.check_result('test_run_with_pcm_pytorch', rec_result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_wav_tf(self): + '''run with single waveform file ''' - logger.info('test_run_with_wav rec result: ' - + rec_result['rec_result']) + + logger.info('Run ASR test with waveform file (tensorflow)...') + + wav_file_path = os.path.join(os.getcwd(), WAV_FILE) + + rec_result = self.run_pipeline( + model_id=self.am_tf_model_id, audio_in=wav_file_path) + self.check_result('test_run_with_wav_tf', rec_result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_pcm_tf(self): + '''run with wav data + ''' + + logger.info('Run ASR test with wav data (tensorflow)...') + + audio = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE)) + + rec_result = self.run_pipeline( + model_id=self.am_tf_model_id, audio_in=audio) + self.check_result('test_run_with_pcm_tf', rec_result) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') - def test_run_with_wav_dataset(self): + def test_run_with_wav_dataset_pytorch(self): '''run with datasets, and audio format is waveform datasets directory: @@ -84,57 +215,48 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase): data.text # hypothesis text ''' - logger.info('Run ASR test with waveform dataset ...') + logger.info('Run ASR test with waveform dataset (pytorch)...') logger.info('Downloading waveform testsets file ...') - # downloading pos_testsets file - testsets_file_path = os.path.join(self._workspace, - LITTLE_TESTSETS_FILE) - if not os.path.exists(testsets_file_path): - r = requests.get(LITTLE_TESTSETS_URL) - with open(testsets_file_path, 'wb') as f: - f.write(r.content) - - testsets_dir_name = os.path.splitext( - os.path.basename( - os.path.splitext( - os.path.basename(LITTLE_TESTSETS_FILE))[0]))[0] - # dataset_path = /.tmp/data_aishell/wav/test - dataset_path = os.path.join(self._workspace, testsets_dir_name, 'wav', - 'test') - - # untar the dataset_path file - if not os.path.exists(dataset_path): - un_tar_gz(testsets_file_path, self._workspace) + dataset_path = download_and_untar( + os.path.join(self.workspace, LITTLE_TESTSETS_FILE), + LITTLE_TESTSETS_URL, self.workspace) + dataset_path = os.path.join(dataset_path, 'wav', 'test') - inference_16k_pipline = pipeline( - task=Tasks.auto_speech_recognition, model=[self._am_model_id]) - self.assertTrue(inference_16k_pipline is not None) + rec_result = self.run_pipeline( + model_id=self.am_pytorch_model_id, audio_in=dataset_path) + self.check_result('test_run_with_wav_dataset_pytorch', rec_result) - rec_result = inference_16k_pipline(wav_path=dataset_path) - self.assertTrue(len(rec_result['datasets_result']) > 0) - self.assertTrue(rec_result['datasets_result']['Wrd'] > 0) - ''' - result structure: - { - 'rec_result': 'None', - 'datasets_result': - { - 'Wrd': 1654, # the number of words - 'Snt': 128, # the number of sentences - 'Corr': 1573, # the number of correct words - 'Ins': 1, # the number of insert words - 'Del': 1, # the number of delete words - 'Sub': 80, # the number of substitution words - 'wrong_words': 82, # the number of wrong words - 'wrong_sentences': 47, # the number of wrong sentences - 'Err': 4.96, # WER/CER - 'S.Err': 36.72 # SER - } - } + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_wav_dataset_tf(self): + '''run with datasets, and audio format is waveform + datasets directory: + + wav + test # testsets + xx.wav + ... + dev # devsets + yy.wav + ... + train # trainsets + zz.wav + ... + transcript + data.text # hypothesis text ''' - logger.info('test_run_with_wav_dataset datasets result: ') - logger.info(rec_result['datasets_result']) + + logger.info('Run ASR test with waveform dataset (tensorflow)...') + logger.info('Downloading waveform testsets file ...') + + dataset_path = download_and_untar( + os.path.join(self.workspace, LITTLE_TESTSETS_FILE), + LITTLE_TESTSETS_URL, self.workspace) + dataset_path = os.path.join(dataset_path, 'wav', 'test') + + rec_result = self.run_pipeline( + model_id=self.am_tf_model_id, audio_in=dataset_path) + self.check_result('test_run_with_wav_dataset_tf', rec_result) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_ark_dataset(self): @@ -155,56 +277,40 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase): data.text ''' - logger.info('Run ASR test with ark dataset ...') + logger.info('Run ASR test with ark dataset (pytorch)...') logger.info('Downloading ark testsets file ...') - # downloading pos_testsets file - testsets_file_path = os.path.join(self._workspace, - AISHELL1_TESTSETS_FILE) - if not os.path.exists(testsets_file_path): - r = requests.get(AISHELL1_TESTSETS_URL) - with open(testsets_file_path, 'wb') as f: - f.write(r.content) - - testsets_dir_name = os.path.splitext( - os.path.basename( - os.path.splitext( - os.path.basename(AISHELL1_TESTSETS_FILE))[0]))[0] - # dataset_path = /.tmp/aishell1/test - dataset_path = os.path.join(self._workspace, testsets_dir_name, 'test') - - # untar the dataset_path file - if not os.path.exists(dataset_path): - un_tar_gz(testsets_file_path, self._workspace) + dataset_path = download_and_untar( + os.path.join(self.workspace, AISHELL1_TESTSETS_FILE), + AISHELL1_TESTSETS_URL, self.workspace) + dataset_path = os.path.join(dataset_path, 'test') - inference_16k_pipline = pipeline( - task=Tasks.auto_speech_recognition, model=[self._am_model_id]) - self.assertTrue(inference_16k_pipline is not None) + rec_result = self.run_pipeline( + model_id=self.am_pytorch_model_id, audio_in=dataset_path) + self.check_result('test_run_with_ark_dataset', rec_result) - rec_result = inference_16k_pipline(wav_path=dataset_path) - self.assertTrue(len(rec_result['datasets_result']) > 0) - self.assertTrue(rec_result['datasets_result']['Wrd'] > 0) - ''' - result structure: - { - 'rec_result': 'None', - 'datasets_result': - { - 'Wrd': 104816, # the number of words - 'Snt': 7176, # the number of sentences - 'Corr': 99327, # the number of correct words - 'Ins': 104, # the number of insert words - 'Del': 155, # the number of delete words - 'Sub': 5334, # the number of substitution words - 'wrong_words': 5593, # the number of wrong words - 'wrong_sentences': 2898, # the number of wrong sentences - 'Err': 5.34, # WER/CER - 'S.Err': 40.38 # SER - } - } + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_tfrecord_dataset(self): + '''run with datasets, and audio format is tfrecord + datasets directory: + + test # testsets + data.records + data.idx + data.text ''' - logger.info('test_run_with_ark_dataset datasets result: ') - logger.info(rec_result['datasets_result']) + + logger.info('Run ASR test with tfrecord dataset (tensorflow)...') + logger.info('Downloading tfrecord testsets file ...') + + dataset_path = download_and_untar( + os.path.join(self.workspace, TFRECORD_TESTSETS_FILE), + TFRECORD_TESTSETS_URL, self.workspace) + dataset_path = os.path.join(dataset_path, 'test') + + rec_result = self.run_pipeline( + model_id=self.am_tf_model_id, audio_in=dataset_path) + self.check_result('test_run_with_tfrecord_dataset', rec_result) if __name__ == '__main__':