Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9409213master
| @@ -1,8 +1,5 @@ | |||||
| from typing import Dict | from typing import Dict | ||||
| import numpy as np | |||||
| from rouge_score import rouge_scorer | |||||
| from ..metainfo import Metrics | from ..metainfo import Metrics | ||||
| from ..utils.registry import default_group | from ..utils.registry import default_group | ||||
| from .base import Metric | from .base import Metric | ||||
| @@ -18,6 +15,7 @@ class TextGenerationMetric(Metric): | |||||
| def __init__(self): | def __init__(self): | ||||
| self.preds = [] | self.preds = [] | ||||
| self.tgts = [] | self.tgts = [] | ||||
| from rouge_score import rouge_scorer | |||||
| self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | ||||
| def add(self, outputs: Dict, inputs: Dict): | def add(self, outputs: Dict, inputs: Dict): | ||||
| @@ -1,7 +1,4 @@ | |||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||||
| from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||||
| def encoder_prenet(inputs, | def encoder_prenet(inputs, | ||||
| @@ -207,6 +204,7 @@ def conv_and_lstm(inputs, | |||||
| embedded_inputs_speaker, | embedded_inputs_speaker, | ||||
| mask=None, | mask=None, | ||||
| scope='conv_and_lstm'): | scope='conv_and_lstm'): | ||||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||||
| x = inputs | x = inputs | ||||
| with tf.variable_scope(scope): | with tf.variable_scope(scope): | ||||
| for i in range(n_conv_layers): | for i in range(n_conv_layers): | ||||
| @@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs, | |||||
| mask=None, | mask=None, | ||||
| scope='conv_and_lstm'): | scope='conv_and_lstm'): | ||||
| x = inputs | x = inputs | ||||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||||
| with tf.variable_scope(scope): | with tf.variable_scope(scope): | ||||
| for i in range(n_conv_layers): | for i in range(n_conv_layers): | ||||
| x = conv1d( | x = conv1d( | ||||
| @@ -1,9 +1,8 @@ | |||||
| import numpy as np | import numpy as np | ||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from tensorflow.contrib.seq2seq import Helper | |||||
| class VarTestHelper(Helper): | |||||
| class VarTestHelper(tf.contrib.seq2seq.Helper): | |||||
| def __init__(self, batch_size, inputs, dim): | def __init__(self, batch_size, inputs, dim): | ||||
| with tf.name_scope('VarTestHelper'): | with tf.name_scope('VarTestHelper'): | ||||
| @@ -44,7 +43,7 @@ class VarTestHelper(Helper): | |||||
| return (finished, next_inputs, state) | return (finished, next_inputs, state) | ||||
| class VarTrainingHelper(Helper): | |||||
| class VarTrainingHelper(tf.contrib.seq2seq.Helper): | |||||
| def __init__(self, targets, inputs, dim): | def __init__(self, targets, inputs, dim): | ||||
| with tf.name_scope('VarTrainingHelper'): | with tf.name_scope('VarTrainingHelper'): | ||||
| @@ -86,7 +85,7 @@ class VarTrainingHelper(Helper): | |||||
| return (finished, next_inputs, state) | return (finished, next_inputs, state) | ||||
| class VarTrainingSSHelper(Helper): | |||||
| class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): | |||||
| def __init__(self, targets, inputs, dim, global_step, schedule_begin, | def __init__(self, targets, inputs, dim, global_step, schedule_begin, | ||||
| alpha, decay_steps): | alpha, decay_steps): | ||||
| @@ -1,14 +1,11 @@ | |||||
| import numpy as np | |||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from tensorflow.contrib.rnn import RNNCell | |||||
| from tensorflow.contrib.seq2seq import AttentionWrapperState | |||||
| from tensorflow.python.ops import rnn_cell_impl | from tensorflow.python.ops import rnn_cell_impl | ||||
| from .am_models import prenet | from .am_models import prenet | ||||
| class VarPredictorCell(RNNCell): | |||||
| '''Wrapper wrapper knock knock.''' | |||||
| class VarPredictorCell(tf.contrib.rnn.RNNCell): | |||||
| """Wrapper wrapper knock knock.""" | |||||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
| super(VarPredictorCell, self).__init__() | super(VarPredictorCell, self).__init__() | ||||
| @@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell): | |||||
| ]) | ]) | ||||
| def call(self, inputs, state): | def call(self, inputs, state): | ||||
| '''Run the Tacotron2 super decoder cell.''' | |||||
| """Run the Tacotron2 super decoder cell.""" | |||||
| super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
| # split | # split | ||||
| @@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell): | |||||
| return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
| class DurPredictorCell(RNNCell): | |||||
| '''Wrapper wrapper knock knock.''' | |||||
| class DurPredictorCell(tf.contrib.rnn.RNNCell): | |||||
| """Wrapper wrapper knock knock.""" | |||||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
| super(DurPredictorCell, self).__init__() | super(DurPredictorCell, self).__init__() | ||||
| @@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell): | |||||
| ]) | ]) | ||||
| def call(self, inputs, state): | def call(self, inputs, state): | ||||
| '''Run the Tacotron2 super decoder cell.''' | |||||
| """Run the Tacotron2 super decoder cell.""" | |||||
| super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
| # split | # split | ||||
| @@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell): | |||||
| return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
| class DurPredictorCECell(RNNCell): | |||||
| '''Wrapper wrapper knock knock.''' | |||||
| class DurPredictorCECell(tf.contrib.rnn.RNNCell): | |||||
| """Wrapper wrapper knock knock.""" | |||||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | ||||
| max_dur, dur_embedding_dim): | max_dur, dur_embedding_dim): | ||||
| @@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell): | |||||
| ]) | ]) | ||||
| def call(self, inputs, state): | def call(self, inputs, state): | ||||
| '''Run the Tacotron2 super decoder cell.''' | |||||
| """Run the Tacotron2 super decoder cell.""" | |||||
| super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
| # split | # split | ||||
| @@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell): | |||||
| return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
| class VarPredictorCell2(RNNCell): | |||||
| '''Wrapper wrapper knock knock.''' | |||||
| class VarPredictorCell2(tf.contrib.rnn.RNNCell): | |||||
| """Wrapper wrapper knock knock.""" | |||||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
| super(VarPredictorCell2, self).__init__() | super(VarPredictorCell2, self).__init__() | ||||
| @@ -1,14 +1,8 @@ | |||||
| import tensorflow as tf | import tensorflow as tf | ||||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||||
| from tensorflow.python.ops.ragged.ragged_util import repeat | from tensorflow.python.ops.ragged.ragged_util import repeat | ||||
| from .am_models import conv_prenet, decoder_prenet, encoder_prenet | |||||
| from .fsmn_encoder import FsmnEncoderV2 | from .fsmn_encoder import FsmnEncoderV2 | ||||
| from .helpers import VarTestHelper, VarTrainingHelper | |||||
| from .position import (BatchSinusodalPositionalEncoding, | |||||
| SinusodalPositionalEncoding) | |||||
| from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||||
| from .position import BatchSinusodalPositionalEncoding | |||||
| from .self_attention_decoder import SelfAttentionDecoder | from .self_attention_decoder import SelfAttentionDecoder | ||||
| from .self_attention_encoder import SelfAttentionEncoder | from .self_attention_encoder import SelfAttentionEncoder | ||||
| @@ -32,7 +26,7 @@ class RobuTrans(): | |||||
| duration_scales=None, | duration_scales=None, | ||||
| energy_contours=None, | energy_contours=None, | ||||
| energy_scales=None): | energy_scales=None): | ||||
| '''Initializes the model for inference. | |||||
| """Initializes the model for inference. | |||||
| Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | ||||
| @@ -46,7 +40,10 @@ class RobuTrans(): | |||||
| mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | ||||
| of steps in the output time series, M is num_mels, and values are entries in the mel | of steps in the output time series, M is num_mels, and values are entries in the mel | ||||
| spectrogram. Only needed for training. | spectrogram. Only needed for training. | ||||
| ''' | |||||
| """ | |||||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||||
| with tf.variable_scope('inference') as _: | with tf.variable_scope('inference') as _: | ||||
| is_training = mel_targets is not None | is_training = mel_targets is not None | ||||
| batch_size = tf.shape(inputs)[0] | batch_size = tf.shape(inputs)[0] | ||||
| @@ -229,17 +226,20 @@ class RobuTrans(): | |||||
| LSTMBlockCell(hp.predictor_lstm_units), | LSTMBlockCell(hp.predictor_lstm_units), | ||||
| LSTMBlockCell(hp.predictor_lstm_units) | LSTMBlockCell(hp.predictor_lstm_units) | ||||
| ], state_is_tuple=True) # yapf:disable | ], state_is_tuple=True) # yapf:disable | ||||
| from .rnn_wrappers import DurPredictorCell | |||||
| duration_output_cell = DurPredictorCell( | duration_output_cell = DurPredictorCell( | ||||
| duration_predictor_cell, is_training, 1, | duration_predictor_cell, is_training, 1, | ||||
| hp.predictor_prenet_units) | hp.predictor_prenet_units) | ||||
| duration_predictor_init_state = duration_output_cell.zero_state( | duration_predictor_init_state = duration_output_cell.zero_state( | ||||
| batch_size=batch_size, dtype=tf.float32) | batch_size=batch_size, dtype=tf.float32) | ||||
| if is_training: | if is_training: | ||||
| from .helpers import VarTrainingHelper | |||||
| duration_helper = VarTrainingHelper( | duration_helper = VarTrainingHelper( | ||||
| tf.expand_dims( | tf.expand_dims( | ||||
| tf.log(tf.cast(durations, tf.float32) + 1), | tf.log(tf.cast(durations, tf.float32) + 1), | ||||
| axis=2), dur_inputs, 1) | axis=2), dur_inputs, 1) | ||||
| else: | else: | ||||
| from .helpers import VarTestHelper | |||||
| duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | ||||
| ( | ( | ||||
| duration_outputs, _ | duration_outputs, _ | ||||
| @@ -1,14 +1,10 @@ | |||||
| from __future__ import (absolute_import, division, print_function, | from __future__ import (absolute_import, division, print_function, | ||||
| unicode_literals) | unicode_literals) | ||||
| import io | |||||
| import os | import os | ||||
| import time | |||||
| import zipfile | import zipfile | ||||
| from typing import Any, Dict, Optional, Union | |||||
| import json | import json | ||||
| import numpy as np | import numpy as np | ||||
| import torch | |||||
| from modelscope.metainfo import Models | from modelscope.metainfo import Models | ||||
| from modelscope.models.base import Model | from modelscope.models.base import Model | ||||
| @@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.audio.tts_exceptions import ( | from modelscope.utils.audio.tts_exceptions import ( | ||||
| TtsFrontendInitializeFailedException, | TtsFrontendInitializeFailedException, | ||||
| TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | ||||
| TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException) | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| TtsVoiceNotExistsException) | |||||
| from modelscope.utils.constant import Tasks | |||||
| from .voice import Voice | from .voice import Voice | ||||
| import tensorflow as tf # isort:skip | import tensorflow as tf # isort:skip | ||||
| @@ -23,8 +23,8 @@ logger = get_logger() | |||||
| class Pipeline(ABC): | class Pipeline(ABC): | ||||
| def initiate_single_model(self, model): | def initiate_single_model(self, model): | ||||
| logger.info(f'initiate model from {model}') | |||||
| if isinstance(model, str) and is_official_hub_path(model): | if isinstance(model, str) and is_official_hub_path(model): | ||||
| logger.info(f'initiate model from location {model}.') | |||||
| # expecting model has been prefetched to local cache beforehand | # expecting model has been prefetched to local cache beforehand | ||||
| return Model.from_pretrained( | return Model.from_pretrained( | ||||
| model, model_prefetched=True) if is_model(model) else model | model, model_prefetched=True) if is_model(model) else model | ||||
| @@ -1,11 +1,9 @@ | |||||
| import os.path as osp | import os.path as osp | ||||
| from typing import Any, Dict | from typing import Any, Dict | ||||
| import decord | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| import torchvision.transforms.functional as TF | import torchvision.transforms.functional as TF | ||||
| from decord import VideoReader, cpu | |||||
| from PIL import Image | from PIL import Image | ||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| @@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||||
| logger.info('load model done') | logger.info('load model done') | ||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | def preprocess(self, input: Input) -> Dict[str, Any]: | ||||
| import decord | |||||
| decord.bridge.set_bridge('native') | decord.bridge.set_bridge('native') | ||||
| transforms = VCompose([ | transforms = VCompose([ | ||||
| @@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||||
| clip_len = (self.cfg.DATA.video_frames | clip_len = (self.cfg.DATA.video_frames | ||||
| - 1) * self.cfg.DATA.video_stride + 1 | - 1) * self.cfg.DATA.video_stride + 1 | ||||
| vr = VideoReader(input, ctx=cpu(0)) | |||||
| vr = decord.VideoReader(input, ctx=decord.cpu(0)) | |||||
| if len(vr) <= clip_len: | if len(vr) <= clip_len: | ||||
| init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | ||||
| else: | else: | ||||
| @@ -16,11 +16,6 @@ from ..base import Pipeline | |||||
| from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
| from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | ||||
| if tf.__version__ >= '2.0': | |||||
| import tf_slim as slim | |||||
| else: | |||||
| from tensorflow.contrib import slim | |||||
| if tf.__version__ >= '2.0': | if tf.__version__ >= '2.0': | ||||
| tf = tf.compat.v1 | tf = tf.compat.v1 | ||||
| tf.compat.v1.disable_eager_execution() | tf.compat.v1.disable_eager_execution() | ||||
| @@ -1,15 +1,11 @@ | |||||
| import math | import math | ||||
| import os | |||||
| import random | import random | ||||
| import decord | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| import torch.nn as nn | |||||
| import torch.utils.data | import torch.utils.data | ||||
| import torch.utils.dlpack as dlpack | import torch.utils.dlpack as dlpack | ||||
| import torchvision.transforms._transforms_video as transforms | import torchvision.transforms._transforms_video as transforms | ||||
| from decord import VideoReader | |||||
| from torchvision.transforms import Compose | from torchvision.transforms import Compose | ||||
| @@ -128,6 +124,7 @@ def _decode_video(cfg, path): | |||||
| Returns: | Returns: | ||||
| frames (Tensor): video tensor data | frames (Tensor): video tensor data | ||||
| """ | """ | ||||
| from decord import VideoReader | |||||
| vr = VideoReader(path) | vr = VideoReader(path) | ||||
| num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | ||||