Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9409213master
| @@ -1,8 +1,5 @@ | |||
| from typing import Dict | |||
| import numpy as np | |||
| from rouge_score import rouge_scorer | |||
| from ..metainfo import Metrics | |||
| from ..utils.registry import default_group | |||
| from .base import Metric | |||
| @@ -18,6 +15,7 @@ class TextGenerationMetric(Metric): | |||
| def __init__(self): | |||
| self.preds = [] | |||
| self.tgts = [] | |||
| from rouge_score import rouge_scorer | |||
| self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | |||
| def add(self, outputs: Dict, inputs: Dict): | |||
| @@ -1,7 +1,4 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||
| from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| def encoder_prenet(inputs, | |||
| @@ -207,6 +204,7 @@ def conv_and_lstm(inputs, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| @@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| @@ -1,9 +1,8 @@ | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.seq2seq import Helper | |||
| class VarTestHelper(Helper): | |||
| class VarTestHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, batch_size, inputs, dim): | |||
| with tf.name_scope('VarTestHelper'): | |||
| @@ -44,7 +43,7 @@ class VarTestHelper(Helper): | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingHelper(Helper): | |||
| class VarTrainingHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, targets, inputs, dim): | |||
| with tf.name_scope('VarTrainingHelper'): | |||
| @@ -86,7 +85,7 @@ class VarTrainingHelper(Helper): | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingSSHelper(Helper): | |||
| class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
| alpha, decay_steps): | |||
| @@ -1,14 +1,11 @@ | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.rnn import RNNCell | |||
| from tensorflow.contrib.seq2seq import AttentionWrapperState | |||
| from tensorflow.python.ops import rnn_cell_impl | |||
| from .am_models import prenet | |||
| class VarPredictorCell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| class VarPredictorCell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell, self).__init__() | |||
| @@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell): | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| @@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell): | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| class DurPredictorCell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(DurPredictorCell, self).__init__() | |||
| @@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell): | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| @@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell): | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCECell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| class DurPredictorCECell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
| max_dur, dur_embedding_dim): | |||
| @@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell): | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| @@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell): | |||
| return new_super_cell_out, new_states | |||
| class VarPredictorCell2(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| class VarPredictorCell2(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell2, self).__init__() | |||
| @@ -1,14 +1,8 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||
| from tensorflow.python.ops.ragged.ragged_util import repeat | |||
| from .am_models import conv_prenet, decoder_prenet, encoder_prenet | |||
| from .fsmn_encoder import FsmnEncoderV2 | |||
| from .helpers import VarTestHelper, VarTrainingHelper | |||
| from .position import (BatchSinusodalPositionalEncoding, | |||
| SinusodalPositionalEncoding) | |||
| from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||
| from .position import BatchSinusodalPositionalEncoding | |||
| from .self_attention_decoder import SelfAttentionDecoder | |||
| from .self_attention_encoder import SelfAttentionEncoder | |||
| @@ -32,7 +26,7 @@ class RobuTrans(): | |||
| duration_scales=None, | |||
| energy_contours=None, | |||
| energy_scales=None): | |||
| '''Initializes the model for inference. | |||
| """Initializes the model for inference. | |||
| Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
| @@ -46,7 +40,10 @@ class RobuTrans(): | |||
| mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
| of steps in the output time series, M is num_mels, and values are entries in the mel | |||
| spectrogram. Only needed for training. | |||
| ''' | |||
| """ | |||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||
| with tf.variable_scope('inference') as _: | |||
| is_training = mel_targets is not None | |||
| batch_size = tf.shape(inputs)[0] | |||
| @@ -229,17 +226,20 @@ class RobuTrans(): | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units) | |||
| ], state_is_tuple=True) # yapf:disable | |||
| from .rnn_wrappers import DurPredictorCell | |||
| duration_output_cell = DurPredictorCell( | |||
| duration_predictor_cell, is_training, 1, | |||
| hp.predictor_prenet_units) | |||
| duration_predictor_init_state = duration_output_cell.zero_state( | |||
| batch_size=batch_size, dtype=tf.float32) | |||
| if is_training: | |||
| from .helpers import VarTrainingHelper | |||
| duration_helper = VarTrainingHelper( | |||
| tf.expand_dims( | |||
| tf.log(tf.cast(durations, tf.float32) + 1), | |||
| axis=2), dur_inputs, 1) | |||
| else: | |||
| from .helpers import VarTestHelper | |||
| duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
| ( | |||
| duration_outputs, _ | |||
| @@ -1,14 +1,10 @@ | |||
| from __future__ import (absolute_import, division, print_function, | |||
| unicode_literals) | |||
| import io | |||
| import os | |||
| import time | |||
| import zipfile | |||
| from typing import Any, Dict, Optional, Union | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import Model | |||
| @@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS | |||
| from modelscope.utils.audio.tts_exceptions import ( | |||
| TtsFrontendInitializeFailedException, | |||
| TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | |||
| TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException) | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| TtsVoiceNotExistsException) | |||
| from modelscope.utils.constant import Tasks | |||
| from .voice import Voice | |||
| import tensorflow as tf # isort:skip | |||
| @@ -23,8 +23,8 @@ logger = get_logger() | |||
| class Pipeline(ABC): | |||
| def initiate_single_model(self, model): | |||
| logger.info(f'initiate model from {model}') | |||
| if isinstance(model, str) and is_official_hub_path(model): | |||
| logger.info(f'initiate model from location {model}.') | |||
| # expecting model has been prefetched to local cache beforehand | |||
| return Model.from_pretrained( | |||
| model, model_prefetched=True) if is_model(model) else model | |||
| @@ -1,11 +1,9 @@ | |||
| import os.path as osp | |||
| from typing import Any, Dict | |||
| import decord | |||
| import numpy as np | |||
| import torch | |||
| import torchvision.transforms.functional as TF | |||
| from decord import VideoReader, cpu | |||
| from PIL import Image | |||
| from modelscope.metainfo import Pipelines | |||
| @@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||
| logger.info('load model done') | |||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||
| import decord | |||
| decord.bridge.set_bridge('native') | |||
| transforms = VCompose([ | |||
| @@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||
| clip_len = (self.cfg.DATA.video_frames | |||
| - 1) * self.cfg.DATA.video_stride + 1 | |||
| vr = VideoReader(input, ctx=cpu(0)) | |||
| vr = decord.VideoReader(input, ctx=decord.cpu(0)) | |||
| if len(vr) <= clip_len: | |||
| init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | |||
| else: | |||
| @@ -16,11 +16,6 @@ from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||
| if tf.__version__ >= '2.0': | |||
| import tf_slim as slim | |||
| else: | |||
| from tensorflow.contrib import slim | |||
| if tf.__version__ >= '2.0': | |||
| tf = tf.compat.v1 | |||
| tf.compat.v1.disable_eager_execution() | |||
| @@ -1,15 +1,11 @@ | |||
| import math | |||
| import os | |||
| import random | |||
| import decord | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.utils.data | |||
| import torch.utils.dlpack as dlpack | |||
| import torchvision.transforms._transforms_video as transforms | |||
| from decord import VideoReader | |||
| from torchvision.transforms import Compose | |||
| @@ -128,6 +124,7 @@ def _decode_video(cfg, path): | |||
| Returns: | |||
| frames (Tensor): video tensor data | |||
| """ | |||
| from decord import VideoReader | |||
| vr = VideoReader(path) | |||
| num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | |||