* [to #41669377] docs and tools refinement and release 1. add build_doc linter script 2. add sphinx-docs support 3. add development doc and api doc 4. change version to 0.1.0 for the first internal release version Link: https://code.aone.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8775307master
| @@ -1,9 +0,0 @@ | |||
| from .robutrans import RobuTrans | |||
| from .vocoder_models import Generator | |||
| def create_am_model(name, hparams): | |||
| if name == 'robutrans': | |||
| return RobuTrans(hparams) | |||
| else: | |||
| raise Exception('Unknown model: ' + name) | |||
| @@ -1,460 +0,0 @@ | |||
| import tensorflow as tf | |||
| def encoder_prenet(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| dense_units, | |||
| is_training, | |||
| mask=None, | |||
| scope='encoder_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.layers.dense( | |||
| x, units=dense_units, activation=None, name='dense') | |||
| return x | |||
| def decoder_prenet(inputs, | |||
| prenet_units, | |||
| dense_units, | |||
| is_training, | |||
| scope='decoder_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i, units in enumerate(prenet_units): | |||
| x = tf.layers.dense( | |||
| x, | |||
| units=units, | |||
| activation=tf.nn.relu, | |||
| name='dense_{}'.format(i)) | |||
| x = tf.layers.dropout( | |||
| x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
| x = tf.layers.dense( | |||
| x, units=dense_units, activation=None, name='dense') | |||
| return x | |||
| def encoder(inputs, | |||
| input_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='encoder'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm( | |||
| inputs, | |||
| input_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=mask) | |||
| return x | |||
| def prenet(inputs, prenet_units, is_training, scope='prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i, units in enumerate(prenet_units): | |||
| x = tf.layers.dense( | |||
| x, | |||
| units=units, | |||
| activation=tf.nn.relu, | |||
| name='dense_{}'.format(i)) | |||
| x = tf.layers.dropout( | |||
| x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
| return x | |||
| def postnet_residual_ulstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_residual_ulstm'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = conv1d( | |||
| x, | |||
| output_units, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
| return x | |||
| def postnet_residual_lstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_residual_lstm'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = conv1d( | |||
| x, | |||
| output_units, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
| return x | |||
| def postnet_linear_ulstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = tf.layers.dense(x, units=output_units) | |||
| return x | |||
| def postnet_linear_lstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| output_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm_dec( | |||
| inputs, | |||
| output_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=mask) | |||
| x = tf.layers.dense(x, units=output_units) | |||
| return x | |||
| def postnet_linear(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| output_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_dec( | |||
| inputs, | |||
| output_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=mask) | |||
| return x | |||
| def conv_and_lstm(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| return x | |||
| def conv_and_lstm_dec(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| return x | |||
| def conv_dec(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
| return x | |||
| def conv_and_ulstm(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| scope='conv_and_ulstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| scope='conv1d_{}'.format(i)) | |||
| outputs, states = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| return outputs | |||
| def conv1d(inputs, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| mask=None, | |||
| scope='conv1d'): | |||
| with tf.variable_scope(scope): | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| x = tf.layers.conv1d( | |||
| inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
| if mask is not None: | |||
| x = x * tf.expand_dims(mask, -1) | |||
| x = tf.layers.batch_normalization(x, training=is_training) | |||
| if activation is not None: | |||
| x = activation(x) | |||
| if dropout: | |||
| x = tf.layers.dropout(x, rate=0.5, training=is_training) | |||
| return x | |||
| def conv1d_dp(inputs, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| dropoutrate=0.5, | |||
| mask=None, | |||
| scope='conv1d'): | |||
| with tf.variable_scope(scope): | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| x = tf.layers.conv1d( | |||
| inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
| if mask is not None: | |||
| x = x * tf.expand_dims(mask, -1) | |||
| x = tf.contrib.layers.layer_norm(x) | |||
| if activation is not None: | |||
| x = activation(x) | |||
| if dropout: | |||
| x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) | |||
| return x | |||
| def duration_predictor(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| input_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='duration_predictor'): | |||
| with tf.variable_scope(scope): | |||
| x = inputs | |||
| for i in range(n_conv_layers): | |||
| x = conv1d_dp( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| dropoutrate=0.1, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| x = tf.layers.dense(x, units=1) | |||
| x = tf.nn.relu(x) | |||
| return x | |||
| def duration_predictor2(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| input_lengths, | |||
| is_training, | |||
| mask=None, | |||
| scope='duration_predictor'): | |||
| with tf.variable_scope(scope): | |||
| x = inputs | |||
| for i in range(n_conv_layers): | |||
| x = conv1d_dp( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| dropoutrate=0.1, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.layers.dense(x, units=1) | |||
| x = tf.nn.relu(x) | |||
| return x | |||
| def conv_prenet(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| mask=None, | |||
| scope='conv_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| return x | |||
| @@ -1,82 +0,0 @@ | |||
| """Functions for compatibility with different TensorFlow versions.""" | |||
| import tensorflow as tf | |||
| def is_tf2(): | |||
| """Returns ``True`` if running TensorFlow 2.0.""" | |||
| return tf.__version__.startswith('2') | |||
| def tf_supports(symbol): | |||
| """Returns ``True`` if TensorFlow defines :obj:`symbol`.""" | |||
| return _string_to_tf_symbol(symbol) is not None | |||
| def tf_any(*symbols): | |||
| """Returns the first supported symbol.""" | |||
| for symbol in symbols: | |||
| module = _string_to_tf_symbol(symbol) | |||
| if module is not None: | |||
| return module | |||
| return None | |||
| def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name | |||
| """Returns the compatible symbol based on the current TensorFlow version. | |||
| Args: | |||
| v2: The candidate v2 symbol name. | |||
| v1: The candidate v1 symbol name. | |||
| Returns: | |||
| A TensorFlow symbol. | |||
| Raises: | |||
| ValueError: if no symbol can be found. | |||
| """ | |||
| candidates = [] | |||
| if v2 is not None: | |||
| candidates.append(v2) | |||
| if v1 is not None: | |||
| candidates.append(v1) | |||
| candidates.append('compat.v1.%s' % v1) | |||
| symbol = tf_any(*candidates) | |||
| if symbol is None: | |||
| raise ValueError('Failure to resolve the TensorFlow symbol') | |||
| return symbol | |||
| def name_from_variable_scope(name=''): | |||
| """Creates a name prefixed by the current variable scope.""" | |||
| var_scope = tf_compat(v1='get_variable_scope')().name | |||
| compat_name = '' | |||
| if name: | |||
| compat_name = '%s/' % name | |||
| if var_scope: | |||
| compat_name = '%s/%s' % (var_scope, compat_name) | |||
| return compat_name | |||
| def reuse(): | |||
| """Returns ``True`` if the current variable scope is marked for reuse.""" | |||
| return tf_compat(v1='get_variable_scope')().reuse | |||
| def _string_to_tf_symbol(symbol): | |||
| modules = symbol.split('.') | |||
| namespace = tf | |||
| for module in modules: | |||
| namespace = getattr(namespace, module, None) | |||
| if namespace is None: | |||
| return None | |||
| return namespace | |||
| # pylint: disable=invalid-name | |||
| gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') | |||
| gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') | |||
| gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') | |||
| is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') | |||
| logging = tf_compat(v1='logging') | |||
| nest = tf_compat(v2='nest', v1='contrib.framework.nest') | |||
| @@ -0,0 +1,238 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from torch.utils.data import Dataset | |||
| from tqdm import tqdm | |||
| from modelscope.utils.logger import get_logger | |||
| from .units import KanTtsLinguisticUnit | |||
| logger = get_logger() | |||
| class KanTtsText2MelDataset(Dataset): | |||
| def __init__(self, metadata_filename, config_filename, cache=False): | |||
| super(KanTtsText2MelDataset, self).__init__() | |||
| self.cache = cache | |||
| with open(config_filename) as f: | |||
| self._config = json.loads(f.read()) | |||
| # Load metadata: | |||
| self._datadir = os.path.dirname(metadata_filename) | |||
| with open(metadata_filename, encoding='utf-8') as f: | |||
| self._metadata = [line.strip().split('|') for line in f] | |||
| self._length_lst = [int(x[2]) for x in self._metadata] | |||
| hours = sum( | |||
| self._length_lst) * self._config['audio']['frame_shift_ms'] / ( | |||
| 3600 * 1000) | |||
| logger.info('Loaded metadata for %d examples (%.2f hours)' % | |||
| (len(self._metadata), hours)) | |||
| logger.info('Minimum length: %d, Maximum length: %d' % | |||
| (min(self._length_lst), max(self._length_lst))) | |||
| self.ling_unit = KanTtsLinguisticUnit(config_filename) | |||
| self.pad_executor = KanTtsText2MelPad() | |||
| self.r = self._config['am']['outputs_per_step'] | |||
| self.num_mels = self._config['am']['num_mels'] | |||
| if 'adv' in self._config: | |||
| self.feat_window = self._config['adv']['random_window'] | |||
| else: | |||
| self.feat_window = None | |||
| logger.info(self.feat_window) | |||
| self.data_cache = [ | |||
| self.cache_load(i) for i in tqdm(range(self.__len__())) | |||
| ] if self.cache else [] | |||
| def get_frames_lst(self): | |||
| return self._length_lst | |||
| def __getitem__(self, index): | |||
| if self.cache: | |||
| sample = self.data_cache[index] | |||
| return sample | |||
| return self.cache_load(index) | |||
| def cache_load(self, index): | |||
| sample = {} | |||
| meta = self._metadata[index] | |||
| sample['utt_id'] = meta[0] | |||
| sample['mel_target'] = np.load(os.path.join( | |||
| self._datadir, meta[1]))[:, :self.num_mels] | |||
| sample['output_length'] = len(sample['mel_target']) | |||
| lfeat_symbol = meta[3] | |||
| sample['ling'] = self.ling_unit.encode_symbol_sequence(lfeat_symbol) | |||
| sample['duration'] = np.load(os.path.join(self._datadir, meta[4])) | |||
| sample['pitch_contour'] = np.load(os.path.join(self._datadir, meta[5])) | |||
| sample['energy_contour'] = np.load( | |||
| os.path.join(self._datadir, meta[6])) | |||
| return sample | |||
| def __len__(self): | |||
| return len(self._metadata) | |||
| def collate_fn(self, batch): | |||
| data_dict = {} | |||
| max_input_length = max((len(x['ling'][0]) for x in batch)) | |||
| # pure linguistic info: sy|tone|syllable_flag|word_segment | |||
| # sy | |||
| lfeat_type = self.ling_unit._lfeat_type_list[0] | |||
| inputs_sy = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][0] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| # tone | |||
| lfeat_type = self.ling_unit._lfeat_type_list[1] | |||
| inputs_tone = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][1] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| # syllable_flag | |||
| lfeat_type = self.ling_unit._lfeat_type_list[2] | |||
| inputs_syllable_flag = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][2] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| # word_segment | |||
| lfeat_type = self.ling_unit._lfeat_type_list[3] | |||
| inputs_ws = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][3] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| # emotion category | |||
| lfeat_type = self.ling_unit._lfeat_type_list[4] | |||
| data_dict['input_emotions'] = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][4] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| # speaker category | |||
| lfeat_type = self.ling_unit._lfeat_type_list[5] | |||
| data_dict['input_speakers'] = self.pad_executor._prepare_scalar_inputs( | |||
| [x['ling'][5] for x in batch], max_input_length, | |||
| self.ling_unit._sub_unit_pad[lfeat_type]).long() | |||
| data_dict['input_lings'] = torch.stack( | |||
| [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2) | |||
| data_dict['valid_input_lengths'] = torch.as_tensor( | |||
| [len(x['ling'][0]) - 1 for x in batch], dtype=torch.long | |||
| ) # There is one '~' in the last of symbol sequence. We put length-1 for calculation. | |||
| data_dict['valid_output_lengths'] = torch.as_tensor( | |||
| [x['output_length'] for x in batch], dtype=torch.long) | |||
| max_output_length = torch.max(data_dict['valid_output_lengths']).item() | |||
| max_output_round_length = self.pad_executor._round_up( | |||
| max_output_length, self.r) | |||
| if self.feat_window is not None: | |||
| active_feat_len = np.minimum(max_output_round_length, | |||
| self.feat_window) | |||
| if active_feat_len < self.feat_window: | |||
| max_output_round_length = self.pad_executor._round_up( | |||
| self.feat_window, self.r) | |||
| active_feat_len = self.feat_window | |||
| max_offsets = [x['output_length'] - active_feat_len for x in batch] | |||
| feat_offsets = [ | |||
| np.random.randint(0, np.maximum(1, offset)) | |||
| for offset in max_offsets | |||
| ] | |||
| feat_offsets = torch.from_numpy( | |||
| np.asarray(feat_offsets, dtype=np.int32)).long() | |||
| data_dict['feat_offsets'] = feat_offsets | |||
| data_dict['mel_targets'] = self.pad_executor._prepare_targets( | |||
| [x['mel_target'] for x in batch], max_output_round_length, 0.0) | |||
| data_dict['durations'] = self.pad_executor._prepare_durations( | |||
| [x['duration'] for x in batch], max_input_length, | |||
| max_output_round_length) | |||
| data_dict['pitch_contours'] = self.pad_executor._prepare_scalar_inputs( | |||
| [x['pitch_contour'] for x in batch], max_input_length, | |||
| 0.0).float() | |||
| data_dict[ | |||
| 'energy_contours'] = self.pad_executor._prepare_scalar_inputs( | |||
| [x['energy_contour'] for x in batch], max_input_length, | |||
| 0.0).float() | |||
| data_dict['utt_ids'] = [x['utt_id'] for x in batch] | |||
| return data_dict | |||
| class KanTtsText2MelPad(object): | |||
| def __init__(self): | |||
| super(KanTtsText2MelPad, self).__init__() | |||
| pass | |||
| def _pad1D(self, x, length, pad): | |||
| return np.pad( | |||
| x, (0, length - x.shape[0]), mode='constant', constant_values=pad) | |||
| def _pad2D(self, x, length, pad): | |||
| return np.pad( | |||
| x, [(0, length - x.shape[0]), (0, 0)], | |||
| mode='constant', | |||
| constant_values=pad) | |||
| def _pad_durations(self, duration, max_in_len, max_out_len): | |||
| framenum = np.sum(duration) | |||
| symbolnum = duration.shape[0] | |||
| if framenum < max_out_len: | |||
| padframenum = max_out_len - framenum | |||
| duration = np.insert( | |||
| duration, symbolnum, values=padframenum, axis=0) | |||
| duration = np.insert( | |||
| duration, | |||
| symbolnum + 1, | |||
| values=[0] * (max_in_len - symbolnum - 1), | |||
| axis=0) | |||
| else: | |||
| if symbolnum < max_in_len: | |||
| duration = np.insert( | |||
| duration, | |||
| symbolnum, | |||
| values=[0] * (max_in_len - symbolnum), | |||
| axis=0) | |||
| return duration | |||
| def _round_up(self, x, multiple): | |||
| remainder = x % multiple | |||
| return x if remainder == 0 else x + multiple - remainder | |||
| def _prepare_scalar_inputs(self, inputs, max_len, pad): | |||
| return torch.from_numpy( | |||
| np.stack([self._pad1D(x, max_len, pad) for x in inputs])) | |||
| def _prepare_targets(self, targets, max_len, pad): | |||
| return torch.from_numpy( | |||
| np.stack([self._pad2D(t, max_len, pad) for t in targets])).float() | |||
| def _prepare_durations(self, durations, max_in_len, max_out_len): | |||
| return torch.from_numpy( | |||
| np.stack([ | |||
| self._pad_durations(t, max_in_len, max_out_len) | |||
| for t in durations | |||
| ])).long() | |||
| @@ -0,0 +1,131 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import math | |||
| import random | |||
| import torch | |||
| from torch import distributed as dist | |||
| from torch.utils.data import Sampler | |||
| class LenSortGroupPoolSampler(Sampler): | |||
| def __init__(self, data_source, length_lst, group_size): | |||
| super(LenSortGroupPoolSampler, self).__init__(data_source) | |||
| self.data_source = data_source | |||
| self.length_lst = length_lst | |||
| self.group_size = group_size | |||
| self.num = len(self.length_lst) | |||
| self.buckets = self.num // group_size | |||
| def __iter__(self): | |||
| def getkey(item): | |||
| return item[1] | |||
| random_lst = torch.randperm(self.num).tolist() | |||
| random_len_lst = [(i, self.length_lst[i]) for i in random_lst] | |||
| # Bucket examples based on similar output sequence length for efficiency: | |||
| groups = [ | |||
| random_len_lst[i:i + self.group_size] | |||
| for i in range(0, self.num, self.group_size) | |||
| ] | |||
| if (self.num % self.group_size): | |||
| groups.append(random_len_lst[self.buckets * self.group_size:-1]) | |||
| indices = [] | |||
| for group in groups: | |||
| group.sort(key=getkey, reverse=True) | |||
| for item in group: | |||
| indices.append(item[0]) | |||
| return iter(indices) | |||
| def __len__(self): | |||
| return len(self.data_source) | |||
| class DistributedLenSortGroupPoolSampler(Sampler): | |||
| def __init__(self, | |||
| dataset, | |||
| length_lst, | |||
| group_size, | |||
| num_replicas=None, | |||
| rank=None, | |||
| shuffle=True): | |||
| super(DistributedLenSortGroupPoolSampler, self).__init__(dataset) | |||
| if num_replicas is None: | |||
| if not dist.is_available(): | |||
| raise RuntimeError( | |||
| 'modelscope error: Requires distributed package to be available' | |||
| ) | |||
| num_replicas = dist.get_world_size() | |||
| if rank is None: | |||
| if not dist.is_available(): | |||
| raise RuntimeError( | |||
| 'modelscope error: Requires distributed package to be available' | |||
| ) | |||
| rank = dist.get_rank() | |||
| self.dataset = dataset | |||
| self.length_lst = length_lst | |||
| self.group_size = group_size | |||
| self.num_replicas = num_replicas | |||
| self.rank = rank | |||
| self.epoch = 0 | |||
| self.num_samples = int( | |||
| math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) | |||
| self.total_size = self.num_samples * self.num_replicas | |||
| self.buckets = self.num_samples // group_size | |||
| self.shuffle = shuffle | |||
| def __iter__(self): | |||
| def getkey(item): | |||
| return item[1] | |||
| # deterministically shuffle based on epoch | |||
| g = torch.Generator() | |||
| g.manual_seed(self.epoch) | |||
| if self.shuffle: | |||
| indices = torch.randperm(len(self.dataset), generator=g).tolist() | |||
| else: | |||
| indices = list(range(len(self.dataset))) | |||
| # add extra samples to make it evenly divisible | |||
| indices += indices[:(self.total_size - len(indices))] | |||
| assert len(indices) == self.total_size | |||
| # subsample | |||
| indices = indices[self.rank:self.total_size:self.num_replicas] | |||
| assert len(indices) == self.num_samples | |||
| random_len_lst = [(i, self.length_lst[i]) for i in indices] | |||
| # Bucket examples based on similar output sequence length for efficiency: | |||
| groups = [ | |||
| random_len_lst[i:i + self.group_size] | |||
| for i in range(0, self.num_samples, self.group_size) | |||
| ] | |||
| if (self.num_samples % self.group_size): | |||
| groups.append(random_len_lst[self.buckets * self.group_size:-1]) | |||
| new_indices = [] | |||
| for group in groups: | |||
| group.sort(key=getkey, reverse=True) | |||
| for item in group: | |||
| new_indices.append(item[0]) | |||
| return iter(new_indices) | |||
| def __len__(self): | |||
| return self.num_samples | |||
| def set_epoch(self, epoch): | |||
| self.epoch = epoch | |||
| @@ -0,0 +1,3 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .ling_unit import * # noqa F403 | |||
| @@ -0,0 +1,88 @@ | |||
| # from https://github.com/keithito/tacotron | |||
| # Cleaners are transformations that run over the input text at both training and eval time. | |||
| # | |||
| # Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
| # hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
| # 1. "english_cleaners" for English text | |||
| # 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
| # the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
| # 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
| # the symbols in symbols.py to match your data). | |||
| import re | |||
| from unidecode import unidecode | |||
| from .numbers import normalize_numbers | |||
| # Regular expression matching whitespace: | |||
| _whitespace_re = re.compile(r'\s+') | |||
| # List of (regular expression, replacement) pairs for abbreviations: | |||
| _abbreviations = [ | |||
| (re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) | |||
| for x in [('mrs', 'misess'), | |||
| ('mr', 'mister'), | |||
| ('dr', 'doctor'), | |||
| ('st', 'saint'), | |||
| ('co', 'company'), | |||
| ('jr', 'junior'), | |||
| ('maj', 'major'), | |||
| ('gen', 'general'), | |||
| ('drs', 'doctors'), | |||
| ('rev', 'reverend'), | |||
| ('lt', 'lieutenant'), | |||
| ('hon', 'honorable'), | |||
| ('sgt', 'sergeant'), | |||
| ('capt', 'captain'), | |||
| ('esq', 'esquire'), | |||
| ('ltd', 'limited'), | |||
| ('col', 'colonel'), | |||
| ('ft', 'fort'), ]] # yapf:disable | |||
| def expand_abbreviations(text): | |||
| for regex, replacement in _abbreviations: | |||
| text = re.sub(regex, replacement, text) | |||
| return text | |||
| def expand_numbers(text): | |||
| return normalize_numbers(text) | |||
| def lowercase(text): | |||
| return text.lower() | |||
| def collapse_whitespace(text): | |||
| return re.sub(_whitespace_re, ' ', text) | |||
| def convert_to_ascii(text): | |||
| return unidecode(text) | |||
| def basic_cleaners(text): | |||
| '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def transliteration_cleaners(text): | |||
| '''Pipeline for non-English text that transliterates to ASCII.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def english_cleaners(text): | |||
| '''Pipeline for English text, including number and abbreviation expansion.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = expand_numbers(text) | |||
| text = expand_abbreviations(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| @@ -0,0 +1,395 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import abc | |||
| import codecs | |||
| import os | |||
| import re | |||
| import shutil | |||
| import json | |||
| import numpy as np | |||
| from . import cleaners as cleaners | |||
| # Regular expression matching text enclosed in curly braces: | |||
| _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
| def _clean_text(text, cleaner_names): | |||
| for name in cleaner_names: | |||
| cleaner = getattr(cleaners, name) | |||
| if not cleaner: | |||
| raise Exception( | |||
| 'modelscope error: configuration cleaner unknown: %s' % name) | |||
| text = cleaner(text) | |||
| return text | |||
| class LinguisticBaseUnit(abc.ABC): | |||
| def set_config_params(self, config_params): | |||
| self.config_params = config_params | |||
| def save(self, config, config_name, path): | |||
| t_path = os.path.join(path, config_name) | |||
| if config != t_path: | |||
| os.makedirs(path, exist_ok=True) | |||
| shutil.copyfile(config, os.path.join(path, config_name)) | |||
| class KanTtsLinguisticUnit(LinguisticBaseUnit): | |||
| def __init__(self, config, path, has_mask=True): | |||
| super(KanTtsLinguisticUnit, self).__init__() | |||
| # special symbol | |||
| self._pad = '_' | |||
| self._eos = '~' | |||
| self._mask = '@[MASK]' | |||
| self._has_mask = has_mask | |||
| self._unit_config = config | |||
| self._path = path | |||
| self._cleaner_names = [ | |||
| x.strip() for x in self._unit_config['cleaners'].split(',') | |||
| ] | |||
| self._lfeat_type_list = self._unit_config['lfeat_type_list'].strip( | |||
| ).split(',') | |||
| self.build() | |||
| def get_unit_size(self): | |||
| ling_unit_size = {} | |||
| ling_unit_size['sy'] = len(self.sy) | |||
| ling_unit_size['tone'] = len(self.tone) | |||
| ling_unit_size['syllable_flag'] = len(self.syllable_flag) | |||
| ling_unit_size['word_segment'] = len(self.word_segment) | |||
| if 'emo_category' in self._lfeat_type_list: | |||
| ling_unit_size['emotion'] = len(self.emo_category) | |||
| if 'speaker_category' in self._lfeat_type_list: | |||
| ling_unit_size['speaker'] = len(self.speaker) | |||
| return ling_unit_size | |||
| def build(self): | |||
| self._sub_unit_dim = {} | |||
| self._sub_unit_pad = {} | |||
| # sy sub-unit | |||
| _characters = '' | |||
| _ch_symbols = [] | |||
| sy_path = os.path.join(self._path, self._unit_config['sy']) | |||
| f = codecs.open(sy_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_symbols.append(line) | |||
| _arpabet = ['@' + s for s in _ch_symbols] | |||
| # Export all symbols: | |||
| self.sy = list(_characters) + _arpabet + [self._pad, self._eos] | |||
| if self._has_mask: | |||
| self.sy.append(self._mask) | |||
| self._sy_to_id = {s: i for i, s in enumerate(self.sy)} | |||
| self._id_to_sy = {i: s for i, s in enumerate(self.sy)} | |||
| self._sub_unit_dim['sy'] = len(self.sy) | |||
| self._sub_unit_pad['sy'] = self._sy_to_id['_'] | |||
| # tone sub-unit | |||
| _characters = '' | |||
| _ch_tones = [] | |||
| tone_path = os.path.join(self._path, self._unit_config['tone']) | |||
| f = codecs.open(tone_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_tones.append(line) | |||
| # Export all tones: | |||
| self.tone = list(_characters) + _ch_tones + [self._pad, self._eos] | |||
| if self._has_mask: | |||
| self.tone.append(self._mask) | |||
| self._tone_to_id = {s: i for i, s in enumerate(self.tone)} | |||
| self._id_to_tone = {i: s for i, s in enumerate(self.tone)} | |||
| self._sub_unit_dim['tone'] = len(self.tone) | |||
| self._sub_unit_pad['tone'] = self._tone_to_id['_'] | |||
| # syllable flag sub-unit | |||
| _characters = '' | |||
| _ch_syllable_flags = [] | |||
| sy_flag_path = os.path.join(self._path, | |||
| self._unit_config['syllable_flag']) | |||
| f = codecs.open(sy_flag_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_syllable_flags.append(line) | |||
| # Export all syllable_flags: | |||
| self.syllable_flag = list(_characters) + _ch_syllable_flags + [ | |||
| self._pad, self._eos | |||
| ] | |||
| if self._has_mask: | |||
| self.syllable_flag.append(self._mask) | |||
| self._syllable_flag_to_id = { | |||
| s: i | |||
| for i, s in enumerate(self.syllable_flag) | |||
| } | |||
| self._id_to_syllable_flag = { | |||
| i: s | |||
| for i, s in enumerate(self.syllable_flag) | |||
| } | |||
| self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag) | |||
| self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_'] | |||
| # word segment sub-unit | |||
| _characters = '' | |||
| _ch_word_segments = [] | |||
| ws_path = os.path.join(self._path, self._unit_config['word_segment']) | |||
| f = codecs.open(ws_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_word_segments.append(line) | |||
| # Export all syllable_flags: | |||
| self.word_segment = list(_characters) + _ch_word_segments + [ | |||
| self._pad, self._eos | |||
| ] | |||
| if self._has_mask: | |||
| self.word_segment.append(self._mask) | |||
| self._word_segment_to_id = { | |||
| s: i | |||
| for i, s in enumerate(self.word_segment) | |||
| } | |||
| self._id_to_word_segment = { | |||
| i: s | |||
| for i, s in enumerate(self.word_segment) | |||
| } | |||
| self._sub_unit_dim['word_segment'] = len(self.word_segment) | |||
| self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_'] | |||
| if 'emo_category' in self._lfeat_type_list: | |||
| # emotion category sub-unit | |||
| _characters = '' | |||
| _ch_emo_types = [] | |||
| emo_path = os.path.join(self._path, | |||
| self._unit_config['emo_category']) | |||
| f = codecs.open(emo_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_emo_types.append(line) | |||
| self.emo_category = list(_characters) + _ch_emo_types + [ | |||
| self._pad, self._eos | |||
| ] | |||
| if self._has_mask: | |||
| self.emo_category.append(self._mask) | |||
| self._emo_category_to_id = { | |||
| s: i | |||
| for i, s in enumerate(self.emo_category) | |||
| } | |||
| self._id_to_emo_category = { | |||
| i: s | |||
| for i, s in enumerate(self.emo_category) | |||
| } | |||
| self._sub_unit_dim['emo_category'] = len(self.emo_category) | |||
| self._sub_unit_pad['emo_category'] = self._emo_category_to_id['_'] | |||
| if 'speaker_category' in self._lfeat_type_list: | |||
| # speaker category sub-unit | |||
| _characters = '' | |||
| _ch_speakers = [] | |||
| speaker_path = os.path.join(self._path, | |||
| self._unit_config['speaker_category']) | |||
| f = codecs.open(speaker_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_speakers.append(line) | |||
| # Export all syllable_flags: | |||
| self.speaker = list(_characters) + _ch_speakers + [ | |||
| self._pad, self._eos | |||
| ] | |||
| if self._has_mask: | |||
| self.speaker.append(self._mask) | |||
| self._speaker_to_id = {s: i for i, s in enumerate(self.speaker)} | |||
| self._id_to_speaker = {i: s for i, s in enumerate(self.speaker)} | |||
| self._sub_unit_dim['speaker_category'] = len(self._speaker_to_id) | |||
| self._sub_unit_pad['speaker_category'] = self._speaker_to_id['_'] | |||
| def encode_symbol_sequence(self, lfeat_symbol): | |||
| lfeat_symbol = lfeat_symbol.strip().split(' ') | |||
| lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) | |||
| for this_lfeat_symbol in lfeat_symbol: | |||
| this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( | |||
| '$') | |||
| index = 0 | |||
| while index < len(lfeat_symbol_separate): | |||
| lfeat_symbol_separate[index] = lfeat_symbol_separate[ | |||
| index] + this_lfeat_symbol[index] + ' ' | |||
| index = index + 1 | |||
| input_and_label_data = [] | |||
| index = 0 | |||
| while index < len(self._lfeat_type_list): | |||
| sequence = self.encode_sub_unit( | |||
| lfeat_symbol_separate[index].strip(), | |||
| self._lfeat_type_list[index]) | |||
| sequence_array = np.asarray(sequence, dtype=np.int32) | |||
| input_and_label_data.append(sequence_array) | |||
| index = index + 1 | |||
| return input_and_label_data | |||
| def decode_symbol_sequence(self, sequence): | |||
| result = [] | |||
| for i, lfeat_type in enumerate(self._lfeat_type_list): | |||
| s = '' | |||
| sequence_item = sequence[i].tolist() | |||
| if lfeat_type == 'sy': | |||
| s = self.decode_sy(sequence_item) | |||
| elif lfeat_type == 'tone': | |||
| s = self.decode_tone(sequence_item) | |||
| elif lfeat_type == 'syllable_flag': | |||
| s = self.decode_syllable_flag(sequence_item) | |||
| elif lfeat_type == 'word_segment': | |||
| s = self.decode_word_segment(sequence_item) | |||
| elif lfeat_type == 'emo_category': | |||
| s = self.decode_emo_category(sequence_item) | |||
| elif lfeat_type == 'speaker_category': | |||
| s = self.decode_speaker_category(sequence_item) | |||
| else: | |||
| raise Exception( | |||
| 'modelscope error: configuration lfeat type(%s) unknown.' | |||
| % lfeat_type) | |||
| result.append('%s:%s' % (lfeat_type, s)) | |||
| return result | |||
| def encode_sub_unit(self, this_lfeat_symbol, lfeat_type): | |||
| sequence = [] | |||
| if lfeat_type == 'sy': | |||
| this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') | |||
| this_lfeat_symbol_format = '' | |||
| index = 0 | |||
| while index < len(this_lfeat_symbol): | |||
| this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ | |||
| index] + '}' + ' ' | |||
| index = index + 1 | |||
| sequence = self.encode_text(this_lfeat_symbol_format, | |||
| self._cleaner_names) | |||
| elif lfeat_type == 'tone': | |||
| sequence = self.encode_tone(this_lfeat_symbol) | |||
| elif lfeat_type == 'syllable_flag': | |||
| sequence = self.encode_syllable_flag(this_lfeat_symbol) | |||
| elif lfeat_type == 'word_segment': | |||
| sequence = self.encode_word_segment(this_lfeat_symbol) | |||
| elif lfeat_type == 'emo_category': | |||
| sequence = self.encode_emo_category(this_lfeat_symbol) | |||
| elif lfeat_type == 'speaker_category': | |||
| sequence = self.encode_speaker_category(this_lfeat_symbol) | |||
| else: | |||
| raise Exception( | |||
| 'modelscope error: configuration lfeat type(%s) unknown.' | |||
| % lfeat_type) | |||
| return sequence | |||
| def encode_text(self, text, cleaner_names): | |||
| sequence = [] | |||
| # Check for curly braces and treat their contents as ARPAbet: | |||
| while len(text): | |||
| m = _curly_re.match(text) | |||
| if not m: | |||
| sequence += self.encode_sy(_clean_text(text, cleaner_names)) | |||
| break | |||
| sequence += self.encode_sy(_clean_text(m.group(1), cleaner_names)) | |||
| sequence += self.encode_arpanet(m.group(2)) | |||
| text = m.group(3) | |||
| # Append EOS token | |||
| sequence.append(self._sy_to_id['~']) | |||
| return sequence | |||
| def encode_sy(self, sy): | |||
| return [self._sy_to_id[s] for s in sy if self.should_keep_sy(s)] | |||
| def decode_sy(self, id): | |||
| s = self._id_to_sy[id] | |||
| if len(s) > 1 and s[0] == '@': | |||
| s = s[1:] | |||
| return s | |||
| def should_keep_sy(self, s): | |||
| return s in self._sy_to_id and s != '_' and s != '~' | |||
| def encode_arpanet(self, text): | |||
| return self.encode_sy(['@' + s for s in text.split()]) | |||
| def encode_tone(self, tone): | |||
| tones = tone.strip().split(' ') | |||
| sequence = [] | |||
| for this_tone in tones: | |||
| sequence.append(self._tone_to_id[this_tone]) | |||
| sequence.append(self._tone_to_id['~']) | |||
| return sequence | |||
| def decode_tone(self, id): | |||
| return self._id_to_tone[id] | |||
| def encode_syllable_flag(self, syllable_flag): | |||
| syllable_flags = syllable_flag.strip().split(' ') | |||
| sequence = [] | |||
| for this_syllable_flag in syllable_flags: | |||
| sequence.append(self._syllable_flag_to_id[this_syllable_flag]) | |||
| sequence.append(self._syllable_flag_to_id['~']) | |||
| return sequence | |||
| def decode_syllable_flag(self, id): | |||
| return self._id_to_syllable_flag[id] | |||
| def encode_word_segment(self, word_segment): | |||
| word_segments = word_segment.strip().split(' ') | |||
| sequence = [] | |||
| for this_word_segment in word_segments: | |||
| sequence.append(self._word_segment_to_id[this_word_segment]) | |||
| sequence.append(self._word_segment_to_id['~']) | |||
| return sequence | |||
| def decode_word_segment(self, id): | |||
| return self._id_to_word_segment[id] | |||
| def encode_emo_category(self, emo_type): | |||
| emo_categories = emo_type.strip().split(' ') | |||
| sequence = [] | |||
| for this_category in emo_categories: | |||
| sequence.append(self._emo_category_to_id[this_category]) | |||
| sequence.append(self._emo_category_to_id['~']) | |||
| return sequence | |||
| def decode_emo_category(self, id): | |||
| return self._id_to_emo_category[id] | |||
| def encode_speaker_category(self, speaker): | |||
| speakers = speaker.strip().split(' ') | |||
| sequence = [] | |||
| for this_speaker in speakers: | |||
| sequence.append(self._speaker_to_id[this_speaker]) | |||
| sequence.append(self._speaker_to_id['~']) | |||
| return sequence | |||
| def decode_speaker_category(self, id): | |||
| return self._id_to_speaker[id] | |||
| @@ -1,3 +1,6 @@ | |||
| # The implementation is adopted from tacotron, | |||
| # made publicly available under the MIT License at https://github.com/keithito/tacotron | |||
| import re | |||
| import inflect | |||
| @@ -1,273 +0,0 @@ | |||
| import tensorflow as tf | |||
| def build_sequence_mask(sequence_length, | |||
| maximum_length=None, | |||
| dtype=tf.float32): | |||
| """Builds the dot product mask. | |||
| Args: | |||
| sequence_length: The sequence length. | |||
| maximum_length: Optional size of the returned time dimension. Otherwise | |||
| it is the maximum of :obj:`sequence_length`. | |||
| dtype: The type of the mask tensor. | |||
| Returns: | |||
| A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape | |||
| ``[batch_size, max_length]``. | |||
| """ | |||
| mask = tf.sequence_mask( | |||
| sequence_length, maxlen=maximum_length, dtype=dtype) | |||
| return mask | |||
| def norm(inputs): | |||
| """Layer normalizes :obj:`inputs`.""" | |||
| return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) | |||
| def pad_in_time(x, padding_shape): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
| Agrs: | |||
| x: [Batch, Time, Frequency] | |||
| padding_length: padding size of constant value (0) before the time dimension | |||
| return: | |||
| padded x | |||
| """ | |||
| depth = x.get_shape().as_list()[-1] | |||
| x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) | |||
| x.set_shape((None, None, depth)) | |||
| return x | |||
| def pad_in_time_right(x, padding_length): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
| Agrs: | |||
| x: [Batch, Time, Frequency] | |||
| padding_length: padding size of constant value (0) before the time dimension | |||
| return: | |||
| padded x | |||
| """ | |||
| depth = x.get_shape().as_list()[-1] | |||
| x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
| x.set_shape((None, None, depth)) | |||
| return x | |||
| def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): | |||
| """Implements the Transformer's "Feed Forward" layer. | |||
| .. math:: | |||
| ffn(x) = max(0, x*W_1 + b_1)*W_2 | |||
| Args: | |||
| x: The input. | |||
| ffn_dim: The number of units of the nonlinear transformation. | |||
| memory_units: the number of units of linear transformation | |||
| mode: A ``tf.estimator.ModeKeys`` mode. | |||
| dropout: The probability to drop units from the inner transformation. | |||
| Returns: | |||
| The transformed input. | |||
| """ | |||
| inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) | |||
| inner = tf.layers.dropout( | |||
| inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) | |||
| outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) | |||
| return outer | |||
| def drop_and_add(inputs, outputs, mode, dropout=0.0): | |||
| """Drops units in the outputs and adds the previous values. | |||
| Args: | |||
| inputs: The input of the previous layer. | |||
| outputs: The output of the previous layer. | |||
| mode: A ``tf.estimator.ModeKeys`` mode. | |||
| dropout: The probability to drop units in :obj:`outputs`. | |||
| Returns: | |||
| The residual and normalized output. | |||
| """ | |||
| outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) | |||
| input_dim = inputs.get_shape().as_list()[-1] | |||
| output_dim = outputs.get_shape().as_list()[-1] | |||
| if input_dim == output_dim: | |||
| outputs += inputs | |||
| return outputs | |||
| def MemoryBlock( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the bidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| mode: Training or Evaluation | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| static_shape = inputs.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=inputs, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='SAME', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| output = tf.reshape( | |||
| output, | |||
| [tf.shape(output)[0], tf.shape(output)[2], depth]) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| def MemoryBlockV2( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| shift=0, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the bidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| mode: Training or Evaluation | |||
| shift: left padding, to control delay | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| static_shape = inputs.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| # padding | |||
| left_padding = int(round((filter_size - 1) / 2)) | |||
| right_padding = int((filter_size - 1) / 2) | |||
| if shift > 0: | |||
| left_padding = left_padding + shift | |||
| right_padding = right_padding - shift | |||
| pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) | |||
| pad_inputs = tf.expand_dims( | |||
| pad_inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=pad_inputs, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='VALID', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = tf.reshape( | |||
| memory, | |||
| [tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| def UniMemoryBlock( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| cache=None, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the unidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| cache: for streaming inference | |||
| mode: Training or Evaluation | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| dropout: dorpout factor | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| if cache is not None: | |||
| static_shape = cache['queries'].get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| queries = tf.slice(cache['queries'], [0, 1, 0], [ | |||
| tf.shape(cache['queries'])[0], | |||
| tf.shape(cache['queries'])[1] - 1, depth | |||
| ]) | |||
| queries = tf.concat([queries, inputs], axis=1) | |||
| cache['queries'] = queries | |||
| else: | |||
| padding_length = filter_size - 1 | |||
| queries = pad_in_time(inputs, [padding_length, 0]) | |||
| queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] | |||
| static_shape = queries.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=queries, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='VALID', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = tf.reshape( | |||
| memory, | |||
| [tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| @@ -1,178 +0,0 @@ | |||
| import tensorflow as tf | |||
| from . import fsmn | |||
| class FsmnEncoder(): | |||
| """Encoder using Fsmn | |||
| """ | |||
| def __init__(self, | |||
| filter_size, | |||
| fsmn_num_layers, | |||
| dnn_num_layers, | |||
| num_memory_units=512, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.0, | |||
| position_encoder=None): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| filter_size: the total order of memory block | |||
| fsmn_num_layers: The number of fsmn layers. | |||
| dnn_num_layers: The number of dnn layers | |||
| num_units: The number of memory units. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(FsmnEncoder, self).__init__() | |||
| self.filter_size = filter_size | |||
| self.fsmn_num_layers = fsmn_num_layers | |||
| self.dnn_num_layers = dnn_num_layers | |||
| self.num_memory_units = num_memory_units | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = fsmn.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
| state = () | |||
| for layer in range(self.fsmn_num_layers): | |||
| with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
| with tf.variable_scope('ffn'): | |||
| context = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| with tf.variable_scope('memory'): | |||
| memory = fsmn.MemoryBlock( | |||
| context, | |||
| self.filter_size, | |||
| mode, | |||
| mask=mask, | |||
| dropout=self.dropout) | |||
| memory = fsmn.drop_and_add( | |||
| inputs, memory, mode, dropout=self.dropout) | |||
| inputs = memory | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| for layer in range(self.dnn_num_layers): | |||
| with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
| transformed = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = inputs | |||
| return (outputs, state, sequence_length) | |||
| class FsmnEncoderV2(): | |||
| """Encoder using Fsmn | |||
| """ | |||
| def __init__(self, | |||
| filter_size, | |||
| fsmn_num_layers, | |||
| dnn_num_layers, | |||
| num_memory_units=512, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.0, | |||
| shift=0, | |||
| position_encoder=None): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| filter_size: the total order of memory block | |||
| fsmn_num_layers: The number of fsmn layers. | |||
| dnn_num_layers: The number of dnn layers | |||
| num_units: The number of memory units. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| shift: left padding, to control delay | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(FsmnEncoderV2, self).__init__() | |||
| self.filter_size = filter_size | |||
| self.fsmn_num_layers = fsmn_num_layers | |||
| self.dnn_num_layers = dnn_num_layers | |||
| self.num_memory_units = num_memory_units | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.shift = shift | |||
| if not isinstance(shift, list): | |||
| self.shift = [shift for _ in range(self.fsmn_num_layers)] | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = fsmn.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
| state = () | |||
| for layer in range(self.fsmn_num_layers): | |||
| with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
| with tf.variable_scope('ffn'): | |||
| context = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| with tf.variable_scope('memory'): | |||
| memory = fsmn.MemoryBlockV2( | |||
| context, | |||
| self.filter_size, | |||
| mode, | |||
| shift=self.shift[layer], | |||
| mask=mask, | |||
| dropout=self.dropout) | |||
| memory = fsmn.drop_and_add( | |||
| inputs, memory, mode, dropout=self.dropout) | |||
| inputs = memory | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| for layer in range(self.dnn_num_layers): | |||
| with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
| transformed = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = inputs | |||
| return (outputs, state, sequence_length) | |||
| @@ -1,159 +0,0 @@ | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| class VarTestHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, batch_size, inputs, dim): | |||
| with tf.name_scope('VarTestHelper'): | |||
| self._batch_size = batch_size | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._inputs)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope('VarTestHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs = tf.concat([outputs, self._inputs[:, time, :]], | |||
| axis=-1) | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, targets, inputs, dim): | |||
| with tf.name_scope('VarTrainingHelper'): | |||
| self._targets = targets # [N, T_in, 1] | |||
| self._batch_size = tf.shape(inputs)[0] # N | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._targets)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope(name or 'VarTrainingHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs = tf.concat( | |||
| [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): | |||
| def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
| alpha, decay_steps): | |||
| with tf.name_scope('VarTrainingSSHelper'): | |||
| self._targets = targets # [N, T_in, 1] | |||
| self._batch_size = tf.shape(inputs)[0] # N | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._targets)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| # for schedule sampling | |||
| self._global_step = global_step | |||
| self._schedule_begin = schedule_begin | |||
| self._alpha = alpha | |||
| self._decay_steps = decay_steps | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| self._ratio = _tf_decay(self._global_step, self._schedule_begin, | |||
| self._alpha, self._decay_steps) | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope(name or 'VarTrainingHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs_tmp = tf.cond( | |||
| tf.less( | |||
| tf.random_uniform([], minval=0, maxval=1, | |||
| dtype=tf.float32), self._ratio), | |||
| lambda: self._targets[:, time, :], lambda: outputs) | |||
| next_inputs = tf.concat( | |||
| [next_inputs_tmp, self._inputs[:, time, :]], axis=-1) | |||
| return (finished, next_inputs, state) | |||
| def _go_frames(batch_size, dim, init_inputs): | |||
| '''Returns all-zero <GO> frames for a given batch size and output dimension''' | |||
| return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], | |||
| axis=-1) | |||
| def _tf_decay(global_step, schedule_begin, alpha, decay_steps): | |||
| tfr = tf.train.exponential_decay( | |||
| 1.0, | |||
| global_step=global_step - schedule_begin, | |||
| decay_steps=decay_steps, | |||
| decay_rate=alpha, | |||
| name='tfr_decay') | |||
| final_tfr = tf.cond( | |||
| tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) | |||
| return final_tfr | |||
| @@ -0,0 +1,3 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .hifigan import * # noqa F403 | |||
| @@ -0,0 +1,238 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| # Part of the implementation is borrowed from https://github.com/jik876/hifi-gan | |||
| from distutils.version import LooseVersion | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d | |||
| from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm | |||
| from modelscope.models.audio.tts.models.utils import get_padding, init_weights | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') | |||
| def stft(x, fft_size, hop_size, win_length, window): | |||
| """Perform STFT and convert to magnitude spectrogram. | |||
| Args: | |||
| x (Tensor): Input signal tensor (B, T). | |||
| fft_size (int): FFT size. | |||
| hop_size (int): Hop size. | |||
| win_length (int): Window length. | |||
| window (str): Window function type. | |||
| Returns: | |||
| Tensor: Magnitude spectrogram (B). | |||
| """ | |||
| if is_pytorch_17plus: | |||
| x_stft = torch.stft( | |||
| x, fft_size, hop_size, win_length, window, return_complex=False) | |||
| else: | |||
| x_stft = torch.stft(x, fft_size, hop_size, win_length, window) | |||
| real = x_stft[..., 0] | |||
| imag = x_stft[..., 1] | |||
| # NOTE(kan-bayashi): clamp is needed to avoid nan or inf | |||
| return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) | |||
| LRELU_SLOPE = 0.1 | |||
| def get_padding_casual(kernel_size, dilation=1): | |||
| return int(kernel_size * dilation - dilation) | |||
| class Conv1dCasual(torch.nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=True, | |||
| padding_mode='zeros'): | |||
| super(Conv1dCasual, self).__init__() | |||
| self.pad = padding | |||
| self.conv1d = weight_norm( | |||
| Conv1d( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0, | |||
| dilation=dilation, | |||
| groups=groups, | |||
| bias=bias, | |||
| padding_mode=padding_mode)) | |||
| self.conv1d.apply(init_weights) | |||
| def forward(self, x): # bdt | |||
| # described starting from the last dimension and moving forward. | |||
| x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') | |||
| x = self.conv1d(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.conv1d) | |||
| class ConvTranspose1dCausal(torch.nn.Module): | |||
| """CausalConvTranspose1d module with customized initialization.""" | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0): | |||
| """Initialize CausalConvTranspose1d module.""" | |||
| super(ConvTranspose1dCausal, self).__init__() | |||
| self.deconv = weight_norm( | |||
| ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) | |||
| self.stride = stride | |||
| self.deconv.apply(init_weights) | |||
| self.pad = kernel_size - stride | |||
| def forward(self, x): | |||
| """Calculate forward propagation. | |||
| Args: | |||
| x (Tensor): Input tensor (B, in_channels, T_in). | |||
| Returns: | |||
| Tensor: Output tensor (B, out_channels, T_out). | |||
| """ | |||
| # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") | |||
| return self.deconv(x)[:, :, :-self.pad] | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.deconv) | |||
| class ResBlock1(torch.nn.Module): | |||
| def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |||
| super(ResBlock1, self).__init__() | |||
| self.h = h | |||
| self.convs1 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=dilation[i], | |||
| padding=get_padding_casual(kernel_size, dilation[i])) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| self.convs2 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=1, | |||
| padding=get_padding_casual(kernel_size, 1)) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| def forward(self, x): | |||
| for c1, c2 in zip(self.convs1, self.convs2): | |||
| xt = F.leaky_relu(x, LRELU_SLOPE) | |||
| xt = c1(xt) | |||
| xt = F.leaky_relu(xt, LRELU_SLOPE) | |||
| xt = c2(xt) | |||
| x = xt + x | |||
| return x | |||
| def remove_weight_norm(self): | |||
| for layer in self.convs1: | |||
| layer.remove_weight_norm() | |||
| for layer in self.convs2: | |||
| layer.remove_weight_norm() | |||
| class Generator(torch.nn.Module): | |||
| def __init__(self, h): | |||
| super(Generator, self).__init__() | |||
| self.h = h | |||
| self.num_kernels = len(h.resblock_kernel_sizes) | |||
| self.num_upsamples = len(h.upsample_rates) | |||
| logger.info('num_kernels={}, num_upsamples={}'.format( | |||
| self.num_kernels, self.num_upsamples)) | |||
| self.conv_pre = Conv1dCasual( | |||
| 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) | |||
| resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |||
| self.ups = nn.ModuleList() | |||
| self.repeat_ups = nn.ModuleList() | |||
| for i, (u, k) in enumerate( | |||
| zip(h.upsample_rates, h.upsample_kernel_sizes)): | |||
| upsample = nn.Sequential( | |||
| nn.Upsample(mode='nearest', scale_factor=u), | |||
| nn.LeakyReLU(LRELU_SLOPE), | |||
| Conv1dCasual( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| kernel_size=7, | |||
| stride=1, | |||
| padding=7 - 1)) | |||
| self.repeat_ups.append(upsample) | |||
| self.ups.append( | |||
| ConvTranspose1dCausal( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| k, | |||
| u, | |||
| padding=(k - u) // 2)) | |||
| self.resblocks = nn.ModuleList() | |||
| for i in range(len(self.ups)): | |||
| ch = h.upsample_initial_channel // (2**(i + 1)) | |||
| for j, (k, d) in enumerate( | |||
| zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |||
| self.resblocks.append(resblock(h, ch, k, d)) | |||
| self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) | |||
| def forward(self, x): | |||
| x = self.conv_pre(x) | |||
| for i in range(self.num_upsamples): | |||
| x = torch.sin(x) + x | |||
| # transconv | |||
| x1 = F.leaky_relu(x, LRELU_SLOPE) | |||
| x1 = self.ups[i](x1) | |||
| # repeat | |||
| x2 = self.repeat_ups[i](x) | |||
| x = x1 + x2 | |||
| xs = None | |||
| for j in range(self.num_kernels): | |||
| if xs is None: | |||
| xs = self.resblocks[i * self.num_kernels + j](x) | |||
| else: | |||
| xs += self.resblocks[i * self.num_kernels + j](x) | |||
| x = xs / self.num_kernels | |||
| x = F.leaky_relu(x) | |||
| x = self.conv_post(x) | |||
| x = torch.tanh(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| logger.info('Removing weight norm...') | |||
| for layer in self.ups: | |||
| layer.remove_weight_norm() | |||
| for layer in self.repeat_ups: | |||
| layer[-1].remove_weight_norm() | |||
| for layer in self.resblocks: | |||
| layer.remove_weight_norm() | |||
| self.conv_pre.remove_weight_norm() | |||
| self.conv_post.remove_weight_norm() | |||
| @@ -0,0 +1,3 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .kantts_sambert import * # noqa F403 | |||
| @@ -0,0 +1,131 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from .base import Prenet | |||
| from .fsmn import FsmnEncoderV2 | |||
| class LengthRegulator(nn.Module): | |||
| def __init__(self, r=1): | |||
| super(LengthRegulator, self).__init__() | |||
| self.r = r | |||
| def forward(self, inputs, durations, masks=None): | |||
| reps = (durations + 0.5).long() | |||
| output_lens = reps.sum(dim=1) | |||
| max_len = output_lens.max() | |||
| reps_cumsum = torch.cumsum( | |||
| F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :] | |||
| range_ = torch.arange(max_len).to(inputs.device)[None, :, None] | |||
| mult = ((reps_cumsum[:, :, :-1] <= range_) | |||
| & (reps_cumsum[:, :, 1:] > range_)) # yapf:disable | |||
| mult = mult.float() | |||
| out = torch.matmul(mult, inputs) | |||
| if masks is not None: | |||
| out = out.masked_fill(masks.unsqueeze(-1), 0.0) | |||
| seq_len = out.size(1) | |||
| padding = self.r - int(seq_len) % self.r | |||
| if (padding < self.r): | |||
| out = F.pad( | |||
| out.transpose(1, 2), (0, padding, 0, 0, 0, 0), value=0.0) | |||
| out = out.transpose(1, 2) | |||
| return out, output_lens | |||
| class VarRnnARPredictor(nn.Module): | |||
| def __init__(self, cond_units, prenet_units, rnn_units): | |||
| super(VarRnnARPredictor, self).__init__() | |||
| self.prenet = Prenet(1, prenet_units) | |||
| self.lstm = nn.LSTM( | |||
| prenet_units[-1] + cond_units, | |||
| rnn_units, | |||
| num_layers=2, | |||
| batch_first=True, | |||
| bidirectional=False) | |||
| self.fc = nn.Linear(rnn_units, 1) | |||
| def forward(self, inputs, cond, h=None, masks=None): | |||
| x = torch.cat([self.prenet(inputs), cond], dim=-1) | |||
| # The input can also be a packed variable length sequence, | |||
| # here we just omit it for simplicity due to the mask and uni-directional lstm. | |||
| x, h_new = self.lstm(x, h) | |||
| x = self.fc(x).squeeze(-1) | |||
| x = F.relu(x) | |||
| if masks is not None: | |||
| x = x.masked_fill(masks, 0.0) | |||
| return x, h_new | |||
| def infer(self, cond, masks=None): | |||
| batch_size, length = cond.size(0), cond.size(1) | |||
| output = [] | |||
| x = torch.zeros((batch_size, 1)).to(cond.device) | |||
| h = None | |||
| for i in range(length): | |||
| x, h = self.forward(x.unsqueeze(1), cond[:, i:i + 1, :], h=h) | |||
| output.append(x) | |||
| output = torch.cat(output, dim=-1) | |||
| if masks is not None: | |||
| output = output.masked_fill(masks, 0.0) | |||
| return output | |||
| class VarFsmnRnnNARPredictor(nn.Module): | |||
| def __init__(self, in_dim, filter_size, fsmn_num_layers, num_memory_units, | |||
| ffn_inner_dim, dropout, shift, lstm_units): | |||
| super(VarFsmnRnnNARPredictor, self).__init__() | |||
| self.fsmn = FsmnEncoderV2(filter_size, fsmn_num_layers, in_dim, | |||
| num_memory_units, ffn_inner_dim, dropout, | |||
| shift) | |||
| self.blstm = nn.LSTM( | |||
| num_memory_units, | |||
| lstm_units, | |||
| num_layers=1, | |||
| batch_first=True, | |||
| bidirectional=True) | |||
| self.fc = nn.Linear(2 * lstm_units, 1) | |||
| def forward(self, inputs, masks=None): | |||
| input_lengths = None | |||
| if masks is not None: | |||
| input_lengths = torch.sum((~masks).float(), dim=1).long() | |||
| x = self.fsmn(inputs, masks) | |||
| if input_lengths is not None: | |||
| x = nn.utils.rnn.pack_padded_sequence( | |||
| x, | |||
| input_lengths.tolist(), | |||
| batch_first=True, | |||
| enforce_sorted=False) | |||
| x, _ = self.blstm(x) | |||
| x, _ = nn.utils.rnn.pad_packed_sequence( | |||
| x, batch_first=True, total_length=inputs.size(1)) | |||
| else: | |||
| x, _ = self.blstm(x) | |||
| x = self.fc(x).squeeze(-1) | |||
| if masks is not None: | |||
| x = x.masked_fill(masks, 0.0) | |||
| return x | |||
| @@ -0,0 +1,369 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class ScaledDotProductAttention(nn.Module): | |||
| """ Scaled Dot-Product Attention """ | |||
| def __init__(self, temperature, dropatt=0.0): | |||
| super().__init__() | |||
| self.temperature = temperature | |||
| self.softmax = nn.Softmax(dim=2) | |||
| self.dropatt = nn.Dropout(dropatt) | |||
| def forward(self, q, k, v, mask=None): | |||
| attn = torch.bmm(q, k.transpose(1, 2)) | |||
| attn = attn / self.temperature | |||
| if mask is not None: | |||
| attn = attn.masked_fill(mask, -np.inf) | |||
| attn = self.softmax(attn) | |||
| attn = self.dropatt(attn) | |||
| output = torch.bmm(attn, v) | |||
| return output, attn | |||
| class Prenet(nn.Module): | |||
| def __init__(self, in_units, prenet_units, out_units=0): | |||
| super(Prenet, self).__init__() | |||
| self.fcs = nn.ModuleList() | |||
| for in_dim, out_dim in zip([in_units] + prenet_units[:-1], | |||
| prenet_units): | |||
| self.fcs.append(nn.Linear(in_dim, out_dim)) | |||
| self.fcs.append(nn.ReLU()) | |||
| self.fcs.append(nn.Dropout(0.5)) | |||
| if (out_units): | |||
| self.fcs.append(nn.Linear(prenet_units[-1], out_units)) | |||
| def forward(self, input): | |||
| output = input | |||
| for layer in self.fcs: | |||
| output = layer(output) | |||
| return output | |||
| class MultiHeadSelfAttention(nn.Module): | |||
| """ Multi-Head SelfAttention module """ | |||
| def __init__(self, n_head, d_in, d_model, d_head, dropout, dropatt=0.0): | |||
| super().__init__() | |||
| self.n_head = n_head | |||
| self.d_head = d_head | |||
| self.d_in = d_in | |||
| self.d_model = d_model | |||
| self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) | |||
| self.w_qkv = nn.Linear(d_in, 3 * n_head * d_head) | |||
| self.attention = ScaledDotProductAttention( | |||
| temperature=np.power(d_head, 0.5), dropatt=dropatt) | |||
| self.fc = nn.Linear(n_head * d_head, d_model) | |||
| self.dropout = nn.Dropout(dropout) | |||
| def forward(self, input, mask=None): | |||
| d_head, n_head = self.d_head, self.n_head | |||
| sz_b, len_in, _ = input.size() | |||
| residual = input | |||
| x = self.layer_norm(input) | |||
| qkv = self.w_qkv(x) | |||
| q, k, v = qkv.chunk(3, -1) | |||
| q = q.view(sz_b, len_in, n_head, d_head) | |||
| k = k.view(sz_b, len_in, n_head, d_head) | |||
| v = v.view(sz_b, len_in, n_head, d_head) | |||
| q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_in, | |||
| d_head) # (n*b) x l x d | |||
| k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_in, | |||
| d_head) # (n*b) x l x d | |||
| v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_in, | |||
| d_head) # (n*b) x l x d | |||
| if mask is not None: | |||
| mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. | |||
| output, attn = self.attention(q, k, v, mask=mask) | |||
| output = output.view(n_head, sz_b, len_in, d_head) | |||
| output = (output.permute(1, 2, 0, | |||
| 3).contiguous().view(sz_b, len_in, | |||
| -1)) # b x l x (n*d) | |||
| output = self.dropout(self.fc(output)) | |||
| if (output.size(-1) == residual.size(-1)): | |||
| output = output + residual | |||
| return output, attn | |||
| class PositionwiseConvFeedForward(nn.Module): | |||
| """ A two-feed-forward-layer module """ | |||
| def __init__(self, | |||
| d_in, | |||
| d_hid, | |||
| kernel_size=(3, 1), | |||
| dropout_inner=0.1, | |||
| dropout=0.1): | |||
| super().__init__() | |||
| # Use Conv1D | |||
| # position-wise | |||
| self.w_1 = nn.Conv1d( | |||
| d_in, | |||
| d_hid, | |||
| kernel_size=kernel_size[0], | |||
| padding=(kernel_size[0] - 1) // 2, | |||
| ) | |||
| # position-wise | |||
| self.w_2 = nn.Conv1d( | |||
| d_hid, | |||
| d_in, | |||
| kernel_size=kernel_size[1], | |||
| padding=(kernel_size[1] - 1) // 2, | |||
| ) | |||
| self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) | |||
| self.dropout_inner = nn.Dropout(dropout_inner) | |||
| self.dropout = nn.Dropout(dropout) | |||
| def forward(self, x, mask=None): | |||
| residual = x | |||
| x = self.layer_norm(x) | |||
| output = x.transpose(1, 2) | |||
| output = F.relu(self.w_1(output)) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(1), 0) | |||
| output = self.dropout_inner(output) | |||
| output = self.w_2(output) | |||
| output = output.transpose(1, 2) | |||
| output = self.dropout(output) | |||
| output = output + residual | |||
| return output | |||
| class FFTBlock(nn.Module): | |||
| """FFT Block""" | |||
| def __init__(self, | |||
| d_in, | |||
| d_model, | |||
| n_head, | |||
| d_head, | |||
| d_inner, | |||
| kernel_size, | |||
| dropout, | |||
| dropout_attn=0.0, | |||
| dropout_relu=0.0): | |||
| super(FFTBlock, self).__init__() | |||
| self.slf_attn = MultiHeadSelfAttention( | |||
| n_head, | |||
| d_in, | |||
| d_model, | |||
| d_head, | |||
| dropout=dropout, | |||
| dropatt=dropout_attn) | |||
| self.pos_ffn = PositionwiseConvFeedForward( | |||
| d_model, | |||
| d_inner, | |||
| kernel_size, | |||
| dropout_inner=dropout_relu, | |||
| dropout=dropout) | |||
| def forward(self, input, mask=None, slf_attn_mask=None): | |||
| output, slf_attn = self.slf_attn(input, mask=slf_attn_mask) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(-1), 0) | |||
| output = self.pos_ffn(output, mask=mask) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(-1), 0) | |||
| return output, slf_attn | |||
| class MultiHeadPNCAAttention(nn.Module): | |||
| """ Multi-Head Attention PNCA module """ | |||
| def __init__(self, n_head, d_model, d_mem, d_head, dropout, dropatt=0.0): | |||
| super().__init__() | |||
| self.n_head = n_head | |||
| self.d_head = d_head | |||
| self.d_model = d_model | |||
| self.d_mem = d_mem | |||
| self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) | |||
| self.w_x_qkv = nn.Linear(d_model, 3 * n_head * d_head) | |||
| self.fc_x = nn.Linear(n_head * d_head, d_model) | |||
| self.w_h_kv = nn.Linear(d_mem, 2 * n_head * d_head) | |||
| self.fc_h = nn.Linear(n_head * d_head, d_model) | |||
| self.attention = ScaledDotProductAttention( | |||
| temperature=np.power(d_head, 0.5), dropatt=dropatt) | |||
| self.dropout = nn.Dropout(dropout) | |||
| def update_x_state(self, x): | |||
| d_head, n_head = self.d_head, self.n_head | |||
| sz_b, len_x, _ = x.size() | |||
| x_qkv = self.w_x_qkv(x) | |||
| x_q, x_k, x_v = x_qkv.chunk(3, -1) | |||
| x_q = x_q.view(sz_b, len_x, n_head, d_head) | |||
| x_k = x_k.view(sz_b, len_x, n_head, d_head) | |||
| x_v = x_v.view(sz_b, len_x, n_head, d_head) | |||
| x_q = x_q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) | |||
| x_k = x_k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) | |||
| x_v = x_v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_head) | |||
| if (self.x_state_size): | |||
| self.x_k = torch.cat([self.x_k, x_k], dim=1) | |||
| self.x_v = torch.cat([self.x_v, x_v], dim=1) | |||
| else: | |||
| self.x_k = x_k | |||
| self.x_v = x_v | |||
| self.x_state_size += len_x | |||
| return x_q, x_k, x_v | |||
| def update_h_state(self, h): | |||
| if (self.h_state_size == h.size(1)): | |||
| return None, None | |||
| d_head, n_head = self.d_head, self.n_head | |||
| # H | |||
| sz_b, len_h, _ = h.size() | |||
| h_kv = self.w_h_kv(h) | |||
| h_k, h_v = h_kv.chunk(2, -1) | |||
| h_k = h_k.view(sz_b, len_h, n_head, d_head) | |||
| h_v = h_v.view(sz_b, len_h, n_head, d_head) | |||
| self.h_k = h_k.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head) | |||
| self.h_v = h_v.permute(2, 0, 1, 3).contiguous().view(-1, len_h, d_head) | |||
| self.h_state_size += len_h | |||
| return h_k, h_v | |||
| def reset_state(self): | |||
| self.h_k = None | |||
| self.h_v = None | |||
| self.h_state_size = 0 | |||
| self.x_k = None | |||
| self.x_v = None | |||
| self.x_state_size = 0 | |||
| def forward(self, x, h, mask_x=None, mask_h=None): | |||
| residual = x | |||
| self.update_h_state(h) | |||
| x_q, x_k, x_v = self.update_x_state(self.layer_norm(x)) | |||
| d_head, n_head = self.d_head, self.n_head | |||
| sz_b, len_in, _ = x.size() | |||
| # X | |||
| if mask_x is not None: | |||
| mask_x = mask_x.repeat(n_head, 1, 1) # (n*b) x .. x .. | |||
| output_x, attn_x = self.attention(x_q, self.x_k, self.x_v, mask=mask_x) | |||
| output_x = output_x.view(n_head, sz_b, len_in, d_head) | |||
| output_x = (output_x.permute(1, 2, 0, | |||
| 3).contiguous().view(sz_b, len_in, | |||
| -1)) # b x l x (n*d) | |||
| output_x = self.fc_x(output_x) | |||
| # H | |||
| if mask_h is not None: | |||
| mask_h = mask_h.repeat(n_head, 1, 1) | |||
| output_h, attn_h = self.attention(x_q, self.h_k, self.h_v, mask=mask_h) | |||
| output_h = output_h.view(n_head, sz_b, len_in, d_head) | |||
| output_h = (output_h.permute(1, 2, 0, | |||
| 3).contiguous().view(sz_b, len_in, | |||
| -1)) # b x l x (n*d) | |||
| output_h = self.fc_h(output_h) | |||
| output = output_x + output_h | |||
| output = self.dropout(output) | |||
| output = output + residual | |||
| return output, attn_x, attn_h | |||
| class PNCABlock(nn.Module): | |||
| """PNCA Block""" | |||
| def __init__(self, | |||
| d_model, | |||
| d_mem, | |||
| n_head, | |||
| d_head, | |||
| d_inner, | |||
| kernel_size, | |||
| dropout, | |||
| dropout_attn=0.0, | |||
| dropout_relu=0.0): | |||
| super(PNCABlock, self).__init__() | |||
| self.pnca_attn = MultiHeadPNCAAttention( | |||
| n_head, | |||
| d_model, | |||
| d_mem, | |||
| d_head, | |||
| dropout=dropout, | |||
| dropatt=dropout_attn) | |||
| self.pos_ffn = PositionwiseConvFeedForward( | |||
| d_model, | |||
| d_inner, | |||
| kernel_size, | |||
| dropout_inner=dropout_relu, | |||
| dropout=dropout) | |||
| def forward(self, | |||
| input, | |||
| memory, | |||
| mask=None, | |||
| pnca_x_attn_mask=None, | |||
| pnca_h_attn_mask=None): | |||
| output, pnca_attn_x, pnca_attn_h = self.pnca_attn( | |||
| input, memory, pnca_x_attn_mask, pnca_h_attn_mask) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(-1), 0) | |||
| output = self.pos_ffn(output, mask=mask) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(-1), 0) | |||
| return output, pnca_attn_x, pnca_attn_h | |||
| def reset_state(self): | |||
| self.pnca_attn.reset_state() | |||
| @@ -0,0 +1,126 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| """ | |||
| FSMN Pytorch Version | |||
| """ | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class FeedForwardNet(nn.Module): | |||
| """ A two-feed-forward-layer module """ | |||
| def __init__(self, d_in, d_hid, d_out, kernel_size=[1, 1], dropout=0.1): | |||
| super().__init__() | |||
| # Use Conv1D | |||
| # position-wise | |||
| self.w_1 = nn.Conv1d( | |||
| d_in, | |||
| d_hid, | |||
| kernel_size=kernel_size[0], | |||
| padding=(kernel_size[0] - 1) // 2, | |||
| ) | |||
| # position-wise | |||
| self.w_2 = nn.Conv1d( | |||
| d_hid, | |||
| d_out, | |||
| kernel_size=kernel_size[1], | |||
| padding=(kernel_size[1] - 1) // 2, | |||
| bias=False) | |||
| self.dropout = nn.Dropout(dropout) | |||
| def forward(self, x): | |||
| output = x.transpose(1, 2) | |||
| output = F.relu(self.w_1(output)) | |||
| output = self.dropout(output) | |||
| output = self.w_2(output) | |||
| output = output.transpose(1, 2) | |||
| return output | |||
| class MemoryBlockV2(nn.Module): | |||
| def __init__(self, d, filter_size, shift, dropout=0.0): | |||
| super(MemoryBlockV2, self).__init__() | |||
| left_padding = int(round((filter_size - 1) / 2)) | |||
| right_padding = int((filter_size - 1) / 2) | |||
| if shift > 0: | |||
| left_padding += shift | |||
| right_padding -= shift | |||
| self.lp, self.rp = left_padding, right_padding | |||
| self.conv_dw = nn.Conv1d(d, d, filter_size, 1, 0, groups=d, bias=False) | |||
| self.dropout = nn.Dropout(dropout) | |||
| def forward(self, input, mask=None): | |||
| if mask is not None: | |||
| input = input.masked_fill(mask.unsqueeze(-1), 0) | |||
| x = F.pad( | |||
| input, (0, 0, self.lp, self.rp, 0, 0), mode='constant', value=0.0) | |||
| output = self.conv_dw(x.contiguous().transpose( | |||
| 1, 2)).contiguous().transpose(1, 2) | |||
| output += input | |||
| output = self.dropout(output) | |||
| if mask is not None: | |||
| output = output.masked_fill(mask.unsqueeze(-1), 0) | |||
| return output | |||
| class FsmnEncoderV2(nn.Module): | |||
| def __init__(self, | |||
| filter_size, | |||
| fsmn_num_layers, | |||
| input_dim, | |||
| num_memory_units, | |||
| ffn_inner_dim, | |||
| dropout=0.0, | |||
| shift=0): | |||
| super(FsmnEncoderV2, self).__init__() | |||
| self.filter_size = filter_size | |||
| self.fsmn_num_layers = fsmn_num_layers | |||
| self.num_memory_units = num_memory_units | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.shift = shift | |||
| if not isinstance(shift, list): | |||
| self.shift = [shift for _ in range(self.fsmn_num_layers)] | |||
| self.ffn_lst = nn.ModuleList() | |||
| self.ffn_lst.append( | |||
| FeedForwardNet( | |||
| input_dim, ffn_inner_dim, num_memory_units, dropout=dropout)) | |||
| for i in range(1, fsmn_num_layers): | |||
| self.ffn_lst.append( | |||
| FeedForwardNet( | |||
| num_memory_units, | |||
| ffn_inner_dim, | |||
| num_memory_units, | |||
| dropout=dropout)) | |||
| self.memory_block_lst = nn.ModuleList() | |||
| for i in range(fsmn_num_layers): | |||
| self.memory_block_lst.append( | |||
| MemoryBlockV2(num_memory_units, filter_size, self.shift[i], | |||
| dropout)) | |||
| def forward(self, input, mask=None): | |||
| x = F.dropout(input, self.dropout, self.training) | |||
| for (ffn, memory_block) in zip(self.ffn_lst, self.memory_block_lst): | |||
| context = ffn(x) | |||
| memory = memory_block(context, mask) | |||
| memory = F.dropout(memory, self.dropout, self.training) | |||
| if (memory.size(-1) == x.size(-1)): | |||
| memory += x | |||
| x = memory | |||
| return x | |||
| @@ -0,0 +1,718 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from modelscope.models.audio.tts.models.utils import get_mask_from_lengths | |||
| from .adaptors import (LengthRegulator, VarFsmnRnnNARPredictor, | |||
| VarRnnARPredictor) | |||
| from .base import FFTBlock, PNCABlock, Prenet | |||
| from .fsmn import FsmnEncoderV2 | |||
| from .positions import DurSinusoidalPositionEncoder, SinusoidalPositionEncoder | |||
| class SelfAttentionEncoder(nn.Module): | |||
| def __init__(self, n_layer, d_in, d_model, n_head, d_head, d_inner, | |||
| dropout, dropout_att, dropout_relu, position_encoder): | |||
| super(SelfAttentionEncoder, self).__init__() | |||
| self.d_in = d_in | |||
| self.d_model = d_model | |||
| self.dropout = dropout | |||
| d_in_lst = [d_in] + [d_model] * (n_layer - 1) | |||
| self.fft = nn.ModuleList([ | |||
| FFTBlock(d, d_model, n_head, d_head, d_inner, (3, 1), dropout, | |||
| dropout_att, dropout_relu) for d in d_in_lst | |||
| ]) | |||
| self.ln = nn.LayerNorm(d_model, eps=1e-6) | |||
| self.position_enc = position_encoder | |||
| def forward(self, input, mask=None, return_attns=False): | |||
| input *= self.d_model**0.5 | |||
| if (isinstance(self.position_enc, SinusoidalPositionEncoder)): | |||
| input = self.position_enc(input) | |||
| else: | |||
| raise NotImplementedError('modelscope error: position_enc invalid') | |||
| input = F.dropout(input, p=self.dropout, training=self.training) | |||
| enc_slf_attn_list = [] | |||
| max_len = input.size(1) | |||
| if mask is not None: | |||
| slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) | |||
| else: | |||
| slf_attn_mask = None | |||
| enc_output = input | |||
| for id, layer in enumerate(self.fft): | |||
| enc_output, enc_slf_attn = layer( | |||
| enc_output, mask=mask, slf_attn_mask=slf_attn_mask) | |||
| if return_attns: | |||
| enc_slf_attn_list += [enc_slf_attn] | |||
| enc_output = self.ln(enc_output) | |||
| return enc_output, enc_slf_attn_list | |||
| class HybridAttentionDecoder(nn.Module): | |||
| def __init__(self, d_in, prenet_units, n_layer, d_model, d_mem, n_head, | |||
| d_head, d_inner, dropout, dropout_att, dropout_relu, d_out): | |||
| super(HybridAttentionDecoder, self).__init__() | |||
| self.d_model = d_model | |||
| self.dropout = dropout | |||
| self.prenet = Prenet(d_in, prenet_units, d_model) | |||
| self.dec_in_proj = nn.Linear(d_model + d_mem, d_model) | |||
| self.pnca = nn.ModuleList([ | |||
| PNCABlock(d_model, d_mem, n_head, d_head, d_inner, (1, 1), dropout, | |||
| dropout_att, dropout_relu) for _ in range(n_layer) | |||
| ]) | |||
| self.ln = nn.LayerNorm(d_model, eps=1e-6) | |||
| self.dec_out_proj = nn.Linear(d_model, d_out) | |||
| def reset_state(self): | |||
| for layer in self.pnca: | |||
| layer.reset_state() | |||
| def get_pnca_attn_mask(self, | |||
| device, | |||
| max_len, | |||
| x_band_width, | |||
| h_band_width, | |||
| mask=None): | |||
| if mask is not None: | |||
| pnca_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) | |||
| else: | |||
| pnca_attn_mask = None | |||
| range_ = torch.arange(max_len).to(device) | |||
| x_start = torch.clamp_min(range_ - x_band_width, 0)[None, None, :] | |||
| x_end = (range_ + 1)[None, None, :] | |||
| h_start = range_[None, None, :] | |||
| h_end = torch.clamp_max(range_ + h_band_width + 1, | |||
| max_len + 1)[None, None, :] | |||
| pnca_x_attn_mask = ~((x_start <= range_[None, :, None]) | |||
| & (x_end > range_[None, :, None])).transpose(1, 2) # yapf:disable | |||
| pnca_h_attn_mask = ~((h_start <= range_[None, :, None]) | |||
| & (h_end > range_[None, :, None])).transpose(1, 2) # yapf:disable | |||
| if pnca_attn_mask is not None: | |||
| pnca_x_attn_mask = (pnca_x_attn_mask | pnca_attn_mask) | |||
| pnca_h_attn_mask = (pnca_h_attn_mask | pnca_attn_mask) | |||
| pnca_x_attn_mask = pnca_x_attn_mask.masked_fill( | |||
| pnca_attn_mask.transpose(1, 2), False) | |||
| pnca_h_attn_mask = pnca_h_attn_mask.masked_fill( | |||
| pnca_attn_mask.transpose(1, 2), False) | |||
| return pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask | |||
| # must call reset_state before | |||
| def forward(self, | |||
| input, | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| mask=None, | |||
| return_attns=False): | |||
| input = self.prenet(input) | |||
| input = torch.cat([memory, input], dim=-1) | |||
| input = self.dec_in_proj(input) | |||
| if mask is not None: | |||
| input = input.masked_fill(mask.unsqueeze(-1), 0) | |||
| input *= self.d_model**0.5 | |||
| input = F.dropout(input, p=self.dropout, training=self.training) | |||
| max_len = input.size(1) | |||
| pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask( | |||
| input.device, max_len, x_band_width, h_band_width, mask) | |||
| dec_pnca_attn_x_list = [] | |||
| dec_pnca_attn_h_list = [] | |||
| dec_output = input | |||
| for id, layer in enumerate(self.pnca): | |||
| dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer( | |||
| dec_output, | |||
| memory, | |||
| mask=mask, | |||
| pnca_x_attn_mask=pnca_x_attn_mask, | |||
| pnca_h_attn_mask=pnca_h_attn_mask) | |||
| if return_attns: | |||
| dec_pnca_attn_x_list += [dec_pnca_attn_x] | |||
| dec_pnca_attn_h_list += [dec_pnca_attn_h] | |||
| dec_output = self.ln(dec_output) | |||
| dec_output = self.dec_out_proj(dec_output) | |||
| return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list | |||
| # must call reset_state before when step == 0 | |||
| def infer(self, | |||
| step, | |||
| input, | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| mask=None, | |||
| return_attns=False): | |||
| max_len = memory.size(1) | |||
| input = self.prenet(input) | |||
| input = torch.cat([memory[:, step:step + 1, :], input], dim=-1) | |||
| input = self.dec_in_proj(input) | |||
| input *= self.d_model**0.5 | |||
| input = F.dropout(input, p=self.dropout, training=self.training) | |||
| pnca_attn_mask, pnca_x_attn_mask, pnca_h_attn_mask = self.get_pnca_attn_mask( | |||
| input.device, max_len, x_band_width, h_band_width, mask) | |||
| dec_pnca_attn_x_list = [] | |||
| dec_pnca_attn_h_list = [] | |||
| dec_output = input | |||
| for id, layer in enumerate(self.pnca): | |||
| if mask is not None: | |||
| mask_step = mask[:, step:step + 1] | |||
| else: | |||
| mask_step = None | |||
| dec_output, dec_pnca_attn_x, dec_pnca_attn_h = layer( | |||
| dec_output, | |||
| memory, | |||
| mask=mask_step, | |||
| pnca_x_attn_mask=pnca_x_attn_mask[:, | |||
| step:step + 1, :(step + 1)], | |||
| pnca_h_attn_mask=pnca_h_attn_mask[:, step:step + 1, :]) | |||
| if return_attns: | |||
| dec_pnca_attn_x_list += [dec_pnca_attn_x] | |||
| dec_pnca_attn_h_list += [dec_pnca_attn_h] | |||
| dec_output = self.ln(dec_output) | |||
| dec_output = self.dec_out_proj(dec_output) | |||
| return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list | |||
| class TextFftEncoder(nn.Module): | |||
| def __init__(self, config, ling_unit_size): | |||
| super(TextFftEncoder, self).__init__() | |||
| # linguistic unit lookup table | |||
| nb_ling_sy = ling_unit_size['sy'] | |||
| nb_ling_tone = ling_unit_size['tone'] | |||
| nb_ling_syllable_flag = ling_unit_size['syllable_flag'] | |||
| nb_ling_ws = ling_unit_size['word_segment'] | |||
| max_len = config['am']['max_len'] | |||
| d_emb = config['am']['embedding_dim'] | |||
| nb_layers = config['am']['encoder_num_layers'] | |||
| nb_heads = config['am']['encoder_num_heads'] | |||
| d_model = config['am']['encoder_num_units'] | |||
| d_head = d_model // nb_heads | |||
| d_inner = config['am']['encoder_ffn_inner_dim'] | |||
| dropout = config['am']['encoder_dropout'] | |||
| dropout_attn = config['am']['encoder_attention_dropout'] | |||
| dropout_relu = config['am']['encoder_relu_dropout'] | |||
| d_proj = config['am']['encoder_projection_units'] | |||
| self.d_model = d_model | |||
| self.sy_emb = nn.Embedding(nb_ling_sy, d_emb) | |||
| self.tone_emb = nn.Embedding(nb_ling_tone, d_emb) | |||
| self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb) | |||
| self.ws_emb = nn.Embedding(nb_ling_ws, d_emb) | |||
| position_enc = SinusoidalPositionEncoder(max_len, d_emb) | |||
| self.ling_enc = SelfAttentionEncoder(nb_layers, d_emb, d_model, | |||
| nb_heads, d_head, d_inner, | |||
| dropout, dropout_attn, | |||
| dropout_relu, position_enc) | |||
| self.ling_proj = nn.Linear(d_model, d_proj, bias=False) | |||
| def forward(self, inputs_ling, masks=None, return_attns=False): | |||
| # Parse inputs_ling_seq | |||
| inputs_sy = inputs_ling[:, :, 0] | |||
| inputs_tone = inputs_ling[:, :, 1] | |||
| inputs_syllable_flag = inputs_ling[:, :, 2] | |||
| inputs_ws = inputs_ling[:, :, 3] | |||
| # Lookup table | |||
| sy_embedding = self.sy_emb(inputs_sy) | |||
| tone_embedding = self.tone_emb(inputs_tone) | |||
| syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag) | |||
| ws_embedding = self.ws_emb(inputs_ws) | |||
| ling_embedding = sy_embedding + tone_embedding + syllable_flag_embedding + ws_embedding | |||
| enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks, | |||
| return_attns) | |||
| enc_output = self.ling_proj(enc_output) | |||
| return enc_output, enc_slf_attn_list | |||
| class VarianceAdaptor(nn.Module): | |||
| def __init__(self, config): | |||
| super(VarianceAdaptor, self).__init__() | |||
| input_dim = config['am']['encoder_projection_units'] + config['am'][ | |||
| 'emotion_units'] + config['am']['speaker_units'] | |||
| filter_size = config['am']['predictor_filter_size'] | |||
| fsmn_num_layers = config['am']['predictor_fsmn_num_layers'] | |||
| num_memory_units = config['am']['predictor_num_memory_units'] | |||
| ffn_inner_dim = config['am']['predictor_ffn_inner_dim'] | |||
| dropout = config['am']['predictor_dropout'] | |||
| shift = config['am']['predictor_shift'] | |||
| lstm_units = config['am']['predictor_lstm_units'] | |||
| dur_pred_prenet_units = config['am']['dur_pred_prenet_units'] | |||
| dur_pred_lstm_units = config['am']['dur_pred_lstm_units'] | |||
| self.pitch_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size, | |||
| fsmn_num_layers, | |||
| num_memory_units, | |||
| ffn_inner_dim, dropout, | |||
| shift, lstm_units) | |||
| self.energy_predictor = VarFsmnRnnNARPredictor(input_dim, filter_size, | |||
| fsmn_num_layers, | |||
| num_memory_units, | |||
| ffn_inner_dim, dropout, | |||
| shift, lstm_units) | |||
| self.duration_predictor = VarRnnARPredictor(input_dim, | |||
| dur_pred_prenet_units, | |||
| dur_pred_lstm_units) | |||
| self.length_regulator = LengthRegulator( | |||
| config['am']['outputs_per_step']) | |||
| self.dur_position_encoder = DurSinusoidalPositionEncoder( | |||
| config['am']['encoder_projection_units'], | |||
| config['am']['outputs_per_step']) | |||
| self.pitch_emb = nn.Conv1d( | |||
| 1, | |||
| config['am']['encoder_projection_units'], | |||
| kernel_size=9, | |||
| padding=4) | |||
| self.energy_emb = nn.Conv1d( | |||
| 1, | |||
| config['am']['encoder_projection_units'], | |||
| kernel_size=9, | |||
| padding=4) | |||
| def forward(self, | |||
| inputs_text_embedding, | |||
| inputs_emo_embedding, | |||
| inputs_spk_embedding, | |||
| masks=None, | |||
| output_masks=None, | |||
| duration_targets=None, | |||
| pitch_targets=None, | |||
| energy_targets=None): | |||
| batch_size = inputs_text_embedding.size(0) | |||
| variance_predictor_inputs = torch.cat([ | |||
| inputs_text_embedding, inputs_spk_embedding, inputs_emo_embedding | |||
| ], dim=-1) # yapf:disable | |||
| pitch_predictions = self.pitch_predictor(variance_predictor_inputs, | |||
| masks) | |||
| energy_predictions = self.energy_predictor(variance_predictor_inputs, | |||
| masks) | |||
| if pitch_targets is not None: | |||
| pitch_embeddings = self.pitch_emb( | |||
| pitch_targets.unsqueeze(1)).transpose(1, 2) | |||
| else: | |||
| pitch_embeddings = self.pitch_emb( | |||
| pitch_predictions.unsqueeze(1)).transpose(1, 2) | |||
| if energy_targets is not None: | |||
| energy_embeddings = self.energy_emb( | |||
| energy_targets.unsqueeze(1)).transpose(1, 2) | |||
| else: | |||
| energy_embeddings = self.energy_emb( | |||
| energy_predictions.unsqueeze(1)).transpose(1, 2) | |||
| inputs_text_embedding_aug = inputs_text_embedding + pitch_embeddings + energy_embeddings | |||
| duration_predictor_cond = torch.cat([ | |||
| inputs_text_embedding_aug, inputs_spk_embedding, | |||
| inputs_emo_embedding | |||
| ], dim=-1) # yapf:disable | |||
| if duration_targets is not None: | |||
| duration_predictor_go_frame = torch.zeros(batch_size, 1).to( | |||
| inputs_text_embedding.device) | |||
| duration_predictor_input = torch.cat([ | |||
| duration_predictor_go_frame, duration_targets[:, :-1].float() | |||
| ], dim=-1) # yapf:disable | |||
| duration_predictor_input = torch.log(duration_predictor_input + 1) | |||
| log_duration_predictions, _ = self.duration_predictor( | |||
| duration_predictor_input.unsqueeze(-1), | |||
| duration_predictor_cond, | |||
| masks=masks) | |||
| duration_predictions = torch.exp(log_duration_predictions) - 1 | |||
| else: | |||
| log_duration_predictions = self.duration_predictor.infer( | |||
| duration_predictor_cond, masks=masks) | |||
| duration_predictions = torch.exp(log_duration_predictions) - 1 | |||
| if duration_targets is not None: | |||
| LR_text_outputs, LR_length_rounded = self.length_regulator( | |||
| inputs_text_embedding_aug, | |||
| duration_targets, | |||
| masks=output_masks) | |||
| LR_position_embeddings = self.dur_position_encoder( | |||
| duration_targets, masks=output_masks) | |||
| LR_emo_outputs, _ = self.length_regulator( | |||
| inputs_emo_embedding, duration_targets, masks=output_masks) | |||
| LR_spk_outputs, _ = self.length_regulator( | |||
| inputs_spk_embedding, duration_targets, masks=output_masks) | |||
| else: | |||
| LR_text_outputs, LR_length_rounded = self.length_regulator( | |||
| inputs_text_embedding_aug, | |||
| duration_predictions, | |||
| masks=output_masks) | |||
| LR_position_embeddings = self.dur_position_encoder( | |||
| duration_predictions, masks=output_masks) | |||
| LR_emo_outputs, _ = self.length_regulator( | |||
| inputs_emo_embedding, duration_predictions, masks=output_masks) | |||
| LR_spk_outputs, _ = self.length_regulator( | |||
| inputs_spk_embedding, duration_predictions, masks=output_masks) | |||
| LR_text_outputs = LR_text_outputs + LR_position_embeddings | |||
| return (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, | |||
| LR_length_rounded, log_duration_predictions, pitch_predictions, | |||
| energy_predictions) | |||
| class MelPNCADecoder(nn.Module): | |||
| def __init__(self, config): | |||
| super(MelPNCADecoder, self).__init__() | |||
| prenet_units = config['am']['decoder_prenet_units'] | |||
| nb_layers = config['am']['decoder_num_layers'] | |||
| nb_heads = config['am']['decoder_num_heads'] | |||
| d_model = config['am']['decoder_num_units'] | |||
| d_head = d_model // nb_heads | |||
| d_inner = config['am']['decoder_ffn_inner_dim'] | |||
| dropout = config['am']['decoder_dropout'] | |||
| dropout_attn = config['am']['decoder_attention_dropout'] | |||
| dropout_relu = config['am']['decoder_relu_dropout'] | |||
| outputs_per_step = config['am']['outputs_per_step'] | |||
| d_mem = config['am'][ | |||
| 'encoder_projection_units'] * outputs_per_step + config['am'][ | |||
| 'emotion_units'] + config['am']['speaker_units'] | |||
| d_mel = config['am']['num_mels'] | |||
| self.d_mel = d_mel | |||
| self.r = outputs_per_step | |||
| self.nb_layers = nb_layers | |||
| self.mel_dec = HybridAttentionDecoder(d_mel, prenet_units, nb_layers, | |||
| d_model, d_mem, nb_heads, d_head, | |||
| d_inner, dropout, dropout_attn, | |||
| dropout_relu, | |||
| d_mel * outputs_per_step) | |||
| def forward(self, | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| target=None, | |||
| mask=None, | |||
| return_attns=False): | |||
| batch_size = memory.size(0) | |||
| go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device) | |||
| if target is not None: | |||
| self.mel_dec.reset_state() | |||
| input = target[:, self.r - 1::self.r, :] | |||
| input = torch.cat([go_frame, input], dim=1)[:, :-1, :] | |||
| dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list = self.mel_dec( | |||
| input, | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| mask=mask, | |||
| return_attns=return_attns) | |||
| else: | |||
| dec_output = [] | |||
| dec_pnca_attn_x_list = [[] for _ in range(self.nb_layers)] | |||
| dec_pnca_attn_h_list = [[] for _ in range(self.nb_layers)] | |||
| self.mel_dec.reset_state() | |||
| input = go_frame | |||
| for step in range(memory.size(1)): | |||
| dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step = self.mel_dec.infer( | |||
| step, | |||
| input, | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| mask=mask, | |||
| return_attns=return_attns) | |||
| input = dec_output_step[:, :, -self.d_mel:] | |||
| dec_output.append(dec_output_step) | |||
| for layer_id, (pnca_x_attn, pnca_h_attn) in enumerate( | |||
| zip(dec_pnca_attn_x_step, dec_pnca_attn_h_step)): | |||
| left = memory.size(1) - pnca_x_attn.size(-1) | |||
| if (left > 0): | |||
| padding = torch.zeros( | |||
| (pnca_x_attn.size(0), 1, left)).to(pnca_x_attn) | |||
| pnca_x_attn = torch.cat([pnca_x_attn, padding], dim=-1) | |||
| dec_pnca_attn_x_list[layer_id].append(pnca_x_attn) | |||
| dec_pnca_attn_h_list[layer_id].append(pnca_h_attn) | |||
| dec_output = torch.cat(dec_output, dim=1) | |||
| for layer_id in range(self.nb_layers): | |||
| dec_pnca_attn_x_list[layer_id] = torch.cat( | |||
| dec_pnca_attn_x_list[layer_id], dim=1) | |||
| dec_pnca_attn_h_list[layer_id] = torch.cat( | |||
| dec_pnca_attn_h_list[layer_id], dim=1) | |||
| return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list | |||
| class PostNet(nn.Module): | |||
| def __init__(self, config): | |||
| super(PostNet, self).__init__() | |||
| self.filter_size = config['am']['postnet_filter_size'] | |||
| self.fsmn_num_layers = config['am']['postnet_fsmn_num_layers'] | |||
| self.num_memory_units = config['am']['postnet_num_memory_units'] | |||
| self.ffn_inner_dim = config['am']['postnet_ffn_inner_dim'] | |||
| self.dropout = config['am']['postnet_dropout'] | |||
| self.shift = config['am']['postnet_shift'] | |||
| self.lstm_units = config['am']['postnet_lstm_units'] | |||
| self.num_mels = config['am']['num_mels'] | |||
| self.fsmn = FsmnEncoderV2(self.filter_size, self.fsmn_num_layers, | |||
| self.num_mels, self.num_memory_units, | |||
| self.ffn_inner_dim, self.dropout, self.shift) | |||
| self.lstm = nn.LSTM( | |||
| self.num_memory_units, | |||
| self.lstm_units, | |||
| num_layers=1, | |||
| batch_first=True) | |||
| self.fc = nn.Linear(self.lstm_units, self.num_mels) | |||
| def forward(self, x, mask=None): | |||
| postnet_fsmn_output = self.fsmn(x, mask) | |||
| # The input can also be a packed variable length sequence, | |||
| # here we just omit it for simpliciy due to the mask and uni-directional lstm. | |||
| postnet_lstm_output, _ = self.lstm(postnet_fsmn_output) | |||
| mel_residual_output = self.fc(postnet_lstm_output) | |||
| return mel_residual_output | |||
| def mel_recon_loss_fn(output_lengths, | |||
| mel_targets, | |||
| dec_outputs, | |||
| postnet_outputs=None): | |||
| mae_loss = nn.L1Loss(reduction='none') | |||
| output_masks = get_mask_from_lengths( | |||
| output_lengths, max_len=mel_targets.size(1)) | |||
| output_masks = ~output_masks | |||
| valid_outputs = output_masks.sum() | |||
| mel_loss_ = torch.sum( | |||
| mae_loss(mel_targets, dec_outputs) * output_masks.unsqueeze(-1)) / ( | |||
| valid_outputs * mel_targets.size(-1)) | |||
| if postnet_outputs is not None: | |||
| mel_loss = torch.sum( | |||
| mae_loss(mel_targets, postnet_outputs) | |||
| * output_masks.unsqueeze(-1)) / ( | |||
| valid_outputs * mel_targets.size(-1)) | |||
| else: | |||
| mel_loss = 0.0 | |||
| return mel_loss_, mel_loss | |||
| def prosody_recon_loss_fn(input_lengths, duration_targets, pitch_targets, | |||
| energy_targets, log_duration_predictions, | |||
| pitch_predictions, energy_predictions): | |||
| mae_loss = nn.L1Loss(reduction='none') | |||
| input_masks = get_mask_from_lengths( | |||
| input_lengths, max_len=duration_targets.size(1)) | |||
| input_masks = ~input_masks | |||
| valid_inputs = input_masks.sum() | |||
| dur_loss = torch.sum( | |||
| mae_loss( | |||
| torch.log(duration_targets.float() + 1), log_duration_predictions) | |||
| * input_masks) / valid_inputs | |||
| pitch_loss = torch.sum( | |||
| mae_loss(pitch_targets, pitch_predictions) | |||
| * input_masks) / valid_inputs | |||
| energy_loss = torch.sum( | |||
| mae_loss(energy_targets, energy_predictions) | |||
| * input_masks) / valid_inputs | |||
| return dur_loss, pitch_loss, energy_loss | |||
| class KanTtsSAMBERT(nn.Module): | |||
| def __init__(self, config, ling_unit_size): | |||
| super(KanTtsSAMBERT, self).__init__() | |||
| self.text_encoder = TextFftEncoder(config, ling_unit_size) | |||
| self.spk_tokenizer = nn.Embedding(ling_unit_size['speaker'], | |||
| config['am']['speaker_units']) | |||
| self.emo_tokenizer = nn.Embedding(ling_unit_size['emotion'], | |||
| config['am']['emotion_units']) | |||
| self.variance_adaptor = VarianceAdaptor(config) | |||
| self.mel_decoder = MelPNCADecoder(config) | |||
| self.mel_postnet = PostNet(config) | |||
| def get_lfr_mask_from_lengths(self, lengths, max_len): | |||
| batch_size = lengths.size(0) | |||
| # padding according to the outputs_per_step | |||
| padded_lr_lengths = torch.zeros_like(lengths) | |||
| for i in range(batch_size): | |||
| len_item = int(lengths[i].item()) | |||
| padding = self.mel_decoder.r - len_item % self.mel_decoder.r | |||
| if (padding < self.mel_decoder.r): | |||
| padded_lr_lengths[i] = (len_item | |||
| + padding) // self.mel_decoder.r | |||
| else: | |||
| padded_lr_lengths[i] = len_item // self.mel_decoder.r | |||
| return get_mask_from_lengths( | |||
| padded_lr_lengths, max_len=max_len // self.mel_decoder.r) | |||
| def forward(self, | |||
| inputs_ling, | |||
| inputs_emotion, | |||
| inputs_speaker, | |||
| input_lengths, | |||
| output_lengths=None, | |||
| mel_targets=None, | |||
| duration_targets=None, | |||
| pitch_targets=None, | |||
| energy_targets=None): | |||
| batch_size = inputs_ling.size(0) | |||
| input_masks = get_mask_from_lengths( | |||
| input_lengths, max_len=inputs_ling.size(1)) | |||
| text_hid, enc_sla_attn_lst = self.text_encoder( | |||
| inputs_ling, input_masks, return_attns=True) | |||
| emo_hid = self.emo_tokenizer(inputs_emotion) | |||
| spk_hid = self.spk_tokenizer(inputs_speaker) | |||
| if output_lengths is not None: | |||
| output_masks = get_mask_from_lengths( | |||
| output_lengths, max_len=mel_targets.size(1)) | |||
| else: | |||
| output_masks = None | |||
| (LR_text_outputs, LR_emo_outputs, LR_spk_outputs, LR_length_rounded, | |||
| log_duration_predictions, pitch_predictions, | |||
| energy_predictions) = self.variance_adaptor( | |||
| text_hid, | |||
| emo_hid, | |||
| spk_hid, | |||
| masks=input_masks, | |||
| output_masks=output_masks, | |||
| duration_targets=duration_targets, | |||
| pitch_targets=pitch_targets, | |||
| energy_targets=energy_targets) | |||
| if output_lengths is not None: | |||
| lfr_masks = self.get_lfr_mask_from_lengths( | |||
| output_lengths, max_len=LR_text_outputs.size(1)) | |||
| else: | |||
| output_masks = get_mask_from_lengths( | |||
| LR_length_rounded, max_len=LR_text_outputs.size(1)) | |||
| lfr_masks = None | |||
| # LFR with the factor of outputs_per_step | |||
| LFR_text_inputs = LR_text_outputs.contiguous().view( | |||
| batch_size, -1, self.mel_decoder.r * text_hid.shape[-1]) | |||
| LFR_emo_inputs = LR_emo_outputs.contiguous().view( | |||
| batch_size, -1, | |||
| self.mel_decoder.r * emo_hid.shape[-1])[:, :, :emo_hid.shape[-1]] | |||
| LFR_spk_inputs = LR_spk_outputs.contiguous().view( | |||
| batch_size, -1, | |||
| self.mel_decoder.r * spk_hid.shape[-1])[:, :, :spk_hid.shape[-1]] | |||
| memory = torch.cat([LFR_text_inputs, LFR_spk_inputs, LFR_emo_inputs], | |||
| dim=-1) | |||
| if duration_targets is not None: | |||
| x_band_width = int( | |||
| duration_targets.float().masked_fill(input_masks, 0).max() | |||
| / self.mel_decoder.r + 0.5) | |||
| h_band_width = x_band_width | |||
| else: | |||
| x_band_width = int((torch.exp(log_duration_predictions) - 1).max() | |||
| / self.mel_decoder.r + 0.5) | |||
| h_band_width = x_band_width | |||
| dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder( | |||
| memory, | |||
| x_band_width, | |||
| h_band_width, | |||
| target=mel_targets, | |||
| mask=lfr_masks, | |||
| return_attns=True) | |||
| # De-LFR with the factor of outputs_per_step | |||
| dec_outputs = dec_outputs.contiguous().view(batch_size, -1, | |||
| self.mel_decoder.d_mel) | |||
| if output_masks is not None: | |||
| dec_outputs = dec_outputs.masked_fill( | |||
| output_masks.unsqueeze(-1), 0) | |||
| postnet_outputs = self.mel_postnet(dec_outputs, | |||
| output_masks) + dec_outputs | |||
| if output_masks is not None: | |||
| postnet_outputs = postnet_outputs.masked_fill( | |||
| output_masks.unsqueeze(-1), 0) | |||
| res = { | |||
| 'x_band_width': x_band_width, | |||
| 'h_band_width': h_band_width, | |||
| 'enc_slf_attn_lst': enc_sla_attn_lst, | |||
| 'pnca_x_attn_lst': pnca_x_attn_lst, | |||
| 'pnca_h_attn_lst': pnca_h_attn_lst, | |||
| 'dec_outputs': dec_outputs, | |||
| 'postnet_outputs': postnet_outputs, | |||
| 'LR_length_rounded': LR_length_rounded, | |||
| 'log_duration_predictions': log_duration_predictions, | |||
| 'pitch_predictions': pitch_predictions, | |||
| 'energy_predictions': energy_predictions | |||
| } | |||
| res['LR_text_outputs'] = LR_text_outputs | |||
| res['LR_emo_outputs'] = LR_emo_outputs | |||
| res['LR_spk_outputs'] = LR_spk_outputs | |||
| return res | |||
| @@ -0,0 +1,101 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class SinusoidalPositionEncoder(nn.Module): | |||
| def __init__(self, max_len, depth): | |||
| super(SinusoidalPositionEncoder, self).__init__() | |||
| self.max_len = max_len | |||
| self.depth = depth | |||
| self.position_enc = nn.Parameter( | |||
| self.get_sinusoid_encoding_table(max_len, depth).unsqueeze(0), | |||
| requires_grad=False) | |||
| def forward(self, input): | |||
| bz_in, len_in, _ = input.size() | |||
| if len_in > self.max_len: | |||
| self.max_len = len_in | |||
| self.position_enc.data = self.get_sinusoid_encoding_table( | |||
| self.max_len, self.depth).unsqueeze(0).to(input.device) | |||
| output = input + self.position_enc[:, :len_in, :].expand(bz_in, -1, -1) | |||
| return output | |||
| @staticmethod | |||
| def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): | |||
| """ Sinusoid position encoding table """ | |||
| def cal_angle(position, hid_idx): | |||
| return position / np.power(10000, hid_idx / float(d_hid / 2 - 1)) | |||
| def get_posi_angle_vec(position): | |||
| return [cal_angle(position, hid_j) for hid_j in range(d_hid // 2)] | |||
| scaled_time_table = np.array( | |||
| [get_posi_angle_vec(pos_i + 1) for pos_i in range(n_position)]) | |||
| sinusoid_table = np.zeros((n_position, d_hid)) | |||
| sinusoid_table[:, :d_hid // 2] = np.sin(scaled_time_table) | |||
| sinusoid_table[:, d_hid // 2:] = np.cos(scaled_time_table) | |||
| if padding_idx is not None: | |||
| # zero vector for padding dimension | |||
| sinusoid_table[padding_idx] = 0.0 | |||
| return torch.FloatTensor(sinusoid_table) | |||
| class DurSinusoidalPositionEncoder(nn.Module): | |||
| def __init__(self, depth, outputs_per_step): | |||
| super(DurSinusoidalPositionEncoder, self).__init__() | |||
| self.depth = depth | |||
| self.outputs_per_step = outputs_per_step | |||
| inv_timescales = [ | |||
| np.power(10000, 2 * (hid_idx // 2) / depth) | |||
| for hid_idx in range(depth) | |||
| ] | |||
| self.inv_timescales = nn.Parameter( | |||
| torch.FloatTensor(inv_timescales), requires_grad=False) | |||
| def forward(self, durations, masks=None): | |||
| reps = (durations + 0.5).long() | |||
| output_lens = reps.sum(dim=1) | |||
| max_len = output_lens.max() | |||
| reps_cumsum = torch.cumsum( | |||
| F.pad(reps.float(), (1, 0, 0, 0), value=0.0), dim=1)[:, None, :] | |||
| range_ = torch.arange(max_len).to(durations.device)[None, :, None] | |||
| mult = ((reps_cumsum[:, :, :-1] <= range_) | |||
| & (reps_cumsum[:, :, 1:] > range_)) # yapf:disable | |||
| mult = mult.float() | |||
| offsets = torch.matmul(mult, | |||
| reps_cumsum[:, | |||
| 0, :-1].unsqueeze(-1)).squeeze(-1) | |||
| dur_pos = range_[:, :, 0] - offsets + 1 | |||
| if masks is not None: | |||
| assert masks.size(1) == dur_pos.size(1) | |||
| dur_pos = dur_pos.masked_fill(masks, 0.0) | |||
| seq_len = dur_pos.size(1) | |||
| padding = self.outputs_per_step - int(seq_len) % self.outputs_per_step | |||
| if (padding < self.outputs_per_step): | |||
| dur_pos = F.pad(dur_pos, (0, padding, 0, 0), value=0.0) | |||
| position_embedding = dur_pos[:, :, None] / self.inv_timescales[None, | |||
| None, :] | |||
| position_embedding[:, :, 0::2] = torch.sin(position_embedding[:, :, | |||
| 0::2]) | |||
| position_embedding[:, :, 1::2] = torch.cos(position_embedding[:, :, | |||
| 1::2]) | |||
| return position_embedding | |||
| @@ -1,174 +0,0 @@ | |||
| """Define position encoder classes.""" | |||
| import abc | |||
| import math | |||
| import tensorflow as tf | |||
| from .reducer import SumReducer | |||
| class PositionEncoder(tf.keras.layers.Layer): | |||
| """Base class for position encoders.""" | |||
| def __init__(self, reducer=None, **kwargs): | |||
| """Initializes the position encoder. | |||
| Args: | |||
| reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
| encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
| **kwargs: Additional layer keyword arguments. | |||
| """ | |||
| super(PositionEncoder, self).__init__(**kwargs) | |||
| if reducer is None: | |||
| reducer = SumReducer(dtype=kwargs.get('dtype')) | |||
| self.reducer = reducer | |||
| def call(self, inputs, position=None): # pylint: disable=arguments-differ | |||
| """Add position encodings to :obj:`inputs`. | |||
| Args: | |||
| inputs: The inputs to encode. | |||
| position: The single position to encode, to use when this layer is called | |||
| step by step. | |||
| Returns: | |||
| A ``tf.Tensor`` whose shape depends on the configured ``reducer``. | |||
| """ | |||
| batch_size = tf.shape(inputs)[0] | |||
| timesteps = tf.shape(inputs)[1] | |||
| input_dim = inputs.shape[-1].value | |||
| positions = tf.range(timesteps) + 1 if position is None else [position] | |||
| position_encoding = self._encode([positions], input_dim) | |||
| position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) | |||
| return self.reducer([inputs, position_encoding]) | |||
| @abc.abstractmethod | |||
| def _encode(self, positions, depth): | |||
| """Creates position encodings. | |||
| Args: | |||
| positions: The positions to encode of shape :math:`[B, ...]`. | |||
| depth: The encoding depth :math:`D`. | |||
| Returns: | |||
| A ``tf.Tensor`` of shape :math:`[B, ..., D]`. | |||
| """ | |||
| raise NotImplementedError() | |||
| class PositionEmbedder(PositionEncoder): | |||
| """Encodes position with a lookup table.""" | |||
| def __init__(self, maximum_position=128, reducer=None, **kwargs): | |||
| """Initializes the position encoder. | |||
| Args: | |||
| maximum_position: The maximum position to embed. Positions greater | |||
| than this value will be set to :obj:`maximum_position`. | |||
| reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
| encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
| **kwargs: Additional layer keyword arguments. | |||
| """ | |||
| super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) | |||
| self.maximum_position = maximum_position | |||
| self.embedding = None | |||
| def build(self, input_shape): | |||
| shape = [self.maximum_position + 1, input_shape[-1]] | |||
| self.embedding = self.add_weight('position_embedding', shape) | |||
| super(PositionEmbedder, self).build(input_shape) | |||
| def _encode(self, positions, depth): | |||
| positions = tf.minimum(positions, self.maximum_position) | |||
| return tf.nn.embedding_lookup(self.embedding, positions) | |||
| class SinusoidalPositionEncoder(PositionEncoder): | |||
| """Encodes positions with sine waves as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def _encode(self, positions, depth): | |||
| if depth % 2 != 0: | |||
| raise ValueError( | |||
| 'SinusoidalPositionEncoder expects the depth to be divisble ' | |||
| 'by 2 but got %d' % depth) | |||
| batch_size = tf.shape(positions)[0] | |||
| positions = tf.cast(positions, tf.float32) | |||
| log_timescale_increment = math.log(10000) / (depth / 2 - 1) | |||
| inv_timescales = tf.exp( | |||
| tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) | |||
| inv_timescales = tf.reshape( | |||
| tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) | |||
| scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( | |||
| inv_timescales, 1) | |||
| encoding = tf.concat( | |||
| [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) | |||
| return tf.cast(encoding, self.dtype) | |||
| class SinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
| def __init__(self, name='SinusodalPositionalEncoding'): | |||
| super(SinusodalPositionalEncoding, self).__init__(name=name) | |||
| @staticmethod | |||
| def positional_encoding(len, dim, step=1.): | |||
| """ | |||
| :param len: int scalar | |||
| :param dim: int scalar | |||
| :param step: | |||
| :return: position embedding | |||
| """ | |||
| pos_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) | |||
| * step, | |||
| axis=-1), [1, dim]) | |||
| dim_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
| axis=0), [len, 1]) | |||
| dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
| pos_encoding = tf.where( # [time, dims] | |||
| tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), | |||
| x=tf.math.sin( | |||
| pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
| y=tf.math.cos(pos_mat | |||
| / tf.pow(10000., | |||
| (dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
| return pos_encoding | |||
| class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
| def __init__(self, name='BatchSinusodalPositionalEncoding'): | |||
| super(BatchSinusodalPositionalEncoding, self).__init__(name=name) | |||
| @staticmethod | |||
| def positional_encoding(batch_size, len, dim, pos_mat, step=1.): | |||
| """ | |||
| :param len: int scalar | |||
| :param dim: int scalar | |||
| :param step: | |||
| :param pos_mat: [B, len] = [len, 1] * dim | |||
| :return: position embedding | |||
| """ | |||
| pos_mat = tf.tile( | |||
| tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), | |||
| [1, 1, dim]) # [B, len, dim] | |||
| dim_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.expand_dims( | |||
| tf.range( | |||
| 0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
| axis=0), | |||
| axis=0), [batch_size, len, 1]) # [B, len, dim] | |||
| dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
| pos_encoding = tf.where( # [B, time, dims] | |||
| tf.math.equal(tf.mod(dim_mat_int, 2), 0), | |||
| x=tf.math.sin( | |||
| pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
| y=tf.math.cos(pos_mat | |||
| / tf.pow(10000., | |||
| (dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
| return pos_encoding | |||
| @@ -1,155 +0,0 @@ | |||
| """Define reducers: objects that merge inputs.""" | |||
| import abc | |||
| import functools | |||
| import tensorflow as tf | |||
| def pad_in_time(x, padding_length): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" | |||
| return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
| def align_in_time(x, length): | |||
| """Aligns the time dimension of :obj:`x` with :obj:`length`.""" | |||
| time_dim = tf.shape(x)[1] | |||
| return tf.cond( | |||
| tf.less(time_dim, length), | |||
| true_fn=lambda: pad_in_time(x, length - time_dim), | |||
| false_fn=lambda: x[:, :length]) | |||
| def pad_with_identity(x, | |||
| sequence_length, | |||
| max_sequence_length, | |||
| identity_values=0, | |||
| maxlen=None): | |||
| """Pads a tensor with identity values up to :obj:`max_sequence_length`. | |||
| Args: | |||
| x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. | |||
| sequence_length: The true sequence length of :obj:`x`. | |||
| max_sequence_length: The sequence length up to which the tensor must contain | |||
| :obj:`identity values`. | |||
| identity_values: The identity value. | |||
| maxlen: Size of the output time dimension. Default is the maximum value in | |||
| obj:`max_sequence_length`. | |||
| Returns: | |||
| A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. | |||
| """ | |||
| if maxlen is None: | |||
| maxlen = tf.reduce_max(max_sequence_length) | |||
| mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
| mask = tf.expand_dims(mask, axis=-1) | |||
| mask_combined = tf.sequence_mask( | |||
| max_sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
| mask_combined = tf.expand_dims(mask_combined, axis=-1) | |||
| identity_mask = mask_combined * (1.0 - mask) | |||
| x = pad_in_time(x, maxlen - tf.shape(x)[1]) | |||
| x = x * mask + (identity_mask * identity_values) | |||
| return x | |||
| def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): | |||
| """Pads each input tensors with identity values up to | |||
| ``max(sequence_lengths)`` for each batch. | |||
| Args: | |||
| inputs: A list of ``tf.Tensor``. | |||
| sequence_lengths: A list of sequence length. | |||
| identity_values: The identity value. | |||
| Returns: | |||
| A tuple ``(padded, max_sequence_length)`` which are respectively a list of | |||
| ``tf.Tensor`` where each tensor are padded with identity and the combined | |||
| sequence length. | |||
| """ | |||
| max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) | |||
| maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) | |||
| padded = [ | |||
| pad_with_identity( | |||
| x, | |||
| length, | |||
| max_sequence_length, | |||
| identity_values=identity_values, | |||
| maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) | |||
| ] | |||
| return padded, max_sequence_length | |||
| class Reducer(tf.keras.layers.Layer): | |||
| """Base class for reducers.""" | |||
| def zip_and_reduce(self, x, y): | |||
| """Zips the :obj:`x` with :obj:`y` structures together and reduces all | |||
| elements. If the structures are nested, they will be flattened first. | |||
| Args: | |||
| x: The first structure. | |||
| y: The second structure. | |||
| Returns: | |||
| The same structure as :obj:`x` and :obj:`y` where each element from | |||
| :obj:`x` is reduced with the correspond element from :obj:`y`. | |||
| Raises: | |||
| ValueError: if the two structures are not the same. | |||
| """ | |||
| tf.nest.assert_same_structure(x, y) | |||
| x_flat = tf.nest.flatten(x) | |||
| y_flat = tf.nest.flatten(y) | |||
| reduced = list(map(self, zip(x_flat, y_flat))) | |||
| return tf.nest.pack_sequence_as(x, reduced) | |||
| def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ | |||
| """Reduces all input elements. | |||
| Args: | |||
| inputs: A list of ``tf.Tensor``. | |||
| sequence_length: The length of each input, if reducing sequences. | |||
| Returns: | |||
| If :obj:`sequence_length` is set, a tuple | |||
| ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` | |||
| only. | |||
| """ | |||
| if sequence_length is None: | |||
| return self.reduce(inputs) | |||
| else: | |||
| return self.reduce_sequence( | |||
| inputs, sequence_lengths=sequence_length) | |||
| @abc.abstractmethod | |||
| def reduce(self, inputs): | |||
| """See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
| raise NotImplementedError() | |||
| @abc.abstractmethod | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| """See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
| raise NotImplementedError() | |||
| class SumReducer(Reducer): | |||
| """A reducer that sums the inputs.""" | |||
| def reduce(self, inputs): | |||
| if len(inputs) == 1: | |||
| return inputs[0] | |||
| if len(inputs) == 2: | |||
| return inputs[0] + inputs[1] | |||
| return tf.add_n(inputs) | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| padded, combined_length = pad_n_with_identity( | |||
| inputs, sequence_lengths, identity_values=0) | |||
| return self.reduce(padded), combined_length | |||
| class MultiplyReducer(Reducer): | |||
| """A reducer that multiplies the inputs.""" | |||
| def reduce(self, inputs): | |||
| return functools.reduce(lambda a, x: a * x, inputs) | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| padded, combined_length = pad_n_with_identity( | |||
| inputs, sequence_lengths, identity_values=1) | |||
| return self.reduce(padded), combined_length | |||
| @@ -1,237 +0,0 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.python.ops import rnn_cell_impl | |||
| from .am_models import prenet | |||
| class VarPredictorCell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='var_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(DurPredictorCell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='dur_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| new_super_cell_out = tf.nn.relu(new_super_cell_out) | |||
| # new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCECell(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
| max_dur, dur_embedding_dim): | |||
| super(DurPredictorCECell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| self._max_dur = max_dur | |||
| self._dur_embedding_dim = dur_embedding_dim | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._max_dur | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| """Run the Tacotron2 super decoder cell.""" | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = tf.squeeze( | |||
| tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] | |||
| prenet_input = tf.one_hot( | |||
| prenet_input, self._max_dur, on_value=1.0, off_value=0.0, | |||
| axis=-1) # [N, 120] | |||
| prenet_input = tf.layers.dense( | |||
| prenet_input, units=self._dur_embedding_dim) | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='dur_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._max_dur) # [N, 120] | |||
| new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class VarPredictorCell2(tf.contrib.rnn.RNNCell): | |||
| """Wrapper wrapper knock knock.""" | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell2, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='var_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| # split and relu | |||
| new_super_cell_out = tf.concat([ | |||
| tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] | |||
| ], axis=-1) # yapf:disable | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| @@ -1,760 +0,0 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.python.ops.ragged.ragged_util import repeat | |||
| from .fsmn_encoder import FsmnEncoderV2 | |||
| from .position import BatchSinusodalPositionalEncoding | |||
| from .self_attention_decoder import SelfAttentionDecoder | |||
| from .self_attention_encoder import SelfAttentionEncoder | |||
| class RobuTrans(): | |||
| def __init__(self, hparams): | |||
| self._hparams = hparams | |||
| def initialize(self, | |||
| inputs, | |||
| inputs_emotion, | |||
| inputs_speaker, | |||
| input_lengths, | |||
| output_lengths=None, | |||
| mel_targets=None, | |||
| durations=None, | |||
| pitch_contours=None, | |||
| uv_masks=None, | |||
| pitch_scales=None, | |||
| duration_scales=None, | |||
| energy_contours=None, | |||
| energy_scales=None): | |||
| """Initializes the model for inference. | |||
| Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
| Args: | |||
| inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of | |||
| steps in the input time series, and values are character IDs | |||
| input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
| of each sequence in inputs. | |||
| output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
| of each sequence in outputs. | |||
| mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
| of steps in the output time series, M is num_mels, and values are entries in the mel | |||
| spectrogram. Only needed for training. | |||
| """ | |||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||
| with tf.variable_scope('inference') as _: | |||
| is_training = mel_targets is not None | |||
| batch_size = tf.shape(inputs)[0] | |||
| hp = self._hparams | |||
| input_mask = None | |||
| if input_lengths is not None and is_training: | |||
| input_mask = tf.sequence_mask( | |||
| input_lengths, tf.shape(inputs)[1], dtype=tf.float32) | |||
| if input_mask is not None: | |||
| inputs = inputs * tf.expand_dims(input_mask, -1) | |||
| # speaker embedding | |||
| embedded_inputs_speaker = tf.layers.dense( | |||
| inputs_speaker, | |||
| 32, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
| # emotion embedding | |||
| embedded_inputs_emotion = tf.layers.dense( | |||
| inputs_emotion, | |||
| 32, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
| # symbol embedding | |||
| with tf.variable_scope('Embedding'): | |||
| embedded_inputs = tf.layers.dense( | |||
| inputs, | |||
| hp.embedding_dim, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer( | |||
| stddev=0.5)) | |||
| # Encoder | |||
| with tf.variable_scope('Encoder'): | |||
| Encoder = SelfAttentionEncoder( | |||
| num_layers=hp.encoder_num_layers, | |||
| num_units=hp.encoder_num_units, | |||
| num_heads=hp.encoder_num_heads, | |||
| ffn_inner_dim=hp.encoder_ffn_inner_dim, | |||
| dropout=hp.encoder_dropout, | |||
| attention_dropout=hp.encoder_attention_dropout, | |||
| relu_dropout=hp.encoder_relu_dropout) | |||
| encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( | |||
| embedded_inputs, | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| encoder_outputs = tf.layers.dense( | |||
| encoder_outputs, | |||
| hp.encoder_projection_units, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer( | |||
| stddev=0.5)) | |||
| # pitch and energy | |||
| var_inputs = tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2) | |||
| if input_mask is not None: | |||
| var_inputs = var_inputs * tf.expand_dims(input_mask, -1) | |||
| with tf.variable_scope('Pitch_Predictor'): | |||
| Pitch_Predictor_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.predictor_filter_size, | |||
| fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
| dnn_num_layers=hp.predictor_dnn_num_layers, | |||
| num_memory_units=hp.predictor_num_memory_units, | |||
| ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
| dropout=hp.predictor_dropout, | |||
| shift=hp.predictor_shift, | |||
| position_encoder=None) | |||
| pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( | |||
| tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2), | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| pitch_contour_outputs, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| pitch_contour_outputs = tf.concat( | |||
| pitch_contour_outputs, axis=-1) | |||
| pitch_contour_outputs = tf.layers.dense( | |||
| pitch_contour_outputs, units=1) # [N, T_in, 1] | |||
| pitch_contour_outputs = tf.squeeze( | |||
| pitch_contour_outputs, axis=2) # [N, T_in] | |||
| with tf.variable_scope('Energy_Predictor'): | |||
| Energy_Predictor_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.predictor_filter_size, | |||
| fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
| dnn_num_layers=hp.predictor_dnn_num_layers, | |||
| num_memory_units=hp.predictor_num_memory_units, | |||
| ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
| dropout=hp.predictor_dropout, | |||
| shift=hp.predictor_shift, | |||
| position_encoder=None) | |||
| energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( | |||
| tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2), | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| energy_contour_outputs, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| energy_contour_outputs = tf.concat( | |||
| energy_contour_outputs, axis=-1) | |||
| energy_contour_outputs = tf.layers.dense( | |||
| energy_contour_outputs, units=1) # [N, T_in, 1] | |||
| energy_contour_outputs = tf.squeeze( | |||
| energy_contour_outputs, axis=2) # [N, T_in] | |||
| if is_training: | |||
| pitch_embeddings = tf.expand_dims( | |||
| pitch_contours, axis=2) # [N, T_in, 1] | |||
| pitch_embeddings = tf.layers.conv1d( | |||
| pitch_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='pitch_embeddings') # [N, T_in, 32] | |||
| energy_embeddings = tf.expand_dims( | |||
| energy_contours, axis=2) # [N, T_in, 1] | |||
| energy_embeddings = tf.layers.conv1d( | |||
| energy_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='energy_embeddings') # [N, T_in, 32] | |||
| else: | |||
| pitch_contour_outputs *= pitch_scales | |||
| pitch_embeddings = tf.expand_dims( | |||
| pitch_contour_outputs, axis=2) # [N, T_in, 1] | |||
| pitch_embeddings = tf.layers.conv1d( | |||
| pitch_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='pitch_embeddings') # [N, T_in, 32] | |||
| energy_contour_outputs *= energy_scales | |||
| energy_embeddings = tf.expand_dims( | |||
| energy_contour_outputs, axis=2) # [N, T_in, 1] | |||
| energy_embeddings = tf.layers.conv1d( | |||
| energy_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='energy_embeddings') # [N, T_in, 32] | |||
| encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings | |||
| # duration | |||
| dur_inputs = tf.concat([ | |||
| encoder_outputs_, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2) | |||
| if input_mask is not None: | |||
| dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) | |||
| with tf.variable_scope('Duration_Predictor'): | |||
| duration_predictor_cell = MultiRNNCell([ | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units) | |||
| ], state_is_tuple=True) # yapf:disable | |||
| from .rnn_wrappers import DurPredictorCell | |||
| duration_output_cell = DurPredictorCell( | |||
| duration_predictor_cell, is_training, 1, | |||
| hp.predictor_prenet_units) | |||
| duration_predictor_init_state = duration_output_cell.zero_state( | |||
| batch_size=batch_size, dtype=tf.float32) | |||
| if is_training: | |||
| from .helpers import VarTrainingHelper | |||
| duration_helper = VarTrainingHelper( | |||
| tf.expand_dims( | |||
| tf.log(tf.cast(durations, tf.float32) + 1), | |||
| axis=2), dur_inputs, 1) | |||
| else: | |||
| from .helpers import VarTestHelper | |||
| duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
| ( | |||
| duration_outputs, _ | |||
| ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( | |||
| BasicDecoder(duration_output_cell, duration_helper, | |||
| duration_predictor_init_state), | |||
| maximum_iterations=1000) | |||
| duration_outputs = tf.squeeze( | |||
| duration_outputs, axis=2) # [N, T_in] | |||
| if input_mask is not None: | |||
| duration_outputs = duration_outputs * input_mask | |||
| duration_outputs_ = tf.exp(duration_outputs) - 1 | |||
| # Length Regulator | |||
| with tf.variable_scope('Length_Regulator'): | |||
| if is_training: | |||
| i = tf.constant(1) | |||
| # position embedding | |||
| j = tf.constant(1) | |||
| dur_len = tf.shape(durations)[-1] | |||
| embedded_position_i = tf.range(1, durations[0, 0] + 1) | |||
| def condition_pos(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range(1, durations[0, j] + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| j, embedded_position_i = tf.while_loop( | |||
| condition_pos, | |||
| loop_body_pos, [j, embedded_position_i], | |||
| shape_invariants=[ | |||
| j.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.reshape(embedded_position_i, | |||
| (1, -1)) | |||
| # others | |||
| LR_outputs = repeat( | |||
| encoder_outputs_[0:1, :, :], durations[0, :], axis=1) | |||
| embedded_outputs_speaker = repeat( | |||
| embedded_inputs_speaker[0:1, :, :], | |||
| durations[0, :], | |||
| axis=1) | |||
| embedded_outputs_emotion = repeat( | |||
| embedded_inputs_emotion[0:1, :, :], | |||
| durations[0, :], | |||
| axis=1) | |||
| def condition(i, pos, layer, s, e): | |||
| return tf.less(i, tf.shape(mel_targets)[0]) | |||
| def loop_body(i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, | |||
| embedded_outputs_emotion): | |||
| # position embedding | |||
| jj = tf.constant(1) | |||
| embedded_position_i = tf.range(1, durations[i, 0] + 1) | |||
| def condition_pos_i(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos_i(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range(1, durations[i, j] + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| jj, embedded_position_i = tf.while_loop( | |||
| condition_pos_i, | |||
| loop_body_pos_i, [jj, embedded_position_i], | |||
| shape_invariants=[ | |||
| jj.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.concat([ | |||
| embedded_position, | |||
| tf.reshape(embedded_position_i, (1, -1)) | |||
| ], 0) | |||
| # others | |||
| LR_outputs = tf.concat([ | |||
| LR_outputs, | |||
| repeat( | |||
| encoder_outputs_[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| embedded_outputs_speaker = tf.concat([ | |||
| embedded_outputs_speaker, | |||
| repeat( | |||
| embedded_inputs_speaker[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| embedded_outputs_emotion = tf.concat([ | |||
| embedded_outputs_emotion, | |||
| repeat( | |||
| embedded_inputs_emotion[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| return [ | |||
| i + 1, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, embedded_outputs_emotion | |||
| ] | |||
| i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, | |||
| embedded_outputs_emotion = tf.while_loop( | |||
| condition, | |||
| loop_body, [ | |||
| i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, embedded_outputs_emotion | |||
| ], | |||
| shape_invariants=[ | |||
| i.get_shape(), | |||
| tf.TensorShape([None, None]), | |||
| tf.TensorShape([None, None, None]), | |||
| tf.TensorShape([None, None, None]), | |||
| tf.TensorShape([None, None, None]) | |||
| ], | |||
| parallel_iterations=hp.batch_size) | |||
| ori_framenum = tf.shape(mel_targets)[1] | |||
| else: | |||
| # position | |||
| j = tf.constant(1) | |||
| dur_len = tf.shape(duration_outputs_)[-1] | |||
| embedded_position_i = tf.range( | |||
| 1, | |||
| tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) | |||
| + 1) | |||
| def condition_pos(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range( | |||
| 1, | |||
| tf.cast( | |||
| tf.round(duration_outputs_)[0, j], | |||
| tf.int32) + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| j, embedded_position_i = tf.while_loop( | |||
| condition_pos, | |||
| loop_body_pos, [j, embedded_position_i], | |||
| shape_invariants=[ | |||
| j.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.reshape(embedded_position_i, | |||
| (1, -1)) | |||
| # others | |||
| duration_outputs_ *= duration_scales | |||
| LR_outputs = repeat( | |||
| encoder_outputs_[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| embedded_outputs_speaker = repeat( | |||
| embedded_inputs_speaker[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| embedded_outputs_emotion = repeat( | |||
| embedded_inputs_emotion[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| ori_framenum = tf.shape(LR_outputs)[1] | |||
| left = hp.outputs_per_step - tf.mod( | |||
| ori_framenum, hp.outputs_per_step) | |||
| LR_outputs = tf.cond( | |||
| tf.equal(left, | |||
| hp.outputs_per_step), lambda: LR_outputs, | |||
| lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], | |||
| 'CONSTANT')) | |||
| embedded_outputs_speaker = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_outputs_speaker, lambda: tf.pad( | |||
| embedded_outputs_speaker, [[0, 0], [0, left], | |||
| [0, 0]], 'CONSTANT')) | |||
| embedded_outputs_emotion = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_outputs_emotion, lambda: tf.pad( | |||
| embedded_outputs_emotion, [[0, 0], [0, left], | |||
| [0, 0]], 'CONSTANT')) | |||
| embedded_position = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_position, | |||
| lambda: tf.pad(embedded_position, [[0, 0], [0, left]], | |||
| 'CONSTANT')) | |||
| # Pos_Embedding | |||
| with tf.variable_scope('Position_Embedding'): | |||
| Pos_Embedding = BatchSinusodalPositionalEncoding() | |||
| position_embeddings = Pos_Embedding.positional_encoding( | |||
| batch_size, | |||
| tf.shape(LR_outputs)[1], hp.encoder_projection_units, | |||
| embedded_position) | |||
| LR_outputs += position_embeddings | |||
| # multi-frame | |||
| LR_outputs = tf.reshape(LR_outputs, [ | |||
| batch_size, -1, | |||
| hp.outputs_per_step * hp.encoder_projection_units | |||
| ]) | |||
| embedded_outputs_speaker = tf.reshape( | |||
| embedded_outputs_speaker, | |||
| [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
| embedded_outputs_emotion = tf.reshape( | |||
| embedded_outputs_emotion, | |||
| [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
| # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) | |||
| LR_outputs = tf.concat([ | |||
| LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion | |||
| ], -1) | |||
| # auto bandwidth | |||
| if is_training: | |||
| durations_mask = tf.cast(durations, | |||
| tf.float32) * input_mask # [N, T_in] | |||
| else: | |||
| durations_mask = duration_outputs_ | |||
| X_band_width = tf.cast( | |||
| tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), | |||
| tf.int32) | |||
| H_band_width = X_band_width | |||
| with tf.variable_scope('Decoder'): | |||
| Decoder = SelfAttentionDecoder( | |||
| num_layers=hp.decoder_num_layers, | |||
| num_units=hp.decoder_num_units, | |||
| num_heads=hp.decoder_num_heads, | |||
| ffn_inner_dim=hp.decoder_ffn_inner_dim, | |||
| dropout=hp.decoder_dropout, | |||
| attention_dropout=hp.decoder_attention_dropout, | |||
| relu_dropout=hp.decoder_relu_dropout, | |||
| prenet_units=hp.prenet_units, | |||
| dense_units=hp.prenet_proj_units, | |||
| num_mels=hp.num_mels, | |||
| outputs_per_step=hp.outputs_per_step, | |||
| X_band_width=X_band_width, | |||
| H_band_width=H_band_width, | |||
| position_encoder=None) | |||
| if is_training: | |||
| if hp.free_run: | |||
| r = hp.outputs_per_step | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_input_lengths = tf.cast( | |||
| output_lengths / r, tf.int32) | |||
| decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
| init_decoder_input, | |||
| maximum_iterations=tf.shape(LR_outputs)[1], | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=decoder_input_lengths) | |||
| else: | |||
| r = hp.outputs_per_step | |||
| decoder_input = mel_targets[:, r - 1:: | |||
| r, :] # [N, T_out / r, hp.num_mels] | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_input = tf.concat( | |||
| [init_decoder_input, decoder_input], | |||
| axis=1) # [N, T_out / r + 1, hp.num_mels] | |||
| decoder_input = decoder_input[:, : | |||
| -1, :] # [N, T_out / r, hp.num_mels] | |||
| decoder_input_lengths = tf.cast( | |||
| output_lengths / r, tf.int32) | |||
| decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( | |||
| decoder_input, | |||
| decoder_input_lengths, | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=decoder_input_lengths) | |||
| else: | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
| init_decoder_input, | |||
| maximum_iterations=tf.shape(LR_outputs)[1], | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=tf.expand_dims( | |||
| tf.shape(LR_outputs)[1], axis=0)) | |||
| if is_training: | |||
| mel_outputs_ = tf.reshape(decoder_outputs, | |||
| [batch_size, -1, hp.num_mels]) | |||
| else: | |||
| mel_outputs_ = tf.reshape( | |||
| decoder_outputs, | |||
| [batch_size, -1, hp.num_mels])[:, :ori_framenum, :] | |||
| mel_outputs = mel_outputs_ | |||
| with tf.variable_scope('Postnet'): | |||
| Postnet_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.postnet_filter_size, | |||
| fsmn_num_layers=hp.postnet_fsmn_num_layers, | |||
| dnn_num_layers=hp.postnet_dnn_num_layers, | |||
| num_memory_units=hp.postnet_num_memory_units, | |||
| ffn_inner_dim=hp.postnet_ffn_inner_dim, | |||
| dropout=hp.postnet_dropout, | |||
| shift=hp.postnet_shift, | |||
| position_encoder=None) | |||
| if is_training: | |||
| postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
| mel_outputs, | |||
| sequence_length=output_lengths, | |||
| mode=is_training) | |||
| hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(hp.postnet_lstm_units), | |||
| postnet_fsmn_outputs, | |||
| sequence_length=output_lengths, | |||
| dtype=tf.float32) | |||
| else: | |||
| postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
| mel_outputs, | |||
| sequence_length=[tf.shape(mel_outputs_)[1]], | |||
| mode=is_training) | |||
| hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(hp.postnet_lstm_units), | |||
| postnet_fsmn_outputs, | |||
| sequence_length=[tf.shape(mel_outputs_)[1]], | |||
| dtype=tf.float32) | |||
| mel_residual_outputs = tf.layers.dense( | |||
| hidden_lstm_outputs, units=hp.num_mels) | |||
| mel_outputs += mel_residual_outputs | |||
| self.inputs = inputs | |||
| self.inputs_speaker = inputs_speaker | |||
| self.inputs_emotion = inputs_emotion | |||
| self.input_lengths = input_lengths | |||
| self.durations = durations | |||
| self.output_lengths = output_lengths | |||
| self.mel_outputs_ = mel_outputs_ | |||
| self.mel_outputs = mel_outputs | |||
| self.mel_targets = mel_targets | |||
| self.duration_outputs = duration_outputs | |||
| self.duration_outputs_ = duration_outputs_ | |||
| self.duration_scales = duration_scales | |||
| self.pitch_contour_outputs = pitch_contour_outputs | |||
| self.pitch_contours = pitch_contours | |||
| self.pitch_scales = pitch_scales | |||
| self.energy_contour_outputs = energy_contour_outputs | |||
| self.energy_contours = energy_contours | |||
| self.energy_scales = energy_scales | |||
| self.uv_masks_ = uv_masks | |||
| self.embedded_inputs_emotion = embedded_inputs_emotion | |||
| self.embedding_fsmn_outputs = embedded_inputs | |||
| self.encoder_outputs = encoder_outputs | |||
| self.encoder_outputs_ = encoder_outputs_ | |||
| self.LR_outputs = LR_outputs | |||
| self.postnet_fsmn_outputs = postnet_fsmn_outputs | |||
| self.pitch_embeddings = pitch_embeddings | |||
| self.energy_embeddings = energy_embeddings | |||
| self.attns = attns | |||
| self.attention_x = attention_x | |||
| self.attention_h = attention_h | |||
| self.X_band_width = X_band_width | |||
| self.H_band_width = H_band_width | |||
| def add_loss(self): | |||
| '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' | |||
| with tf.variable_scope('loss') as _: | |||
| hp = self._hparams | |||
| mask = tf.sequence_mask( | |||
| self.output_lengths, | |||
| tf.shape(self.mel_targets)[1], | |||
| dtype=tf.float32) | |||
| valid_outputs = tf.reduce_sum(mask) | |||
| mask_input = tf.sequence_mask( | |||
| self.input_lengths, | |||
| tf.shape(self.durations)[1], | |||
| dtype=tf.float32) | |||
| valid_inputs = tf.reduce_sum(mask_input) | |||
| # mel loss | |||
| if self.uv_masks_ is not None: | |||
| valid_outputs_mask = tf.reduce_sum( | |||
| tf.expand_dims(mask, -1) * self.uv_masks_) | |||
| self.mel_loss_ = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs_) | |||
| * tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
| valid_outputs_mask * hp.num_mels) | |||
| self.mel_loss = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs) | |||
| * tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
| valid_outputs_mask * hp.num_mels) | |||
| else: | |||
| self.mel_loss_ = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs_) | |||
| * tf.expand_dims(mask, -1)) / ( | |||
| valid_outputs * hp.num_mels) | |||
| self.mel_loss = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs) | |||
| * tf.expand_dims(mask, -1)) / ( | |||
| valid_outputs * hp.num_mels) | |||
| # duration loss | |||
| self.duration_loss = tf.reduce_sum( | |||
| tf.abs( | |||
| tf.log(tf.cast(self.durations, tf.float32) + 1) | |||
| - self.duration_outputs) * mask_input) / valid_inputs | |||
| # pitch contour loss | |||
| self.pitch_contour_loss = tf.reduce_sum( | |||
| tf.abs(self.pitch_contours - self.pitch_contour_outputs) | |||
| * mask_input) / valid_inputs | |||
| # energy contour loss | |||
| self.energy_contour_loss = tf.reduce_sum( | |||
| tf.abs(self.energy_contours - self.energy_contour_outputs) | |||
| * mask_input) / valid_inputs | |||
| # final loss | |||
| self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ | |||
| + self.pitch_contour_loss + self.energy_contour_loss | |||
| # guided attention loss | |||
| self.guided_attention_loss = tf.constant(0.0) | |||
| if hp.guided_attention: | |||
| i0 = tf.constant(0) | |||
| loss0 = tf.constant(0.0) | |||
| def c(i, _): | |||
| return tf.less(i, tf.shape(mel_targets)[0]) | |||
| def loop_body(i, loss): | |||
| decoder_input_lengths = tf.cast( | |||
| self.output_lengths / hp.outputs_per_step, tf.int32) | |||
| input_len = decoder_input_lengths[i] | |||
| output_len = decoder_input_lengths[i] | |||
| input_w = tf.expand_dims( | |||
| tf.range(tf.cast(input_len, dtype=tf.float32)), | |||
| axis=1) / tf.cast( | |||
| input_len, dtype=tf.float32) # [T_in, 1] | |||
| output_w = tf.expand_dims( | |||
| tf.range(tf.cast(output_len, dtype=tf.float32)), | |||
| axis=0) / tf.cast( | |||
| output_len, dtype=tf.float32) # [1, T_out] | |||
| guided_attention_w = 1.0 - tf.exp( | |||
| -(1 / hp.guided_attention_2g_squared) | |||
| * tf.square(input_w - output_w)) # [T_in, T_out] | |||
| guided_attention_w = tf.expand_dims( | |||
| guided_attention_w, axis=0) # [1, T_in, T_out] | |||
| # [hp.decoder_num_heads, T_in, T_out] | |||
| guided_attention_w = tf.tile(guided_attention_w, | |||
| [hp.decoder_num_heads, 1, 1]) | |||
| loss_i = tf.constant(0.0) | |||
| for j in range(hp.decoder_num_layers): | |||
| loss_i += tf.reduce_mean( | |||
| self.attention_h[j][i, :, :input_len, :output_len] | |||
| * guided_attention_w) | |||
| return [tf.add(i, 1), tf.add(loss, loss_i)] | |||
| _, loss = tf.while_loop( | |||
| c, | |||
| loop_body, | |||
| loop_vars=[i0, loss0], | |||
| parallel_iterations=hp.batch_size) | |||
| self.guided_attention_loss = loss / hp.batch_size | |||
| self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss | |||
| def add_optimizer(self, global_step): | |||
| '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. | |||
| Args: | |||
| global_step: int32 scalar Tensor representing current global step in training | |||
| ''' | |||
| with tf.variable_scope('optimizer') as _: | |||
| hp = self._hparams | |||
| if hp.decay_learning_rate: | |||
| self.learning_rate = _learning_rate_decay( | |||
| hp.initial_learning_rate, global_step) | |||
| else: | |||
| self.learning_rate = tf.convert_to_tensor( | |||
| hp.initial_learning_rate) | |||
| optimizer = tf.train.AdamOptimizer(self.learning_rate, | |||
| hp.adam_beta1, hp.adam_beta2) | |||
| gradients, variables = zip(*optimizer.compute_gradients(self.loss)) | |||
| self.gradients = gradients | |||
| clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) | |||
| # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: | |||
| # https://github.com/tensorflow/tensorflow/issues/1122 | |||
| with tf.control_dependencies( | |||
| tf.get_collection(tf.GraphKeys.UPDATE_OPS)): | |||
| self.optimize = optimizer.apply_gradients( | |||
| zip(clipped_gradients, variables), global_step=global_step) | |||
| def _learning_rate_decay(init_lr, global_step): | |||
| # Noam scheme from tensor2tensor: | |||
| warmup_steps = 4000.0 | |||
| step = tf.cast(global_step + 1, dtype=tf.float32) | |||
| return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, | |||
| step**-0.5) | |||
| @@ -1,817 +0,0 @@ | |||
| """Define self-attention decoder.""" | |||
| import sys | |||
| import tensorflow as tf | |||
| from . import compat, transformer | |||
| from .am_models import decoder_prenet | |||
| from .position import SinusoidalPositionEncoder | |||
| class SelfAttentionDecoder(): | |||
| """Decoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| prenet_units=256, | |||
| dense_units=128, | |||
| num_mels=80, | |||
| outputs_per_step=3, | |||
| X_band_width=None, | |||
| H_band_width=None, | |||
| position_encoder=SinusoidalPositionEncoder(), | |||
| self_attention_type='scaled_dot'): | |||
| """Initializes the parameters of the decoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
| insensitive). | |||
| Raises: | |||
| ValueError: if :obj:`self_attention_type` is invalid. | |||
| """ | |||
| super(SelfAttentionDecoder, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| self.self_attention_type = self_attention_type.lower() | |||
| if self.self_attention_type not in ('scaled_dot', 'average'): | |||
| raise ValueError('invalid attention type %s' | |||
| % self.self_attention_type) | |||
| if self.self_attention_type == 'average': | |||
| tf.logging.warning( | |||
| 'Support for average attention network is experimental ' | |||
| 'and may change in future versions.') | |||
| self.prenet_units = prenet_units | |||
| self.dense_units = dense_units | |||
| self.num_mels = num_mels | |||
| self.outputs_per_step = outputs_per_step | |||
| self.X_band_width = X_band_width | |||
| self.H_band_width = H_band_width | |||
| @property | |||
| def output_size(self): | |||
| """Returns the decoder output size.""" | |||
| return self.num_units | |||
| @property | |||
| def support_alignment_history(self): | |||
| return True | |||
| @property | |||
| def support_multi_source(self): | |||
| return True | |||
| def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
| cache = {} | |||
| for layer in range(self.num_layers): | |||
| proj_cache_shape = [ | |||
| batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
| ] | |||
| layer_cache = {} | |||
| layer_cache['memory'] = [{ | |||
| 'memory_keys': | |||
| tf.zeros(proj_cache_shape, dtype=dtype), | |||
| 'memory_values': | |||
| tf.zeros(proj_cache_shape, dtype=dtype) | |||
| } for _ in range(num_sources)] | |||
| if self.self_attention_type == 'scaled_dot': | |||
| layer_cache['self_keys'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| layer_cache['self_values'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| elif self.self_attention_type == 'average': | |||
| layer_cache['prev_g'] = tf.zeros( | |||
| [batch_size, 1, self.num_units], dtype=dtype) | |||
| cache['layer_{}'.format(layer)] = layer_cache | |||
| return cache | |||
| def _init_attn(self, dtype=tf.float32): | |||
| attn = [] | |||
| for layer in range(self.num_layers): | |||
| attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) | |||
| return attn | |||
| def _self_attention_stack(self, | |||
| inputs, | |||
| sequence_length=None, | |||
| mode=True, | |||
| cache=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| step=None): | |||
| # [N, T_out, self.dense_units] or [N, 1, self.dense_units] | |||
| prenet_outputs = decoder_prenet(inputs, self.prenet_units, | |||
| self.dense_units, mode) | |||
| if step is None: | |||
| decoder_inputs = tf.concat( | |||
| [memory, prenet_outputs], | |||
| axis=-1) # [N, T_out, memory_size + self.dense_units] | |||
| else: | |||
| decoder_inputs = tf.concat( | |||
| [memory[:, step:step + 1, :], prenet_outputs], | |||
| axis=-1) # [N, 1, memory_size + self.dense_units] | |||
| decoder_inputs = tf.layers.dense( | |||
| decoder_inputs, units=self.dense_units) | |||
| inputs = decoder_inputs | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder( | |||
| inputs, position=step + 1 if step is not None else None) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| decoder_mask = None | |||
| memory_mask = None | |||
| # last_attention = None | |||
| X_band_width_tmp = -1 | |||
| H_band_width_tmp = -1 | |||
| if self.X_band_width is not None: | |||
| X_band_width_tmp = tf.cast( | |||
| tf.cond( | |||
| tf.less(tf.shape(memory)[1], self.X_band_width), | |||
| lambda: -1, lambda: self.X_band_width), | |||
| dtype=tf.int64) | |||
| if self.H_band_width is not None: | |||
| H_band_width_tmp = tf.cast( | |||
| tf.cond( | |||
| tf.less(tf.shape(memory)[1], self.H_band_width), | |||
| lambda: -1, lambda: self.H_band_width), | |||
| dtype=tf.int64) | |||
| if self.self_attention_type == 'scaled_dot': | |||
| if sequence_length is not None: | |||
| decoder_mask = transformer.build_future_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| band=X_band_width_tmp) # [N, 1, T_out, T_out] | |||
| elif self.self_attention_type == 'average': | |||
| if cache is None: | |||
| if sequence_length is None: | |||
| sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
| tf.shape(inputs)[1]) | |||
| decoder_mask = transformer.cumulative_average_mask( | |||
| sequence_length, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| dtype=inputs.dtype) | |||
| if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
| memory): | |||
| memory = (memory, ) | |||
| if memory_sequence_length is not None: | |||
| if not tf.contrib.framework.nest.is_sequence( | |||
| memory_sequence_length): | |||
| memory_sequence_length = (memory_sequence_length, ) | |||
| if step is None: | |||
| memory_mask = [ | |||
| transformer.build_history_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1], | |||
| band=H_band_width_tmp) | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| else: | |||
| memory_mask = [ | |||
| transformer.build_history_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1], | |||
| band=H_band_width_tmp)[:, :, step:step + 1, :] | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| # last_attention = None | |||
| attns_x = [] | |||
| attns_h = [] | |||
| for layer in range(self.num_layers): | |||
| layer_name = 'layer_{}'.format(layer) | |||
| layer_cache = cache[layer_name] if cache is not None else None | |||
| with tf.variable_scope(layer_name): | |||
| if memory is not None: | |||
| for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
| memory_cache = None | |||
| if layer_cache is not None: | |||
| memory_cache = layer_cache['memory'][i] | |||
| scope_name = 'multi_head_{}'.format(i) | |||
| if i == 0: | |||
| scope_name = 'multi_head' | |||
| with tf.variable_scope(scope_name): | |||
| encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| mem, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=decoder_mask, | |||
| mask_h=mask, | |||
| cache=layer_cache, | |||
| cache_h=memory_cache, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True, | |||
| layer_name=layer_name, | |||
| X_band_width=self.X_band_width) | |||
| attns_x.append(attn_x) | |||
| attns_h.append(attn_h) | |||
| context = transformer.drop_and_add( | |||
| inputs, encoded, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| outputs = transformer.norm(inputs) | |||
| outputs = tf.layers.dense( | |||
| outputs, units=self.num_mels * self.outputs_per_step) | |||
| return outputs, attns_x, attns_h | |||
| def decode_from_inputs(self, | |||
| inputs, | |||
| sequence_length, | |||
| initial_state=None, | |||
| mode=True, | |||
| memory=None, | |||
| memory_sequence_length=None): | |||
| outputs, attention_x, attention_h = self._self_attention_stack( | |||
| inputs, | |||
| sequence_length=sequence_length, | |||
| mode=mode, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| return outputs, attention_x, attention_h | |||
| def step_fn(self, | |||
| mode, | |||
| batch_size, | |||
| initial_state=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| dtype=tf.float32): | |||
| if memory is None: | |||
| num_sources = 0 | |||
| elif tf.contrib.framework.nest.is_sequence(memory): | |||
| num_sources = len(memory) | |||
| else: | |||
| num_sources = 1 | |||
| cache = self._init_cache( | |||
| batch_size, dtype=dtype, num_sources=num_sources) | |||
| attention_x = self._init_attn(dtype=dtype) | |||
| attention_h = self._init_attn(dtype=dtype) | |||
| def _fn(step, inputs, cache): | |||
| outputs, attention_x, attention_h = self._self_attention_stack( | |||
| inputs, | |||
| mode=mode, | |||
| cache=cache, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length, | |||
| step=step) | |||
| attention_x_tmp = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_x_tmp_l = tf.zeros_like(attention_h[layer]) | |||
| if self.X_band_width is not None: | |||
| pred = tf.less(step, self.X_band_width + 1) | |||
| attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable | |||
| lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], | |||
| lambda: tf.concat([ | |||
| attention_x_tmp_l[:, :, :, | |||
| :step - self.X_band_width], | |||
| attention_x_tmp_l[:, :, :, | |||
| step - self.X_band_width:step + 1] | |||
| + attention_x[layer]], | |||
| axis=-1)) # yapf:disable | |||
| attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
| attention_x_tmp.append( | |||
| tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], | |||
| axis=-1)) | |||
| else: | |||
| attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] | |||
| attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
| attention_x_tmp.append( | |||
| tf.concat([ | |||
| attention_x_tmp_l_1 + attention_x[layer], | |||
| attention_x_tmp_l_2 | |||
| ], axis=-1)) # yapf:disable | |||
| attention_x = attention_x_tmp | |||
| return outputs, cache, attention_x, attention_h | |||
| return _fn, cache, attention_x, attention_h | |||
| def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, | |||
| mode, memory, memory_sequence_length): | |||
| batch_size = tf.shape(init_decoder_input)[0] | |||
| step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
| mode, | |||
| batch_size, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| outputs, attention_x, attention_h, cache = self.dynamic_decode( | |||
| step_fn, | |||
| init_decoder_input, | |||
| init_cache=init_cache, | |||
| init_attn_x=init_attn_x, | |||
| init_attn_h=init_attn_h, | |||
| maximum_iterations=maximum_iterations, | |||
| batch_size=batch_size) | |||
| return outputs, attention_x, attention_h | |||
| def dynamic_decode_and_search_teacher_forcing(self, decoder_input, | |||
| maximum_iterations, mode, | |||
| memory, | |||
| memory_sequence_length): | |||
| batch_size = tf.shape(decoder_input)[0] | |||
| step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
| mode, | |||
| batch_size, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( | |||
| step_fn, | |||
| decoder_input, | |||
| init_cache=init_cache, | |||
| init_attn_x=init_attn_x, | |||
| init_attn_h=init_attn_h, | |||
| maximum_iterations=maximum_iterations, | |||
| batch_size=batch_size) | |||
| return outputs, attention_x, attention_h | |||
| def dynamic_decode(self, | |||
| step_fn, | |||
| init_decoder_input, | |||
| init_cache=None, | |||
| init_attn_x=None, | |||
| init_attn_h=None, | |||
| maximum_iterations=None, | |||
| batch_size=None): | |||
| def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
| return tf.less(step, maximum_iterations) | |||
| def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
| # output: [1, 1, num_mels * r] | |||
| # attn: [1, 1, T_out] | |||
| output, cache, attn_x, attn_h = step_fn( | |||
| step, inputs, cache) # outputs, cache, attention, attns | |||
| for layer in range(len(attention_x)): | |||
| attention_x[layer] = attention_x[layer].write( | |||
| step, tf.cast(attn_x[layer], tf.float32)) | |||
| for layer in range(len(attention_h)): | |||
| attention_h[layer] = attention_h[layer].write( | |||
| step, tf.cast(attn_h[layer], tf.float32)) | |||
| outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
| return step + 1, cache, output[:, :, -self. | |||
| num_mels:], outputs, attention_x, attention_h | |||
| step = tf.constant(0, dtype=tf.int32) | |||
| outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
| _, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
| _cond, | |||
| _body, | |||
| loop_vars=(step, init_cache, init_decoder_input, outputs, | |||
| init_attn_x, init_attn_h), | |||
| shape_invariants=(step.shape, | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_cache), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, | |||
| init_decoder_input), tf.TensorShape(None), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_x), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_h)), | |||
| parallel_iterations=1, | |||
| back_prop=False, | |||
| maximum_iterations=maximum_iterations) | |||
| # element of outputs: [N, 1, num_mels * r] | |||
| outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
| outputs_stack = tf.transpose( | |||
| outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
| outputs_stack = tf.squeeze( | |||
| outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
| attention_x_stack = [] | |||
| for layer in range(len(attention_x)): | |||
| attention_x_stack_tmp = attention_x[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_x_stack_tmp = tf.transpose( | |||
| attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_x_stack_tmp = tf.squeeze( | |||
| attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_x_stack.append(attention_x_stack_tmp) | |||
| attention_h_stack = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_h_stack_tmp = attention_h[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_h_stack_tmp = tf.transpose( | |||
| attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_h_stack_tmp = tf.squeeze( | |||
| attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_h_stack.append(attention_h_stack_tmp) | |||
| return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
| def dynamic_decode_teacher_forcing(self, | |||
| step_fn, | |||
| decoder_input, | |||
| init_cache=None, | |||
| init_attn_x=None, | |||
| init_attn_h=None, | |||
| maximum_iterations=None, | |||
| batch_size=None): | |||
| def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
| return tf.less(step, maximum_iterations) | |||
| def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
| # output: [1, 1, num_mels * r] | |||
| # attn: [1, 1, T_out] | |||
| output, cache, attn_x, attn_h = step_fn( | |||
| step, inputs[:, step:step + 1, :], | |||
| cache) # outputs, cache, attention, attns | |||
| for layer in range(len(attention_x)): | |||
| attention_x[layer] = attention_x[layer].write( | |||
| step, tf.cast(attn_x[layer], tf.float32)) | |||
| for layer in range(len(attention_h)): | |||
| attention_h[layer] = attention_h[layer].write( | |||
| step, tf.cast(attn_h[layer], tf.float32)) | |||
| outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
| return step + 1, cache, inputs, outputs, attention_x, attention_h | |||
| step = tf.constant(0, dtype=tf.int32) | |||
| outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
| _, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
| _cond, | |||
| _body, | |||
| loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, | |||
| init_attn_h), | |||
| shape_invariants=(step.shape, | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, | |||
| init_cache), decoder_input.shape, | |||
| tf.TensorShape(None), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_x), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_h)), | |||
| parallel_iterations=1, | |||
| back_prop=False, | |||
| maximum_iterations=maximum_iterations) | |||
| # element of outputs: [N, 1, num_mels * r] | |||
| outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
| outputs_stack = tf.transpose( | |||
| outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
| outputs_stack = tf.squeeze( | |||
| outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
| attention_x_stack = [] | |||
| for layer in range(len(attention_x)): | |||
| attention_x_stack_tmp = attention_x[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_x_stack_tmp = tf.transpose( | |||
| attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_x_stack_tmp = tf.squeeze( | |||
| attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_x_stack.append(attention_x_stack_tmp) | |||
| attention_h_stack = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_h_stack_tmp = attention_h[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_h_stack_tmp = tf.transpose( | |||
| attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_h_stack_tmp = tf.squeeze( | |||
| attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_h_stack.append(attention_h_stack_tmp) | |||
| return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
| def _get_shape_invariants(self, tensor): | |||
| """Returns the shape of the tensor but sets middle dims to None.""" | |||
| if isinstance(tensor, tf.TensorArray): | |||
| shape = None | |||
| else: | |||
| shape = tensor.shape.as_list() | |||
| for i in range(1, len(shape) - 1): | |||
| shape[i] = None | |||
| return tf.TensorShape(shape) | |||
| class SelfAttentionDecoderOri(): | |||
| """Decoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder(), | |||
| self_attention_type='scaled_dot'): | |||
| """Initializes the parameters of the decoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
| insensitive). | |||
| Raises: | |||
| ValueError: if :obj:`self_attention_type` is invalid. | |||
| """ | |||
| super(SelfAttentionDecoderOri, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| self.self_attention_type = self_attention_type.lower() | |||
| if self.self_attention_type not in ('scaled_dot', 'average'): | |||
| raise ValueError('invalid attention type %s' | |||
| % self.self_attention_type) | |||
| if self.self_attention_type == 'average': | |||
| tf.logging.warning( | |||
| 'Support for average attention network is experimental ' | |||
| 'and may change in future versions.') | |||
| @property | |||
| def output_size(self): | |||
| """Returns the decoder output size.""" | |||
| return self.num_units | |||
| @property | |||
| def support_alignment_history(self): | |||
| return True | |||
| @property | |||
| def support_multi_source(self): | |||
| return True | |||
| def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
| cache = {} | |||
| for layer in range(self.num_layers): | |||
| proj_cache_shape = [ | |||
| batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
| ] | |||
| layer_cache = {} | |||
| layer_cache['memory'] = [{ | |||
| 'memory_keys': | |||
| tf.zeros(proj_cache_shape, dtype=dtype), | |||
| 'memory_values': | |||
| tf.zeros(proj_cache_shape, dtype=dtype) | |||
| } for _ in range(num_sources)] | |||
| if self.self_attention_type == 'scaled_dot': | |||
| layer_cache['self_keys'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| layer_cache['self_values'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| elif self.self_attention_type == 'average': | |||
| layer_cache['prev_g'] = tf.zeros( | |||
| [batch_size, 1, self.num_units], dtype=dtype) | |||
| cache['layer_{}'.format(layer)] = layer_cache | |||
| return cache | |||
| def _self_attention_stack(self, | |||
| inputs, | |||
| sequence_length=None, | |||
| mode=True, | |||
| cache=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| step=None): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder( | |||
| inputs, position=step + 1 if step is not None else None) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| decoder_mask = None | |||
| memory_mask = None | |||
| last_attention = None | |||
| if self.self_attention_type == 'scaled_dot': | |||
| if sequence_length is not None: | |||
| decoder_mask = transformer.build_future_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) | |||
| elif self.self_attention_type == 'average': | |||
| if cache is None: | |||
| if sequence_length is None: | |||
| sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
| tf.shape(inputs)[1]) | |||
| decoder_mask = transformer.cumulative_average_mask( | |||
| sequence_length, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| dtype=inputs.dtype) | |||
| if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
| memory): | |||
| memory = (memory, ) | |||
| if memory_sequence_length is not None: | |||
| if not tf.contrib.framework.nest.is_sequence( | |||
| memory_sequence_length): | |||
| memory_sequence_length = (memory_sequence_length, ) | |||
| memory_mask = [ | |||
| transformer.build_sequence_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1]) | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| for layer in range(self.num_layers): | |||
| layer_name = 'layer_{}'.format(layer) | |||
| layer_cache = cache[layer_name] if cache is not None else None | |||
| with tf.variable_scope(layer_name): | |||
| if self.self_attention_type == 'scaled_dot': | |||
| with tf.variable_scope('masked_multi_head'): | |||
| encoded = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=decoder_mask, | |||
| cache=layer_cache, | |||
| dropout=self.attention_dropout) | |||
| last_context = transformer.drop_and_add( | |||
| inputs, encoded, mode, dropout=self.dropout) | |||
| elif self.self_attention_type == 'average': | |||
| with tf.variable_scope('average_attention'): | |||
| # Cumulative average. | |||
| x = transformer.norm(inputs) | |||
| y = transformer.cumulative_average( | |||
| x, | |||
| decoder_mask if cache is None else step, | |||
| cache=layer_cache) | |||
| # FFN. | |||
| y = transformer.feed_forward( | |||
| y, | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| # Gating layer. | |||
| z = tf.layers.dense( | |||
| tf.concat([x, y], -1), self.num_units * 2) | |||
| i, f = tf.split(z, 2, axis=-1) | |||
| y = tf.sigmoid(i) * x + tf.sigmoid(f) * y | |||
| last_context = transformer.drop_and_add( | |||
| inputs, y, mode, dropout=self.dropout) | |||
| if memory is not None: | |||
| for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
| memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable | |||
| with tf.variable_scope('multi_head' if i | |||
| == 0 else 'multi_head_%d' % i): # yapf:disable | |||
| context, last_attention = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(last_context), | |||
| mem, | |||
| mode, | |||
| mask=mask, | |||
| cache=memory_cache, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| last_context = transformer.drop_and_add( | |||
| last_context, | |||
| context, | |||
| mode, | |||
| dropout=self.dropout) | |||
| if i > 0: # Do not return attention in case of multi source. | |||
| last_attention = None | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(last_context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| last_context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| if last_attention is not None: | |||
| # The first head of the last layer is returned. | |||
| first_head_attention = last_attention[:, 0] | |||
| else: | |||
| first_head_attention = None | |||
| outputs = transformer.norm(inputs) | |||
| return outputs, first_head_attention | |||
| def decode_from_inputs(self, | |||
| inputs, | |||
| sequence_length, | |||
| initial_state=None, | |||
| mode=True, | |||
| memory=None, | |||
| memory_sequence_length=None): | |||
| outputs, attention = self._self_attention_stack( | |||
| inputs, | |||
| sequence_length=sequence_length, | |||
| mode=mode, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| return outputs, None, attention | |||
| def step_fn(self, | |||
| mode, | |||
| batch_size, | |||
| initial_state=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| dtype=tf.float32): | |||
| if memory is None: | |||
| num_sources = 0 | |||
| elif tf.contrib.framework.nest.is_sequence(memory): | |||
| num_sources = len(memory) | |||
| else: | |||
| num_sources = 1 | |||
| cache = self._init_cache( | |||
| batch_size, dtype=dtype, num_sources=num_sources) | |||
| def _fn(step, inputs, cache, mode): | |||
| inputs = tf.expand_dims(inputs, 1) | |||
| outputs, attention = self._self_attention_stack( | |||
| inputs, | |||
| mode=mode, | |||
| cache=cache, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length, | |||
| step=step) | |||
| outputs = tf.squeeze(outputs, axis=1) | |||
| if attention is not None: | |||
| attention = tf.squeeze(attention, axis=1) | |||
| return outputs, cache, attention | |||
| return _fn, cache | |||
| @@ -1,182 +0,0 @@ | |||
| """Define the self-attention encoder.""" | |||
| import tensorflow as tf | |||
| from . import transformer | |||
| from .position import SinusoidalPositionEncoder | |||
| class SelfAttentionEncoder(): | |||
| """Encoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder()): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(SelfAttentionEncoder, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = transformer.build_sequence_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) | |||
| mask_FF = tf.squeeze( | |||
| transformer.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]), | |||
| axis=1) | |||
| state = () | |||
| attns = [] | |||
| for layer in range(self.num_layers): | |||
| with tf.variable_scope('layer_{}'.format(layer)): | |||
| with tf.variable_scope('multi_head'): | |||
| context, attn = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=mask, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| attns.append(attn) | |||
| context = transformer.drop_and_add( | |||
| inputs, context, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout, | |||
| mask=mask_FF) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = transformer.norm(inputs) | |||
| return (outputs, state, sequence_length, attns) | |||
| class SelfAttentionEncoderOri(): | |||
| """Encoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder()): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(SelfAttentionEncoderOri, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = transformer.build_sequence_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] | |||
| state = () | |||
| attns = [] | |||
| for layer in range(self.num_layers): | |||
| with tf.variable_scope('layer_{}'.format(layer)): | |||
| with tf.variable_scope('multi_head'): | |||
| context, attn = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=mask, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| attns.append(attn) | |||
| context = transformer.drop_and_add( | |||
| inputs, context, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = transformer.norm(inputs) | |||
| return (outputs, state, sequence_length, attns) | |||
| @@ -1,59 +0,0 @@ | |||
| import glob | |||
| import os | |||
| import matplotlib | |||
| import matplotlib.pylab as plt | |||
| import torch | |||
| from torch.nn.utils import weight_norm | |||
| matplotlib.use('Agg') | |||
| def plot_spectrogram(spectrogram): | |||
| fig, ax = plt.subplots(figsize=(10, 2)) | |||
| im = ax.imshow( | |||
| spectrogram, aspect='auto', origin='lower', interpolation='none') | |||
| plt.colorbar(im, ax=ax) | |||
| fig.canvas.draw() | |||
| plt.close() | |||
| return fig | |||
| def init_weights(m, mean=0.0, std=0.01): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| m.weight.data.normal_(mean, std) | |||
| def apply_weight_norm(m): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| weight_norm(m) | |||
| def get_padding(kernel_size, dilation=1): | |||
| return int((kernel_size * dilation - dilation) / 2) | |||
| def load_checkpoint(filepath, device): | |||
| assert os.path.isfile(filepath) | |||
| print("Loading '{}'".format(filepath)) | |||
| checkpoint_dict = torch.load(filepath, map_location=device) | |||
| print('Complete.') | |||
| return checkpoint_dict | |||
| def save_checkpoint(filepath, obj): | |||
| print('Saving checkpoint to {}'.format(filepath)) | |||
| torch.save(obj, filepath) | |||
| print('Complete.') | |||
| def scan_checkpoint(cp_dir, prefix): | |||
| pattern = os.path.join(cp_dir, prefix + '????????') | |||
| cp_list = glob.glob(pattern) | |||
| if len(cp_list) == 0: | |||
| return None | |||
| return sorted(cp_list)[-1] | |||
| @@ -0,0 +1,3 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .utils import * # noqa F403 | |||
| @@ -0,0 +1,136 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import glob | |||
| import os | |||
| import shutil | |||
| import matplotlib | |||
| import matplotlib.pylab as plt | |||
| import torch | |||
| matplotlib.use('Agg') | |||
| class AttrDict(dict): | |||
| def __init__(self, *args, **kwargs): | |||
| super(AttrDict, self).__init__(*args, **kwargs) | |||
| self.__dict__ = self | |||
| def build_env(config, config_name, path): | |||
| t_path = os.path.join(path, config_name) | |||
| if config != t_path: | |||
| os.makedirs(path, exist_ok=True) | |||
| shutil.copyfile(config, os.path.join(path, config_name)) | |||
| def plot_spectrogram(spectrogram): | |||
| fig, ax = plt.subplots(figsize=(10, 2)) | |||
| im = ax.imshow( | |||
| spectrogram, aspect='auto', origin='lower', interpolation='none') | |||
| plt.colorbar(im, ax=ax) | |||
| fig.canvas.draw() | |||
| plt.close() | |||
| return fig | |||
| def plot_alignment(alignment, info=None): | |||
| fig, ax = plt.subplots() | |||
| im = ax.imshow( | |||
| alignment, aspect='auto', origin='lower', interpolation='none') | |||
| fig.colorbar(im, ax=ax) | |||
| xlabel = 'Input timestep' | |||
| if info is not None: | |||
| xlabel += '\t' + info | |||
| plt.xlabel(xlabel) | |||
| plt.ylabel('Output timestep') | |||
| fig.canvas.draw() | |||
| plt.close() | |||
| return fig | |||
| def load_checkpoint(filepath, device): | |||
| assert os.path.isfile(filepath) | |||
| checkpoint_dict = torch.load(filepath, map_location=device) | |||
| return checkpoint_dict | |||
| def save_checkpoint(filepath, obj): | |||
| torch.save(obj, filepath) | |||
| def scan_checkpoint(cp_dir, prefix): | |||
| pattern = os.path.join(cp_dir, prefix + '????????.pkl') | |||
| cp_list = glob.glob(pattern) | |||
| if len(cp_list) == 0: | |||
| return None | |||
| return sorted(cp_list)[-1] | |||
| def get_padding(kernel_size, dilation=1): | |||
| return int((kernel_size * dilation - dilation) / 2) | |||
| class ValueWindow(): | |||
| def __init__(self, window_size=100): | |||
| self._window_size = window_size | |||
| self._values = [] | |||
| def append(self, x): | |||
| self._values = self._values[-(self._window_size - 1):] + [x] | |||
| @property | |||
| def sum(self): | |||
| return sum(self._values) | |||
| @property | |||
| def count(self): | |||
| return len(self._values) | |||
| @property | |||
| def average(self): | |||
| return self.sum / max(1, self.count) | |||
| def reset(self): | |||
| self._values = [] | |||
| def get_model_size(model): | |||
| param_num = sum([p.numel() for p in model.parameters() if p.requires_grad]) | |||
| param_size = param_num * 4 / 1024 / 1024 | |||
| return param_size | |||
| def get_grad_norm(model): | |||
| total_norm = 0 | |||
| params = [ | |||
| p for p in model.parameters() if p.grad is not None and p.requires_grad | |||
| ] | |||
| for p in params: | |||
| param_norm = p.grad.detach().data.norm(2) | |||
| total_norm += param_norm.item()**2 | |||
| total_norm = total_norm**0.5 | |||
| return total_norm | |||
| def init_weights(m, mean=0.0, std=0.01): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| m.weight.data.normal_(mean, std) | |||
| def get_mask_from_lengths(lengths, max_len=None): | |||
| batch_size = lengths.shape[0] | |||
| if max_len is None: | |||
| max_len = torch.max(lengths).item() | |||
| ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, | |||
| -1).to(lengths.device) | |||
| mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) | |||
| return mask | |||
| @@ -1,516 +0,0 @@ | |||
| from distutils.version import LooseVersion | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d | |||
| from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm | |||
| from .utils import get_padding, init_weights | |||
| is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') | |||
| def stft(x, fft_size, hop_size, win_length, window): | |||
| """Perform STFT and convert to magnitude spectrogram. | |||
| Args: | |||
| x (Tensor): Input signal tensor (B, T). | |||
| fft_size (int): FFT size. | |||
| hop_size (int): Hop size. | |||
| win_length (int): Window length. | |||
| window (str): Window function type. | |||
| Returns: | |||
| Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). | |||
| """ | |||
| if is_pytorch_17plus: | |||
| x_stft = torch.stft( | |||
| x, fft_size, hop_size, win_length, window, return_complex=False) | |||
| else: | |||
| x_stft = torch.stft(x, fft_size, hop_size, win_length, window) | |||
| real = x_stft[..., 0] | |||
| imag = x_stft[..., 1] | |||
| # NOTE(kan-bayashi): clamp is needed to avoid nan or inf | |||
| return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) | |||
| LRELU_SLOPE = 0.1 | |||
| def get_padding_casual(kernel_size, dilation=1): | |||
| return int(kernel_size * dilation - dilation) | |||
| class Conv1dCasual(torch.nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=True, | |||
| padding_mode='zeros'): | |||
| super(Conv1dCasual, self).__init__() | |||
| self.pad = padding | |||
| self.conv1d = weight_norm( | |||
| Conv1d( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0, | |||
| dilation=dilation, | |||
| groups=groups, | |||
| bias=bias, | |||
| padding_mode=padding_mode)) | |||
| self.conv1d.apply(init_weights) | |||
| def forward(self, x): # bdt | |||
| # described starting from the last dimension and moving forward. | |||
| x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') | |||
| x = self.conv1d(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.conv1d) | |||
| class ConvTranspose1dCausal(torch.nn.Module): | |||
| """CausalConvTranspose1d module with customized initialization.""" | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0): | |||
| """Initialize CausalConvTranspose1d module.""" | |||
| super(ConvTranspose1dCausal, self).__init__() | |||
| self.deconv = weight_norm( | |||
| ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) | |||
| self.stride = stride | |||
| self.deconv.apply(init_weights) | |||
| self.pad = kernel_size - stride | |||
| def forward(self, x): | |||
| """Calculate forward propagation. | |||
| Args: | |||
| x (Tensor): Input tensor (B, in_channels, T_in). | |||
| Returns: | |||
| Tensor: Output tensor (B, out_channels, T_out). | |||
| """ | |||
| # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") | |||
| return self.deconv(x)[:, :, :-self.pad] | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.deconv) | |||
| class ResBlock1(torch.nn.Module): | |||
| def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |||
| super(ResBlock1, self).__init__() | |||
| self.h = h | |||
| self.convs1 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=dilation[i], | |||
| padding=get_padding_casual(kernel_size, dilation[i])) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| self.convs2 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=1, | |||
| padding=get_padding_casual(kernel_size, 1)) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| def forward(self, x): | |||
| for c1, c2 in zip(self.convs1, self.convs2): | |||
| xt = F.leaky_relu(x, LRELU_SLOPE) | |||
| xt = c1(xt) | |||
| xt = F.leaky_relu(xt, LRELU_SLOPE) | |||
| xt = c2(xt) | |||
| x = xt + x | |||
| return x | |||
| def remove_weight_norm(self): | |||
| for layer in self.convs1: | |||
| layer.remove_weight_norm() | |||
| for layer in self.convs2: | |||
| layer.remove_weight_norm() | |||
| class Generator(torch.nn.Module): | |||
| def __init__(self, h): | |||
| super(Generator, self).__init__() | |||
| self.h = h | |||
| self.num_kernels = len(h.resblock_kernel_sizes) | |||
| self.num_upsamples = len(h.upsample_rates) | |||
| print('num_kernels={}, num_upsamples={}'.format( | |||
| self.num_kernels, self.num_upsamples)) | |||
| self.conv_pre = Conv1dCasual( | |||
| 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) | |||
| resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |||
| self.ups = nn.ModuleList() | |||
| self.repeat_ups = nn.ModuleList() | |||
| for i, (u, k) in enumerate( | |||
| zip(h.upsample_rates, h.upsample_kernel_sizes)): | |||
| upsample = nn.Sequential( | |||
| nn.Upsample(mode='nearest', scale_factor=u), | |||
| nn.LeakyReLU(LRELU_SLOPE), | |||
| Conv1dCasual( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| kernel_size=7, | |||
| stride=1, | |||
| padding=7 - 1)) | |||
| self.repeat_ups.append(upsample) | |||
| self.ups.append( | |||
| ConvTranspose1dCausal( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| k, | |||
| u, | |||
| padding=(k - u) // 2)) | |||
| self.resblocks = nn.ModuleList() | |||
| for i in range(len(self.ups)): | |||
| ch = h.upsample_initial_channel // (2**(i + 1)) | |||
| for j, (k, d) in enumerate( | |||
| zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |||
| self.resblocks.append(resblock(h, ch, k, d)) | |||
| self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) | |||
| def forward(self, x): | |||
| x = self.conv_pre(x) | |||
| for i in range(self.num_upsamples): | |||
| x = torch.sin(x) + x | |||
| # transconv | |||
| x1 = F.leaky_relu(x, LRELU_SLOPE) | |||
| x1 = self.ups[i](x1) | |||
| # repeat | |||
| x2 = self.repeat_ups[i](x) | |||
| x = x1 + x2 | |||
| xs = None | |||
| for j in range(self.num_kernels): | |||
| if xs is None: | |||
| xs = self.resblocks[i * self.num_kernels + j](x) | |||
| else: | |||
| xs += self.resblocks[i * self.num_kernels + j](x) | |||
| x = xs / self.num_kernels | |||
| x = F.leaky_relu(x) | |||
| x = self.conv_post(x) | |||
| x = torch.tanh(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| print('Removing weight norm...') | |||
| for layer in self.ups: | |||
| layer.remove_weight_norm() | |||
| for layer in self.repeat_ups: | |||
| layer[-1].remove_weight_norm() | |||
| for layer in self.resblocks: | |||
| layer.remove_weight_norm() | |||
| self.conv_pre.remove_weight_norm() | |||
| self.conv_post.remove_weight_norm() | |||
| class DiscriminatorP(torch.nn.Module): | |||
| def __init__(self, | |||
| period, | |||
| kernel_size=5, | |||
| stride=3, | |||
| use_spectral_norm=False): | |||
| super(DiscriminatorP, self).__init__() | |||
| self.period = period | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f( | |||
| Conv2d( | |||
| 1, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 128, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 128, | |||
| 512, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 512, | |||
| 1024, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), | |||
| ]) | |||
| self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) | |||
| def forward(self, x): | |||
| fmap = [] | |||
| # 1d to 2d | |||
| b, c, t = x.shape | |||
| if t % self.period != 0: # pad first | |||
| n_pad = self.period - (t % self.period) | |||
| x = F.pad(x, (0, n_pad), 'reflect') | |||
| t = t + n_pad | |||
| x = x.view(b, c, t // self.period, self.period) | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = torch.flatten(x, 1, -1) | |||
| return x, fmap | |||
| class MultiPeriodDiscriminator(torch.nn.Module): | |||
| def __init__(self): | |||
| super(MultiPeriodDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList([ | |||
| DiscriminatorP(2), | |||
| DiscriminatorP(3), | |||
| DiscriminatorP(5), | |||
| DiscriminatorP(7), | |||
| DiscriminatorP(11), | |||
| ]) | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| class DiscriminatorS(torch.nn.Module): | |||
| def __init__(self, use_spectral_norm=False): | |||
| super(DiscriminatorS, self).__init__() | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f(Conv1d(1, 128, 15, 1, padding=7)), | |||
| norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), | |||
| norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), | |||
| norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), | |||
| norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), | |||
| norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), | |||
| norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), | |||
| ]) | |||
| self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) | |||
| def forward(self, x): | |||
| fmap = [] | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = torch.flatten(x, 1, -1) | |||
| return x, fmap | |||
| class MultiScaleDiscriminator(torch.nn.Module): | |||
| def __init__(self): | |||
| super(MultiScaleDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList([ | |||
| DiscriminatorS(use_spectral_norm=True), | |||
| DiscriminatorS(), | |||
| DiscriminatorS(), | |||
| ]) | |||
| from pytorch_wavelets import DWT1DForward | |||
| self.meanpools = nn.ModuleList( | |||
| [DWT1DForward(wave='db3', J=1), | |||
| DWT1DForward(wave='db3', J=1)]) | |||
| self.convs = nn.ModuleList([ | |||
| weight_norm(Conv1d(2, 1, 15, 1, padding=7)), | |||
| weight_norm(Conv1d(2, 1, 15, 1, padding=7)) | |||
| ]) | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| if i != 0: | |||
| yl, yh = self.meanpools[i - 1](y) | |||
| y = torch.cat([yl, yh[0]], dim=1) | |||
| y = self.convs[i - 1](y) | |||
| y = F.leaky_relu(y, LRELU_SLOPE) | |||
| yl_hat, yh_hat = self.meanpools[i - 1](y_hat) | |||
| y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) | |||
| y_hat = self.convs[i - 1](y_hat) | |||
| y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| class DiscriminatorSTFT(torch.nn.Module): | |||
| def __init__(self, | |||
| kernel_size=11, | |||
| stride=2, | |||
| use_spectral_norm=False, | |||
| fft_size=1024, | |||
| shift_size=120, | |||
| win_length=600, | |||
| window='hann_window'): | |||
| super(DiscriminatorSTFT, self).__init__() | |||
| self.fft_size = fft_size | |||
| self.shift_size = shift_size | |||
| self.win_length = win_length | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f( | |||
| Conv2d( | |||
| fft_size // 2 + 1, | |||
| 32, (15, 1), (1, 1), | |||
| padding=(get_padding(15, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), | |||
| ]) | |||
| self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) | |||
| self.register_buffer('window', getattr(torch, window)(win_length)) | |||
| def forward(self, wav): | |||
| wav = torch.squeeze(wav, 1) | |||
| x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, | |||
| self.window) | |||
| x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) | |||
| fmap = [] | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = x.squeeze(-1) | |||
| return x, fmap | |||
| class MultiSTFTDiscriminator(torch.nn.Module): | |||
| def __init__( | |||
| self, | |||
| fft_sizes=[1024, 2048, 512], | |||
| hop_sizes=[120, 240, 50], | |||
| win_lengths=[600, 1200, 240], | |||
| window='hann_window', | |||
| ): | |||
| super(MultiSTFTDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList() | |||
| for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): | |||
| self.discriminators += [ | |||
| DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) | |||
| ] | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| def feature_loss(fmap_r, fmap_g): | |||
| loss = 0 | |||
| for dr, dg in zip(fmap_r, fmap_g): | |||
| for rl, gl in zip(dr, dg): | |||
| loss += torch.mean(torch.abs(rl - gl)) | |||
| return loss * 2 | |||
| def discriminator_loss(disc_real_outputs, disc_generated_outputs): | |||
| loss = 0 | |||
| r_losses = [] | |||
| g_losses = [] | |||
| for dr, dg in zip(disc_real_outputs, disc_generated_outputs): | |||
| r_loss = torch.mean((1 - dr)**2) | |||
| g_loss = torch.mean(dg**2) | |||
| loss += (r_loss + g_loss) | |||
| r_losses.append(r_loss.item()) | |||
| g_losses.append(g_loss.item()) | |||
| return loss, r_losses, g_losses | |||
| def generator_loss(disc_outputs): | |||
| loss = 0 | |||
| gen_losses = [] | |||
| for dg in disc_outputs: | |||
| temp_loss = torch.mean((1 - dg)**2) | |||
| gen_losses.append(temp_loss) | |||
| loss += temp_loss | |||
| return loss, gen_losses | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from __future__ import (absolute_import, division, print_function, | |||
| unicode_literals) | |||
| import os | |||
| @@ -11,13 +13,11 @@ from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.audio.tts_exceptions import ( | |||
| TtsFrontendInitializeFailedException, | |||
| TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | |||
| TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException, | |||
| TtsVoiceNotExistsException) | |||
| from modelscope.utils.constant import Tasks | |||
| from .voice import Voice | |||
| import tensorflow as tf # isort:skip | |||
| __all__ = ['SambertHifigan'] | |||
| @@ -28,14 +28,15 @@ class SambertHifigan(Model): | |||
| def __init__(self, model_dir, *args, **kwargs): | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| if 'am' not in kwargs: | |||
| raise TtsModelConfigurationExcetion( | |||
| 'configuration model field missing am!') | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: configuration model field missing am!') | |||
| if 'vocoder' not in kwargs: | |||
| raise TtsModelConfigurationExcetion( | |||
| 'configuration model field missing vocoder!') | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: configuration model field missing vocoder!') | |||
| if 'lang_type' not in kwargs: | |||
| raise TtsModelConfigurationExcetion( | |||
| 'configuration model field missing lang_type!') | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: configuration model field missing lang_type!' | |||
| ) | |||
| am_cfg = kwargs['am'] | |||
| voc_cfg = kwargs['vocoder'] | |||
| # initialize frontend | |||
| @@ -47,10 +48,12 @@ class SambertHifigan(Model): | |||
| zip_ref.extractall(model_dir) | |||
| if not frontend.initialize(self.__res_path): | |||
| raise TtsFrontendInitializeFailedException( | |||
| 'resource invalid: {}'.format(self.__res_path)) | |||
| 'modelscope error: resource invalid: {}'.format( | |||
| self.__res_path)) | |||
| if not frontend.set_lang_type(kwargs['lang_type']): | |||
| raise TtsFrontendLanguageTypeInvalidException( | |||
| 'language type invalid: {}'.format(kwargs['lang_type'])) | |||
| 'modelscope error: language type invalid: {}'.format( | |||
| kwargs['lang_type'])) | |||
| self.__frontend = frontend | |||
| zip_file = os.path.join(model_dir, 'voices.zip') | |||
| self.__voice_path = os.path.join(model_dir, 'voices') | |||
| @@ -60,7 +63,8 @@ class SambertHifigan(Model): | |||
| with open(voice_cfg_path, 'r') as f: | |||
| voice_cfg = json.load(f) | |||
| if 'voices' not in voice_cfg: | |||
| raise TtsModelConfigurationExcetion('voices invalid') | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: voices invalid') | |||
| self.__voice = {} | |||
| for name in voice_cfg['voices']: | |||
| voice_path = os.path.join(self.__voice_path, name) | |||
| @@ -70,11 +74,13 @@ class SambertHifigan(Model): | |||
| if voice_cfg['voices']: | |||
| self.__default_voice_name = voice_cfg['voices'][0] | |||
| else: | |||
| raise TtsVoiceNotExistsException('voices is empty in voices.json') | |||
| raise TtsVoiceNotExistsException( | |||
| 'modelscope error: voices is empty in voices.json') | |||
| def __synthesis_one_sentences(self, voice_name, text): | |||
| if voice_name not in self.__voice: | |||
| raise TtsVoiceNotExistsException(f'Voice {voice_name} not exists') | |||
| raise TtsVoiceNotExistsException( | |||
| f'modelscope error: Voice {voice_name} not exists') | |||
| return self.__voice[voice_name].forward(text) | |||
| def forward(self, text: str, voice_name: str = None): | |||
| @@ -1,89 +0,0 @@ | |||
| ''' | |||
| Cleaners are transformations that run over the input text at both training and eval time. | |||
| Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
| hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
| 1. "english_cleaners" for English text | |||
| 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
| the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
| 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
| the symbols in symbols.py to match your data). | |||
| ''' | |||
| import re | |||
| from unidecode import unidecode | |||
| from .numbers import normalize_numbers | |||
| # Regular expression matching whitespace: | |||
| _whitespace_re = re.compile(r'\s+') | |||
| # List of (regular expression, replacement) pairs for abbreviations: | |||
| _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) | |||
| for x in [ | |||
| ('mrs', 'misess'), | |||
| ('mr', 'mister'), | |||
| ('dr', 'doctor'), | |||
| ('st', 'saint'), | |||
| ('co', 'company'), | |||
| ('jr', 'junior'), | |||
| ('maj', 'major'), | |||
| ('gen', 'general'), | |||
| ('drs', 'doctors'), | |||
| ('rev', 'reverend'), | |||
| ('lt', 'lieutenant'), | |||
| ('hon', 'honorable'), | |||
| ('sgt', 'sergeant'), | |||
| ('capt', 'captain'), | |||
| ('esq', 'esquire'), | |||
| ('ltd', 'limited'), | |||
| ('col', 'colonel'), | |||
| ('ft', 'fort'), ]] # yapf:disable | |||
| def expand_abbreviations(text): | |||
| for regex, replacement in _abbreviations: | |||
| text = re.sub(regex, replacement, text) | |||
| return text | |||
| def expand_numbers(text): | |||
| return normalize_numbers(text) | |||
| def lowercase(text): | |||
| return text.lower() | |||
| def collapse_whitespace(text): | |||
| return re.sub(_whitespace_re, ' ', text) | |||
| def convert_to_ascii(text): | |||
| return unidecode(text) | |||
| def basic_cleaners(text): | |||
| '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def transliteration_cleaners(text): | |||
| '''Pipeline for non-English text that transliterates to ASCII.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def english_cleaners(text): | |||
| '''Pipeline for English text, including number and abbreviation expansion.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = expand_numbers(text) | |||
| text = expand_abbreviations(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| @@ -1,64 +0,0 @@ | |||
| import re | |||
| valid_symbols = [ | |||
| 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', | |||
| 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', | |||
| 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', | |||
| 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', | |||
| 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', | |||
| 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', | |||
| 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', | |||
| 'Y', 'Z', 'ZH' | |||
| ] | |||
| _valid_symbol_set = set(valid_symbols) | |||
| class CMUDict: | |||
| '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | |||
| def __init__(self, file_or_path, keep_ambiguous=True): | |||
| if isinstance(file_or_path, str): | |||
| with open(file_or_path, encoding='latin-1') as f: | |||
| entries = _parse_cmudict(f) | |||
| else: | |||
| entries = _parse_cmudict(file_or_path) | |||
| if not keep_ambiguous: | |||
| entries = { | |||
| word: pron | |||
| for word, pron in entries.items() if len(pron) == 1 | |||
| } | |||
| self._entries = entries | |||
| def __len__(self): | |||
| return len(self._entries) | |||
| def lookup(self, word): | |||
| '''Returns list of ARPAbet pronunciations of the given word.''' | |||
| return self._entries.get(word.upper()) | |||
| _alt_re = re.compile(r'\([0-9]+\)') | |||
| def _parse_cmudict(file): | |||
| cmudict = {} | |||
| for line in file: | |||
| if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |||
| parts = line.split(' ') | |||
| word = re.sub(_alt_re, '', parts[0]) | |||
| pronunciation = _get_pronunciation(parts[1]) | |||
| if pronunciation: | |||
| if word in cmudict: | |||
| cmudict[word].append(pronunciation) | |||
| else: | |||
| cmudict[word] = [pronunciation] | |||
| return cmudict | |||
| def _get_pronunciation(s): | |||
| parts = s.strip().split(' ') | |||
| for part in parts: | |||
| if part not in _valid_symbol_set: | |||
| return None | |||
| return ' '.join(parts) | |||
| @@ -1,105 +0,0 @@ | |||
| ''' | |||
| Defines the set of symbols used in text input to the model. | |||
| The default is a set of ASCII characters that works well for English or text that has been run | |||
| through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. | |||
| ''' | |||
| import codecs | |||
| import os | |||
| _pad = '_' | |||
| _eos = '~' | |||
| _mask = '@[MASK]' | |||
| def load_symbols(dict_path, has_mask=True): | |||
| _characters = '' | |||
| _ch_symbols = [] | |||
| sy_dict_name = 'sy_dict.txt' | |||
| sy_dict_path = os.path.join(dict_path, sy_dict_name) | |||
| f = codecs.open(sy_dict_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_symbols.append(line) | |||
| _arpabet = ['@' + s for s in _ch_symbols] | |||
| # Export all symbols: | |||
| sy = list(_characters) + _arpabet + [_pad, _eos] | |||
| if has_mask: | |||
| sy.append(_mask) | |||
| _characters = '' | |||
| _ch_tones = [] | |||
| tone_dict_name = 'tone_dict.txt' | |||
| tone_dict_path = os.path.join(dict_path, tone_dict_name) | |||
| f = codecs.open(tone_dict_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_tones.append(line) | |||
| # Export all tones: | |||
| tone = list(_characters) + _ch_tones + [_pad, _eos] | |||
| if has_mask: | |||
| tone.append(_mask) | |||
| _characters = '' | |||
| _ch_syllable_flags = [] | |||
| syllable_flag_name = 'syllable_flag_dict.txt' | |||
| syllable_flag_path = os.path.join(dict_path, syllable_flag_name) | |||
| f = codecs.open(syllable_flag_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_syllable_flags.append(line) | |||
| # Export all syllable_flags: | |||
| syllable_flag = list(_characters) + _ch_syllable_flags + [_pad, _eos] | |||
| if has_mask: | |||
| syllable_flag.append(_mask) | |||
| _characters = '' | |||
| _ch_word_segments = [] | |||
| word_segment_name = 'word_segment_dict.txt' | |||
| word_segment_path = os.path.join(dict_path, word_segment_name) | |||
| f = codecs.open(word_segment_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_word_segments.append(line) | |||
| # Export all syllable_flags: | |||
| word_segment = list(_characters) + _ch_word_segments + [_pad, _eos] | |||
| if has_mask: | |||
| word_segment.append(_mask) | |||
| _characters = '' | |||
| _ch_emo_types = [] | |||
| emo_category_name = 'emo_category_dict.txt' | |||
| emo_category_path = os.path.join(dict_path, emo_category_name) | |||
| f = codecs.open(emo_category_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_emo_types.append(line) | |||
| emo_category = list(_characters) + _ch_emo_types + [_pad, _eos] | |||
| if has_mask: | |||
| emo_category.append(_mask) | |||
| _characters = '' | |||
| _ch_speakers = [] | |||
| speaker_name = 'speaker_dict.txt' | |||
| speaker_path = os.path.join(dict_path, speaker_name) | |||
| f = codecs.open(speaker_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_speakers.append(line) | |||
| # Export all syllable_flags: | |||
| speaker = list(_characters) + _ch_speakers + [_pad, _eos] | |||
| if has_mask: | |||
| speaker.append(_mask) | |||
| return sy, tone, syllable_flag, word_segment, emo_category, speaker | |||
| @@ -1,200 +0,0 @@ | |||
| import re | |||
| import sys | |||
| from .cleaners import (basic_cleaners, english_cleaners, | |||
| transliteration_cleaners) | |||
| class SymbolsDict: | |||
| def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, | |||
| speaker, inputs_dim, lfeat_type_list): | |||
| self._inputs_dim = inputs_dim | |||
| self._lfeat_type_list = lfeat_type_list | |||
| self._sy_to_id = {s: i for i, s in enumerate(sy)} | |||
| self._id_to_sy = {i: s for i, s in enumerate(sy)} | |||
| self._tone_to_id = {s: i for i, s in enumerate(tone)} | |||
| self._id_to_tone = {i: s for i, s in enumerate(tone)} | |||
| self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} | |||
| self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} | |||
| self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} | |||
| self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} | |||
| self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} | |||
| self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} | |||
| self._speaker_to_id = {s: i for i, s in enumerate(speaker)} | |||
| self._id_to_speaker = {i: s for i, s in enumerate(speaker)} | |||
| print('_sy_to_id: ') | |||
| print(self._sy_to_id) | |||
| print('_tone_to_id: ') | |||
| print(self._tone_to_id) | |||
| print('_syllable_flag_to_id: ') | |||
| print(self._syllable_flag_to_id) | |||
| print('_word_segment_to_id: ') | |||
| print(self._word_segment_to_id) | |||
| print('_emo_category_to_id: ') | |||
| print(self._emo_category_to_id) | |||
| print('_speaker_to_id: ') | |||
| print(self._speaker_to_id) | |||
| self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
| self._cleaners = { | |||
| basic_cleaners.__name__: basic_cleaners, | |||
| transliteration_cleaners.__name__: transliteration_cleaners, | |||
| english_cleaners.__name__: english_cleaners | |||
| } | |||
| def _clean_text(self, text, cleaner_names): | |||
| for name in cleaner_names: | |||
| cleaner = self._cleaners.get(name) | |||
| if not cleaner: | |||
| raise Exception('Unknown cleaner: %s' % name) | |||
| text = cleaner(text) | |||
| return text | |||
| def _sy_to_sequence(self, sy): | |||
| return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] | |||
| def _arpabet_to_sequence(self, text): | |||
| return self._sy_to_sequence(['@' + s for s in text.split()]) | |||
| def _should_keep_sy(self, s): | |||
| return s in self._sy_to_id and s != '_' and s != '~' | |||
| def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): | |||
| sequence = [] | |||
| if lfeat_type == 'sy': | |||
| this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') | |||
| this_lfeat_symbol_format = '' | |||
| index = 0 | |||
| while index < len(this_lfeat_symbol): | |||
| this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ | |||
| index] + '}' + ' ' | |||
| index = index + 1 | |||
| sequence = self.text_to_sequence(this_lfeat_symbol_format, | |||
| cleaner_names) | |||
| elif lfeat_type == 'tone': | |||
| sequence = self.tone_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'syllable_flag': | |||
| sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'word_segment': | |||
| sequence = self.word_segment_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'emo_category': | |||
| sequence = self.emo_category_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'speaker': | |||
| sequence = self.speaker_to_sequence(this_lfeat_symbol) | |||
| else: | |||
| raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
| return sequence | |||
| def text_to_sequence(self, text, cleaner_names): | |||
| '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |||
| The text can optionally have ARPAbet sequences enclosed in curly braces embedded | |||
| in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | |||
| Args: | |||
| text: string to convert to a sequence | |||
| cleaner_names: names of the cleaner functions to run the text through | |||
| Returns: | |||
| List of integers corresponding to the symbols in the text | |||
| ''' | |||
| sequence = [] | |||
| # Check for curly braces and treat their contents as ARPAbet: | |||
| while len(text): | |||
| m = self._curly_re.match(text) | |||
| if not m: | |||
| sequence += self._sy_to_sequence( | |||
| self._clean_text(text, cleaner_names)) | |||
| break | |||
| sequence += self._sy_to_sequence( | |||
| self._clean_text(m.group(1), cleaner_names)) | |||
| sequence += self._arpabet_to_sequence(m.group(2)) | |||
| text = m.group(3) | |||
| # Append EOS token | |||
| sequence.append(self._sy_to_id['~']) | |||
| return sequence | |||
| def tone_to_sequence(self, tone): | |||
| tones = tone.strip().split(' ') | |||
| sequence = [] | |||
| for this_tone in tones: | |||
| sequence.append(self._tone_to_id[this_tone]) | |||
| sequence.append(self._tone_to_id['~']) | |||
| return sequence | |||
| def syllable_flag_to_sequence(self, syllable_flag): | |||
| syllable_flags = syllable_flag.strip().split(' ') | |||
| sequence = [] | |||
| for this_syllable_flag in syllable_flags: | |||
| sequence.append(self._syllable_flag_to_id[this_syllable_flag]) | |||
| sequence.append(self._syllable_flag_to_id['~']) | |||
| return sequence | |||
| def word_segment_to_sequence(self, word_segment): | |||
| word_segments = word_segment.strip().split(' ') | |||
| sequence = [] | |||
| for this_word_segment in word_segments: | |||
| sequence.append(self._word_segment_to_id[this_word_segment]) | |||
| sequence.append(self._word_segment_to_id['~']) | |||
| return sequence | |||
| def emo_category_to_sequence(self, emo_type): | |||
| emo_categories = emo_type.strip().split(' ') | |||
| sequence = [] | |||
| for this_category in emo_categories: | |||
| sequence.append(self._emo_category_to_id[this_category]) | |||
| sequence.append(self._emo_category_to_id['~']) | |||
| return sequence | |||
| def speaker_to_sequence(self, speaker): | |||
| speakers = speaker.strip().split(' ') | |||
| sequence = [] | |||
| for this_speaker in speakers: | |||
| sequence.append(self._speaker_to_id[this_speaker]) | |||
| sequence.append(self._speaker_to_id['~']) | |||
| return sequence | |||
| def sequence_to_symbol(self, sequence): | |||
| result = '' | |||
| pre_lfeat_dim = 0 | |||
| for lfeat_type in self._lfeat_type_list: | |||
| current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim | |||
| + self._inputs_dim[lfeat_type]] | |||
| current_sequence = current_one_hot_sequence.argmax(1) | |||
| length = current_sequence.shape[0] | |||
| index = 0 | |||
| while index < length: | |||
| this_sequence = current_sequence[index] | |||
| s = '' | |||
| if lfeat_type == 'sy': | |||
| s = self._id_to_sy[this_sequence] | |||
| if len(s) > 1 and s[0] == '@': | |||
| s = s[1:] | |||
| elif lfeat_type == 'tone': | |||
| s = self._id_to_tone[this_sequence] | |||
| elif lfeat_type == 'syllable_flag': | |||
| s = self._id_to_syllable_flag[this_sequence] | |||
| elif lfeat_type == 'word_segment': | |||
| s = self._id_to_word_segment[this_sequence] | |||
| elif lfeat_type == 'emo_category': | |||
| s = self._id_to_emo_category[this_sequence] | |||
| elif lfeat_type == 'speaker': | |||
| s = self._id_to_speaker[this_sequence] | |||
| else: | |||
| raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
| if index == 0: | |||
| result = result + lfeat_type + ': ' | |||
| result = result + '{' + s + '}' | |||
| if index == length - 1: | |||
| result = result + '; ' | |||
| index = index + 1 | |||
| pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] | |||
| return result | |||
| @@ -1,286 +1,111 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import pickle as pkl | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from sklearn.preprocessing import MultiLabelBinarizer | |||
| from modelscope.utils.audio.tts_exceptions import \ | |||
| TtsModelConfigurationException | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .models import Generator, create_am_model | |||
| from .text.symbols import load_symbols | |||
| from .text.symbols_dict import SymbolsDict | |||
| import tensorflow as tf # isort:skip | |||
| from .models.datasets.units import KanTtsLinguisticUnit | |||
| from .models.models.hifigan import Generator | |||
| from .models.models.sambert import KanTtsSAMBERT | |||
| from .models.utils import (AttrDict, build_env, init_weights, load_checkpoint, | |||
| plot_spectrogram, save_checkpoint, scan_checkpoint) | |||
| MAX_WAV_VALUE = 32768.0 | |||
| def multi_label_symbol_to_sequence(my_classes, my_symbol): | |||
| one_hot = MultiLabelBinarizer(classes=my_classes) | |||
| tokens = my_symbol.strip().split(' ') | |||
| sequences = [] | |||
| for token in tokens: | |||
| sequences.append(tuple(token.split('&'))) | |||
| return one_hot.fit_transform(sequences) | |||
| def load_checkpoint(filepath, device): | |||
| assert os.path.isfile(filepath) | |||
| checkpoint_dict = torch.load(filepath, map_location=device) | |||
| return checkpoint_dict | |||
| class AttrDict(dict): | |||
| def __init__(self, *args, **kwargs): | |||
| super(AttrDict, self).__init__(*args, **kwargs) | |||
| self.__dict__ = self | |||
| class Voice: | |||
| def __init__(self, voice_name, voice_path, am_hparams, voc_config): | |||
| def __init__(self, voice_name, voice_path, am_config, voc_config): | |||
| self.__voice_name = voice_name | |||
| self.__voice_path = voice_path | |||
| self.__am_hparams = tf.contrib.training.HParams(**am_hparams) | |||
| self.__am_config = AttrDict(**am_config) | |||
| self.__voc_config = AttrDict(**voc_config) | |||
| self.__model_loaded = False | |||
| if 'am' not in self.__am_config: | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: am configuration invalid') | |||
| if 'linguistic_unit' not in self.__am_config: | |||
| raise TtsModelConfigurationException( | |||
| 'modelscope error: am configuration invalid') | |||
| self.__am_lingustic_unit_config = self.__am_config['linguistic_unit'] | |||
| def __load_am(self): | |||
| local_am_ckpt_path = os.path.join(self.__voice_path, | |||
| ModelFile.TF_CHECKPOINT_FOLDER) | |||
| self.__am_ckpt_path = os.path.join(local_am_ckpt_path, 'ckpt') | |||
| self.__dict_path = os.path.join(self.__voice_path, 'dicts') | |||
| local_am_ckpt_path = os.path.join(self.__voice_path, 'am') | |||
| self.__am_ckpt_path = os.path.join(local_am_ckpt_path, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| has_mask = True | |||
| if self.__am_hparams.get('has_mask') is not None: | |||
| has_mask = self.__am_hparams.has_mask | |||
| model_name = 'robutrans' | |||
| self.__lfeat_type_list = self.__am_hparams.lfeat_type_list.strip( | |||
| ).split(',') | |||
| sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( | |||
| self.__dict_path, has_mask) | |||
| self.__sy = sy | |||
| self.__tone = tone | |||
| self.__syllable_flag = syllable_flag | |||
| self.__word_segment = word_segment | |||
| self.__emo_category = emo_category | |||
| self.__speaker = speaker | |||
| self.__inputs_dim = dict() | |||
| for lfeat_type in self.__lfeat_type_list: | |||
| if lfeat_type == 'sy': | |||
| self.__inputs_dim[lfeat_type] = len(sy) | |||
| elif lfeat_type == 'tone': | |||
| self.__inputs_dim[lfeat_type] = len(tone) | |||
| elif lfeat_type == 'syllable_flag': | |||
| self.__inputs_dim[lfeat_type] = len(syllable_flag) | |||
| elif lfeat_type == 'word_segment': | |||
| self.__inputs_dim[lfeat_type] = len(word_segment) | |||
| elif lfeat_type == 'emo_category': | |||
| self.__inputs_dim[lfeat_type] = len(emo_category) | |||
| elif lfeat_type == 'speaker': | |||
| self.__inputs_dim[lfeat_type] = len(speaker) | |||
| self.__symbols_dict = SymbolsDict(sy, tone, syllable_flag, | |||
| word_segment, emo_category, speaker, | |||
| self.__inputs_dim, | |||
| self.__lfeat_type_list) | |||
| dim_inputs = sum(self.__inputs_dim.values( | |||
| )) - self.__inputs_dim['speaker'] - self.__inputs_dim['emo_category'] | |||
| self.__graph = tf.Graph() | |||
| with self.__graph.as_default(): | |||
| inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], | |||
| 'inputs') | |||
| inputs_emotion = tf.placeholder( | |||
| tf.float32, [1, None, self.__inputs_dim['emo_category']], | |||
| 'inputs_emotion') | |||
| inputs_speaker = tf.placeholder( | |||
| tf.float32, [1, None, self.__inputs_dim['speaker']], | |||
| 'inputs_speaker') | |||
| input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') | |||
| pitch_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'pitch_contours_scale') | |||
| energy_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'energy_contours_scale') | |||
| duration_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'duration_scale') | |||
| with tf.variable_scope('model') as _: | |||
| self.__model = create_am_model(model_name, self.__am_hparams) | |||
| self.__model.initialize( | |||
| inputs, | |||
| inputs_emotion, | |||
| inputs_speaker, | |||
| input_lengths, | |||
| duration_scales=duration_scale, | |||
| pitch_scales=pitch_contours_scale, | |||
| energy_scales=energy_contours_scale) | |||
| self.__mel_spec = self.__model.mel_outputs[0] | |||
| self.__duration_outputs = self.__model.duration_outputs[0] | |||
| self.__duration_outputs_ = self.__model.duration_outputs_[0] | |||
| self.__pitch_contour_outputs = self.__model.pitch_contour_outputs[ | |||
| 0] | |||
| self.__energy_contour_outputs = self.__model.energy_contour_outputs[ | |||
| 0] | |||
| self.__embedded_inputs_emotion = self.__model.embedded_inputs_emotion[ | |||
| 0] | |||
| self.__embedding_fsmn_outputs = self.__model.embedding_fsmn_outputs[ | |||
| 0] | |||
| self.__encoder_outputs = self.__model.encoder_outputs[0] | |||
| self.__pitch_embeddings = self.__model.pitch_embeddings[0] | |||
| self.__energy_embeddings = self.__model.energy_embeddings[0] | |||
| self.__LR_outputs = self.__model.LR_outputs[0] | |||
| self.__postnet_fsmn_outputs = self.__model.postnet_fsmn_outputs[ | |||
| 0] | |||
| self.__attention_h = self.__model.attention_h | |||
| self.__attention_x = self.__model.attention_x | |||
| config = tf.ConfigProto() | |||
| config.gpu_options.allow_growth = True | |||
| self.__session = tf.Session(config=config) | |||
| self.__session.run(tf.global_variables_initializer()) | |||
| saver = tf.train.Saver() | |||
| saver.restore(self.__session, self.__am_ckpt_path) | |||
| if 'has_mask' in self.__am_lingustic_unit_config: | |||
| has_mask = self.__am_lingustic_unit_config.has_mask | |||
| self.__ling_unit = KanTtsLinguisticUnit( | |||
| self.__am_lingustic_unit_config, self.__voice_path, has_mask) | |||
| self.__am_net = KanTtsSAMBERT(self.__am_config, | |||
| self.__ling_unit.get_unit_size()).to( | |||
| self.__device) | |||
| state_dict_g = {} | |||
| try: | |||
| state_dict_g = load_checkpoint(self.__am_ckpt_path, self.__device) | |||
| except RuntimeError: | |||
| with open(self.__am_ckpt_path, 'rb') as f: | |||
| pth_var_dict = pkl.load(f) | |||
| state_dict_g['fsnet'] = { | |||
| k: torch.FloatTensor(v) | |||
| for k, v in pth_var_dict['fsnet'].items() | |||
| } | |||
| self.__am_net.load_state_dict(state_dict_g['fsnet'], strict=False) | |||
| self.__am_net.eval() | |||
| def __load_vocoder(self): | |||
| self.__voc_ckpt_path = os.path.join(self.__voice_path, | |||
| local_voc_ckpy_path = os.path.join(self.__voice_path, 'vocoder') | |||
| self.__voc_ckpt_path = os.path.join(local_voc_ckpy_path, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| if torch.cuda.is_available(): | |||
| torch.manual_seed(self.__voc_config.seed) | |||
| self.__device = torch.device('cuda') | |||
| else: | |||
| self.__device = torch.device('cpu') | |||
| self.__generator = Generator(self.__voc_config).to(self.__device) | |||
| state_dict_g = load_checkpoint(self.__voc_ckpt_path, self.__device) | |||
| self.__generator.load_state_dict(state_dict_g['generator']) | |||
| self.__generator.eval() | |||
| self.__generator.remove_weight_norm() | |||
| def __am_forward(self, | |||
| text, | |||
| pitch_control_str='', | |||
| duration_control_str='', | |||
| energy_control_str=''): | |||
| duration_cfg_lst = [] | |||
| if len(duration_control_str) != 0: | |||
| for item in duration_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| duration_cfg_lst.append((float(percent), float(scale))) | |||
| pitch_contours_cfg_lst = [] | |||
| if len(pitch_control_str) != 0: | |||
| for item in pitch_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| pitch_contours_cfg_lst.append((float(percent), float(scale))) | |||
| energy_contours_cfg_lst = [] | |||
| if len(energy_control_str) != 0: | |||
| for item in energy_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| energy_contours_cfg_lst.append((float(percent), float(scale))) | |||
| cleaner_names = [ | |||
| x.strip() for x in self.__am_hparams.cleaners.split(',') | |||
| ] | |||
| lfeat_symbol = text.strip().split(' ') | |||
| lfeat_symbol_separate = [''] * int(len(self.__lfeat_type_list)) | |||
| for this_lfeat_symbol in lfeat_symbol: | |||
| this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( | |||
| '$') | |||
| if len(this_lfeat_symbol) != len(self.__lfeat_type_list): | |||
| raise Exception( | |||
| 'Length of this_lfeat_symbol in training data' | |||
| + ' is not equal to the length of lfeat_type_list, ' | |||
| + str(len(this_lfeat_symbol)) + ' VS. ' | |||
| + str(len(self.__lfeat_type_list))) | |||
| index = 0 | |||
| while index < len(lfeat_symbol_separate): | |||
| lfeat_symbol_separate[index] = lfeat_symbol_separate[ | |||
| index] + this_lfeat_symbol[index] + ' ' | |||
| index = index + 1 | |||
| index = 0 | |||
| lfeat_type = self.__lfeat_type_list[index] | |||
| sequence = self.__symbols_dict.symbol_to_sequence( | |||
| lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) | |||
| sequence_array = np.asarray( | |||
| sequence[:-1], | |||
| dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
| inputs = np.eye( | |||
| self.__inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
| index = index + 1 | |||
| while index < len(self.__lfeat_type_list) - 2: | |||
| lfeat_type = self.__lfeat_type_list[index] | |||
| sequence = self.__symbols_dict.symbol_to_sequence( | |||
| lfeat_symbol_separate[index].strip(), lfeat_type, | |||
| cleaner_names) | |||
| sequence_array = np.asarray( | |||
| sequence[:-1], | |||
| dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
| inputs_temp = np.eye( | |||
| self.__inputs_dim[lfeat_type], | |||
| dtype=np.float32)[sequence_array] | |||
| inputs = np.concatenate((inputs, inputs_temp), axis=1) | |||
| index = index + 1 | |||
| seq = inputs | |||
| lfeat_type = 'emo_category' | |||
| inputs_emotion = multi_label_symbol_to_sequence( | |||
| self.__emo_category, lfeat_symbol_separate[index].strip()) | |||
| # inputs_emotion = inputs_emotion * 1.5 | |||
| index = index + 1 | |||
| lfeat_type = 'speaker' | |||
| inputs_speaker = multi_label_symbol_to_sequence( | |||
| self.__speaker, lfeat_symbol_separate[index].strip()) | |||
| duration_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in duration_cfg_lst: | |||
| duration_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in pitch_contours_cfg_lst: | |||
| pitch_contours_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in energy_contours_cfg_lst: | |||
| energy_contours_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| feed_dict = { | |||
| self.__model.inputs: [np.asarray(seq, dtype=np.float32)], | |||
| self.__model.inputs_emotion: | |||
| [np.asarray(inputs_emotion, dtype=np.float32)], | |||
| self.__model.inputs_speaker: | |||
| [np.asarray(inputs_speaker, dtype=np.float32)], | |||
| self.__model.input_lengths: | |||
| np.asarray([len(seq)], dtype=np.int32), | |||
| self.__model.duration_scales: [duration_scale], | |||
| self.__model.pitch_scales: [pitch_contours_scale], | |||
| self.__model.energy_scales: [energy_contours_scale] | |||
| } | |||
| result = self.__session.run([ | |||
| self.__mel_spec, self.__duration_outputs, self.__duration_outputs_, | |||
| self.__pitch_contour_outputs, self.__embedded_inputs_emotion, | |||
| self.__embedding_fsmn_outputs, self.__encoder_outputs, | |||
| self.__pitch_embeddings, self.__LR_outputs, | |||
| self.__postnet_fsmn_outputs, self.__energy_contour_outputs, | |||
| self.__energy_embeddings, self.__attention_x, self.__attention_h | |||
| ], feed_dict=feed_dict) # yapf:disable | |||
| return result[0] | |||
| def __am_forward(self, symbol_seq): | |||
| with torch.no_grad(): | |||
| inputs_feat_lst = self.__ling_unit.encode_symbol_sequence( | |||
| symbol_seq) | |||
| inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to( | |||
| self.__device) | |||
| inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to( | |||
| self.__device) | |||
| inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to( | |||
| self.__device) | |||
| inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to( | |||
| self.__device) | |||
| inputs_ling = torch.stack( | |||
| [inputs_sy, inputs_tone, inputs_syllable, inputs_ws], | |||
| dim=-1).unsqueeze(0) | |||
| inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to( | |||
| self.__device).unsqueeze(0) | |||
| inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to( | |||
| self.__device).unsqueeze(0) | |||
| inputs_len = torch.zeros(1).to(self.__device).long( | |||
| ) + inputs_emo.size(1) - 1 # minus 1 for "~" | |||
| res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1], | |||
| inputs_spk[:, :-1], inputs_len) | |||
| postnet_outputs = res['postnet_outputs'] | |||
| LR_length_rounded = res['LR_length_rounded'] | |||
| valid_length = int(LR_length_rounded[0].item()) | |||
| postnet_outputs = postnet_outputs[ | |||
| 0, :valid_length, :].cpu().numpy() | |||
| return postnet_outputs | |||
| def __vocoder_forward(self, melspec): | |||
| dim0 = list(melspec.shape)[-1] | |||
| if dim0 != self.__voc_config.num_mels: | |||
| raise TtsVocoderMelspecShapeMismatchException( | |||
| 'input melspec mismatch require {} but {}'.format( | |||
| self.__voc_config.num_mels, dim0)) | |||
| 'modelscope error: input melspec mismatch require {} but {}'. | |||
| format(self.__voc_config.num_mels, dim0)) | |||
| with torch.no_grad(): | |||
| x = melspec.T | |||
| x = torch.FloatTensor(x).to(self.__device) | |||
| @@ -292,9 +117,15 @@ class Voice: | |||
| audio = audio.cpu().numpy().astype('int16') | |||
| return audio | |||
| def forward(self, text): | |||
| def forward(self, symbol_seq): | |||
| if not self.__model_loaded: | |||
| torch.manual_seed(self.__am_config.seed) | |||
| if torch.cuda.is_available(): | |||
| torch.manual_seed(self.__am_config.seed) | |||
| self.__device = torch.device('cuda') | |||
| else: | |||
| self.__device = torch.device('cpu') | |||
| self.__load_am() | |||
| self.__load_vocoder() | |||
| self.__model_loaded = True | |||
| return self.__vocoder_forward(self.__am_forward(text)) | |||
| return self.__vocoder_forward(self.__am_forward(symbol_seq)) | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, List | |||
| import numpy as np | |||
| @@ -42,3 +44,6 @@ class TextToSpeechSambertHifiganPipeline(Pipeline): | |||
| def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: | |||
| return inputs | |||
| def _sanitize_parameters(self, **pipeline_parameters): | |||
| return {}, pipeline_parameters, {} | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| """ | |||
| Define TTS exceptions | |||
| """ | |||
| @@ -10,7 +11,7 @@ class TtsException(Exception): | |||
| pass | |||
| class TtsModelConfigurationExcetion(TtsException): | |||
| class TtsModelConfigurationException(TtsException): | |||
| """ | |||
| TTS model configuration exceptions. | |||
| """ | |||
| @@ -1,6 +1,5 @@ | |||
| easyasr>=0.0.2 | |||
| espnet>=202204 | |||
| #tts | |||
| h5py | |||
| inflect | |||
| keras | |||
| @@ -15,11 +14,7 @@ nltk | |||
| numpy<=1.18 | |||
| # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. | |||
| protobuf>3,<3.21.0 | |||
| ptflops | |||
| py_sound_connect | |||
| pytorch_wavelets | |||
| PyWavelets>=1.0.0 | |||
| scikit-learn | |||
| SoundFile>0.10 | |||
| sox | |||
| torchaudio | |||
| @@ -9,6 +9,7 @@ import unittest | |||
| import torch | |||
| from scipy.io.wavfile import write | |||
| from modelscope.models import Model | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.utils.constant import Tasks | |||
| @@ -33,7 +34,9 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase, | |||
| text = '今天北京天气怎么样?' | |||
| voice = 'zhitian_emo' | |||
| sambert_hifigan_tts = pipeline(task=self.task, model=self.model_id) | |||
| model = Model.from_pretrained( | |||
| model_name_or_path=self.model_id, revision='pytorch_am') | |||
| sambert_hifigan_tts = pipeline(task=self.task, model=model) | |||
| self.assertTrue(sambert_hifigan_tts is not None) | |||
| output = sambert_hifigan_tts(input=text, voice=voice) | |||
| self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) | |||