| @@ -24,6 +24,7 @@ wheels/ | |||
| .installed.cfg | |||
| *.egg | |||
| /package | |||
| /temp | |||
| MANIFEST | |||
| # PyInstaller | |||
| @@ -123,3 +124,7 @@ replace.sh | |||
| # Pytorch | |||
| *.pth | |||
| # audio | |||
| *.wav | |||
| @@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro | |||
| > ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. | |||
| 由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。 | |||
| ### 3. 安装pytorch出现版本错误 | |||
| > ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8 | |||
| > ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0) | |||
| > ERROR: No matching distribution found for torch==1.8.1+cu111 | |||
| 安装时使用如下命令: | |||
| ```shell | |||
| pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt | |||
| ``` | |||
| @@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行 | |||
| * [Pytorch安装指导](https://pytorch.org/get-started/locally/) | |||
| * [Tensorflow安装指导](https://www.tensorflow.org/install/pip) | |||
| 部分第三方依赖库需要提前安装numpy | |||
| ``` | |||
| pip install numpy | |||
| ``` | |||
| ## ModelScope library 安装 | |||
| @@ -1,5 +1,7 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .audio.tts.am import SambertNetHifi16k | |||
| from .audio.tts.vocoder import Hifigan16k | |||
| from .base import Model | |||
| from .builder import MODELS, build_model | |||
| from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity | |||
| @@ -0,0 +1 @@ | |||
| from .sambert_hifi_16k import * # noqa F403 | |||
| @@ -0,0 +1,8 @@ | |||
| from .robutrans import RobuTrans | |||
| def create_model(name, hparams): | |||
| if name == 'robutrans': | |||
| return RobuTrans(hparams) | |||
| else: | |||
| raise Exception('Unknown model: ' + name) | |||
| @@ -0,0 +1,82 @@ | |||
| """Functions for compatibility with different TensorFlow versions.""" | |||
| import tensorflow as tf | |||
| def is_tf2(): | |||
| """Returns ``True`` if running TensorFlow 2.0.""" | |||
| return tf.__version__.startswith('2') | |||
| def tf_supports(symbol): | |||
| """Returns ``True`` if TensorFlow defines :obj:`symbol`.""" | |||
| return _string_to_tf_symbol(symbol) is not None | |||
| def tf_any(*symbols): | |||
| """Returns the first supported symbol.""" | |||
| for symbol in symbols: | |||
| module = _string_to_tf_symbol(symbol) | |||
| if module is not None: | |||
| return module | |||
| return None | |||
| def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name | |||
| """Returns the compatible symbol based on the current TensorFlow version. | |||
| Args: | |||
| v2: The candidate v2 symbol name. | |||
| v1: The candidate v1 symbol name. | |||
| Returns: | |||
| A TensorFlow symbol. | |||
| Raises: | |||
| ValueError: if no symbol can be found. | |||
| """ | |||
| candidates = [] | |||
| if v2 is not None: | |||
| candidates.append(v2) | |||
| if v1 is not None: | |||
| candidates.append(v1) | |||
| candidates.append('compat.v1.%s' % v1) | |||
| symbol = tf_any(*candidates) | |||
| if symbol is None: | |||
| raise ValueError('Failure to resolve the TensorFlow symbol') | |||
| return symbol | |||
| def name_from_variable_scope(name=''): | |||
| """Creates a name prefixed by the current variable scope.""" | |||
| var_scope = tf_compat(v1='get_variable_scope')().name | |||
| compat_name = '' | |||
| if name: | |||
| compat_name = '%s/' % name | |||
| if var_scope: | |||
| compat_name = '%s/%s' % (var_scope, compat_name) | |||
| return compat_name | |||
| def reuse(): | |||
| """Returns ``True`` if the current variable scope is marked for reuse.""" | |||
| return tf_compat(v1='get_variable_scope')().reuse | |||
| def _string_to_tf_symbol(symbol): | |||
| modules = symbol.split('.') | |||
| namespace = tf | |||
| for module in modules: | |||
| namespace = getattr(namespace, module, None) | |||
| if namespace is None: | |||
| return None | |||
| return namespace | |||
| # pylint: disable=invalid-name | |||
| gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') | |||
| gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') | |||
| gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') | |||
| is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') | |||
| logging = tf_compat(v1='logging') | |||
| nest = tf_compat(v2='nest', v1='contrib.framework.nest') | |||
| @@ -0,0 +1,273 @@ | |||
| import tensorflow as tf | |||
| def build_sequence_mask(sequence_length, | |||
| maximum_length=None, | |||
| dtype=tf.float32): | |||
| """Builds the dot product mask. | |||
| Args: | |||
| sequence_length: The sequence length. | |||
| maximum_length: Optional size of the returned time dimension. Otherwise | |||
| it is the maximum of :obj:`sequence_length`. | |||
| dtype: The type of the mask tensor. | |||
| Returns: | |||
| A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape | |||
| ``[batch_size, max_length]``. | |||
| """ | |||
| mask = tf.sequence_mask( | |||
| sequence_length, maxlen=maximum_length, dtype=dtype) | |||
| return mask | |||
| def norm(inputs): | |||
| """Layer normalizes :obj:`inputs`.""" | |||
| return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) | |||
| def pad_in_time(x, padding_shape): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
| Agrs: | |||
| x: [Batch, Time, Frequency] | |||
| padding_length: padding size of constant value (0) before the time dimension | |||
| return: | |||
| padded x | |||
| """ | |||
| depth = x.get_shape().as_list()[-1] | |||
| x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) | |||
| x.set_shape((None, None, depth)) | |||
| return x | |||
| def pad_in_time_right(x, padding_length): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
| Agrs: | |||
| x: [Batch, Time, Frequency] | |||
| padding_length: padding size of constant value (0) before the time dimension | |||
| return: | |||
| padded x | |||
| """ | |||
| depth = x.get_shape().as_list()[-1] | |||
| x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
| x.set_shape((None, None, depth)) | |||
| return x | |||
| def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): | |||
| """Implements the Transformer's "Feed Forward" layer. | |||
| .. math:: | |||
| ffn(x) = max(0, x*W_1 + b_1)*W_2 | |||
| Args: | |||
| x: The input. | |||
| ffn_dim: The number of units of the nonlinear transformation. | |||
| memory_units: the number of units of linear transformation | |||
| mode: A ``tf.estimator.ModeKeys`` mode. | |||
| dropout: The probability to drop units from the inner transformation. | |||
| Returns: | |||
| The transformed input. | |||
| """ | |||
| inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) | |||
| inner = tf.layers.dropout( | |||
| inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) | |||
| outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) | |||
| return outer | |||
| def drop_and_add(inputs, outputs, mode, dropout=0.0): | |||
| """Drops units in the outputs and adds the previous values. | |||
| Args: | |||
| inputs: The input of the previous layer. | |||
| outputs: The output of the previous layer. | |||
| mode: A ``tf.estimator.ModeKeys`` mode. | |||
| dropout: The probability to drop units in :obj:`outputs`. | |||
| Returns: | |||
| The residual and normalized output. | |||
| """ | |||
| outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) | |||
| input_dim = inputs.get_shape().as_list()[-1] | |||
| output_dim = outputs.get_shape().as_list()[-1] | |||
| if input_dim == output_dim: | |||
| outputs += inputs | |||
| return outputs | |||
| def MemoryBlock( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the bidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| mode: Training or Evaluation | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| static_shape = inputs.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=inputs, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='SAME', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| output = tf.reshape( | |||
| output, | |||
| [tf.shape(output)[0], tf.shape(output)[2], depth]) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| def MemoryBlockV2( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| shift=0, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the bidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| mode: Training or Evaluation | |||
| shift: left padding, to control delay | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| static_shape = inputs.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| # padding | |||
| left_padding = int(round((filter_size - 1) / 2)) | |||
| right_padding = int((filter_size - 1) / 2) | |||
| if shift > 0: | |||
| left_padding = left_padding + shift | |||
| right_padding = right_padding - shift | |||
| pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) | |||
| pad_inputs = tf.expand_dims( | |||
| pad_inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=pad_inputs, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='VALID', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = tf.reshape( | |||
| memory, | |||
| [tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| def UniMemoryBlock( | |||
| inputs, | |||
| filter_size, | |||
| mode, | |||
| cache=None, | |||
| mask=None, | |||
| dropout=0.0, | |||
| ): | |||
| """ | |||
| Define the unidirectional memory block in FSMN | |||
| Agrs: | |||
| inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
| filter_size: memory block filter size | |||
| cache: for streaming inference | |||
| mode: Training or Evaluation | |||
| mask: A ``tf.Tensor`` applied to the memory block output | |||
| dropout: dorpout factor | |||
| return: | |||
| output: 3-D tensor ([Batch, Time, Frequency]) | |||
| """ | |||
| if cache is not None: | |||
| static_shape = cache['queries'].get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| queries = tf.slice(cache['queries'], [0, 1, 0], [ | |||
| tf.shape(cache['queries'])[0], | |||
| tf.shape(cache['queries'])[1] - 1, depth | |||
| ]) | |||
| queries = tf.concat([queries, inputs], axis=1) | |||
| cache['queries'] = queries | |||
| else: | |||
| padding_length = filter_size - 1 | |||
| queries = pad_in_time(inputs, [padding_length, 0]) | |||
| queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] | |||
| static_shape = queries.get_shape().as_list() | |||
| depth = static_shape[-1] | |||
| depthwise_filter = tf.get_variable( | |||
| 'depth_conv_w', | |||
| shape=[1, filter_size, depth, 1], | |||
| initializer=tf.glorot_uniform_initializer(), | |||
| dtype=tf.float32) | |||
| memory = tf.nn.depthwise_conv2d( | |||
| input=queries, | |||
| filter=depthwise_filter, | |||
| strides=[1, 1, 1, 1], | |||
| padding='VALID', | |||
| rate=[1, 1], | |||
| data_format='NHWC') | |||
| memory = tf.reshape( | |||
| memory, | |||
| [tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
| memory = memory + inputs | |||
| output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
| if mask is not None: | |||
| output = output * tf.expand_dims(mask, -1) | |||
| return output | |||
| @@ -0,0 +1,178 @@ | |||
| import tensorflow as tf | |||
| from . import fsmn | |||
| class FsmnEncoder(): | |||
| """Encoder using Fsmn | |||
| """ | |||
| def __init__(self, | |||
| filter_size, | |||
| fsmn_num_layers, | |||
| dnn_num_layers, | |||
| num_memory_units=512, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.0, | |||
| position_encoder=None): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| filter_size: the total order of memory block | |||
| fsmn_num_layers: The number of fsmn layers. | |||
| dnn_num_layers: The number of dnn layers | |||
| num_units: The number of memory units. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(FsmnEncoder, self).__init__() | |||
| self.filter_size = filter_size | |||
| self.fsmn_num_layers = fsmn_num_layers | |||
| self.dnn_num_layers = dnn_num_layers | |||
| self.num_memory_units = num_memory_units | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = fsmn.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
| state = () | |||
| for layer in range(self.fsmn_num_layers): | |||
| with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
| with tf.variable_scope('ffn'): | |||
| context = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| with tf.variable_scope('memory'): | |||
| memory = fsmn.MemoryBlock( | |||
| context, | |||
| self.filter_size, | |||
| mode, | |||
| mask=mask, | |||
| dropout=self.dropout) | |||
| memory = fsmn.drop_and_add( | |||
| inputs, memory, mode, dropout=self.dropout) | |||
| inputs = memory | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| for layer in range(self.dnn_num_layers): | |||
| with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
| transformed = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = inputs | |||
| return (outputs, state, sequence_length) | |||
| class FsmnEncoderV2(): | |||
| """Encoder using Fsmn | |||
| """ | |||
| def __init__(self, | |||
| filter_size, | |||
| fsmn_num_layers, | |||
| dnn_num_layers, | |||
| num_memory_units=512, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.0, | |||
| shift=0, | |||
| position_encoder=None): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| filter_size: the total order of memory block | |||
| fsmn_num_layers: The number of fsmn layers. | |||
| dnn_num_layers: The number of dnn layers | |||
| num_units: The number of memory units. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| shift: left padding, to control delay | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(FsmnEncoderV2, self).__init__() | |||
| self.filter_size = filter_size | |||
| self.fsmn_num_layers = fsmn_num_layers | |||
| self.dnn_num_layers = dnn_num_layers | |||
| self.num_memory_units = num_memory_units | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.shift = shift | |||
| if not isinstance(shift, list): | |||
| self.shift = [shift for _ in range(self.fsmn_num_layers)] | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = fsmn.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
| state = () | |||
| for layer in range(self.fsmn_num_layers): | |||
| with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
| with tf.variable_scope('ffn'): | |||
| context = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| with tf.variable_scope('memory'): | |||
| memory = fsmn.MemoryBlockV2( | |||
| context, | |||
| self.filter_size, | |||
| mode, | |||
| shift=self.shift[layer], | |||
| mask=mask, | |||
| dropout=self.dropout) | |||
| memory = fsmn.drop_and_add( | |||
| inputs, memory, mode, dropout=self.dropout) | |||
| inputs = memory | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| for layer in range(self.dnn_num_layers): | |||
| with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
| transformed = fsmn.feed_forward( | |||
| inputs, | |||
| self.ffn_inner_dim, | |||
| self.num_memory_units, | |||
| mode, | |||
| dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = inputs | |||
| return (outputs, state, sequence_length) | |||
| @@ -0,0 +1,160 @@ | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.seq2seq import Helper | |||
| class VarTestHelper(Helper): | |||
| def __init__(self, batch_size, inputs, dim): | |||
| with tf.name_scope('VarTestHelper'): | |||
| self._batch_size = batch_size | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._inputs)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope('VarTestHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs = tf.concat([outputs, self._inputs[:, time, :]], | |||
| axis=-1) | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingHelper(Helper): | |||
| def __init__(self, targets, inputs, dim): | |||
| with tf.name_scope('VarTrainingHelper'): | |||
| self._targets = targets # [N, T_in, 1] | |||
| self._batch_size = tf.shape(inputs)[0] # N | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._targets)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope(name or 'VarTrainingHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs = tf.concat( | |||
| [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) | |||
| return (finished, next_inputs, state) | |||
| class VarTrainingSSHelper(Helper): | |||
| def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
| alpha, decay_steps): | |||
| with tf.name_scope('VarTrainingSSHelper'): | |||
| self._targets = targets # [N, T_in, 1] | |||
| self._batch_size = tf.shape(inputs)[0] # N | |||
| self._inputs = inputs | |||
| self._dim = dim | |||
| num_steps = tf.shape(self._targets)[1] | |||
| self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
| self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
| self._init_inputs = inputs[:, 0, :] | |||
| # for schedule sampling | |||
| self._global_step = global_step | |||
| self._schedule_begin = schedule_begin | |||
| self._alpha = alpha | |||
| self._decay_steps = decay_steps | |||
| @property | |||
| def batch_size(self): | |||
| return self._batch_size | |||
| @property | |||
| def sample_ids_shape(self): | |||
| return tf.TensorShape([]) | |||
| @property | |||
| def sample_ids_dtype(self): | |||
| return np.int32 | |||
| def initialize(self, name=None): | |||
| self._ratio = _tf_decay(self._global_step, self._schedule_begin, | |||
| self._alpha, self._decay_steps) | |||
| return (tf.tile([False], [self._batch_size]), | |||
| _go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
| def sample(self, time, outputs, state, name=None): | |||
| return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
| def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
| with tf.name_scope(name or 'VarTrainingHelper'): | |||
| finished = (time + 1 >= self._lengths) | |||
| next_inputs_tmp = tf.cond( | |||
| tf.less( | |||
| tf.random_uniform([], minval=0, maxval=1, | |||
| dtype=tf.float32), self._ratio), | |||
| lambda: self._targets[:, time, :], lambda: outputs) | |||
| next_inputs = tf.concat( | |||
| [next_inputs_tmp, self._inputs[:, time, :]], axis=-1) | |||
| return (finished, next_inputs, state) | |||
| def _go_frames(batch_size, dim, init_inputs): | |||
| '''Returns all-zero <GO> frames for a given batch size and output dimension''' | |||
| return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], | |||
| axis=-1) | |||
| def _tf_decay(global_step, schedule_begin, alpha, decay_steps): | |||
| tfr = tf.train.exponential_decay( | |||
| 1.0, | |||
| global_step=global_step - schedule_begin, | |||
| decay_steps=decay_steps, | |||
| decay_rate=alpha, | |||
| name='tfr_decay') | |||
| final_tfr = tf.cond( | |||
| tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) | |||
| return final_tfr | |||
| @@ -0,0 +1,461 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||
| from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||
| from tensorflow.contrib.rnn import LSTMBlockCell | |||
| def encoder_prenet(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| dense_units, | |||
| is_training, | |||
| mask=None, | |||
| scope='encoder_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.layers.dense( | |||
| x, units=dense_units, activation=None, name='dense') | |||
| return x | |||
| def decoder_prenet(inputs, | |||
| prenet_units, | |||
| dense_units, | |||
| is_training, | |||
| scope='decoder_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i, units in enumerate(prenet_units): | |||
| x = tf.layers.dense( | |||
| x, | |||
| units=units, | |||
| activation=tf.nn.relu, | |||
| name='dense_{}'.format(i)) | |||
| x = tf.layers.dropout( | |||
| x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
| x = tf.layers.dense( | |||
| x, units=dense_units, activation=None, name='dense') | |||
| return x | |||
| def encoder(inputs, | |||
| input_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='encoder'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm( | |||
| inputs, | |||
| input_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=mask) | |||
| return x | |||
| def prenet(inputs, prenet_units, is_training, scope='prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i, units in enumerate(prenet_units): | |||
| x = tf.layers.dense( | |||
| x, | |||
| units=units, | |||
| activation=tf.nn.relu, | |||
| name='dense_{}'.format(i)) | |||
| x = tf.layers.dropout( | |||
| x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
| return x | |||
| def postnet_residual_ulstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_residual_ulstm'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = conv1d( | |||
| x, | |||
| output_units, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
| return x | |||
| def postnet_residual_lstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_residual_lstm'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = conv1d( | |||
| x, | |||
| output_units, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
| return x | |||
| def postnet_linear_ulstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| is_training, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
| lstm_units, is_training) | |||
| x = tf.layers.dense(x, units=output_units) | |||
| return x | |||
| def postnet_linear_lstm(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| output_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_and_lstm_dec( | |||
| inputs, | |||
| output_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=mask) | |||
| x = tf.layers.dense(x, units=output_units) | |||
| return x | |||
| def postnet_linear(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| output_units, | |||
| output_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='postnet_linear'): | |||
| with tf.variable_scope(scope): | |||
| x = conv_dec( | |||
| inputs, | |||
| output_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=mask) | |||
| return x | |||
| def conv_and_lstm(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| return x | |||
| def conv_and_lstm_dec(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| return x | |||
| def conv_dec(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| embedded_inputs_speaker2, | |||
| mask=None, | |||
| scope='conv_and_lstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
| return x | |||
| def conv_and_ulstm(inputs, | |||
| sequence_lengths, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| is_training, | |||
| scope='conv_and_ulstm'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| scope='conv1d_{}'.format(i)) | |||
| outputs, states = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=sequence_lengths, | |||
| dtype=tf.float32) | |||
| return outputs | |||
| def conv1d(inputs, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| mask=None, | |||
| scope='conv1d'): | |||
| with tf.variable_scope(scope): | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| x = tf.layers.conv1d( | |||
| inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
| if mask is not None: | |||
| x = x * tf.expand_dims(mask, -1) | |||
| x = tf.layers.batch_normalization(x, training=is_training) | |||
| if activation is not None: | |||
| x = activation(x) | |||
| if dropout: | |||
| x = tf.layers.dropout(x, rate=0.5, training=is_training) | |||
| return x | |||
| def conv1d_dp(inputs, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=None, | |||
| dropout=False, | |||
| dropoutrate=0.5, | |||
| mask=None, | |||
| scope='conv1d'): | |||
| with tf.variable_scope(scope): | |||
| if mask is not None: | |||
| inputs = inputs * tf.expand_dims(mask, -1) | |||
| x = tf.layers.conv1d( | |||
| inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
| if mask is not None: | |||
| x = x * tf.expand_dims(mask, -1) | |||
| x = tf.contrib.layers.layer_norm(x) | |||
| if activation is not None: | |||
| x = activation(x) | |||
| if dropout: | |||
| x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) | |||
| return x | |||
| def duration_predictor(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| lstm_units, | |||
| input_lengths, | |||
| is_training, | |||
| embedded_inputs_speaker, | |||
| mask=None, | |||
| scope='duration_predictor'): | |||
| with tf.variable_scope(scope): | |||
| x = inputs | |||
| for i in range(n_conv_layers): | |||
| x = conv1d_dp( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| dropoutrate=0.1, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
| outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(lstm_units), | |||
| LSTMBlockCell(lstm_units), | |||
| x, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| x = tf.concat(outputs, axis=-1) | |||
| x = tf.layers.dense(x, units=1) | |||
| x = tf.nn.relu(x) | |||
| return x | |||
| def duration_predictor2(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| input_lengths, | |||
| is_training, | |||
| mask=None, | |||
| scope='duration_predictor'): | |||
| with tf.variable_scope(scope): | |||
| x = inputs | |||
| for i in range(n_conv_layers): | |||
| x = conv1d_dp( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| dropoutrate=0.1, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| x = tf.layers.dense(x, units=1) | |||
| x = tf.nn.relu(x) | |||
| return x | |||
| def conv_prenet(inputs, | |||
| n_conv_layers, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| mask=None, | |||
| scope='conv_prenet'): | |||
| x = inputs | |||
| with tf.variable_scope(scope): | |||
| for i in range(n_conv_layers): | |||
| x = conv1d( | |||
| x, | |||
| filters, | |||
| kernel_size, | |||
| is_training, | |||
| activation=tf.nn.relu, | |||
| dropout=True, | |||
| mask=mask, | |||
| scope='conv1d_{}'.format(i)) | |||
| return x | |||
| @@ -0,0 +1,174 @@ | |||
| """Define position encoder classes.""" | |||
| import abc | |||
| import math | |||
| import tensorflow as tf | |||
| from .reducer import SumReducer | |||
| class PositionEncoder(tf.keras.layers.Layer): | |||
| """Base class for position encoders.""" | |||
| def __init__(self, reducer=None, **kwargs): | |||
| """Initializes the position encoder. | |||
| Args: | |||
| reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
| encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
| **kwargs: Additional layer keyword arguments. | |||
| """ | |||
| super(PositionEncoder, self).__init__(**kwargs) | |||
| if reducer is None: | |||
| reducer = SumReducer(dtype=kwargs.get('dtype')) | |||
| self.reducer = reducer | |||
| def call(self, inputs, position=None): # pylint: disable=arguments-differ | |||
| """Add position encodings to :obj:`inputs`. | |||
| Args: | |||
| inputs: The inputs to encode. | |||
| position: The single position to encode, to use when this layer is called | |||
| step by step. | |||
| Returns: | |||
| A ``tf.Tensor`` whose shape depends on the configured ``reducer``. | |||
| """ | |||
| batch_size = tf.shape(inputs)[0] | |||
| timesteps = tf.shape(inputs)[1] | |||
| input_dim = inputs.shape[-1].value | |||
| positions = tf.range(timesteps) + 1 if position is None else [position] | |||
| position_encoding = self._encode([positions], input_dim) | |||
| position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) | |||
| return self.reducer([inputs, position_encoding]) | |||
| @abc.abstractmethod | |||
| def _encode(self, positions, depth): | |||
| """Creates position encodings. | |||
| Args: | |||
| positions: The positions to encode of shape :math:`[B, ...]`. | |||
| depth: The encoding depth :math:`D`. | |||
| Returns: | |||
| A ``tf.Tensor`` of shape :math:`[B, ..., D]`. | |||
| """ | |||
| raise NotImplementedError() | |||
| class PositionEmbedder(PositionEncoder): | |||
| """Encodes position with a lookup table.""" | |||
| def __init__(self, maximum_position=128, reducer=None, **kwargs): | |||
| """Initializes the position encoder. | |||
| Args: | |||
| maximum_position: The maximum position to embed. Positions greater | |||
| than this value will be set to :obj:`maximum_position`. | |||
| reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
| encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
| **kwargs: Additional layer keyword arguments. | |||
| """ | |||
| super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) | |||
| self.maximum_position = maximum_position | |||
| self.embedding = None | |||
| def build(self, input_shape): | |||
| shape = [self.maximum_position + 1, input_shape[-1]] | |||
| self.embedding = self.add_weight('position_embedding', shape) | |||
| super(PositionEmbedder, self).build(input_shape) | |||
| def _encode(self, positions, depth): | |||
| positions = tf.minimum(positions, self.maximum_position) | |||
| return tf.nn.embedding_lookup(self.embedding, positions) | |||
| class SinusoidalPositionEncoder(PositionEncoder): | |||
| """Encodes positions with sine waves as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def _encode(self, positions, depth): | |||
| if depth % 2 != 0: | |||
| raise ValueError( | |||
| 'SinusoidalPositionEncoder expects the depth to be divisble ' | |||
| 'by 2 but got %d' % depth) | |||
| batch_size = tf.shape(positions)[0] | |||
| positions = tf.cast(positions, tf.float32) | |||
| log_timescale_increment = math.log(10000) / (depth / 2 - 1) | |||
| inv_timescales = tf.exp( | |||
| tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) | |||
| inv_timescales = tf.reshape( | |||
| tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) | |||
| scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( | |||
| inv_timescales, 1) | |||
| encoding = tf.concat( | |||
| [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) | |||
| return tf.cast(encoding, self.dtype) | |||
| class SinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
| def __init__(self, name='SinusodalPositionalEncoding'): | |||
| super(SinusodalPositionalEncoding, self).__init__(name=name) | |||
| @staticmethod | |||
| def positional_encoding(len, dim, step=1.): | |||
| """ | |||
| :param len: int scalar | |||
| :param dim: int scalar | |||
| :param step: | |||
| :return: position embedding | |||
| """ | |||
| pos_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) | |||
| * step, | |||
| axis=-1), [1, dim]) | |||
| dim_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
| axis=0), [len, 1]) | |||
| dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
| pos_encoding = tf.where( # [time, dims] | |||
| tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), | |||
| x=tf.math.sin( | |||
| pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
| y=tf.math.cos(pos_mat | |||
| / tf.pow(10000., | |||
| (dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
| return pos_encoding | |||
| class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
| def __init__(self, name='BatchSinusodalPositionalEncoding'): | |||
| super(BatchSinusodalPositionalEncoding, self).__init__(name=name) | |||
| @staticmethod | |||
| def positional_encoding(batch_size, len, dim, pos_mat, step=1.): | |||
| """ | |||
| :param len: int scalar | |||
| :param dim: int scalar | |||
| :param step: | |||
| :param pos_mat: [B, len] = [len, 1] * dim | |||
| :return: position embedding | |||
| """ | |||
| pos_mat = tf.tile( | |||
| tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), | |||
| [1, 1, dim]) # [B, len, dim] | |||
| dim_mat = tf.tile( | |||
| tf.expand_dims( | |||
| tf.expand_dims( | |||
| tf.range( | |||
| 0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
| axis=0), | |||
| axis=0), [batch_size, len, 1]) # [B, len, dim] | |||
| dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
| pos_encoding = tf.where( # [B, time, dims] | |||
| tf.math.equal(tf.mod(dim_mat_int, 2), 0), | |||
| x=tf.math.sin( | |||
| pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
| y=tf.math.cos(pos_mat | |||
| / tf.pow(10000., | |||
| (dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
| return pos_encoding | |||
| @@ -0,0 +1,155 @@ | |||
| """Define reducers: objects that merge inputs.""" | |||
| import abc | |||
| import functools | |||
| import tensorflow as tf | |||
| def pad_in_time(x, padding_length): | |||
| """Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" | |||
| return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
| def align_in_time(x, length): | |||
| """Aligns the time dimension of :obj:`x` with :obj:`length`.""" | |||
| time_dim = tf.shape(x)[1] | |||
| return tf.cond( | |||
| tf.less(time_dim, length), | |||
| true_fn=lambda: pad_in_time(x, length - time_dim), | |||
| false_fn=lambda: x[:, :length]) | |||
| def pad_with_identity(x, | |||
| sequence_length, | |||
| max_sequence_length, | |||
| identity_values=0, | |||
| maxlen=None): | |||
| """Pads a tensor with identity values up to :obj:`max_sequence_length`. | |||
| Args: | |||
| x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. | |||
| sequence_length: The true sequence length of :obj:`x`. | |||
| max_sequence_length: The sequence length up to which the tensor must contain | |||
| :obj:`identity values`. | |||
| identity_values: The identity value. | |||
| maxlen: Size of the output time dimension. Default is the maximum value in | |||
| obj:`max_sequence_length`. | |||
| Returns: | |||
| A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. | |||
| """ | |||
| if maxlen is None: | |||
| maxlen = tf.reduce_max(max_sequence_length) | |||
| mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
| mask = tf.expand_dims(mask, axis=-1) | |||
| mask_combined = tf.sequence_mask( | |||
| max_sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
| mask_combined = tf.expand_dims(mask_combined, axis=-1) | |||
| identity_mask = mask_combined * (1.0 - mask) | |||
| x = pad_in_time(x, maxlen - tf.shape(x)[1]) | |||
| x = x * mask + (identity_mask * identity_values) | |||
| return x | |||
| def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): | |||
| """Pads each input tensors with identity values up to | |||
| ``max(sequence_lengths)`` for each batch. | |||
| Args: | |||
| inputs: A list of ``tf.Tensor``. | |||
| sequence_lengths: A list of sequence length. | |||
| identity_values: The identity value. | |||
| Returns: | |||
| A tuple ``(padded, max_sequence_length)`` which are respectively a list of | |||
| ``tf.Tensor`` where each tensor are padded with identity and the combined | |||
| sequence length. | |||
| """ | |||
| max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) | |||
| maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) | |||
| padded = [ | |||
| pad_with_identity( | |||
| x, | |||
| length, | |||
| max_sequence_length, | |||
| identity_values=identity_values, | |||
| maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) | |||
| ] | |||
| return padded, max_sequence_length | |||
| class Reducer(tf.keras.layers.Layer): | |||
| """Base class for reducers.""" | |||
| def zip_and_reduce(self, x, y): | |||
| """Zips the :obj:`x` with :obj:`y` structures together and reduces all | |||
| elements. If the structures are nested, they will be flattened first. | |||
| Args: | |||
| x: The first structure. | |||
| y: The second structure. | |||
| Returns: | |||
| The same structure as :obj:`x` and :obj:`y` where each element from | |||
| :obj:`x` is reduced with the correspond element from :obj:`y`. | |||
| Raises: | |||
| ValueError: if the two structures are not the same. | |||
| """ | |||
| tf.nest.assert_same_structure(x, y) | |||
| x_flat = tf.nest.flatten(x) | |||
| y_flat = tf.nest.flatten(y) | |||
| reduced = list(map(self, zip(x_flat, y_flat))) | |||
| return tf.nest.pack_sequence_as(x, reduced) | |||
| def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ | |||
| """Reduces all input elements. | |||
| Args: | |||
| inputs: A list of ``tf.Tensor``. | |||
| sequence_length: The length of each input, if reducing sequences. | |||
| Returns: | |||
| If :obj:`sequence_length` is set, a tuple | |||
| ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` | |||
| only. | |||
| """ | |||
| if sequence_length is None: | |||
| return self.reduce(inputs) | |||
| else: | |||
| return self.reduce_sequence( | |||
| inputs, sequence_lengths=sequence_length) | |||
| @abc.abstractmethod | |||
| def reduce(self, inputs): | |||
| """See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
| raise NotImplementedError() | |||
| @abc.abstractmethod | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| """See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
| raise NotImplementedError() | |||
| class SumReducer(Reducer): | |||
| """A reducer that sums the inputs.""" | |||
| def reduce(self, inputs): | |||
| if len(inputs) == 1: | |||
| return inputs[0] | |||
| if len(inputs) == 2: | |||
| return inputs[0] + inputs[1] | |||
| return tf.add_n(inputs) | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| padded, combined_length = pad_n_with_identity( | |||
| inputs, sequence_lengths, identity_values=0) | |||
| return self.reduce(padded), combined_length | |||
| class MultiplyReducer(Reducer): | |||
| """A reducer that multiplies the inputs.""" | |||
| def reduce(self, inputs): | |||
| return functools.reduce(lambda a, x: a * x, inputs) | |||
| def reduce_sequence(self, inputs, sequence_lengths): | |||
| padded, combined_length = pad_n_with_identity( | |||
| inputs, sequence_lengths, identity_values=1) | |||
| return self.reduce(padded), combined_length | |||
| @@ -0,0 +1,240 @@ | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.rnn import RNNCell | |||
| from tensorflow.contrib.seq2seq import AttentionWrapperState | |||
| from tensorflow.python.ops import rnn_cell_impl | |||
| from .modules import prenet | |||
| class VarPredictorCell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='var_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(DurPredictorCell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='dur_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| new_super_cell_out = tf.nn.relu(new_super_cell_out) | |||
| # new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class DurPredictorCECell(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
| max_dur, dur_embedding_dim): | |||
| super(DurPredictorCECell, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| self._max_dur = max_dur | |||
| self._dur_embedding_dim = dur_embedding_dim | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._max_dur | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = tf.squeeze( | |||
| tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] | |||
| prenet_input = tf.one_hot( | |||
| prenet_input, self._max_dur, on_value=1.0, off_value=0.0, | |||
| axis=-1) # [N, 120] | |||
| prenet_input = tf.layers.dense( | |||
| prenet_input, units=self._dur_embedding_dim) | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='dur_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._max_dur) # [N, 120] | |||
| new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| class VarPredictorCell2(RNNCell): | |||
| '''Wrapper wrapper knock knock.''' | |||
| def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
| super(VarPredictorCell2, self).__init__() | |||
| self._var_predictor_cell = var_predictor_cell | |||
| self._is_training = is_training | |||
| self._dim = dim | |||
| self._prenet_units = prenet_units | |||
| @property | |||
| def state_size(self): | |||
| return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
| @property | |||
| def output_size(self): | |||
| return self._dim | |||
| def zero_state(self, batch_size, dtype): | |||
| return tuple([ | |||
| rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
| dtype), | |||
| self._var_predictor_cell.zero_state(batch_size, dtype) | |||
| ]) | |||
| def call(self, inputs, state): | |||
| '''Run the Tacotron2 super decoder cell.''' | |||
| super_cell_out, decoder_state = state | |||
| # split | |||
| prenet_input = inputs[:, 0:self._dim] | |||
| encoder_output = inputs[:, self._dim:] | |||
| # prenet and concat | |||
| prenet_output = prenet( | |||
| prenet_input, | |||
| self._prenet_units, | |||
| self._is_training, | |||
| scope='var_prenet') | |||
| decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
| # decoder LSTM/GRU | |||
| new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
| decoder_input, decoder_state) | |||
| # projection | |||
| new_super_cell_out = tf.layers.dense( | |||
| new_super_cell_out, units=self._dim) | |||
| # split and relu | |||
| new_super_cell_out = tf.concat([ | |||
| tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] | |||
| ], axis=-1) # yapf:disable | |||
| new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
| return new_super_cell_out, new_states | |||
| @@ -0,0 +1,760 @@ | |||
| import tensorflow as tf | |||
| from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
| from tensorflow.contrib.seq2seq import BasicDecoder | |||
| from tensorflow.python.ops.ragged.ragged_util import repeat | |||
| from .fsmn_encoder import FsmnEncoderV2 | |||
| from .helpers import VarTestHelper, VarTrainingHelper | |||
| from .modules import conv_prenet, decoder_prenet, encoder_prenet | |||
| from .position import (BatchSinusodalPositionalEncoding, | |||
| SinusodalPositionalEncoding) | |||
| from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||
| from .self_attention_decoder import SelfAttentionDecoder | |||
| from .self_attention_encoder import SelfAttentionEncoder | |||
| class RobuTrans(): | |||
| def __init__(self, hparams): | |||
| self._hparams = hparams | |||
| def initialize(self, | |||
| inputs, | |||
| inputs_emotion, | |||
| inputs_speaker, | |||
| input_lengths, | |||
| output_lengths=None, | |||
| mel_targets=None, | |||
| durations=None, | |||
| pitch_contours=None, | |||
| uv_masks=None, | |||
| pitch_scales=None, | |||
| duration_scales=None, | |||
| energy_contours=None, | |||
| energy_scales=None): | |||
| '''Initializes the model for inference. | |||
| Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
| Args: | |||
| inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of | |||
| steps in the input time series, and values are character IDs | |||
| input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
| of each sequence in inputs. | |||
| output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
| of each sequence in outputs. | |||
| mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
| of steps in the output time series, M is num_mels, and values are entries in the mel | |||
| spectrogram. Only needed for training. | |||
| ''' | |||
| with tf.variable_scope('inference') as _: | |||
| is_training = mel_targets is not None | |||
| batch_size = tf.shape(inputs)[0] | |||
| hp = self._hparams | |||
| input_mask = None | |||
| if input_lengths is not None and is_training: | |||
| input_mask = tf.sequence_mask( | |||
| input_lengths, tf.shape(inputs)[1], dtype=tf.float32) | |||
| if input_mask is not None: | |||
| inputs = inputs * tf.expand_dims(input_mask, -1) | |||
| # speaker embedding | |||
| embedded_inputs_speaker = tf.layers.dense( | |||
| inputs_speaker, | |||
| 32, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
| # emotion embedding | |||
| embedded_inputs_emotion = tf.layers.dense( | |||
| inputs_emotion, | |||
| 32, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
| # symbol embedding | |||
| with tf.variable_scope('Embedding'): | |||
| embedded_inputs = tf.layers.dense( | |||
| inputs, | |||
| hp.embedding_dim, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer( | |||
| stddev=0.5)) | |||
| # Encoder | |||
| with tf.variable_scope('Encoder'): | |||
| Encoder = SelfAttentionEncoder( | |||
| num_layers=hp.encoder_num_layers, | |||
| num_units=hp.encoder_num_units, | |||
| num_heads=hp.encoder_num_heads, | |||
| ffn_inner_dim=hp.encoder_ffn_inner_dim, | |||
| dropout=hp.encoder_dropout, | |||
| attention_dropout=hp.encoder_attention_dropout, | |||
| relu_dropout=hp.encoder_relu_dropout) | |||
| encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( | |||
| embedded_inputs, | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| encoder_outputs = tf.layers.dense( | |||
| encoder_outputs, | |||
| hp.encoder_projection_units, | |||
| activation=None, | |||
| use_bias=False, | |||
| kernel_initializer=tf.truncated_normal_initializer( | |||
| stddev=0.5)) | |||
| # pitch and energy | |||
| var_inputs = tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2) | |||
| if input_mask is not None: | |||
| var_inputs = var_inputs * tf.expand_dims(input_mask, -1) | |||
| with tf.variable_scope('Pitch_Predictor'): | |||
| Pitch_Predictor_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.predictor_filter_size, | |||
| fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
| dnn_num_layers=hp.predictor_dnn_num_layers, | |||
| num_memory_units=hp.predictor_num_memory_units, | |||
| ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
| dropout=hp.predictor_dropout, | |||
| shift=hp.predictor_shift, | |||
| position_encoder=None) | |||
| pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( | |||
| tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2), | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| pitch_contour_outputs, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| pitch_contour_outputs = tf.concat( | |||
| pitch_contour_outputs, axis=-1) | |||
| pitch_contour_outputs = tf.layers.dense( | |||
| pitch_contour_outputs, units=1) # [N, T_in, 1] | |||
| pitch_contour_outputs = tf.squeeze( | |||
| pitch_contour_outputs, axis=2) # [N, T_in] | |||
| with tf.variable_scope('Energy_Predictor'): | |||
| Energy_Predictor_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.predictor_filter_size, | |||
| fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
| dnn_num_layers=hp.predictor_dnn_num_layers, | |||
| num_memory_units=hp.predictor_num_memory_units, | |||
| ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
| dropout=hp.predictor_dropout, | |||
| shift=hp.predictor_shift, | |||
| position_encoder=None) | |||
| energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( | |||
| tf.concat([ | |||
| encoder_outputs, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2), | |||
| sequence_length=input_lengths, | |||
| mode=is_training) | |||
| energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| energy_contour_outputs, | |||
| sequence_length=input_lengths, | |||
| dtype=tf.float32) | |||
| energy_contour_outputs = tf.concat( | |||
| energy_contour_outputs, axis=-1) | |||
| energy_contour_outputs = tf.layers.dense( | |||
| energy_contour_outputs, units=1) # [N, T_in, 1] | |||
| energy_contour_outputs = tf.squeeze( | |||
| energy_contour_outputs, axis=2) # [N, T_in] | |||
| if is_training: | |||
| pitch_embeddings = tf.expand_dims( | |||
| pitch_contours, axis=2) # [N, T_in, 1] | |||
| pitch_embeddings = tf.layers.conv1d( | |||
| pitch_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='pitch_embeddings') # [N, T_in, 32] | |||
| energy_embeddings = tf.expand_dims( | |||
| energy_contours, axis=2) # [N, T_in, 1] | |||
| energy_embeddings = tf.layers.conv1d( | |||
| energy_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='energy_embeddings') # [N, T_in, 32] | |||
| else: | |||
| pitch_contour_outputs *= pitch_scales | |||
| pitch_embeddings = tf.expand_dims( | |||
| pitch_contour_outputs, axis=2) # [N, T_in, 1] | |||
| pitch_embeddings = tf.layers.conv1d( | |||
| pitch_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='pitch_embeddings') # [N, T_in, 32] | |||
| energy_contour_outputs *= energy_scales | |||
| energy_embeddings = tf.expand_dims( | |||
| energy_contour_outputs, axis=2) # [N, T_in, 1] | |||
| energy_embeddings = tf.layers.conv1d( | |||
| energy_embeddings, | |||
| filters=hp.encoder_projection_units, | |||
| kernel_size=9, | |||
| padding='same', | |||
| name='energy_embeddings') # [N, T_in, 32] | |||
| encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings | |||
| # duration | |||
| dur_inputs = tf.concat([ | |||
| encoder_outputs_, embedded_inputs_speaker, | |||
| embedded_inputs_emotion | |||
| ], 2) | |||
| if input_mask is not None: | |||
| dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) | |||
| with tf.variable_scope('Duration_Predictor'): | |||
| duration_predictor_cell = MultiRNNCell([ | |||
| LSTMBlockCell(hp.predictor_lstm_units), | |||
| LSTMBlockCell(hp.predictor_lstm_units) | |||
| ], state_is_tuple=True) # yapf:disable | |||
| duration_output_cell = DurPredictorCell( | |||
| duration_predictor_cell, is_training, 1, | |||
| hp.predictor_prenet_units) | |||
| duration_predictor_init_state = duration_output_cell.zero_state( | |||
| batch_size=batch_size, dtype=tf.float32) | |||
| if is_training: | |||
| duration_helper = VarTrainingHelper( | |||
| tf.expand_dims( | |||
| tf.log(tf.cast(durations, tf.float32) + 1), | |||
| axis=2), dur_inputs, 1) | |||
| else: | |||
| duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
| ( | |||
| duration_outputs, _ | |||
| ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( | |||
| BasicDecoder(duration_output_cell, duration_helper, | |||
| duration_predictor_init_state), | |||
| maximum_iterations=1000) | |||
| duration_outputs = tf.squeeze( | |||
| duration_outputs, axis=2) # [N, T_in] | |||
| if input_mask is not None: | |||
| duration_outputs = duration_outputs * input_mask | |||
| duration_outputs_ = tf.exp(duration_outputs) - 1 | |||
| # Length Regulator | |||
| with tf.variable_scope('Length_Regulator'): | |||
| if is_training: | |||
| i = tf.constant(1) | |||
| # position embedding | |||
| j = tf.constant(1) | |||
| dur_len = tf.shape(durations)[-1] | |||
| embedded_position_i = tf.range(1, durations[0, 0] + 1) | |||
| def condition_pos(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range(1, durations[0, j] + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| j, embedded_position_i = tf.while_loop( | |||
| condition_pos, | |||
| loop_body_pos, [j, embedded_position_i], | |||
| shape_invariants=[ | |||
| j.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.reshape(embedded_position_i, | |||
| (1, -1)) | |||
| # others | |||
| LR_outputs = repeat( | |||
| encoder_outputs_[0:1, :, :], durations[0, :], axis=1) | |||
| embedded_outputs_speaker = repeat( | |||
| embedded_inputs_speaker[0:1, :, :], | |||
| durations[0, :], | |||
| axis=1) | |||
| embedded_outputs_emotion = repeat( | |||
| embedded_inputs_emotion[0:1, :, :], | |||
| durations[0, :], | |||
| axis=1) | |||
| def condition(i, pos, layer, s, e): | |||
| return tf.less(i, tf.shape(mel_targets)[0]) | |||
| def loop_body(i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, | |||
| embedded_outputs_emotion): | |||
| # position embedding | |||
| jj = tf.constant(1) | |||
| embedded_position_i = tf.range(1, durations[i, 0] + 1) | |||
| def condition_pos_i(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos_i(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range(1, durations[i, j] + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| jj, embedded_position_i = tf.while_loop( | |||
| condition_pos_i, | |||
| loop_body_pos_i, [jj, embedded_position_i], | |||
| shape_invariants=[ | |||
| jj.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.concat([ | |||
| embedded_position, | |||
| tf.reshape(embedded_position_i, (1, -1)) | |||
| ], 0) | |||
| # others | |||
| LR_outputs = tf.concat([ | |||
| LR_outputs, | |||
| repeat( | |||
| encoder_outputs_[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| embedded_outputs_speaker = tf.concat([ | |||
| embedded_outputs_speaker, | |||
| repeat( | |||
| embedded_inputs_speaker[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| embedded_outputs_emotion = tf.concat([ | |||
| embedded_outputs_emotion, | |||
| repeat( | |||
| embedded_inputs_emotion[i:i + 1, :, :], | |||
| durations[i, :], | |||
| axis=1) | |||
| ], 0) | |||
| return [ | |||
| i + 1, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, embedded_outputs_emotion | |||
| ] | |||
| i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, | |||
| embedded_outputs_emotion = tf.while_loop( | |||
| condition, | |||
| loop_body, [ | |||
| i, embedded_position, LR_outputs, | |||
| embedded_outputs_speaker, embedded_outputs_emotion | |||
| ], | |||
| shape_invariants=[ | |||
| i.get_shape(), | |||
| tf.TensorShape([None, None]), | |||
| tf.TensorShape([None, None, None]), | |||
| tf.TensorShape([None, None, None]), | |||
| tf.TensorShape([None, None, None]) | |||
| ], | |||
| parallel_iterations=hp.batch_size) | |||
| ori_framenum = tf.shape(mel_targets)[1] | |||
| else: | |||
| # position | |||
| j = tf.constant(1) | |||
| dur_len = tf.shape(duration_outputs_)[-1] | |||
| embedded_position_i = tf.range( | |||
| 1, | |||
| tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) | |||
| + 1) | |||
| def condition_pos(j, e): | |||
| return tf.less(j, dur_len) | |||
| def loop_body_pos(j, embedded_position_i): | |||
| embedded_position_i = tf.concat([ | |||
| embedded_position_i, | |||
| tf.range( | |||
| 1, | |||
| tf.cast( | |||
| tf.round(duration_outputs_)[0, j], | |||
| tf.int32) + 1) | |||
| ], axis=0) # yapf:disable | |||
| return [j + 1, embedded_position_i] | |||
| j, embedded_position_i = tf.while_loop( | |||
| condition_pos, | |||
| loop_body_pos, [j, embedded_position_i], | |||
| shape_invariants=[ | |||
| j.get_shape(), | |||
| tf.TensorShape([None]) | |||
| ]) | |||
| embedded_position = tf.reshape(embedded_position_i, | |||
| (1, -1)) | |||
| # others | |||
| duration_outputs_ *= duration_scales | |||
| LR_outputs = repeat( | |||
| encoder_outputs_[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| embedded_outputs_speaker = repeat( | |||
| embedded_inputs_speaker[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| embedded_outputs_emotion = repeat( | |||
| embedded_inputs_emotion[0:1, :, :], | |||
| tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
| axis=1) | |||
| ori_framenum = tf.shape(LR_outputs)[1] | |||
| left = hp.outputs_per_step - tf.mod( | |||
| ori_framenum, hp.outputs_per_step) | |||
| LR_outputs = tf.cond( | |||
| tf.equal(left, | |||
| hp.outputs_per_step), lambda: LR_outputs, | |||
| lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], | |||
| 'CONSTANT')) | |||
| embedded_outputs_speaker = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_outputs_speaker, lambda: tf.pad( | |||
| embedded_outputs_speaker, [[0, 0], [0, left], | |||
| [0, 0]], 'CONSTANT')) | |||
| embedded_outputs_emotion = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_outputs_emotion, lambda: tf.pad( | |||
| embedded_outputs_emotion, [[0, 0], [0, left], | |||
| [0, 0]], 'CONSTANT')) | |||
| embedded_position = tf.cond( | |||
| tf.equal(left, hp.outputs_per_step), | |||
| lambda: embedded_position, | |||
| lambda: tf.pad(embedded_position, [[0, 0], [0, left]], | |||
| 'CONSTANT')) | |||
| # Pos_Embedding | |||
| with tf.variable_scope('Position_Embedding'): | |||
| Pos_Embedding = BatchSinusodalPositionalEncoding() | |||
| position_embeddings = Pos_Embedding.positional_encoding( | |||
| batch_size, | |||
| tf.shape(LR_outputs)[1], hp.encoder_projection_units, | |||
| embedded_position) | |||
| LR_outputs += position_embeddings | |||
| # multi-frame | |||
| LR_outputs = tf.reshape(LR_outputs, [ | |||
| batch_size, -1, | |||
| hp.outputs_per_step * hp.encoder_projection_units | |||
| ]) | |||
| embedded_outputs_speaker = tf.reshape( | |||
| embedded_outputs_speaker, | |||
| [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
| embedded_outputs_emotion = tf.reshape( | |||
| embedded_outputs_emotion, | |||
| [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
| # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) | |||
| LR_outputs = tf.concat([ | |||
| LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion | |||
| ], -1) | |||
| # auto bandwidth | |||
| if is_training: | |||
| durations_mask = tf.cast(durations, | |||
| tf.float32) * input_mask # [N, T_in] | |||
| else: | |||
| durations_mask = duration_outputs_ | |||
| X_band_width = tf.cast( | |||
| tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), | |||
| tf.int32) | |||
| H_band_width = X_band_width | |||
| with tf.variable_scope('Decoder'): | |||
| Decoder = SelfAttentionDecoder( | |||
| num_layers=hp.decoder_num_layers, | |||
| num_units=hp.decoder_num_units, | |||
| num_heads=hp.decoder_num_heads, | |||
| ffn_inner_dim=hp.decoder_ffn_inner_dim, | |||
| dropout=hp.decoder_dropout, | |||
| attention_dropout=hp.decoder_attention_dropout, | |||
| relu_dropout=hp.decoder_relu_dropout, | |||
| prenet_units=hp.prenet_units, | |||
| dense_units=hp.prenet_proj_units, | |||
| num_mels=hp.num_mels, | |||
| outputs_per_step=hp.outputs_per_step, | |||
| X_band_width=X_band_width, | |||
| H_band_width=H_band_width, | |||
| position_encoder=None) | |||
| if is_training: | |||
| if hp.free_run: | |||
| r = hp.outputs_per_step | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_input_lengths = tf.cast( | |||
| output_lengths / r, tf.int32) | |||
| decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
| init_decoder_input, | |||
| maximum_iterations=tf.shape(LR_outputs)[1], | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=decoder_input_lengths) | |||
| else: | |||
| r = hp.outputs_per_step | |||
| decoder_input = mel_targets[:, r - 1:: | |||
| r, :] # [N, T_out / r, hp.num_mels] | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_input = tf.concat( | |||
| [init_decoder_input, decoder_input], | |||
| axis=1) # [N, T_out / r + 1, hp.num_mels] | |||
| decoder_input = decoder_input[:, : | |||
| -1, :] # [N, T_out / r, hp.num_mels] | |||
| decoder_input_lengths = tf.cast( | |||
| output_lengths / r, tf.int32) | |||
| decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( | |||
| decoder_input, | |||
| decoder_input_lengths, | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=decoder_input_lengths) | |||
| else: | |||
| init_decoder_input = tf.expand_dims( | |||
| tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
| axis=1) # [N, 1, hp.num_mels] | |||
| decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
| init_decoder_input, | |||
| maximum_iterations=tf.shape(LR_outputs)[1], | |||
| mode=is_training, | |||
| memory=LR_outputs, | |||
| memory_sequence_length=tf.expand_dims( | |||
| tf.shape(LR_outputs)[1], axis=0)) | |||
| if is_training: | |||
| mel_outputs_ = tf.reshape(decoder_outputs, | |||
| [batch_size, -1, hp.num_mels]) | |||
| else: | |||
| mel_outputs_ = tf.reshape( | |||
| decoder_outputs, | |||
| [batch_size, -1, hp.num_mels])[:, :ori_framenum, :] | |||
| mel_outputs = mel_outputs_ | |||
| with tf.variable_scope('Postnet'): | |||
| Postnet_FSMN = FsmnEncoderV2( | |||
| filter_size=hp.postnet_filter_size, | |||
| fsmn_num_layers=hp.postnet_fsmn_num_layers, | |||
| dnn_num_layers=hp.postnet_dnn_num_layers, | |||
| num_memory_units=hp.postnet_num_memory_units, | |||
| ffn_inner_dim=hp.postnet_ffn_inner_dim, | |||
| dropout=hp.postnet_dropout, | |||
| shift=hp.postnet_shift, | |||
| position_encoder=None) | |||
| if is_training: | |||
| postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
| mel_outputs, | |||
| sequence_length=output_lengths, | |||
| mode=is_training) | |||
| hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(hp.postnet_lstm_units), | |||
| postnet_fsmn_outputs, | |||
| sequence_length=output_lengths, | |||
| dtype=tf.float32) | |||
| else: | |||
| postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
| mel_outputs, | |||
| sequence_length=[tf.shape(mel_outputs_)[1]], | |||
| mode=is_training) | |||
| hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
| LSTMBlockCell(hp.postnet_lstm_units), | |||
| postnet_fsmn_outputs, | |||
| sequence_length=[tf.shape(mel_outputs_)[1]], | |||
| dtype=tf.float32) | |||
| mel_residual_outputs = tf.layers.dense( | |||
| hidden_lstm_outputs, units=hp.num_mels) | |||
| mel_outputs += mel_residual_outputs | |||
| self.inputs = inputs | |||
| self.inputs_speaker = inputs_speaker | |||
| self.inputs_emotion = inputs_emotion | |||
| self.input_lengths = input_lengths | |||
| self.durations = durations | |||
| self.output_lengths = output_lengths | |||
| self.mel_outputs_ = mel_outputs_ | |||
| self.mel_outputs = mel_outputs | |||
| self.mel_targets = mel_targets | |||
| self.duration_outputs = duration_outputs | |||
| self.duration_outputs_ = duration_outputs_ | |||
| self.duration_scales = duration_scales | |||
| self.pitch_contour_outputs = pitch_contour_outputs | |||
| self.pitch_contours = pitch_contours | |||
| self.pitch_scales = pitch_scales | |||
| self.energy_contour_outputs = energy_contour_outputs | |||
| self.energy_contours = energy_contours | |||
| self.energy_scales = energy_scales | |||
| self.uv_masks_ = uv_masks | |||
| self.embedded_inputs_emotion = embedded_inputs_emotion | |||
| self.embedding_fsmn_outputs = embedded_inputs | |||
| self.encoder_outputs = encoder_outputs | |||
| self.encoder_outputs_ = encoder_outputs_ | |||
| self.LR_outputs = LR_outputs | |||
| self.postnet_fsmn_outputs = postnet_fsmn_outputs | |||
| self.pitch_embeddings = pitch_embeddings | |||
| self.energy_embeddings = energy_embeddings | |||
| self.attns = attns | |||
| self.attention_x = attention_x | |||
| self.attention_h = attention_h | |||
| self.X_band_width = X_band_width | |||
| self.H_band_width = H_band_width | |||
| def add_loss(self): | |||
| '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' | |||
| with tf.variable_scope('loss') as _: | |||
| hp = self._hparams | |||
| mask = tf.sequence_mask( | |||
| self.output_lengths, | |||
| tf.shape(self.mel_targets)[1], | |||
| dtype=tf.float32) | |||
| valid_outputs = tf.reduce_sum(mask) | |||
| mask_input = tf.sequence_mask( | |||
| self.input_lengths, | |||
| tf.shape(self.durations)[1], | |||
| dtype=tf.float32) | |||
| valid_inputs = tf.reduce_sum(mask_input) | |||
| # mel loss | |||
| if self.uv_masks_ is not None: | |||
| valid_outputs_mask = tf.reduce_sum( | |||
| tf.expand_dims(mask, -1) * self.uv_masks_) | |||
| self.mel_loss_ = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs_) | |||
| * tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
| valid_outputs_mask * hp.num_mels) | |||
| self.mel_loss = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs) | |||
| * tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
| valid_outputs_mask * hp.num_mels) | |||
| else: | |||
| self.mel_loss_ = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs_) | |||
| * tf.expand_dims(mask, -1)) / ( | |||
| valid_outputs * hp.num_mels) | |||
| self.mel_loss = tf.reduce_sum( | |||
| tf.abs(self.mel_targets - self.mel_outputs) | |||
| * tf.expand_dims(mask, -1)) / ( | |||
| valid_outputs * hp.num_mels) | |||
| # duration loss | |||
| self.duration_loss = tf.reduce_sum( | |||
| tf.abs( | |||
| tf.log(tf.cast(self.durations, tf.float32) + 1) | |||
| - self.duration_outputs) * mask_input) / valid_inputs | |||
| # pitch contour loss | |||
| self.pitch_contour_loss = tf.reduce_sum( | |||
| tf.abs(self.pitch_contours - self.pitch_contour_outputs) | |||
| * mask_input) / valid_inputs | |||
| # energy contour loss | |||
| self.energy_contour_loss = tf.reduce_sum( | |||
| tf.abs(self.energy_contours - self.energy_contour_outputs) | |||
| * mask_input) / valid_inputs | |||
| # final loss | |||
| self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ | |||
| + self.pitch_contour_loss + self.energy_contour_loss | |||
| # guided attention loss | |||
| self.guided_attention_loss = tf.constant(0.0) | |||
| if hp.guided_attention: | |||
| i0 = tf.constant(0) | |||
| loss0 = tf.constant(0.0) | |||
| def c(i, _): | |||
| return tf.less(i, tf.shape(mel_targets)[0]) | |||
| def loop_body(i, loss): | |||
| decoder_input_lengths = tf.cast( | |||
| self.output_lengths / hp.outputs_per_step, tf.int32) | |||
| input_len = decoder_input_lengths[i] | |||
| output_len = decoder_input_lengths[i] | |||
| input_w = tf.expand_dims( | |||
| tf.range(tf.cast(input_len, dtype=tf.float32)), | |||
| axis=1) / tf.cast( | |||
| input_len, dtype=tf.float32) # [T_in, 1] | |||
| output_w = tf.expand_dims( | |||
| tf.range(tf.cast(output_len, dtype=tf.float32)), | |||
| axis=0) / tf.cast( | |||
| output_len, dtype=tf.float32) # [1, T_out] | |||
| guided_attention_w = 1.0 - tf.exp( | |||
| -(1 / hp.guided_attention_2g_squared) | |||
| * tf.square(input_w - output_w)) # [T_in, T_out] | |||
| guided_attention_w = tf.expand_dims( | |||
| guided_attention_w, axis=0) # [1, T_in, T_out] | |||
| # [hp.decoder_num_heads, T_in, T_out] | |||
| guided_attention_w = tf.tile(guided_attention_w, | |||
| [hp.decoder_num_heads, 1, 1]) | |||
| loss_i = tf.constant(0.0) | |||
| for j in range(hp.decoder_num_layers): | |||
| loss_i += tf.reduce_mean( | |||
| self.attention_h[j][i, :, :input_len, :output_len] | |||
| * guided_attention_w) | |||
| return [tf.add(i, 1), tf.add(loss, loss_i)] | |||
| _, loss = tf.while_loop( | |||
| c, | |||
| loop_body, | |||
| loop_vars=[i0, loss0], | |||
| parallel_iterations=hp.batch_size) | |||
| self.guided_attention_loss = loss / hp.batch_size | |||
| self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss | |||
| def add_optimizer(self, global_step): | |||
| '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. | |||
| Args: | |||
| global_step: int32 scalar Tensor representing current global step in training | |||
| ''' | |||
| with tf.variable_scope('optimizer') as _: | |||
| hp = self._hparams | |||
| if hp.decay_learning_rate: | |||
| self.learning_rate = _learning_rate_decay( | |||
| hp.initial_learning_rate, global_step) | |||
| else: | |||
| self.learning_rate = tf.convert_to_tensor( | |||
| hp.initial_learning_rate) | |||
| optimizer = tf.train.AdamOptimizer(self.learning_rate, | |||
| hp.adam_beta1, hp.adam_beta2) | |||
| gradients, variables = zip(*optimizer.compute_gradients(self.loss)) | |||
| self.gradients = gradients | |||
| clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) | |||
| # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: | |||
| # https://github.com/tensorflow/tensorflow/issues/1122 | |||
| with tf.control_dependencies( | |||
| tf.get_collection(tf.GraphKeys.UPDATE_OPS)): | |||
| self.optimize = optimizer.apply_gradients( | |||
| zip(clipped_gradients, variables), global_step=global_step) | |||
| def _learning_rate_decay(init_lr, global_step): | |||
| # Noam scheme from tensor2tensor: | |||
| warmup_steps = 4000.0 | |||
| step = tf.cast(global_step + 1, dtype=tf.float32) | |||
| return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, | |||
| step**-0.5) | |||
| @@ -0,0 +1,817 @@ | |||
| """Define self-attention decoder.""" | |||
| import sys | |||
| import tensorflow as tf | |||
| from . import compat, transformer | |||
| from .modules import decoder_prenet | |||
| from .position import SinusoidalPositionEncoder | |||
| class SelfAttentionDecoder(): | |||
| """Decoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| prenet_units=256, | |||
| dense_units=128, | |||
| num_mels=80, | |||
| outputs_per_step=3, | |||
| X_band_width=None, | |||
| H_band_width=None, | |||
| position_encoder=SinusoidalPositionEncoder(), | |||
| self_attention_type='scaled_dot'): | |||
| """Initializes the parameters of the decoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
| insensitive). | |||
| Raises: | |||
| ValueError: if :obj:`self_attention_type` is invalid. | |||
| """ | |||
| super(SelfAttentionDecoder, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| self.self_attention_type = self_attention_type.lower() | |||
| if self.self_attention_type not in ('scaled_dot', 'average'): | |||
| raise ValueError('invalid attention type %s' | |||
| % self.self_attention_type) | |||
| if self.self_attention_type == 'average': | |||
| tf.logging.warning( | |||
| 'Support for average attention network is experimental ' | |||
| 'and may change in future versions.') | |||
| self.prenet_units = prenet_units | |||
| self.dense_units = dense_units | |||
| self.num_mels = num_mels | |||
| self.outputs_per_step = outputs_per_step | |||
| self.X_band_width = X_band_width | |||
| self.H_band_width = H_band_width | |||
| @property | |||
| def output_size(self): | |||
| """Returns the decoder output size.""" | |||
| return self.num_units | |||
| @property | |||
| def support_alignment_history(self): | |||
| return True | |||
| @property | |||
| def support_multi_source(self): | |||
| return True | |||
| def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
| cache = {} | |||
| for layer in range(self.num_layers): | |||
| proj_cache_shape = [ | |||
| batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
| ] | |||
| layer_cache = {} | |||
| layer_cache['memory'] = [{ | |||
| 'memory_keys': | |||
| tf.zeros(proj_cache_shape, dtype=dtype), | |||
| 'memory_values': | |||
| tf.zeros(proj_cache_shape, dtype=dtype) | |||
| } for _ in range(num_sources)] | |||
| if self.self_attention_type == 'scaled_dot': | |||
| layer_cache['self_keys'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| layer_cache['self_values'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| elif self.self_attention_type == 'average': | |||
| layer_cache['prev_g'] = tf.zeros( | |||
| [batch_size, 1, self.num_units], dtype=dtype) | |||
| cache['layer_{}'.format(layer)] = layer_cache | |||
| return cache | |||
| def _init_attn(self, dtype=tf.float32): | |||
| attn = [] | |||
| for layer in range(self.num_layers): | |||
| attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) | |||
| return attn | |||
| def _self_attention_stack(self, | |||
| inputs, | |||
| sequence_length=None, | |||
| mode=True, | |||
| cache=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| step=None): | |||
| # [N, T_out, self.dense_units] or [N, 1, self.dense_units] | |||
| prenet_outputs = decoder_prenet(inputs, self.prenet_units, | |||
| self.dense_units, mode) | |||
| if step is None: | |||
| decoder_inputs = tf.concat( | |||
| [memory, prenet_outputs], | |||
| axis=-1) # [N, T_out, memory_size + self.dense_units] | |||
| else: | |||
| decoder_inputs = tf.concat( | |||
| [memory[:, step:step + 1, :], prenet_outputs], | |||
| axis=-1) # [N, 1, memory_size + self.dense_units] | |||
| decoder_inputs = tf.layers.dense( | |||
| decoder_inputs, units=self.dense_units) | |||
| inputs = decoder_inputs | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder( | |||
| inputs, position=step + 1 if step is not None else None) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| decoder_mask = None | |||
| memory_mask = None | |||
| # last_attention = None | |||
| X_band_width_tmp = -1 | |||
| H_band_width_tmp = -1 | |||
| if self.X_band_width is not None: | |||
| X_band_width_tmp = tf.cast( | |||
| tf.cond( | |||
| tf.less(tf.shape(memory)[1], self.X_band_width), | |||
| lambda: -1, lambda: self.X_band_width), | |||
| dtype=tf.int64) | |||
| if self.H_band_width is not None: | |||
| H_band_width_tmp = tf.cast( | |||
| tf.cond( | |||
| tf.less(tf.shape(memory)[1], self.H_band_width), | |||
| lambda: -1, lambda: self.H_band_width), | |||
| dtype=tf.int64) | |||
| if self.self_attention_type == 'scaled_dot': | |||
| if sequence_length is not None: | |||
| decoder_mask = transformer.build_future_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| band=X_band_width_tmp) # [N, 1, T_out, T_out] | |||
| elif self.self_attention_type == 'average': | |||
| if cache is None: | |||
| if sequence_length is None: | |||
| sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
| tf.shape(inputs)[1]) | |||
| decoder_mask = transformer.cumulative_average_mask( | |||
| sequence_length, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| dtype=inputs.dtype) | |||
| if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
| memory): | |||
| memory = (memory, ) | |||
| if memory_sequence_length is not None: | |||
| if not tf.contrib.framework.nest.is_sequence( | |||
| memory_sequence_length): | |||
| memory_sequence_length = (memory_sequence_length, ) | |||
| if step is None: | |||
| memory_mask = [ | |||
| transformer.build_history_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1], | |||
| band=H_band_width_tmp) | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| else: | |||
| memory_mask = [ | |||
| transformer.build_history_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1], | |||
| band=H_band_width_tmp)[:, :, step:step + 1, :] | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| # last_attention = None | |||
| attns_x = [] | |||
| attns_h = [] | |||
| for layer in range(self.num_layers): | |||
| layer_name = 'layer_{}'.format(layer) | |||
| layer_cache = cache[layer_name] if cache is not None else None | |||
| with tf.variable_scope(layer_name): | |||
| if memory is not None: | |||
| for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
| memory_cache = None | |||
| if layer_cache is not None: | |||
| memory_cache = layer_cache['memory'][i] | |||
| scope_name = 'multi_head_{}'.format(i) | |||
| if i == 0: | |||
| scope_name = 'multi_head' | |||
| with tf.variable_scope(scope_name): | |||
| encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| mem, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=decoder_mask, | |||
| mask_h=mask, | |||
| cache=layer_cache, | |||
| cache_h=memory_cache, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True, | |||
| layer_name=layer_name, | |||
| X_band_width=self.X_band_width) | |||
| attns_x.append(attn_x) | |||
| attns_h.append(attn_h) | |||
| context = transformer.drop_and_add( | |||
| inputs, encoded, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| outputs = transformer.norm(inputs) | |||
| outputs = tf.layers.dense( | |||
| outputs, units=self.num_mels * self.outputs_per_step) | |||
| return outputs, attns_x, attns_h | |||
| def decode_from_inputs(self, | |||
| inputs, | |||
| sequence_length, | |||
| initial_state=None, | |||
| mode=True, | |||
| memory=None, | |||
| memory_sequence_length=None): | |||
| outputs, attention_x, attention_h = self._self_attention_stack( | |||
| inputs, | |||
| sequence_length=sequence_length, | |||
| mode=mode, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| return outputs, attention_x, attention_h | |||
| def step_fn(self, | |||
| mode, | |||
| batch_size, | |||
| initial_state=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| dtype=tf.float32): | |||
| if memory is None: | |||
| num_sources = 0 | |||
| elif tf.contrib.framework.nest.is_sequence(memory): | |||
| num_sources = len(memory) | |||
| else: | |||
| num_sources = 1 | |||
| cache = self._init_cache( | |||
| batch_size, dtype=dtype, num_sources=num_sources) | |||
| attention_x = self._init_attn(dtype=dtype) | |||
| attention_h = self._init_attn(dtype=dtype) | |||
| def _fn(step, inputs, cache): | |||
| outputs, attention_x, attention_h = self._self_attention_stack( | |||
| inputs, | |||
| mode=mode, | |||
| cache=cache, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length, | |||
| step=step) | |||
| attention_x_tmp = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_x_tmp_l = tf.zeros_like(attention_h[layer]) | |||
| if self.X_band_width is not None: | |||
| pred = tf.less(step, self.X_band_width + 1) | |||
| attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable | |||
| lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], | |||
| lambda: tf.concat([ | |||
| attention_x_tmp_l[:, :, :, | |||
| :step - self.X_band_width], | |||
| attention_x_tmp_l[:, :, :, | |||
| step - self.X_band_width:step + 1] | |||
| + attention_x[layer]], | |||
| axis=-1)) # yapf:disable | |||
| attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
| attention_x_tmp.append( | |||
| tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], | |||
| axis=-1)) | |||
| else: | |||
| attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] | |||
| attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
| attention_x_tmp.append( | |||
| tf.concat([ | |||
| attention_x_tmp_l_1 + attention_x[layer], | |||
| attention_x_tmp_l_2 | |||
| ], axis=-1)) # yapf:disable | |||
| attention_x = attention_x_tmp | |||
| return outputs, cache, attention_x, attention_h | |||
| return _fn, cache, attention_x, attention_h | |||
| def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, | |||
| mode, memory, memory_sequence_length): | |||
| batch_size = tf.shape(init_decoder_input)[0] | |||
| step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
| mode, | |||
| batch_size, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| outputs, attention_x, attention_h, cache = self.dynamic_decode( | |||
| step_fn, | |||
| init_decoder_input, | |||
| init_cache=init_cache, | |||
| init_attn_x=init_attn_x, | |||
| init_attn_h=init_attn_h, | |||
| maximum_iterations=maximum_iterations, | |||
| batch_size=batch_size) | |||
| return outputs, attention_x, attention_h | |||
| def dynamic_decode_and_search_teacher_forcing(self, decoder_input, | |||
| maximum_iterations, mode, | |||
| memory, | |||
| memory_sequence_length): | |||
| batch_size = tf.shape(decoder_input)[0] | |||
| step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
| mode, | |||
| batch_size, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( | |||
| step_fn, | |||
| decoder_input, | |||
| init_cache=init_cache, | |||
| init_attn_x=init_attn_x, | |||
| init_attn_h=init_attn_h, | |||
| maximum_iterations=maximum_iterations, | |||
| batch_size=batch_size) | |||
| return outputs, attention_x, attention_h | |||
| def dynamic_decode(self, | |||
| step_fn, | |||
| init_decoder_input, | |||
| init_cache=None, | |||
| init_attn_x=None, | |||
| init_attn_h=None, | |||
| maximum_iterations=None, | |||
| batch_size=None): | |||
| def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
| return tf.less(step, maximum_iterations) | |||
| def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
| # output: [1, 1, num_mels * r] | |||
| # attn: [1, 1, T_out] | |||
| output, cache, attn_x, attn_h = step_fn( | |||
| step, inputs, cache) # outputs, cache, attention, attns | |||
| for layer in range(len(attention_x)): | |||
| attention_x[layer] = attention_x[layer].write( | |||
| step, tf.cast(attn_x[layer], tf.float32)) | |||
| for layer in range(len(attention_h)): | |||
| attention_h[layer] = attention_h[layer].write( | |||
| step, tf.cast(attn_h[layer], tf.float32)) | |||
| outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
| return step + 1, cache, output[:, :, -self. | |||
| num_mels:], outputs, attention_x, attention_h | |||
| step = tf.constant(0, dtype=tf.int32) | |||
| outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
| _, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
| _cond, | |||
| _body, | |||
| loop_vars=(step, init_cache, init_decoder_input, outputs, | |||
| init_attn_x, init_attn_h), | |||
| shape_invariants=(step.shape, | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_cache), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, | |||
| init_decoder_input), tf.TensorShape(None), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_x), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_h)), | |||
| parallel_iterations=1, | |||
| back_prop=False, | |||
| maximum_iterations=maximum_iterations) | |||
| # element of outputs: [N, 1, num_mels * r] | |||
| outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
| outputs_stack = tf.transpose( | |||
| outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
| outputs_stack = tf.squeeze( | |||
| outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
| attention_x_stack = [] | |||
| for layer in range(len(attention_x)): | |||
| attention_x_stack_tmp = attention_x[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_x_stack_tmp = tf.transpose( | |||
| attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_x_stack_tmp = tf.squeeze( | |||
| attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_x_stack.append(attention_x_stack_tmp) | |||
| attention_h_stack = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_h_stack_tmp = attention_h[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_h_stack_tmp = tf.transpose( | |||
| attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_h_stack_tmp = tf.squeeze( | |||
| attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_h_stack.append(attention_h_stack_tmp) | |||
| return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
| def dynamic_decode_teacher_forcing(self, | |||
| step_fn, | |||
| decoder_input, | |||
| init_cache=None, | |||
| init_attn_x=None, | |||
| init_attn_h=None, | |||
| maximum_iterations=None, | |||
| batch_size=None): | |||
| def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
| return tf.less(step, maximum_iterations) | |||
| def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
| # output: [1, 1, num_mels * r] | |||
| # attn: [1, 1, T_out] | |||
| output, cache, attn_x, attn_h = step_fn( | |||
| step, inputs[:, step:step + 1, :], | |||
| cache) # outputs, cache, attention, attns | |||
| for layer in range(len(attention_x)): | |||
| attention_x[layer] = attention_x[layer].write( | |||
| step, tf.cast(attn_x[layer], tf.float32)) | |||
| for layer in range(len(attention_h)): | |||
| attention_h[layer] = attention_h[layer].write( | |||
| step, tf.cast(attn_h[layer], tf.float32)) | |||
| outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
| return step + 1, cache, inputs, outputs, attention_x, attention_h | |||
| step = tf.constant(0, dtype=tf.int32) | |||
| outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
| _, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
| _cond, | |||
| _body, | |||
| loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, | |||
| init_attn_h), | |||
| shape_invariants=(step.shape, | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, | |||
| init_cache), decoder_input.shape, | |||
| tf.TensorShape(None), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_x), | |||
| compat.nest.map_structure( | |||
| self._get_shape_invariants, init_attn_h)), | |||
| parallel_iterations=1, | |||
| back_prop=False, | |||
| maximum_iterations=maximum_iterations) | |||
| # element of outputs: [N, 1, num_mels * r] | |||
| outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
| outputs_stack = tf.transpose( | |||
| outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
| outputs_stack = tf.squeeze( | |||
| outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
| attention_x_stack = [] | |||
| for layer in range(len(attention_x)): | |||
| attention_x_stack_tmp = attention_x[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_x_stack_tmp = tf.transpose( | |||
| attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_x_stack_tmp = tf.squeeze( | |||
| attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_x_stack.append(attention_x_stack_tmp) | |||
| attention_h_stack = [] | |||
| for layer in range(len(attention_h)): | |||
| attention_h_stack_tmp = attention_h[layer].stack( | |||
| ) # [T_out, N, H, 1, T_out] | |||
| attention_h_stack_tmp = tf.transpose( | |||
| attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
| 4]) # [1, N, H, T_out, T_out] | |||
| attention_h_stack_tmp = tf.squeeze( | |||
| attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
| attention_h_stack.append(attention_h_stack_tmp) | |||
| return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
| def _get_shape_invariants(self, tensor): | |||
| """Returns the shape of the tensor but sets middle dims to None.""" | |||
| if isinstance(tensor, tf.TensorArray): | |||
| shape = None | |||
| else: | |||
| shape = tensor.shape.as_list() | |||
| for i in range(1, len(shape) - 1): | |||
| shape[i] = None | |||
| return tf.TensorShape(shape) | |||
| class SelfAttentionDecoderOri(): | |||
| """Decoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder(), | |||
| self_attention_type='scaled_dot'): | |||
| """Initializes the parameters of the decoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
| insensitive). | |||
| Raises: | |||
| ValueError: if :obj:`self_attention_type` is invalid. | |||
| """ | |||
| super(SelfAttentionDecoderOri, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| self.self_attention_type = self_attention_type.lower() | |||
| if self.self_attention_type not in ('scaled_dot', 'average'): | |||
| raise ValueError('invalid attention type %s' | |||
| % self.self_attention_type) | |||
| if self.self_attention_type == 'average': | |||
| tf.logging.warning( | |||
| 'Support for average attention network is experimental ' | |||
| 'and may change in future versions.') | |||
| @property | |||
| def output_size(self): | |||
| """Returns the decoder output size.""" | |||
| return self.num_units | |||
| @property | |||
| def support_alignment_history(self): | |||
| return True | |||
| @property | |||
| def support_multi_source(self): | |||
| return True | |||
| def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
| cache = {} | |||
| for layer in range(self.num_layers): | |||
| proj_cache_shape = [ | |||
| batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
| ] | |||
| layer_cache = {} | |||
| layer_cache['memory'] = [{ | |||
| 'memory_keys': | |||
| tf.zeros(proj_cache_shape, dtype=dtype), | |||
| 'memory_values': | |||
| tf.zeros(proj_cache_shape, dtype=dtype) | |||
| } for _ in range(num_sources)] | |||
| if self.self_attention_type == 'scaled_dot': | |||
| layer_cache['self_keys'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| layer_cache['self_values'] = tf.zeros( | |||
| proj_cache_shape, dtype=dtype) | |||
| elif self.self_attention_type == 'average': | |||
| layer_cache['prev_g'] = tf.zeros( | |||
| [batch_size, 1, self.num_units], dtype=dtype) | |||
| cache['layer_{}'.format(layer)] = layer_cache | |||
| return cache | |||
| def _self_attention_stack(self, | |||
| inputs, | |||
| sequence_length=None, | |||
| mode=True, | |||
| cache=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| step=None): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder( | |||
| inputs, position=step + 1 if step is not None else None) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| decoder_mask = None | |||
| memory_mask = None | |||
| last_attention = None | |||
| if self.self_attention_type == 'scaled_dot': | |||
| if sequence_length is not None: | |||
| decoder_mask = transformer.build_future_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) | |||
| elif self.self_attention_type == 'average': | |||
| if cache is None: | |||
| if sequence_length is None: | |||
| sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
| tf.shape(inputs)[1]) | |||
| decoder_mask = transformer.cumulative_average_mask( | |||
| sequence_length, | |||
| maximum_length=tf.shape(inputs)[1], | |||
| dtype=inputs.dtype) | |||
| if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
| memory): | |||
| memory = (memory, ) | |||
| if memory_sequence_length is not None: | |||
| if not tf.contrib.framework.nest.is_sequence( | |||
| memory_sequence_length): | |||
| memory_sequence_length = (memory_sequence_length, ) | |||
| memory_mask = [ | |||
| transformer.build_sequence_mask( | |||
| length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(m)[1]) | |||
| for m, length in zip(memory, memory_sequence_length) | |||
| ] | |||
| for layer in range(self.num_layers): | |||
| layer_name = 'layer_{}'.format(layer) | |||
| layer_cache = cache[layer_name] if cache is not None else None | |||
| with tf.variable_scope(layer_name): | |||
| if self.self_attention_type == 'scaled_dot': | |||
| with tf.variable_scope('masked_multi_head'): | |||
| encoded = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=decoder_mask, | |||
| cache=layer_cache, | |||
| dropout=self.attention_dropout) | |||
| last_context = transformer.drop_and_add( | |||
| inputs, encoded, mode, dropout=self.dropout) | |||
| elif self.self_attention_type == 'average': | |||
| with tf.variable_scope('average_attention'): | |||
| # Cumulative average. | |||
| x = transformer.norm(inputs) | |||
| y = transformer.cumulative_average( | |||
| x, | |||
| decoder_mask if cache is None else step, | |||
| cache=layer_cache) | |||
| # FFN. | |||
| y = transformer.feed_forward( | |||
| y, | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| # Gating layer. | |||
| z = tf.layers.dense( | |||
| tf.concat([x, y], -1), self.num_units * 2) | |||
| i, f = tf.split(z, 2, axis=-1) | |||
| y = tf.sigmoid(i) * x + tf.sigmoid(f) * y | |||
| last_context = transformer.drop_and_add( | |||
| inputs, y, mode, dropout=self.dropout) | |||
| if memory is not None: | |||
| for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
| memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable | |||
| with tf.variable_scope('multi_head' if i | |||
| == 0 else 'multi_head_%d' % i): # yapf:disable | |||
| context, last_attention = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(last_context), | |||
| mem, | |||
| mode, | |||
| mask=mask, | |||
| cache=memory_cache, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| last_context = transformer.drop_and_add( | |||
| last_context, | |||
| context, | |||
| mode, | |||
| dropout=self.dropout) | |||
| if i > 0: # Do not return attention in case of multi source. | |||
| last_attention = None | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(last_context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| last_context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| if last_attention is not None: | |||
| # The first head of the last layer is returned. | |||
| first_head_attention = last_attention[:, 0] | |||
| else: | |||
| first_head_attention = None | |||
| outputs = transformer.norm(inputs) | |||
| return outputs, first_head_attention | |||
| def decode_from_inputs(self, | |||
| inputs, | |||
| sequence_length, | |||
| initial_state=None, | |||
| mode=True, | |||
| memory=None, | |||
| memory_sequence_length=None): | |||
| outputs, attention = self._self_attention_stack( | |||
| inputs, | |||
| sequence_length=sequence_length, | |||
| mode=mode, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length) | |||
| return outputs, None, attention | |||
| def step_fn(self, | |||
| mode, | |||
| batch_size, | |||
| initial_state=None, | |||
| memory=None, | |||
| memory_sequence_length=None, | |||
| dtype=tf.float32): | |||
| if memory is None: | |||
| num_sources = 0 | |||
| elif tf.contrib.framework.nest.is_sequence(memory): | |||
| num_sources = len(memory) | |||
| else: | |||
| num_sources = 1 | |||
| cache = self._init_cache( | |||
| batch_size, dtype=dtype, num_sources=num_sources) | |||
| def _fn(step, inputs, cache, mode): | |||
| inputs = tf.expand_dims(inputs, 1) | |||
| outputs, attention = self._self_attention_stack( | |||
| inputs, | |||
| mode=mode, | |||
| cache=cache, | |||
| memory=memory, | |||
| memory_sequence_length=memory_sequence_length, | |||
| step=step) | |||
| outputs = tf.squeeze(outputs, axis=1) | |||
| if attention is not None: | |||
| attention = tf.squeeze(attention, axis=1) | |||
| return outputs, cache, attention | |||
| return _fn, cache | |||
| @@ -0,0 +1,182 @@ | |||
| """Define the self-attention encoder.""" | |||
| import tensorflow as tf | |||
| from . import transformer | |||
| from .position import SinusoidalPositionEncoder | |||
| class SelfAttentionEncoder(): | |||
| """Encoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder()): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(SelfAttentionEncoder, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = transformer.build_sequence_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) | |||
| mask_FF = tf.squeeze( | |||
| transformer.build_sequence_mask( | |||
| sequence_length, maximum_length=tf.shape(inputs)[1]), | |||
| axis=1) | |||
| state = () | |||
| attns = [] | |||
| for layer in range(self.num_layers): | |||
| with tf.variable_scope('layer_{}'.format(layer)): | |||
| with tf.variable_scope('multi_head'): | |||
| context, attn = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=mask, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| attns.append(attn) | |||
| context = transformer.drop_and_add( | |||
| inputs, context, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout, | |||
| mask=mask_FF) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = transformer.norm(inputs) | |||
| return (outputs, state, sequence_length, attns) | |||
| class SelfAttentionEncoderOri(): | |||
| """Encoder using self-attention as described in | |||
| https://arxiv.org/abs/1706.03762. | |||
| """ | |||
| def __init__(self, | |||
| num_layers, | |||
| num_units=512, | |||
| num_heads=8, | |||
| ffn_inner_dim=2048, | |||
| dropout=0.1, | |||
| attention_dropout=0.1, | |||
| relu_dropout=0.1, | |||
| position_encoder=SinusoidalPositionEncoder()): | |||
| """Initializes the parameters of the encoder. | |||
| Args: | |||
| num_layers: The number of layers. | |||
| num_units: The number of hidden units. | |||
| num_heads: The number of heads in the multi-head attention. | |||
| ffn_inner_dim: The number of units of the inner linear transformation | |||
| in the feed forward layer. | |||
| dropout: The probability to drop units from the outputs. | |||
| attention_dropout: The probability to drop units from the attention. | |||
| relu_dropout: The probability to drop units from the ReLU activation in | |||
| the feed forward layer. | |||
| position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
| apply on inputs or ``None``. | |||
| """ | |||
| super(SelfAttentionEncoderOri, self).__init__() | |||
| self.num_layers = num_layers | |||
| self.num_units = num_units | |||
| self.num_heads = num_heads | |||
| self.ffn_inner_dim = ffn_inner_dim | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.relu_dropout = relu_dropout | |||
| self.position_encoder = position_encoder | |||
| def encode(self, inputs, sequence_length=None, mode=True): | |||
| inputs *= self.num_units**0.5 | |||
| if self.position_encoder is not None: | |||
| inputs = self.position_encoder(inputs) | |||
| inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
| mask = transformer.build_sequence_mask( | |||
| sequence_length, | |||
| num_heads=self.num_heads, | |||
| maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] | |||
| state = () | |||
| attns = [] | |||
| for layer in range(self.num_layers): | |||
| with tf.variable_scope('layer_{}'.format(layer)): | |||
| with tf.variable_scope('multi_head'): | |||
| context, attn = transformer.multi_head_attention( | |||
| self.num_heads, | |||
| transformer.norm(inputs), | |||
| None, | |||
| mode, | |||
| num_units=self.num_units, | |||
| mask=mask, | |||
| dropout=self.attention_dropout, | |||
| return_attention=True) | |||
| attns.append(attn) | |||
| context = transformer.drop_and_add( | |||
| inputs, context, mode, dropout=self.dropout) | |||
| with tf.variable_scope('ffn'): | |||
| transformed = transformer.feed_forward_ori( | |||
| transformer.norm(context), | |||
| self.ffn_inner_dim, | |||
| mode, | |||
| dropout=self.relu_dropout) | |||
| transformed = transformer.drop_and_add( | |||
| context, transformed, mode, dropout=self.dropout) | |||
| inputs = transformed | |||
| state += (tf.reduce_mean(inputs, axis=1), ) | |||
| outputs = transformer.norm(inputs) | |||
| return (outputs, state, sequence_length, attns) | |||
| @@ -0,0 +1,255 @@ | |||
| import io | |||
| import os | |||
| from typing import Any, Dict, Optional, Union | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| from sklearn.preprocessing import MultiLabelBinarizer | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .models import create_model | |||
| from .text.symbols import load_symbols | |||
| from .text.symbols_dict import SymbolsDict | |||
| __all__ = ['SambertNetHifi16k'] | |||
| def multi_label_symbol_to_sequence(my_classes, my_symbol): | |||
| one_hot = MultiLabelBinarizer(my_classes) | |||
| tokens = my_symbol.strip().split(' ') | |||
| sequences = [] | |||
| for token in tokens: | |||
| sequences.append(tuple(token.split('&'))) | |||
| # sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~ | |||
| return one_hot.fit_transform(sequences) | |||
| @MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k') | |||
| class SambertNetHifi16k(Model): | |||
| def __init__(self, | |||
| model_dir, | |||
| pitch_control_str='', | |||
| duration_control_str='', | |||
| energy_control_str='', | |||
| *args, | |||
| **kwargs): | |||
| tf.reset_default_graph() | |||
| local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt') | |||
| self._ckpt_path = os.path.join(model_dir, local_ckpt_path) | |||
| self._dict_path = os.path.join(model_dir, 'dicts') | |||
| self._hparams = tf.contrib.training.HParams(**kwargs) | |||
| values = self._hparams.values() | |||
| hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)] | |||
| print('Hyperparameters:\n' + '\n'.join(hp)) | |||
| super().__init__(self._ckpt_path, *args, **kwargs) | |||
| model_name = 'robutrans' | |||
| self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split( | |||
| ',') | |||
| sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( | |||
| self._dict_path) | |||
| self._sy = sy | |||
| self._tone = tone | |||
| self._syllable_flag = syllable_flag | |||
| self._word_segment = word_segment | |||
| self._emo_category = emo_category | |||
| self._speaker = speaker | |||
| self._inputs_dim = dict() | |||
| for lfeat_type in self._lfeat_type_list: | |||
| if lfeat_type == 'sy': | |||
| self._inputs_dim[lfeat_type] = len(sy) | |||
| elif lfeat_type == 'tone': | |||
| self._inputs_dim[lfeat_type] = len(tone) | |||
| elif lfeat_type == 'syllable_flag': | |||
| self._inputs_dim[lfeat_type] = len(syllable_flag) | |||
| elif lfeat_type == 'word_segment': | |||
| self._inputs_dim[lfeat_type] = len(word_segment) | |||
| elif lfeat_type == 'emo_category': | |||
| self._inputs_dim[lfeat_type] = len(emo_category) | |||
| elif lfeat_type == 'speaker': | |||
| self._inputs_dim[lfeat_type] = len(speaker) | |||
| self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment, | |||
| emo_category, speaker, | |||
| self._inputs_dim, | |||
| self._lfeat_type_list) | |||
| dim_inputs = sum(self._inputs_dim.values( | |||
| )) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category'] | |||
| inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs') | |||
| inputs_emotion = tf.placeholder( | |||
| tf.float32, [1, None, self._inputs_dim['emo_category']], | |||
| 'inputs_emotion') | |||
| inputs_speaker = tf.placeholder(tf.float32, | |||
| [1, None, self._inputs_dim['speaker']], | |||
| 'inputs_speaker') | |||
| input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') | |||
| pitch_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'pitch_contours_scale') | |||
| energy_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'energy_contours_scale') | |||
| duration_scale = tf.placeholder(tf.float32, [1, None], | |||
| 'duration_scale') | |||
| with tf.variable_scope('model') as _: | |||
| self._model = create_model(model_name, self._hparams) | |||
| self._model.initialize( | |||
| inputs, | |||
| inputs_emotion, | |||
| inputs_speaker, | |||
| input_lengths, | |||
| duration_scales=duration_scale, | |||
| pitch_scales=pitch_contours_scale, | |||
| energy_scales=energy_contours_scale) | |||
| self._mel_spec = self._model.mel_outputs[0] | |||
| self._duration_outputs = self._model.duration_outputs[0] | |||
| self._duration_outputs_ = self._model.duration_outputs_[0] | |||
| self._pitch_contour_outputs = self._model.pitch_contour_outputs[0] | |||
| self._energy_contour_outputs = self._model.energy_contour_outputs[ | |||
| 0] | |||
| self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[ | |||
| 0] | |||
| self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[ | |||
| 0] | |||
| self._encoder_outputs = self._model.encoder_outputs[0] | |||
| self._pitch_embeddings = self._model.pitch_embeddings[0] | |||
| self._energy_embeddings = self._model.energy_embeddings[0] | |||
| self._LR_outputs = self._model.LR_outputs[0] | |||
| self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0] | |||
| self._attention_h = self._model.attention_h | |||
| self._attention_x = self._model.attention_x | |||
| print('Loading checkpoint: %s' % self._ckpt_path) | |||
| config = tf.ConfigProto() | |||
| config.gpu_options.allow_growth = True | |||
| self._session = tf.Session(config=config) | |||
| self._session.run(tf.global_variables_initializer()) | |||
| saver = tf.train.Saver() | |||
| saver.restore(self._session, self._ckpt_path) | |||
| duration_cfg_lst = [] | |||
| if len(duration_control_str) != 0: | |||
| for item in duration_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| duration_cfg_lst.append((float(percent), float(scale))) | |||
| self._duration_cfg_lst = duration_cfg_lst | |||
| pitch_contours_cfg_lst = [] | |||
| if len(pitch_control_str) != 0: | |||
| for item in pitch_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| pitch_contours_cfg_lst.append( | |||
| (float(percent), float(scale))) | |||
| self._pitch_contours_cfg_lst = pitch_contours_cfg_lst | |||
| energy_contours_cfg_lst = [] | |||
| if len(energy_control_str) != 0: | |||
| for item in energy_control_str.strip().split('|'): | |||
| percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
| energy_contours_cfg_lst.append( | |||
| (float(percent), float(scale))) | |||
| self._energy_contours_cfg_lst = energy_contours_cfg_lst | |||
| def forward(self, text): | |||
| cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')] | |||
| lfeat_symbol = text.strip().split(' ') | |||
| lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) | |||
| for this_lfeat_symbol in lfeat_symbol: | |||
| this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( | |||
| '$') | |||
| if len(this_lfeat_symbol) != len(self._lfeat_type_list): | |||
| raise Exception( | |||
| 'Length of this_lfeat_symbol in training data' | |||
| + ' is not equal to the length of lfeat_type_list, ' | |||
| + str(len(this_lfeat_symbol)) + ' VS. ' | |||
| + str(len(self._lfeat_type_list))) | |||
| index = 0 | |||
| while index < len(lfeat_symbol_separate): | |||
| lfeat_symbol_separate[index] = lfeat_symbol_separate[ | |||
| index] + this_lfeat_symbol[index] + ' ' | |||
| index = index + 1 | |||
| index = 0 | |||
| lfeat_type = self._lfeat_type_list[index] | |||
| sequence = self._symbols_dict.symbol_to_sequence( | |||
| lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) | |||
| sequence_array = np.asarray( | |||
| sequence[:-1], | |||
| dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
| inputs = np.eye( | |||
| self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
| index = index + 1 | |||
| while index < len(self._lfeat_type_list) - 2: | |||
| lfeat_type = self._lfeat_type_list[index] | |||
| sequence = self._symbols_dict.symbol_to_sequence( | |||
| lfeat_symbol_separate[index].strip(), lfeat_type, | |||
| cleaner_names) | |||
| sequence_array = np.asarray( | |||
| sequence[:-1], | |||
| dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
| inputs_temp = np.eye( | |||
| self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
| inputs = np.concatenate((inputs, inputs_temp), axis=1) | |||
| index = index + 1 | |||
| seq = inputs | |||
| lfeat_type = 'emo_category' | |||
| inputs_emotion = multi_label_symbol_to_sequence( | |||
| self._emo_category, lfeat_symbol_separate[index].strip()) | |||
| # inputs_emotion = inputs_emotion * 1.5 | |||
| index = index + 1 | |||
| lfeat_type = 'speaker' | |||
| inputs_speaker = multi_label_symbol_to_sequence( | |||
| self._speaker, lfeat_symbol_separate[index].strip()) | |||
| duration_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in self._duration_cfg_lst: | |||
| duration_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in self._pitch_contours_cfg_lst: | |||
| pitch_contours_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
| start_idx = 0 | |||
| for (percent, scale) in self._energy_contours_cfg_lst: | |||
| energy_contours_scale[start_idx:start_idx | |||
| + int(percent * len(seq))] = scale | |||
| start_idx += int(percent * len(seq)) | |||
| feed_dict = { | |||
| self._model.inputs: [np.asarray(seq, dtype=np.float32)], | |||
| self._model.inputs_emotion: | |||
| [np.asarray(inputs_emotion, dtype=np.float32)], | |||
| self._model.inputs_speaker: | |||
| [np.asarray(inputs_speaker, dtype=np.float32)], | |||
| self._model.input_lengths: | |||
| np.asarray([len(seq)], dtype=np.int32), | |||
| self._model.duration_scales: [duration_scale], | |||
| self._model.pitch_scales: [pitch_contours_scale], | |||
| self._model.energy_scales: [energy_contours_scale] | |||
| } | |||
| result = self._session.run([ | |||
| self._mel_spec, self._duration_outputs, self._duration_outputs_, | |||
| self._pitch_contour_outputs, self._embedded_inputs_emotion, | |||
| self._embedding_fsmn_outputs, self._encoder_outputs, | |||
| self._pitch_embeddings, self._LR_outputs, | |||
| self._postnet_fsmn_outputs, self._energy_contour_outputs, | |||
| self._energy_embeddings, self._attention_x, self._attention_h | |||
| ], feed_dict=feed_dict) # yapf:disable | |||
| return result[0] | |||
| @@ -0,0 +1,89 @@ | |||
| ''' | |||
| Cleaners are transformations that run over the input text at both training and eval time. | |||
| Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
| hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
| 1. "english_cleaners" for English text | |||
| 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
| the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
| 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
| the symbols in symbols.py to match your data). | |||
| ''' | |||
| import re | |||
| from unidecode import unidecode | |||
| from .numbers import normalize_numbers | |||
| # Regular expression matching whitespace: | |||
| _whitespace_re = re.compile(r'\s+') | |||
| # List of (regular expression, replacement) pairs for abbreviations: | |||
| _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) | |||
| for x in [ | |||
| ('mrs', 'misess'), | |||
| ('mr', 'mister'), | |||
| ('dr', 'doctor'), | |||
| ('st', 'saint'), | |||
| ('co', 'company'), | |||
| ('jr', 'junior'), | |||
| ('maj', 'major'), | |||
| ('gen', 'general'), | |||
| ('drs', 'doctors'), | |||
| ('rev', 'reverend'), | |||
| ('lt', 'lieutenant'), | |||
| ('hon', 'honorable'), | |||
| ('sgt', 'sergeant'), | |||
| ('capt', 'captain'), | |||
| ('esq', 'esquire'), | |||
| ('ltd', 'limited'), | |||
| ('col', 'colonel'), | |||
| ('ft', 'fort'), ]] # yapf:disable | |||
| def expand_abbreviations(text): | |||
| for regex, replacement in _abbreviations: | |||
| text = re.sub(regex, replacement, text) | |||
| return text | |||
| def expand_numbers(text): | |||
| return normalize_numbers(text) | |||
| def lowercase(text): | |||
| return text.lower() | |||
| def collapse_whitespace(text): | |||
| return re.sub(_whitespace_re, ' ', text) | |||
| def convert_to_ascii(text): | |||
| return unidecode(text) | |||
| def basic_cleaners(text): | |||
| '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def transliteration_cleaners(text): | |||
| '''Pipeline for non-English text that transliterates to ASCII.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| def english_cleaners(text): | |||
| '''Pipeline for English text, including number and abbreviation expansion.''' | |||
| text = convert_to_ascii(text) | |||
| text = lowercase(text) | |||
| text = expand_numbers(text) | |||
| text = expand_abbreviations(text) | |||
| text = collapse_whitespace(text) | |||
| return text | |||
| @@ -0,0 +1,64 @@ | |||
| import re | |||
| valid_symbols = [ | |||
| 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', | |||
| 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', | |||
| 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', | |||
| 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', | |||
| 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', | |||
| 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', | |||
| 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', | |||
| 'Y', 'Z', 'ZH' | |||
| ] | |||
| _valid_symbol_set = set(valid_symbols) | |||
| class CMUDict: | |||
| '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | |||
| def __init__(self, file_or_path, keep_ambiguous=True): | |||
| if isinstance(file_or_path, str): | |||
| with open(file_or_path, encoding='latin-1') as f: | |||
| entries = _parse_cmudict(f) | |||
| else: | |||
| entries = _parse_cmudict(file_or_path) | |||
| if not keep_ambiguous: | |||
| entries = { | |||
| word: pron | |||
| for word, pron in entries.items() if len(pron) == 1 | |||
| } | |||
| self._entries = entries | |||
| def __len__(self): | |||
| return len(self._entries) | |||
| def lookup(self, word): | |||
| '''Returns list of ARPAbet pronunciations of the given word.''' | |||
| return self._entries.get(word.upper()) | |||
| _alt_re = re.compile(r'\([0-9]+\)') | |||
| def _parse_cmudict(file): | |||
| cmudict = {} | |||
| for line in file: | |||
| if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |||
| parts = line.split(' ') | |||
| word = re.sub(_alt_re, '', parts[0]) | |||
| pronunciation = _get_pronunciation(parts[1]) | |||
| if pronunciation: | |||
| if word in cmudict: | |||
| cmudict[word].append(pronunciation) | |||
| else: | |||
| cmudict[word] = [pronunciation] | |||
| return cmudict | |||
| def _get_pronunciation(s): | |||
| parts = s.strip().split(' ') | |||
| for part in parts: | |||
| if part not in _valid_symbol_set: | |||
| return None | |||
| return ' '.join(parts) | |||
| @@ -0,0 +1,70 @@ | |||
| import re | |||
| import inflect | |||
| _inflect = inflect.engine() | |||
| _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') | |||
| _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') | |||
| _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') | |||
| _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') | |||
| _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') | |||
| _number_re = re.compile(r'[0-9]+') | |||
| def _remove_commas(m): | |||
| return m.group(1).replace(',', '') | |||
| def _expand_decimal_point(m): | |||
| return m.group(1).replace('.', ' point ') | |||
| def _expand_dollars(m): | |||
| match = m.group(1) | |||
| parts = match.split('.') | |||
| if len(parts) > 2: | |||
| return match + ' dollars' # Unexpected format | |||
| dollars = int(parts[0]) if parts[0] else 0 | |||
| cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 | |||
| if dollars and cents: | |||
| dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
| cent_unit = 'cent' if cents == 1 else 'cents' | |||
| return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) | |||
| elif dollars: | |||
| dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
| return '%s %s' % (dollars, dollar_unit) | |||
| elif cents: | |||
| cent_unit = 'cent' if cents == 1 else 'cents' | |||
| return '%s %s' % (cents, cent_unit) | |||
| else: | |||
| return 'zero dollars' | |||
| def _expand_ordinal(m): | |||
| return _inflect.number_to_words(m.group(0)) | |||
| def _expand_number(m): | |||
| num = int(m.group(0)) | |||
| if num > 1000 and num < 3000: | |||
| if num == 2000: | |||
| return 'two thousand' | |||
| elif num > 2000 and num < 2010: | |||
| return 'two thousand ' + _inflect.number_to_words(num % 100) | |||
| elif num % 100 == 0: | |||
| return _inflect.number_to_words(num // 100) + ' hundred' | |||
| else: | |||
| return _inflect.number_to_words( | |||
| num, andword='', zero='oh', group=2).replace(', ', ' ') | |||
| else: | |||
| return _inflect.number_to_words(num, andword='') | |||
| def normalize_numbers(text): | |||
| text = re.sub(_comma_number_re, _remove_commas, text) | |||
| text = re.sub(_pounds_re, r'\1 pounds', text) | |||
| text = re.sub(_dollars_re, _expand_dollars, text) | |||
| text = re.sub(_decimal_number_re, _expand_decimal_point, text) | |||
| text = re.sub(_ordinal_re, _expand_ordinal, text) | |||
| text = re.sub(_number_re, _expand_number, text) | |||
| return text | |||
| @@ -0,0 +1,95 @@ | |||
| ''' | |||
| Defines the set of symbols used in text input to the model. | |||
| The default is a set of ASCII characters that works well for English or text that has been run | |||
| through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. | |||
| ''' | |||
| import codecs | |||
| import os | |||
| _pad = '_' | |||
| _eos = '~' | |||
| _mask = '@[MASK]' | |||
| def load_symbols(dict_path): | |||
| _characters = '' | |||
| _ch_symbols = [] | |||
| sy_dict_name = 'sy_dict.txt' | |||
| sy_dict_path = os.path.join(dict_path, sy_dict_name) | |||
| f = codecs.open(sy_dict_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_symbols.append(line) | |||
| _arpabet = ['@' + s for s in _ch_symbols] | |||
| # Export all symbols: | |||
| sy = list(_characters) + _arpabet + [_pad, _eos, _mask] | |||
| _characters = '' | |||
| _ch_tones = [] | |||
| tone_dict_name = 'tone_dict.txt' | |||
| tone_dict_path = os.path.join(dict_path, tone_dict_name) | |||
| f = codecs.open(tone_dict_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_tones.append(line) | |||
| # Export all tones: | |||
| tone = list(_characters) + _ch_tones + [_pad, _eos, _mask] | |||
| _characters = '' | |||
| _ch_syllable_flags = [] | |||
| syllable_flag_name = 'syllable_flag_dict.txt' | |||
| syllable_flag_path = os.path.join(dict_path, syllable_flag_name) | |||
| f = codecs.open(syllable_flag_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_syllable_flags.append(line) | |||
| # Export all syllable_flags: | |||
| syllable_flag = list(_characters) + _ch_syllable_flags + [ | |||
| _pad, _eos, _mask | |||
| ] | |||
| _characters = '' | |||
| _ch_word_segments = [] | |||
| word_segment_name = 'word_segment_dict.txt' | |||
| word_segment_path = os.path.join(dict_path, word_segment_name) | |||
| f = codecs.open(word_segment_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_word_segments.append(line) | |||
| # Export all syllable_flags: | |||
| word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask] | |||
| _characters = '' | |||
| _ch_emo_types = [] | |||
| emo_category_name = 'emo_category_dict.txt' | |||
| emo_category_path = os.path.join(dict_path, emo_category_name) | |||
| f = codecs.open(emo_category_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_emo_types.append(line) | |||
| emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask] | |||
| _characters = '' | |||
| _ch_speakers = [] | |||
| speaker_name = 'speaker_dict.txt' | |||
| speaker_path = os.path.join(dict_path, speaker_name) | |||
| f = codecs.open(speaker_path, 'r') | |||
| for line in f: | |||
| line = line.strip('\r\n') | |||
| _ch_speakers.append(line) | |||
| # Export all syllable_flags: | |||
| speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask] | |||
| return sy, tone, syllable_flag, word_segment, emo_category, speaker | |||
| @@ -0,0 +1,200 @@ | |||
| import re | |||
| import sys | |||
| from .cleaners import (basic_cleaners, english_cleaners, | |||
| transliteration_cleaners) | |||
| class SymbolsDict: | |||
| def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, | |||
| speaker, inputs_dim, lfeat_type_list): | |||
| self._inputs_dim = inputs_dim | |||
| self._lfeat_type_list = lfeat_type_list | |||
| self._sy_to_id = {s: i for i, s in enumerate(sy)} | |||
| self._id_to_sy = {i: s for i, s in enumerate(sy)} | |||
| self._tone_to_id = {s: i for i, s in enumerate(tone)} | |||
| self._id_to_tone = {i: s for i, s in enumerate(tone)} | |||
| self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} | |||
| self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} | |||
| self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} | |||
| self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} | |||
| self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} | |||
| self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} | |||
| self._speaker_to_id = {s: i for i, s in enumerate(speaker)} | |||
| self._id_to_speaker = {i: s for i, s in enumerate(speaker)} | |||
| print('_sy_to_id: ') | |||
| print(self._sy_to_id) | |||
| print('_tone_to_id: ') | |||
| print(self._tone_to_id) | |||
| print('_syllable_flag_to_id: ') | |||
| print(self._syllable_flag_to_id) | |||
| print('_word_segment_to_id: ') | |||
| print(self._word_segment_to_id) | |||
| print('_emo_category_to_id: ') | |||
| print(self._emo_category_to_id) | |||
| print('_speaker_to_id: ') | |||
| print(self._speaker_to_id) | |||
| self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
| self._cleaners = { | |||
| basic_cleaners.__name__: basic_cleaners, | |||
| transliteration_cleaners.__name__: transliteration_cleaners, | |||
| english_cleaners.__name__: english_cleaners | |||
| } | |||
| def _clean_text(self, text, cleaner_names): | |||
| for name in cleaner_names: | |||
| cleaner = self._cleaners.get(name) | |||
| if not cleaner: | |||
| raise Exception('Unknown cleaner: %s' % name) | |||
| text = cleaner(text) | |||
| return text | |||
| def _sy_to_sequence(self, sy): | |||
| return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] | |||
| def _arpabet_to_sequence(self, text): | |||
| return self._sy_to_sequence(['@' + s for s in text.split()]) | |||
| def _should_keep_sy(self, s): | |||
| return s in self._sy_to_id and s != '_' and s != '~' | |||
| def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): | |||
| sequence = [] | |||
| if lfeat_type == 'sy': | |||
| this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') | |||
| this_lfeat_symbol_format = '' | |||
| index = 0 | |||
| while index < len(this_lfeat_symbol): | |||
| this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ | |||
| index] + '}' + ' ' | |||
| index = index + 1 | |||
| sequence = self.text_to_sequence(this_lfeat_symbol_format, | |||
| cleaner_names) | |||
| elif lfeat_type == 'tone': | |||
| sequence = self.tone_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'syllable_flag': | |||
| sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'word_segment': | |||
| sequence = self.word_segment_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'emo_category': | |||
| sequence = self.emo_category_to_sequence(this_lfeat_symbol) | |||
| elif lfeat_type == 'speaker': | |||
| sequence = self.speaker_to_sequence(this_lfeat_symbol) | |||
| else: | |||
| raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
| return sequence | |||
| def text_to_sequence(self, text, cleaner_names): | |||
| '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |||
| The text can optionally have ARPAbet sequences enclosed in curly braces embedded | |||
| in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | |||
| Args: | |||
| text: string to convert to a sequence | |||
| cleaner_names: names of the cleaner functions to run the text through | |||
| Returns: | |||
| List of integers corresponding to the symbols in the text | |||
| ''' | |||
| sequence = [] | |||
| # Check for curly braces and treat their contents as ARPAbet: | |||
| while len(text): | |||
| m = self._curly_re.match(text) | |||
| if not m: | |||
| sequence += self._sy_to_sequence( | |||
| self._clean_text(text, cleaner_names)) | |||
| break | |||
| sequence += self._sy_to_sequence( | |||
| self._clean_text(m.group(1), cleaner_names)) | |||
| sequence += self._arpabet_to_sequence(m.group(2)) | |||
| text = m.group(3) | |||
| # Append EOS token | |||
| sequence.append(self._sy_to_id['~']) | |||
| return sequence | |||
| def tone_to_sequence(self, tone): | |||
| tones = tone.strip().split(' ') | |||
| sequence = [] | |||
| for this_tone in tones: | |||
| sequence.append(self._tone_to_id[this_tone]) | |||
| sequence.append(self._tone_to_id['~']) | |||
| return sequence | |||
| def syllable_flag_to_sequence(self, syllable_flag): | |||
| syllable_flags = syllable_flag.strip().split(' ') | |||
| sequence = [] | |||
| for this_syllable_flag in syllable_flags: | |||
| sequence.append(self._syllable_flag_to_id[this_syllable_flag]) | |||
| sequence.append(self._syllable_flag_to_id['~']) | |||
| return sequence | |||
| def word_segment_to_sequence(self, word_segment): | |||
| word_segments = word_segment.strip().split(' ') | |||
| sequence = [] | |||
| for this_word_segment in word_segments: | |||
| sequence.append(self._word_segment_to_id[this_word_segment]) | |||
| sequence.append(self._word_segment_to_id['~']) | |||
| return sequence | |||
| def emo_category_to_sequence(self, emo_type): | |||
| emo_categories = emo_type.strip().split(' ') | |||
| sequence = [] | |||
| for this_category in emo_categories: | |||
| sequence.append(self._emo_category_to_id[this_category]) | |||
| sequence.append(self._emo_category_to_id['~']) | |||
| return sequence | |||
| def speaker_to_sequence(self, speaker): | |||
| speakers = speaker.strip().split(' ') | |||
| sequence = [] | |||
| for this_speaker in speakers: | |||
| sequence.append(self._speaker_to_id[this_speaker]) | |||
| sequence.append(self._speaker_to_id['~']) | |||
| return sequence | |||
| def sequence_to_symbol(self, sequence): | |||
| result = '' | |||
| pre_lfeat_dim = 0 | |||
| for lfeat_type in self._lfeat_type_list: | |||
| current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim | |||
| + self._inputs_dim[lfeat_type]] | |||
| current_sequence = current_one_hot_sequence.argmax(1) | |||
| length = current_sequence.shape[0] | |||
| index = 0 | |||
| while index < length: | |||
| this_sequence = current_sequence[index] | |||
| s = '' | |||
| if lfeat_type == 'sy': | |||
| s = self._id_to_sy[this_sequence] | |||
| if len(s) > 1 and s[0] == '@': | |||
| s = s[1:] | |||
| elif lfeat_type == 'tone': | |||
| s = self._id_to_tone[this_sequence] | |||
| elif lfeat_type == 'syllable_flag': | |||
| s = self._id_to_syllable_flag[this_sequence] | |||
| elif lfeat_type == 'word_segment': | |||
| s = self._id_to_word_segment[this_sequence] | |||
| elif lfeat_type == 'emo_category': | |||
| s = self._id_to_emo_category[this_sequence] | |||
| elif lfeat_type == 'speaker': | |||
| s = self._id_to_speaker[this_sequence] | |||
| else: | |||
| raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
| if index == 0: | |||
| result = result + lfeat_type + ': ' | |||
| result = result + '{' + s + '}' | |||
| if index == length - 1: | |||
| result = result + '; ' | |||
| index = index + 1 | |||
| pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] | |||
| return result | |||
| @@ -0,0 +1 @@ | |||
| from .generic_text_to_speech_frontend import * # noqa F403 | |||
| @@ -0,0 +1,39 @@ | |||
| import os | |||
| import zipfile | |||
| from typing import Any, Dict, List | |||
| import ttsfrd | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.audio.tts_exceptions import ( | |||
| TtsFrontendInitializeFailedException, | |||
| TtsFrontendLanguageTypeInvalidException) | |||
| from modelscope.utils.constant import Tasks | |||
| __all__ = ['GenericTtsFrontend'] | |||
| @MODELS.register_module( | |||
| Tasks.text_to_speech, module_name=r'generic_tts_frontend') | |||
| class GenericTtsFrontend(Model): | |||
| def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs): | |||
| super().__init__(model_dir, *args, **kwargs) | |||
| frontend = ttsfrd.TtsFrontendEngine() | |||
| zip_file = os.path.join(model_dir, 'resource.zip') | |||
| self._res_path = os.path.join(model_dir, 'resource') | |||
| with zipfile.ZipFile(zip_file, 'r') as zip_ref: | |||
| zip_ref.extractall(model_dir) | |||
| if not frontend.initialize(self._res_path): | |||
| raise TtsFrontendInitializeFailedException( | |||
| 'resource invalid: {}'.format(self._res_path)) | |||
| if not frontend.set_lang_type(lang_type): | |||
| raise TtsFrontendLanguageTypeInvalidException( | |||
| 'language type invalid: {}, valid is pinyin and chenmix'. | |||
| format(lang_type)) | |||
| self._frontend = frontend | |||
| def forward(self, data: str) -> Dict[str, List]: | |||
| result = self._frontend.gen_tacotron_symbols(data) | |||
| return {'texts': [s for s in result.splitlines() if s != '']} | |||
| @@ -0,0 +1 @@ | |||
| from .hifigan16k import * # noqa F403 | |||
| @@ -0,0 +1,73 @@ | |||
| from __future__ import (absolute_import, division, print_function, | |||
| unicode_literals) | |||
| import argparse | |||
| import glob | |||
| import os | |||
| import time | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from scipy.io.wavfile import write | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.audio.tts_exceptions import \ | |||
| TtsVocoderMelspecShapeMismatchException | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from .models import Generator | |||
| __all__ = ['Hifigan16k', 'AttrDict'] | |||
| MAX_WAV_VALUE = 32768.0 | |||
| def load_checkpoint(filepath, device): | |||
| assert os.path.isfile(filepath) | |||
| print("Loading '{}'".format(filepath)) | |||
| checkpoint_dict = torch.load(filepath, map_location=device) | |||
| print('Complete.') | |||
| return checkpoint_dict | |||
| class AttrDict(dict): | |||
| def __init__(self, *args, **kwargs): | |||
| super(AttrDict, self).__init__(*args, **kwargs) | |||
| self.__dict__ = self | |||
| @MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k') | |||
| class Hifigan16k(Model): | |||
| def __init__(self, model_dir, *args, **kwargs): | |||
| self._ckpt_path = os.path.join(model_dir, | |||
| ModelFile.TORCH_MODEL_BIN_FILE) | |||
| self._config = AttrDict(**kwargs) | |||
| super().__init__(self._ckpt_path, *args, **kwargs) | |||
| if torch.cuda.is_available(): | |||
| torch.manual_seed(self._config.seed) | |||
| self._device = torch.device('cuda') | |||
| else: | |||
| self._device = torch.device('cpu') | |||
| self._generator = Generator(self._config).to(self._device) | |||
| state_dict_g = load_checkpoint(self._ckpt_path, self._device) | |||
| self._generator.load_state_dict(state_dict_g['generator']) | |||
| self._generator.eval() | |||
| self._generator.remove_weight_norm() | |||
| def forward(self, melspec): | |||
| dim0 = list(melspec.shape)[-1] | |||
| if dim0 != 80: | |||
| raise TtsVocoderMelspecShapeMismatchException( | |||
| 'input melspec mismatch 0 dim require 80 but {}'.format(dim0)) | |||
| with torch.no_grad(): | |||
| x = melspec.T | |||
| x = torch.FloatTensor(x).to(self._device) | |||
| if len(x.shape) == 2: | |||
| x = x.unsqueeze(0) | |||
| y_g_hat = self._generator(x) | |||
| audio = y_g_hat.squeeze() | |||
| audio = audio * MAX_WAV_VALUE | |||
| audio = audio.cpu().numpy().astype('int16') | |||
| return audio | |||
| @@ -0,0 +1 @@ | |||
| from .models import Generator | |||
| @@ -0,0 +1,516 @@ | |||
| from distutils.version import LooseVersion | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from pytorch_wavelets import DWT1DForward | |||
| from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d | |||
| from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm | |||
| from .utils import get_padding, init_weights | |||
| is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') | |||
| def stft(x, fft_size, hop_size, win_length, window): | |||
| """Perform STFT and convert to magnitude spectrogram. | |||
| Args: | |||
| x (Tensor): Input signal tensor (B, T). | |||
| fft_size (int): FFT size. | |||
| hop_size (int): Hop size. | |||
| win_length (int): Window length. | |||
| window (str): Window function type. | |||
| Returns: | |||
| Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). | |||
| """ | |||
| if is_pytorch_17plus: | |||
| x_stft = torch.stft( | |||
| x, fft_size, hop_size, win_length, window, return_complex=False) | |||
| else: | |||
| x_stft = torch.stft(x, fft_size, hop_size, win_length, window) | |||
| real = x_stft[..., 0] | |||
| imag = x_stft[..., 1] | |||
| # NOTE(kan-bayashi): clamp is needed to avoid nan or inf | |||
| return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) | |||
| LRELU_SLOPE = 0.1 | |||
| def get_padding_casual(kernel_size, dilation=1): | |||
| return int(kernel_size * dilation - dilation) | |||
| class Conv1dCasual(torch.nn.Module): | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=True, | |||
| padding_mode='zeros'): | |||
| super(Conv1dCasual, self).__init__() | |||
| self.pad = padding | |||
| self.conv1d = weight_norm( | |||
| Conv1d( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0, | |||
| dilation=dilation, | |||
| groups=groups, | |||
| bias=bias, | |||
| padding_mode=padding_mode)) | |||
| self.conv1d.apply(init_weights) | |||
| def forward(self, x): # bdt | |||
| # described starting from the last dimension and moving forward. | |||
| x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') | |||
| x = self.conv1d(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.conv1d) | |||
| class ConvTranspose1dCausal(torch.nn.Module): | |||
| """CausalConvTranspose1d module with customized initialization.""" | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride, | |||
| padding=0): | |||
| """Initialize CausalConvTranspose1d module.""" | |||
| super(ConvTranspose1dCausal, self).__init__() | |||
| self.deconv = weight_norm( | |||
| ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) | |||
| self.stride = stride | |||
| self.deconv.apply(init_weights) | |||
| self.pad = kernel_size - stride | |||
| def forward(self, x): | |||
| """Calculate forward propagation. | |||
| Args: | |||
| x (Tensor): Input tensor (B, in_channels, T_in). | |||
| Returns: | |||
| Tensor: Output tensor (B, out_channels, T_out). | |||
| """ | |||
| # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") | |||
| return self.deconv(x)[:, :, :-self.pad] | |||
| def remove_weight_norm(self): | |||
| remove_weight_norm(self.deconv) | |||
| class ResBlock1(torch.nn.Module): | |||
| def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |||
| super(ResBlock1, self).__init__() | |||
| self.h = h | |||
| self.convs1 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=dilation[i], | |||
| padding=get_padding_casual(kernel_size, dilation[i])) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| self.convs2 = nn.ModuleList([ | |||
| Conv1dCasual( | |||
| channels, | |||
| channels, | |||
| kernel_size, | |||
| 1, | |||
| dilation=1, | |||
| padding=get_padding_casual(kernel_size, 1)) | |||
| for i in range(len(dilation)) | |||
| ]) | |||
| def forward(self, x): | |||
| for c1, c2 in zip(self.convs1, self.convs2): | |||
| xt = F.leaky_relu(x, LRELU_SLOPE) | |||
| xt = c1(xt) | |||
| xt = F.leaky_relu(xt, LRELU_SLOPE) | |||
| xt = c2(xt) | |||
| x = xt + x | |||
| return x | |||
| def remove_weight_norm(self): | |||
| for layer in self.convs1: | |||
| layer.remove_weight_norm() | |||
| for layer in self.convs2: | |||
| layer.remove_weight_norm() | |||
| class Generator(torch.nn.Module): | |||
| def __init__(self, h): | |||
| super(Generator, self).__init__() | |||
| self.h = h | |||
| self.num_kernels = len(h.resblock_kernel_sizes) | |||
| self.num_upsamples = len(h.upsample_rates) | |||
| print('num_kernels={}, num_upsamples={}'.format( | |||
| self.num_kernels, self.num_upsamples)) | |||
| self.conv_pre = Conv1dCasual( | |||
| 80, h.upsample_initial_channel, 7, 1, padding=7 - 1) | |||
| resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |||
| self.ups = nn.ModuleList() | |||
| self.repeat_ups = nn.ModuleList() | |||
| for i, (u, k) in enumerate( | |||
| zip(h.upsample_rates, h.upsample_kernel_sizes)): | |||
| upsample = nn.Sequential( | |||
| nn.Upsample(mode='nearest', scale_factor=u), | |||
| nn.LeakyReLU(LRELU_SLOPE), | |||
| Conv1dCasual( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| kernel_size=7, | |||
| stride=1, | |||
| padding=7 - 1)) | |||
| self.repeat_ups.append(upsample) | |||
| self.ups.append( | |||
| ConvTranspose1dCausal( | |||
| h.upsample_initial_channel // (2**i), | |||
| h.upsample_initial_channel // (2**(i + 1)), | |||
| k, | |||
| u, | |||
| padding=(k - u) // 2)) | |||
| self.resblocks = nn.ModuleList() | |||
| for i in range(len(self.ups)): | |||
| ch = h.upsample_initial_channel // (2**(i + 1)) | |||
| for j, (k, d) in enumerate( | |||
| zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |||
| self.resblocks.append(resblock(h, ch, k, d)) | |||
| self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) | |||
| def forward(self, x): | |||
| x = self.conv_pre(x) | |||
| for i in range(self.num_upsamples): | |||
| x = torch.sin(x) + x | |||
| # transconv | |||
| x1 = F.leaky_relu(x, LRELU_SLOPE) | |||
| x1 = self.ups[i](x1) | |||
| # repeat | |||
| x2 = self.repeat_ups[i](x) | |||
| x = x1 + x2 | |||
| xs = None | |||
| for j in range(self.num_kernels): | |||
| if xs is None: | |||
| xs = self.resblocks[i * self.num_kernels + j](x) | |||
| else: | |||
| xs += self.resblocks[i * self.num_kernels + j](x) | |||
| x = xs / self.num_kernels | |||
| x = F.leaky_relu(x) | |||
| x = self.conv_post(x) | |||
| x = torch.tanh(x) | |||
| return x | |||
| def remove_weight_norm(self): | |||
| print('Removing weight norm...') | |||
| for layer in self.ups: | |||
| layer.remove_weight_norm() | |||
| for layer in self.repeat_ups: | |||
| layer[-1].remove_weight_norm() | |||
| for layer in self.resblocks: | |||
| layer.remove_weight_norm() | |||
| self.conv_pre.remove_weight_norm() | |||
| self.conv_post.remove_weight_norm() | |||
| class DiscriminatorP(torch.nn.Module): | |||
| def __init__(self, | |||
| period, | |||
| kernel_size=5, | |||
| stride=3, | |||
| use_spectral_norm=False): | |||
| super(DiscriminatorP, self).__init__() | |||
| self.period = period | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f( | |||
| Conv2d( | |||
| 1, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 128, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 128, | |||
| 512, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 512, | |||
| 1024, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(5, 1), 0))), | |||
| norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), | |||
| ]) | |||
| self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) | |||
| def forward(self, x): | |||
| fmap = [] | |||
| # 1d to 2d | |||
| b, c, t = x.shape | |||
| if t % self.period != 0: # pad first | |||
| n_pad = self.period - (t % self.period) | |||
| x = F.pad(x, (0, n_pad), 'reflect') | |||
| t = t + n_pad | |||
| x = x.view(b, c, t // self.period, self.period) | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = torch.flatten(x, 1, -1) | |||
| return x, fmap | |||
| class MultiPeriodDiscriminator(torch.nn.Module): | |||
| def __init__(self): | |||
| super(MultiPeriodDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList([ | |||
| DiscriminatorP(2), | |||
| DiscriminatorP(3), | |||
| DiscriminatorP(5), | |||
| DiscriminatorP(7), | |||
| DiscriminatorP(11), | |||
| ]) | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| class DiscriminatorS(torch.nn.Module): | |||
| def __init__(self, use_spectral_norm=False): | |||
| super(DiscriminatorS, self).__init__() | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f(Conv1d(1, 128, 15, 1, padding=7)), | |||
| norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), | |||
| norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), | |||
| norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), | |||
| norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), | |||
| norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), | |||
| norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), | |||
| ]) | |||
| self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) | |||
| def forward(self, x): | |||
| fmap = [] | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = torch.flatten(x, 1, -1) | |||
| return x, fmap | |||
| class MultiScaleDiscriminator(torch.nn.Module): | |||
| def __init__(self): | |||
| super(MultiScaleDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList([ | |||
| DiscriminatorS(use_spectral_norm=True), | |||
| DiscriminatorS(), | |||
| DiscriminatorS(), | |||
| ]) | |||
| self.meanpools = nn.ModuleList( | |||
| [DWT1DForward(wave='db3', J=1), | |||
| DWT1DForward(wave='db3', J=1)]) | |||
| self.convs = nn.ModuleList([ | |||
| weight_norm(Conv1d(2, 1, 15, 1, padding=7)), | |||
| weight_norm(Conv1d(2, 1, 15, 1, padding=7)) | |||
| ]) | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| if i != 0: | |||
| yl, yh = self.meanpools[i - 1](y) | |||
| y = torch.cat([yl, yh[0]], dim=1) | |||
| y = self.convs[i - 1](y) | |||
| y = F.leaky_relu(y, LRELU_SLOPE) | |||
| yl_hat, yh_hat = self.meanpools[i - 1](y_hat) | |||
| y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) | |||
| y_hat = self.convs[i - 1](y_hat) | |||
| y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| class DiscriminatorSTFT(torch.nn.Module): | |||
| def __init__(self, | |||
| kernel_size=11, | |||
| stride=2, | |||
| use_spectral_norm=False, | |||
| fft_size=1024, | |||
| shift_size=120, | |||
| win_length=600, | |||
| window='hann_window'): | |||
| super(DiscriminatorSTFT, self).__init__() | |||
| self.fft_size = fft_size | |||
| self.shift_size = shift_size | |||
| self.win_length = win_length | |||
| norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
| self.convs = nn.ModuleList([ | |||
| norm_f( | |||
| Conv2d( | |||
| fft_size // 2 + 1, | |||
| 32, (15, 1), (1, 1), | |||
| padding=(get_padding(15, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f( | |||
| Conv2d( | |||
| 32, | |||
| 32, (kernel_size, 1), (stride, 1), | |||
| padding=(get_padding(9, 1), 0))), | |||
| norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), | |||
| ]) | |||
| self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) | |||
| self.register_buffer('window', getattr(torch, window)(win_length)) | |||
| def forward(self, wav): | |||
| wav = torch.squeeze(wav, 1) | |||
| x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, | |||
| self.window) | |||
| x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) | |||
| fmap = [] | |||
| for layer in self.convs: | |||
| x = layer(x) | |||
| x = F.leaky_relu(x, LRELU_SLOPE) | |||
| fmap.append(x) | |||
| x = self.conv_post(x) | |||
| fmap.append(x) | |||
| x = x.squeeze(-1) | |||
| return x, fmap | |||
| class MultiSTFTDiscriminator(torch.nn.Module): | |||
| def __init__( | |||
| self, | |||
| fft_sizes=[1024, 2048, 512], | |||
| hop_sizes=[120, 240, 50], | |||
| win_lengths=[600, 1200, 240], | |||
| window='hann_window', | |||
| ): | |||
| super(MultiSTFTDiscriminator, self).__init__() | |||
| self.discriminators = nn.ModuleList() | |||
| for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): | |||
| self.discriminators += [ | |||
| DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) | |||
| ] | |||
| def forward(self, y, y_hat): | |||
| y_d_rs = [] | |||
| y_d_gs = [] | |||
| fmap_rs = [] | |||
| fmap_gs = [] | |||
| for i, d in enumerate(self.discriminators): | |||
| y_d_r, fmap_r = d(y) | |||
| y_d_g, fmap_g = d(y_hat) | |||
| y_d_rs.append(y_d_r) | |||
| fmap_rs.append(fmap_r) | |||
| y_d_gs.append(y_d_g) | |||
| fmap_gs.append(fmap_g) | |||
| return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
| def feature_loss(fmap_r, fmap_g): | |||
| loss = 0 | |||
| for dr, dg in zip(fmap_r, fmap_g): | |||
| for rl, gl in zip(dr, dg): | |||
| loss += torch.mean(torch.abs(rl - gl)) | |||
| return loss * 2 | |||
| def discriminator_loss(disc_real_outputs, disc_generated_outputs): | |||
| loss = 0 | |||
| r_losses = [] | |||
| g_losses = [] | |||
| for dr, dg in zip(disc_real_outputs, disc_generated_outputs): | |||
| r_loss = torch.mean((1 - dr)**2) | |||
| g_loss = torch.mean(dg**2) | |||
| loss += (r_loss + g_loss) | |||
| r_losses.append(r_loss.item()) | |||
| g_losses.append(g_loss.item()) | |||
| return loss, r_losses, g_losses | |||
| def generator_loss(disc_outputs): | |||
| loss = 0 | |||
| gen_losses = [] | |||
| for dg in disc_outputs: | |||
| temp_loss = torch.mean((1 - dg)**2) | |||
| gen_losses.append(temp_loss) | |||
| loss += temp_loss | |||
| return loss, gen_losses | |||
| @@ -0,0 +1,59 @@ | |||
| import glob | |||
| import os | |||
| import matplotlib | |||
| import matplotlib.pylab as plt | |||
| import torch | |||
| from torch.nn.utils import weight_norm | |||
| matplotlib.use('Agg') | |||
| def plot_spectrogram(spectrogram): | |||
| fig, ax = plt.subplots(figsize=(10, 2)) | |||
| im = ax.imshow( | |||
| spectrogram, aspect='auto', origin='lower', interpolation='none') | |||
| plt.colorbar(im, ax=ax) | |||
| fig.canvas.draw() | |||
| plt.close() | |||
| return fig | |||
| def init_weights(m, mean=0.0, std=0.01): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| m.weight.data.normal_(mean, std) | |||
| def apply_weight_norm(m): | |||
| classname = m.__class__.__name__ | |||
| if classname.find('Conv') != -1: | |||
| weight_norm(m) | |||
| def get_padding(kernel_size, dilation=1): | |||
| return int((kernel_size * dilation - dilation) / 2) | |||
| def load_checkpoint(filepath, device): | |||
| assert os.path.isfile(filepath) | |||
| print("Loading '{}'".format(filepath)) | |||
| checkpoint_dict = torch.load(filepath, map_location=device) | |||
| print('Complete.') | |||
| return checkpoint_dict | |||
| def save_checkpoint(filepath, obj): | |||
| print('Saving checkpoint to {}'.format(filepath)) | |||
| torch.save(obj, filepath) | |||
| print('Complete.') | |||
| def scan_checkpoint(cp_dir, prefix): | |||
| pattern = os.path.join(cp_dir, prefix + '????????') | |||
| cp_list = glob.glob(pattern) | |||
| if len(cp_list) == 0: | |||
| return None | |||
| return sorted(cp_list)[-1] | |||
| @@ -62,4 +62,6 @@ class Model(ABC): | |||
| if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): | |||
| model_cfg.type = model_cfg.model_type | |||
| model_cfg.model_dir = local_model_dir | |||
| for k, v in kwargs.items(): | |||
| model_cfg.k = v | |||
| return build_model(model_cfg, task_name) | |||
| @@ -1 +1,2 @@ | |||
| from .linear_aec_pipeline import LinearAECPipeline | |||
| from .text_to_speech_pipeline import * # noqa F403 | |||
| @@ -0,0 +1,46 @@ | |||
| import time | |||
| from typing import Any, Dict, List | |||
| import numpy as np | |||
| from modelscope.models import Model | |||
| from modelscope.models.audio.tts.am import SambertNetHifi16k | |||
| from modelscope.models.audio.tts.vocoder import Hifigan16k | |||
| from modelscope.pipelines.base import Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor | |||
| from modelscope.utils.constant import Fields, Tasks | |||
| __all__ = ['TextToSpeechSambertHifigan16kPipeline'] | |||
| @PIPELINES.register_module( | |||
| Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k') | |||
| class TextToSpeechSambertHifigan16kPipeline(Pipeline): | |||
| def __init__(self, | |||
| config_file: str = None, | |||
| model: List[Model] = None, | |||
| preprocessor: TextToTacotronSymbols = None, | |||
| **kwargs): | |||
| super().__init__( | |||
| config_file=config_file, | |||
| model=model, | |||
| preprocessor=preprocessor, | |||
| **kwargs) | |||
| assert len(model) == 2, 'model number should be 2' | |||
| self._am = model[0] | |||
| self._vocoder = model[1] | |||
| self._preprocessor = preprocessor | |||
| def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]: | |||
| texts = inputs['texts'] | |||
| audio_total = np.empty((0), dtype='int16') | |||
| for line in texts: | |||
| line = line.strip().split('\t') | |||
| audio = self._vocoder.forward(self._am.forward(line[1])) | |||
| audio_total = np.append(audio_total, audio, axis=0) | |||
| return {'output': audio_total} | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -8,3 +8,4 @@ from .image import LoadImage, load_image | |||
| from .nlp import * # noqa F403 | |||
| from .space.dialog_intent_prediction_preprocessor import * # noqa F403 | |||
| from .space.dialog_modeling_preprocessor import * # noqa F403 | |||
| from .text_to_speech import * # noqa F403 | |||
| @@ -5,7 +5,6 @@ from typing import Any, Dict | |||
| import numpy as np | |||
| import scipy.io.wavfile as wav | |||
| import torch | |||
| import torchaudio.compliance.kaldi as kaldi | |||
| from numpy.ctypeslib import ndpointer | |||
| from modelscope.utils.constant import Fields | |||
| @@ -123,6 +122,8 @@ class Feature: | |||
| if self.feat_type == 'raw': | |||
| return utt | |||
| elif self.feat_type == 'fbank': | |||
| # have to use local import before modelscope framework supoort lazy loading | |||
| import torchaudio.compliance.kaldi as kaldi | |||
| if len(utt.shape) == 1: | |||
| utt = utt.unsqueeze(0) | |||
| feat = kaldi.fbank(utt, **self.fbank_config) | |||
| @@ -0,0 +1,53 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import io | |||
| from typing import Any, Dict, Union | |||
| import ttsfrd | |||
| from modelscope.fileio import File | |||
| from modelscope.models.audio.tts.frontend import GenericTtsFrontend | |||
| from modelscope.models.base import Model | |||
| from modelscope.utils.audio.tts_exceptions import * # noqa F403 | |||
| from modelscope.utils.constant import Fields | |||
| from .base import Preprocessor | |||
| from .builder import PREPROCESSORS | |||
| __all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols'] | |||
| @PREPROCESSORS.register_module( | |||
| Fields.audio, module_name=r'text_to_tacotron_symbols') | |||
| class TextToTacotronSymbols(Preprocessor): | |||
| """extract tacotron symbols from text. | |||
| Args: | |||
| res_path (str): TTS frontend resource url | |||
| lang_type (str): language type, valid values are "pinyin" and "chenmix" | |||
| """ | |||
| def __init__(self, model_name, lang_type='pinyin'): | |||
| self._frontend_model = Model.from_pretrained( | |||
| model_name, lang_type=lang_type) | |||
| assert self._frontend_model is not None, 'load model from pretained failed' | |||
| def __call__(self, data: str) -> Dict[str, Any]: | |||
| """Call functions to load text and get tacotron symbols. | |||
| Args: | |||
| input (str): text with utf-8 | |||
| Returns: | |||
| symbos (list[str]): texts in tacotron symbols format. | |||
| """ | |||
| return self._frontend_model.forward(data) | |||
| def text_to_tacotron_symbols(text='', path='./', lang='pinyin'): | |||
| """ simple interface to transform text to tacotron symbols | |||
| Args: | |||
| text (str): input text | |||
| path (str): resource path | |||
| lang (str): language type from one of "pinyin" and "chenmix" | |||
| """ | |||
| transform = TextToTacotronSymbols(path, lang) | |||
| return transform(text) | |||
| @@ -0,0 +1,42 @@ | |||
| """ | |||
| Define TTS exceptions | |||
| """ | |||
| class TtsException(Exception): | |||
| """ | |||
| TTS exception class. | |||
| """ | |||
| pass | |||
| class TtsFrontendException(TtsException): | |||
| """ | |||
| TTS frontend module level exceptions. | |||
| """ | |||
| pass | |||
| class TtsFrontendInitializeFailedException(TtsFrontendException): | |||
| """ | |||
| If tts frontend resource is invalid or not exist, this exception will be raised. | |||
| """ | |||
| pass | |||
| class TtsFrontendLanguageTypeInvalidException(TtsFrontendException): | |||
| """ | |||
| If language type is invalid, this exception will be raised. | |||
| """ | |||
| class TtsVocoderException(TtsException): | |||
| """ | |||
| Vocoder exception | |||
| """ | |||
| class TtsVocoderMelspecShapeMismatchException(TtsVocoderException): | |||
| """ | |||
| If vocoder's input melspec shape mismatch, this exception will be raised. | |||
| """ | |||
| @@ -67,7 +67,6 @@ class Registry(object): | |||
| if module_name in self._modules[group_key]: | |||
| raise KeyError(f'{module_name} is already registered in ' | |||
| f'{self._name}[{group_key}]') | |||
| self._modules[group_key][module_name] = module_cls | |||
| module_cls.group_key = group_key | |||
| @@ -2,4 +2,5 @@ | |||
| -r requirements/pipeline.txt | |||
| -r requirements/multi-modal.txt | |||
| -r requirements/nlp.txt | |||
| -r requirements/audio.txt | |||
| -r requirements/cv.txt | |||
| @@ -0,0 +1,26 @@ | |||
| #tts | |||
| h5py==2.10.0 | |||
| #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl | |||
| https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl | |||
| https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D | |||
| #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl | |||
| #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl | |||
| inflect | |||
| keras==2.2.4 | |||
| librosa | |||
| lxml | |||
| matplotlib | |||
| nara_wpe | |||
| numpy==1.18.* | |||
| protobuf==3.20.* | |||
| ptflops | |||
| PyWavelets>=1.0.0 | |||
| scikit-learn==0.23.2 | |||
| sox | |||
| tensorboard | |||
| tensorflow==1.15.* | |||
| torch==1.10.* | |||
| torchaudio | |||
| torchvision | |||
| tqdm | |||
| unidecode | |||
| @@ -0,0 +1,60 @@ | |||
| import time | |||
| import unittest | |||
| import json | |||
| import tensorflow as tf | |||
| # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. | |||
| # A segmentation fault may be raise by pytorch cpp library | |||
| # if 'import tensorflow' in front of 'import torch'. | |||
| # Puting a 'import torch' here can bypass this incompatibility. | |||
| import torch | |||
| from scipy.io.wavfile import write | |||
| from modelscope.fileio import File | |||
| from modelscope.models import Model, build_model | |||
| from modelscope.models.audio.tts.am import SambertNetHifi16k | |||
| from modelscope.models.audio.tts.vocoder import AttrDict, Hifigan16k | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.preprocessors import build_preprocessor | |||
| from modelscope.utils.constant import Fields, InputFields, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase): | |||
| def test_pipeline(self): | |||
| lang_type = 'pinyin' | |||
| text = '明天天气怎么样' | |||
| preprocessor_model_id = 'damo/speech_binary_tts_frontend_resource' | |||
| am_model_id = 'damo/speech_sambert16k_tts_zhitian_emo' | |||
| voc_model_id = 'damo/speech_hifigan16k_tts_zhitian_emo' | |||
| cfg_preprocessor = dict( | |||
| type='text_to_tacotron_symbols', | |||
| model_name=preprocessor_model_id, | |||
| lang_type=lang_type) | |||
| preprocessor = build_preprocessor(cfg_preprocessor, Fields.audio) | |||
| self.assertTrue(preprocessor is not None) | |||
| am = Model.from_pretrained(am_model_id) | |||
| self.assertTrue(am is not None) | |||
| voc = Model.from_pretrained(voc_model_id) | |||
| self.assertTrue(voc is not None) | |||
| sambert_tts = pipeline( | |||
| pipeline_name='tts-sambert-hifigan-16k', | |||
| config_file='', | |||
| model=[am, voc], | |||
| preprocessor=preprocessor) | |||
| self.assertTrue(sambert_tts is not None) | |||
| output = sambert_tts(text) | |||
| self.assertTrue(len(output['output']) > 0) | |||
| write('output.wav', 16000, output['output']) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,28 @@ | |||
| import shutil | |||
| import unittest | |||
| from modelscope.preprocessors import build_preprocessor | |||
| from modelscope.utils.constant import Fields, InputFields | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| class TtsPreprocessorTest(unittest.TestCase): | |||
| def test_preprocess(self): | |||
| lang_type = 'pinyin' | |||
| text = '今天天气不错,我们去散步吧。' | |||
| cfg = dict( | |||
| type='text_to_tacotron_symbols', | |||
| model_name='damo/speech_binary_tts_frontend_resource', | |||
| lang_type=lang_type) | |||
| preprocessor = build_preprocessor(cfg, Fields.audio) | |||
| output = preprocessor(text) | |||
| self.assertTrue(output) | |||
| for line in output['texts']: | |||
| print(line) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -7,6 +7,12 @@ import sys | |||
| import unittest | |||
| from fnmatch import fnmatch | |||
| # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. | |||
| # A segmentation fault may be raise by pytorch cpp library | |||
| # if 'import tensorflow' in front of 'import torch'. | |||
| # Puting a 'import torch' here can bypass this incompatibility. | |||
| import torch | |||
| from modelscope.utils.logger import get_logger | |||
| from modelscope.utils.test_utils import set_test_level, test_level | |||