From 7a09b16311c3197c484bf05181cb4c6c4c5e3d73 Mon Sep 17 00:00:00 2001 From: wanyiming Date: Sat, 30 Jan 2021 10:31:58 +0800 Subject: [PATCH] deepspeech cpu training --- .../research/audio/deepspeech2/README.md | 8 ++++++-- model_zoo/research/audio/deepspeech2/eval.py | 5 +++-- .../research/audio/deepspeech2/src/dataset.py | 2 +- .../audio/deepspeech2/src/deepspeech2.py | 19 +++++++++++++------ model_zoo/research/audio/deepspeech2/train.py | 18 +++++++++--------- model_zoo/research/audio/wavenet/README.md | 6 +++--- 6 files changed, 35 insertions(+), 23 deletions(-) diff --git a/model_zoo/research/audio/deepspeech2/README.md b/model_zoo/research/audio/deepspeech2/README.md index c2a1708c63..682f443eb3 100644 --- a/model_zoo/research/audio/deepspeech2/README.md +++ b/model_zoo/research/audio/deepspeech2/README.md @@ -19,7 +19,7 @@ # [DeepSpeech2 Description](#contents) DeepSpeech2 is a speech recognition models which is trained with CTC loss. It replaces entire pipelines of hand-engineered components with neural networks and can handle a diverse variety of speech including noisy -environments, accents and different languages. We support training and evaluation on GPU. +environments, accents and different languages. We support training and evaluation on CPU and GPU. [Paper](https://arxiv.org/pdf/1512.02595v1.pdf): Amodei, Dario, et al. Deep speech 2: End-to-end speech recognition in english and mandarin. @@ -97,10 +97,12 @@ usage: train.py [--use_pretrained USE_PRETRAINED] [--pre_trained_model_path PRE_TRAINED_MODEL_PATH] [--is_distributed IS_DISTRIBUTED] [--bidirectional BIDIRECTIONAL] + [--device_target DEVICE_TARGET] options: --pre_trained_model_path pretrained checkpoint path, default is '' --is_distributed distributed training, default is False --bidirectional whether or not to use bidirectional RNN, default is True. Currently, only bidirectional model is implemented + --device_target device where the code will be implemented: "GPU" | "CPU", default is "GPU" ``` ### Evaluation @@ -108,10 +110,12 @@ options: ```text usage: eval.py [--bidirectional BIDIRECTIONAL] [--pretrain_ckpt PRETRAIN_CKPT] + [--device_target DEVICE_TARGET] options: --bidirectional whether to use bidirectional RNN, default is True. Currently, only bidirectional model is implemented --pretrain_ckpt saved checkpoint path, default is '' + --device_target device where the code will be implemented: "GPU" | "CPU", default is "GPU" ``` ### Options and Parameters @@ -210,7 +214,7 @@ for evaluation configuration ``` - The three*.csv files will be used in training and evaluation process. Before training, some requirements should be installed, including `librosa` and `Levenshtein` +Before training, some requirements should be installed, including `librosa` and `Levenshtein` After installing MindSpore via the official website and finishing dataset processing, you can start training as follows: ```shell diff --git a/model_zoo/research/audio/deepspeech2/eval.py b/model_zoo/research/audio/deepspeech2/eval.py index e804993651..a0de22efee 100644 --- a/model_zoo/research/audio/deepspeech2/eval.py +++ b/model_zoo/research/audio/deepspeech2/eval.py @@ -24,17 +24,18 @@ from src.config import eval_config from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax from src.dataset import create_dataset from src.greedydecoder import MSGreedyDecoder - from mindspore import context from mindspore.train.serialization import load_checkpoint, load_param_into_net -context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False) parser = argparse.ArgumentParser(description='DeepSpeech evaluation') parser.add_argument('--bidirectional', action="store_false", default=True, help='Use bidirectional RNN') parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path') +parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"), + help='Device target, support GPU and CPU, Default: GPU') args = parser.parse_args() if __name__ == '__main__': + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) config = eval_config with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) diff --git a/model_zoo/research/audio/deepspeech2/src/dataset.py b/model_zoo/research/audio/deepspeech2/src/dataset.py index 30725ee43f..8409ce718a 100644 --- a/model_zoo/research/audio/deepspeech2/src/dataset.py +++ b/model_zoo/research/audio/deepspeech2/src/dataset.py @@ -115,7 +115,7 @@ class ASRDataset(LoadAudioAndTranscript): batch_idx = self.bins[index] batch_size = len(batch_idx) batch_spect, batch_script, target_indices = [], [], [] - input_length = np.zeros(batch_size, np.int32) + input_length = np.zeros(batch_size, np.float32) for data in batch_idx: audio_path, transcript_path = data[0], data[1] spect = self.parse_audio(audio_path) diff --git a/model_zoo/research/audio/deepspeech2/src/deepspeech2.py b/model_zoo/research/audio/deepspeech2/src/deepspeech2.py index 5851e03a41..1e7d34cace 100644 --- a/model_zoo/research/audio/deepspeech2/src/deepspeech2.py +++ b/model_zoo/research/audio/deepspeech2/src/deepspeech2.py @@ -19,6 +19,7 @@ DeepSpeech2 model import math import numpy as np +import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore import nn, Tensor, ParameterTuple, Parameter from mindspore.common.initializer import initializer @@ -112,7 +113,7 @@ class BatchRNN(nn.Cell): """ def __init__(self, batch_size, input_size, hidden_size, num_layers, bidirectional=False, batch_norm=False, - rnn_type='LSTM'): + rnn_type='LSTM', device_target="GPU"): super(BatchRNN, self).__init__() self.batch_size = batch_size self.input_size = input_size @@ -141,7 +142,10 @@ class BatchRNN(nn.Cell): for i in range(num_layers): weight_size = (input_size_list[i] + hidden_size) * hidden_size * self.num_directions * 4 if self.has_bias: - bias_size = self.num_directions * hidden_size * 4 * 2 + if device_target == "GPU": + bias_size = self.num_directions * hidden_size * 4 * 2 + else: + bias_size = self.num_directions * hidden_size * 4 weight_size = weight_size + bias_size stdv = 1 / math.sqrt(hidden_size) @@ -195,7 +199,8 @@ class DeepSpeechModel(nn.Cell): bidirectional(bool): use bidirectional rnn (default=True) """ - def __init__(self, batch_size, labels, rnn_hidden_size, nb_layers, audio_conf, rnn_type='LSTM', bidirectional=True): + def __init__(self, batch_size, labels, rnn_hidden_size, nb_layers, audio_conf, rnn_type='LSTM', + bidirectional=True, device_target='GPU'): super(DeepSpeechModel, self).__init__() self.batch_size = batch_size self.hidden_size = rnn_hidden_size @@ -226,7 +231,7 @@ class DeepSpeechModel(nn.Cell): self.RNN = BatchRNN(batch_size=self.batch_size, input_size=rnn_input_size, num_layers=nb_layers, hidden_size=rnn_hidden_size, bidirectional=bidirectional, batch_norm=False, - rnn_type=self.rnn_type) + rnn_type=self.rnn_type, device_target=device_target) fully_connected = nn.Dense(rnn_hidden_size, num_classes, has_bias=False) self.fc = SequenceWise(fully_connected) @@ -275,10 +280,11 @@ class NetWithLossClass(nn.Cell): self.network = network self.ReduceMean_false = P.ReduceMean(keep_dims=False) self.squeeze_op = P.Squeeze(0) + self.cast_op = P.Cast() def construct(self, inputs, input_length, target_indices, label_values): predict, output_length = self.network(inputs, input_length) - loss = self.loss(predict, target_indices, label_values, output_length) + loss = self.loss(predict, target_indices, label_values, self.cast_op(output_length, mstype.int32)) return self.ReduceMean_false(loss[0]) @@ -292,9 +298,10 @@ class PredictWithSoftmax(nn.Cell): self.network = network self.inference_softmax = P.Softmax(axis=-1) self.transpose_op = P.Transpose() + self.cast_op = P.Cast() def construct(self, inputs, input_length): - x, output_sizes = self.network(inputs, input_length) + x, output_sizes = self.network(inputs, self.cast_op(input_length, mstype.int32)) x = self.inference_softmax(x) x = self.transpose_op(x, (1, 0, 2)) return x, output_sizes diff --git a/model_zoo/research/audio/deepspeech2/train.py b/model_zoo/research/audio/deepspeech2/train.py index 824f178b1d..0ebed19296 100644 --- a/model_zoo/research/audio/deepspeech2/train.py +++ b/model_zoo/research/audio/deepspeech2/train.py @@ -21,7 +21,7 @@ import argparse from mindspore import context, Tensor, ParameterTuple from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank, get_group_size -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.nn.optim import Adam from mindspore.nn import TrainOneStepCell @@ -29,7 +29,6 @@ from mindspore.train import Model from src.deepspeech2 import DeepSpeechModel, NetWithLossClass from src.lr_generator import get_lr -from src.callback import Monitor from src.config import train_config from src.dataset import create_dataset @@ -37,22 +36,23 @@ parser = argparse.ArgumentParser(description='DeepSpeech2 training') parser.add_argument('--pre_trained_model_path', type=str, default='', help='Pretrained checkpoint path') parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training') parser.add_argument('--bidirectional', action="store_false", default=True, help='Use bidirectional RNN') +parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"), + help='Device target, support GPU and CPU, Default: GPU') args = parser.parse_args() if __name__ == '__main__': rank_id = 0 group_size = 1 config = train_config + data_sink = (args.device_target == "GPU") + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False) if args.is_distributed: init('nccl') rank_id = get_rank() group_size = get_group_size() - context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False) context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - else: - context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False) with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) @@ -73,7 +73,8 @@ if __name__ == '__main__': labels=labels, rnn_type=config.ModelConfig.rnn_type, audio_conf=config.DataConfig.SpectConfig, - bidirectional=True) + bidirectional=True, + device_target=args.device_target) loss_net = NetWithLossClass(deepspeech_net) weights = ParameterTuple(deepspeech_net.trainable_params()) @@ -88,8 +89,7 @@ if __name__ == '__main__': print('Successfully loading the pre-trained model') model = Model(train_net) - lr_cb = Monitor(lr) - callback_list = [lr_cb] + callback_list = [LossMonitor()] if args.is_distributed: config.CheckpointConfig.ckpt_file_name_prefix = config.CheckpointConfig.ckpt_file_name_prefix + str(get_rank()) @@ -100,4 +100,4 @@ if __name__ == '__main__': ckpt_cb = ModelCheckpoint(prefix=config.CheckpointConfig.ckpt_file_name_prefix, directory=config.CheckpointConfig.ckpt_path, config=config_ck) callback_list.append(ckpt_cb) - model.train(config.TrainingConfig.epochs, ds_train, callbacks=callback_list) + model.train(config.TrainingConfig.epochs, ds_train, callbacks=callback_list, dataset_sink_mode=data_sink) diff --git a/model_zoo/research/audio/wavenet/README.md b/model_zoo/research/audio/wavenet/README.md index d6c02ee58e..56ce97e755 100644 --- a/model_zoo/research/audio/wavenet/README.md +++ b/model_zoo/research/audio/wavenet/README.md @@ -50,10 +50,10 @@ Dataset used: [The LJ Speech Dataset]() - Hardware(GPU) - Prepare hardware environment with GPU processor. - Framework - - [MindSpore](https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search?searchKey=Do-MindSpore%20V100R001C00B622) + - [MindSpore](https://www.mindspore.cn/install/en) - For more information, please check the resources below: - - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html) - - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html) + - [MindSpore tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html) + - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html) # [Script Description](#contents)