From 7a09b16311c3197c484bf05181cb4c6c4c5e3d73 Mon Sep 17 00:00:00 2001
From: wanyiming <wanyiming@huawei.com>
Date: Sat, 30 Jan 2021 10:31:58 +0800
Subject: [PATCH] deepspeech cpu training

---
 .../research/audio/deepspeech2/README.md      |  8 ++++++--
 model_zoo/research/audio/deepspeech2/eval.py  |  5 +++--
 .../research/audio/deepspeech2/src/dataset.py |  2 +-
 .../audio/deepspeech2/src/deepspeech2.py      | 19 +++++++++++++------
 model_zoo/research/audio/deepspeech2/train.py | 18 +++++++++---------
 model_zoo/research/audio/wavenet/README.md    |  6 +++---
 6 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/model_zoo/research/audio/deepspeech2/README.md b/model_zoo/research/audio/deepspeech2/README.md
index c2a1708c63..682f443eb3 100644
--- a/model_zoo/research/audio/deepspeech2/README.md
+++ b/model_zoo/research/audio/deepspeech2/README.md
@@ -19,7 +19,7 @@
 # [DeepSpeech2 Description](#contents)
 
 DeepSpeech2 is a speech recognition models which is trained with CTC loss. It replaces entire pipelines of hand-engineered components with neural networks and can handle a diverse variety of speech including noisy
-environments, accents and different languages. We support training and evaluation on GPU.
+environments, accents and different languages. We support training and evaluation on CPU and GPU.
 
 [Paper](https://arxiv.org/pdf/1512.02595v1.pdf): Amodei, Dario, et al. Deep speech 2: End-to-end speech recognition in english and mandarin.
 
@@ -97,10 +97,12 @@ usage: train.py  [--use_pretrained USE_PRETRAINED]
                  [--pre_trained_model_path PRE_TRAINED_MODEL_PATH]
                  [--is_distributed IS_DISTRIBUTED]
                  [--bidirectional BIDIRECTIONAL]
+                 [--device_target DEVICE_TARGET]
 options:
     --pre_trained_model_path    pretrained checkpoint path, default is ''
     --is_distributed            distributed training, default is False
     --bidirectional             whether or not to use bidirectional RNN, default is True. Currently, only bidirectional model is implemented
+    --device_target             device where the code will be implemented: "GPU" | "CPU", default is "GPU"
 ```
 
 ### Evaluation
@@ -108,10 +110,12 @@ options:
 ```text
 usage: eval.py  [--bidirectional BIDIRECTIONAL]
                 [--pretrain_ckpt PRETRAIN_CKPT]
+                [--device_target DEVICE_TARGET]
 
 options:
     --bidirectional              whether to use bidirectional RNN, default is True. Currently, only bidirectional model is implemented
     --pretrain_ckpt              saved checkpoint path, default is ''
+    --device_target              device where the code will be implemented: "GPU" | "CPU", default is "GPU"
 ```
 
 ### Options and Parameters
@@ -210,7 +214,7 @@ for evaluation configuration
 
 ```
 
- The three*.csv files will be used in training and evaluation process. Before training, some requirements should be installed, including `librosa` and `Levenshtein`
+Before training, some requirements should be installed, including `librosa` and `Levenshtein`
 After installing MindSpore via the official website and finishing dataset processing, you can start training as follows:
 
 ```shell
diff --git a/model_zoo/research/audio/deepspeech2/eval.py b/model_zoo/research/audio/deepspeech2/eval.py
index e804993651..a0de22efee 100644
--- a/model_zoo/research/audio/deepspeech2/eval.py
+++ b/model_zoo/research/audio/deepspeech2/eval.py
@@ -24,17 +24,18 @@ from src.config import eval_config
 from src.deepspeech2 import DeepSpeechModel, PredictWithSoftmax
 from src.dataset import create_dataset
 from src.greedydecoder import MSGreedyDecoder
-
 from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
 
 parser = argparse.ArgumentParser(description='DeepSpeech evaluation')
 parser.add_argument('--bidirectional', action="store_false", default=True, help='Use bidirectional RNN')
 parser.add_argument('--pretrain_ckpt', type=str, default='', help='Pretrained checkpoint path')
+parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"),
+                    help='Device target, support GPU and CPU, Default: GPU')
 args = parser.parse_args()
 
 if __name__ == '__main__':
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False)
     config = eval_config
     with open(config.DataConfig.labels_path) as label_file:
         labels = json.load(label_file)
diff --git a/model_zoo/research/audio/deepspeech2/src/dataset.py b/model_zoo/research/audio/deepspeech2/src/dataset.py
index 30725ee43f..8409ce718a 100644
--- a/model_zoo/research/audio/deepspeech2/src/dataset.py
+++ b/model_zoo/research/audio/deepspeech2/src/dataset.py
@@ -115,7 +115,7 @@ class ASRDataset(LoadAudioAndTranscript):
         batch_idx = self.bins[index]
         batch_size = len(batch_idx)
         batch_spect, batch_script, target_indices = [], [], []
-        input_length = np.zeros(batch_size, np.int32)
+        input_length = np.zeros(batch_size, np.float32)
         for data in batch_idx:
             audio_path, transcript_path = data[0], data[1]
             spect = self.parse_audio(audio_path)
diff --git a/model_zoo/research/audio/deepspeech2/src/deepspeech2.py b/model_zoo/research/audio/deepspeech2/src/deepspeech2.py
index 5851e03a41..1e7d34cace 100644
--- a/model_zoo/research/audio/deepspeech2/src/deepspeech2.py
+++ b/model_zoo/research/audio/deepspeech2/src/deepspeech2.py
@@ -19,6 +19,7 @@ DeepSpeech2 model
 
 import math
 import numpy as np
+import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P
 from mindspore import nn, Tensor, ParameterTuple, Parameter
 from mindspore.common.initializer import initializer
@@ -112,7 +113,7 @@ class BatchRNN(nn.Cell):
     """
 
     def __init__(self, batch_size, input_size, hidden_size, num_layers, bidirectional=False, batch_norm=False,
-                 rnn_type='LSTM'):
+                 rnn_type='LSTM', device_target="GPU"):
         super(BatchRNN, self).__init__()
         self.batch_size = batch_size
         self.input_size = input_size
@@ -141,7 +142,10 @@ class BatchRNN(nn.Cell):
         for i in range(num_layers):
             weight_size = (input_size_list[i] + hidden_size) * hidden_size * self.num_directions * 4
             if self.has_bias:
-                bias_size = self.num_directions * hidden_size * 4 * 2
+                if device_target == "GPU":
+                    bias_size = self.num_directions * hidden_size * 4 * 2
+                else:
+                    bias_size = self.num_directions * hidden_size * 4
                 weight_size = weight_size + bias_size
 
             stdv = 1 / math.sqrt(hidden_size)
@@ -195,7 +199,8 @@ class DeepSpeechModel(nn.Cell):
         bidirectional(bool): use bidirectional rnn (default=True)
     """
 
-    def __init__(self, batch_size, labels, rnn_hidden_size, nb_layers, audio_conf, rnn_type='LSTM', bidirectional=True):
+    def __init__(self, batch_size, labels, rnn_hidden_size, nb_layers, audio_conf, rnn_type='LSTM',
+                 bidirectional=True, device_target='GPU'):
         super(DeepSpeechModel, self).__init__()
         self.batch_size = batch_size
         self.hidden_size = rnn_hidden_size
@@ -226,7 +231,7 @@ class DeepSpeechModel(nn.Cell):
 
         self.RNN = BatchRNN(batch_size=self.batch_size, input_size=rnn_input_size, num_layers=nb_layers,
                             hidden_size=rnn_hidden_size, bidirectional=bidirectional, batch_norm=False,
-                            rnn_type=self.rnn_type)
+                            rnn_type=self.rnn_type, device_target=device_target)
         fully_connected = nn.Dense(rnn_hidden_size, num_classes, has_bias=False)
         self.fc = SequenceWise(fully_connected)
 
@@ -275,10 +280,11 @@ class NetWithLossClass(nn.Cell):
         self.network = network
         self.ReduceMean_false = P.ReduceMean(keep_dims=False)
         self.squeeze_op = P.Squeeze(0)
+        self.cast_op = P.Cast()
 
     def construct(self, inputs, input_length, target_indices, label_values):
         predict, output_length = self.network(inputs, input_length)
-        loss = self.loss(predict, target_indices, label_values, output_length)
+        loss = self.loss(predict, target_indices, label_values, self.cast_op(output_length, mstype.int32))
         return self.ReduceMean_false(loss[0])
 
 
@@ -292,9 +298,10 @@ class PredictWithSoftmax(nn.Cell):
         self.network = network
         self.inference_softmax = P.Softmax(axis=-1)
         self.transpose_op = P.Transpose()
+        self.cast_op = P.Cast()
 
     def construct(self, inputs, input_length):
-        x, output_sizes = self.network(inputs, input_length)
+        x, output_sizes = self.network(inputs, self.cast_op(input_length, mstype.int32))
         x = self.inference_softmax(x)
         x = self.transpose_op(x, (1, 0, 2))
         return x, output_sizes
diff --git a/model_zoo/research/audio/deepspeech2/train.py b/model_zoo/research/audio/deepspeech2/train.py
index 824f178b1d..0ebed19296 100644
--- a/model_zoo/research/audio/deepspeech2/train.py
+++ b/model_zoo/research/audio/deepspeech2/train.py
@@ -21,7 +21,7 @@ import argparse
 from mindspore import context, Tensor, ParameterTuple
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank, get_group_size
-from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.nn.optim import Adam
 from mindspore.nn import TrainOneStepCell
@@ -29,7 +29,6 @@ from mindspore.train import Model
 
 from src.deepspeech2 import DeepSpeechModel, NetWithLossClass
 from src.lr_generator import get_lr
-from src.callback import Monitor
 from src.config import train_config
 from src.dataset import create_dataset
 
@@ -37,22 +36,23 @@ parser = argparse.ArgumentParser(description='DeepSpeech2 training')
 parser.add_argument('--pre_trained_model_path', type=str, default='', help='Pretrained checkpoint path')
 parser.add_argument('--is_distributed', action="store_true", default=False, help='Distributed training')
 parser.add_argument('--bidirectional', action="store_false", default=True, help='Use bidirectional RNN')
+parser.add_argument('--device_target', type=str, default="GPU", choices=("GPU", "CPU"),
+                    help='Device target, support GPU and CPU, Default: GPU')
 args = parser.parse_args()
 
 if __name__ == '__main__':
     rank_id = 0
     group_size = 1
     config = train_config
+    data_sink = (args.device_target == "GPU")
+    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False)
     if args.is_distributed:
         init('nccl')
         rank_id = get_rank()
         group_size = get_group_size()
-        context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False)
         context.reset_auto_parallel_context()
         context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
                                           gradients_mean=True)
-    else:
-        context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False)
 
     with open(config.DataConfig.labels_path) as label_file:
         labels = json.load(label_file)
@@ -73,7 +73,8 @@ if __name__ == '__main__':
                                      labels=labels,
                                      rnn_type=config.ModelConfig.rnn_type,
                                      audio_conf=config.DataConfig.SpectConfig,
-                                     bidirectional=True)
+                                     bidirectional=True,
+                                     device_target=args.device_target)
 
     loss_net = NetWithLossClass(deepspeech_net)
     weights = ParameterTuple(deepspeech_net.trainable_params())
@@ -88,8 +89,7 @@ if __name__ == '__main__':
         print('Successfully loading the pre-trained model')
 
     model = Model(train_net)
-    lr_cb = Monitor(lr)
-    callback_list = [lr_cb]
+    callback_list = [LossMonitor()]
 
     if args.is_distributed:
         config.CheckpointConfig.ckpt_file_name_prefix = config.CheckpointConfig.ckpt_file_name_prefix + str(get_rank())
@@ -100,4 +100,4 @@ if __name__ == '__main__':
     ckpt_cb = ModelCheckpoint(prefix=config.CheckpointConfig.ckpt_file_name_prefix,
                               directory=config.CheckpointConfig.ckpt_path, config=config_ck)
     callback_list.append(ckpt_cb)
-    model.train(config.TrainingConfig.epochs, ds_train, callbacks=callback_list)
+    model.train(config.TrainingConfig.epochs, ds_train, callbacks=callback_list, dataset_sink_mode=data_sink)
diff --git a/model_zoo/research/audio/wavenet/README.md b/model_zoo/research/audio/wavenet/README.md
index d6c02ee58e..56ce97e755 100644
--- a/model_zoo/research/audio/wavenet/README.md
+++ b/model_zoo/research/audio/wavenet/README.md
@@ -50,10 +50,10 @@ Dataset used: [The LJ Speech Dataset](<https://keithito.com/LJ-Speech-Dataset>)
 - Hardware（GPU）
     - Prepare hardware environment with GPU processor.
 - Framework
-    - [MindSpore](https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search?searchKey=Do-MindSpore%20V100R001C00B622)
+    - [MindSpore](https://www.mindspore.cn/install/en)
 - For more information, please check the resources below：
-    - [MindSpore tutorials](https://www.mindspore.cn/tutorial/zh-CN/master/index.html)
-    - [MindSpore API](https://www.mindspore.cn/api/zh-CN/master/index.html)
+    - [MindSpore tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
+    - [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
 
 # [Script Description](#contents)