From 28cb0da168a9f02197bbb9ac833fc8f55cf2b318 Mon Sep 17 00:00:00 2001
From: chenhaozhe <chenhaozhe1@huawei.com>
Date: Fri, 11 Sep 2020 16:19:49 +0800
Subject: [PATCH] remove os.system in launch scripts

---
 model_zoo/official/nlp/bert/README.md         | 33 ++++++++----
 .../ascend_distributed_launcher/README.md     |  9 ++--
 ...rain.py => get_distribute_pretrain_cmd.py} | 52 ++++++++++++-------
 .../run_distributed_pretrain_ascend.sh        |  7 ++-
 .../nlp/bert/src/bert_for_pre_training.py     |  2 +-
 5 files changed, 68 insertions(+), 35 deletions(-)
 rename model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/{run_distribute_pretrain.py => get_distribute_pretrain_cmd.py} (74%)

diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index 922a1f8054..4d02c8abfa 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -1,19 +1,32 @@
 # Contents
+- [Contents](#contents)
 - [BERT Description](#bert-description)
 - [Model Architecture](#model-architecture)
 - [Dataset](#dataset)
 - [Environment Requirements](#environment-requirements)
 - [Quick Start](#quick-start)
 - [Script Description](#script-description)
-    - [Script and Sample Code](#script-and-sample-code)
-    - [Script Parameters](#script-parameters)
-    - [Dataset Preparation](#dataset-preparation)
-    - [Training Process](#training-process)
-    - [Evaluation Process](#evaluation-process)
-- [Model Description](#model-description)
-    - [Performance](#performance)
-        - [Training Performance](#training-performance)
-        - [Evaluation Performance](#evaluation-performance)
+  - [Script and Sample Code](#script-and-sample-code)
+  - [Script Parameters](#script-parameters)
+    - [Pre-Training](#pre-training)
+    - [Fine-Tuning and Evaluation](#fine-tuning-and-evaluation)
+  - [Options and Parameters](#options-and-parameters)
+    - [Options:](#options)
+    - [Parameters:](#parameters)
+  - [Training Process](#training-process)
+    - [Training](#training)
+      - [Running on Ascend](#running-on-ascend)
+    - [Distributed Training](#distributed-training)
+      - [Running on Ascend](#running-on-ascend-1)
+  - [Evaluation Process](#evaluation-process)
+    - [Evaluation](#evaluation)
+      - [evaluation on cola dataset when running on Ascend](#evaluation-on-cola-dataset-when-running-on-ascend)
+      - [evaluation on cluener dataset when running on Ascend](#evaluation-on-cluener-dataset-when-running-on-ascend)
+      - [evaluation on squad v1.1 dataset when running on Ascend](#evaluation-on-squad-v11-dataset-when-running-on-ascend)
+  - [Model Description](#model-description)
+  - [Performance](#performance)
+    - [Pretraining Performance](#pretraining-performance)
+      - [Inference Performance](#inference-performance)
 - [Description of Random Situation](#description-of-random-situation)
 - [ModelZoo Homepage](#modelzoo-homepage)
 
@@ -139,7 +152,7 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol
     ├─ascend_distributed_launcher
         ├─__init__.py
         ├─hyper_parameter_config.ini          # hyper paramter for distributed pretraining 
-        ├─run_distribute_pretrain.py          # script for distributed pretraining
+        ├─get_distribute_pretrain_cmd.py          # script for distributed pretraining
         ├─README.md    
     ├─run_classifier.sh                       # shell script for standalone classifier task on ascend or gpu
     ├─run_ner.sh                              # shell script for standalone NER task on ascend or gpu
diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md
index b492c4c309..18a6532fbf 100644
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md
@@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set
 
 
 ## how to use
-For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
+For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir:
 ```
-python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
+python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
 ```
 
 output:
@@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt
 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
 
 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
-    device_id
-    device_num
+    - device_id
+    - device_num
+    - data_dir
 
 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py
similarity index 74%
rename from model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
rename to model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py
index 794aaf7234..e2a62ba95d 100644
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py
@@ -42,11 +42,21 @@ def parse_args():
                         help="Data path, it is better to use absolute path")
     parser.add_argument("--hccl_config_dir", type=str, default="",
                         help="Hccl config path, it is better to use absolute path")
+    parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
+                        help="Path of the generated cmd file.")
 
     args = parser.parse_args()
     return args
 
 
+def append_cmd(cmd, s):
+    cmd += s
+    cmd += "\n"
+    return cmd
+
+def append_cmd_env(cmd, key, value):
+    return append_cmd(cmd, "export" + str(key) + "=" + str(value))
+
 def distribute_pretrain():
     """
     distribute pretrain scripts. The number of D chips can be automatically allocated
@@ -92,6 +102,7 @@ def distribute_pretrain():
     print("avg_core_per_rank:", avg_core_per_rank)
 
     count = 0
+    cmd = ""
     for instance in this_server["device"]:
         device_id = instance["device_id"]
         rank_id = instance["rank_id"]
@@ -104,39 +115,44 @@ def distribute_pretrain():
         end = start + core_gap
         cmdopt = str(start) + "-" + str(end)
 
-        os.environ["DEVICE_ID"] = device_id
-        os.environ["RANK_ID"] = rank_id
-        os.environ["DEPLOY_MODE"] = "0"
-        os.environ["GE_USE_STATIC_MEMORY"] = "1"
+        cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id))
+        cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id))
+        cmd = append_cmd(cmd, "export DEPLOY_MODE=0")
+        cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1")
 
-        os.system("rm -rf LOG" + str(device_id))
-        os.system("mkdir ./LOG" + str(device_id))
-        os.system("cp *.py ./LOG" + str(device_id))
-        os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
-        os.system("env > ./LOG" + str(device_id) + "/env.log")
+        cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id))
+        cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id))
+        cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id))
+        cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log")
+        cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log")
 
         cur_dir = os.getcwd()
-        os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
-        os.environ["GLOG_logtostderr"] = "0"
+        cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log")
+        cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0")
 
         print("core_nums:", cmdopt)
         print("epoch_size:", str(cfg['epoch_size']))
         print("data_dir:", data_dir)
         print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt")
 
-        os.chdir(cur_dir + "/LOG" + str(device_id))
-        cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
+        cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id))
+
+        run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
         opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
         if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
             raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
                              " 'device_num' or 'data_dir'! ")
-        cmd += opt
-        cmd += " --data_dir=" + data_dir
-        cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+        run_cmd += opt
+        run_cmd += " --data_dir=" + data_dir
+        run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
                + str(rank_size) + ' >./pretraining_log.txt 2>&1 &'
 
-        os.system(cmd)
-        os.chdir(cur_dir)
+        cmd = append_cmd(cmd, run_cmd)
+        cmd = append_cmd(cmd, "cd -")
+        cmd += "\n"
+
+    with open(args.cmd_file, "w") as f:
+        f.write(cmd)
 
 if __name__ == "__main__":
     distribute_pretrain()
diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
index 1f7309a24f..0843b01c9d 100644
--- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh
@@ -24,8 +24,11 @@ echo "For hyper parameter, please note that you should customize the scripts:
 echo "=============================================================================================================="
 CUR_DIR=`pwd`
 
-python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \
+python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \
     --run_script_dir=${CUR_DIR}/run_pretrain.py \
     --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
     --data_dir=$1 \
-    --hccl_config_dir=$2
+    --hccl_config_dir=$2 \
+    --cmd_file=distributed_cmd.sh
+
+bash distributed_cmd.sh
diff --git a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
index 8a4f3272a9..fae88f92a5 100644
--- a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
+++ b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
@@ -590,7 +590,7 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell):
             scaling = scaling_sens * self.degree * self.accumulation_steps
             grads = self.hyper_map(F.partial(grad_scale, scaling), grads)
             if self.enable_global_norm:
-                grads = ClipByGlobalNorm()(grad)
+                grads = ClipByGlobalNorm()(grads)
             else:
                 grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
             accu_overflow = self.overflow_reducer(accu_overflow)