From 28cb0da168a9f02197bbb9ac833fc8f55cf2b318 Mon Sep 17 00:00:00 2001 From: chenhaozhe Date: Fri, 11 Sep 2020 16:19:49 +0800 Subject: [PATCH] remove os.system in launch scripts --- model_zoo/official/nlp/bert/README.md | 33 ++++++++---- .../ascend_distributed_launcher/README.md | 9 ++-- ...rain.py => get_distribute_pretrain_cmd.py} | 52 ++++++++++++------- .../run_distributed_pretrain_ascend.sh | 7 ++- .../nlp/bert/src/bert_for_pre_training.py | 2 +- 5 files changed, 68 insertions(+), 35 deletions(-) rename model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/{run_distribute_pretrain.py => get_distribute_pretrain_cmd.py} (74%) diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index 922a1f8054..4d02c8abfa 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -1,19 +1,32 @@ # Contents +- [Contents](#contents) - [BERT Description](#bert-description) - [Model Architecture](#model-architecture) - [Dataset](#dataset) - [Environment Requirements](#environment-requirements) - [Quick Start](#quick-start) - [Script Description](#script-description) - - [Script and Sample Code](#script-and-sample-code) - - [Script Parameters](#script-parameters) - - [Dataset Preparation](#dataset-preparation) - - [Training Process](#training-process) - - [Evaluation Process](#evaluation-process) -- [Model Description](#model-description) - - [Performance](#performance) - - [Training Performance](#training-performance) - - [Evaluation Performance](#evaluation-performance) + - [Script and Sample Code](#script-and-sample-code) + - [Script Parameters](#script-parameters) + - [Pre-Training](#pre-training) + - [Fine-Tuning and Evaluation](#fine-tuning-and-evaluation) + - [Options and Parameters](#options-and-parameters) + - [Options:](#options) + - [Parameters:](#parameters) + - [Training Process](#training-process) + - [Training](#training) + - [Running on Ascend](#running-on-ascend) + - [Distributed Training](#distributed-training) + - [Running on Ascend](#running-on-ascend-1) + - [Evaluation Process](#evaluation-process) + - [Evaluation](#evaluation) + - [evaluation on cola dataset when running on Ascend](#evaluation-on-cola-dataset-when-running-on-ascend) + - [evaluation on cluener dataset when running on Ascend](#evaluation-on-cluener-dataset-when-running-on-ascend) + - [evaluation on squad v1.1 dataset when running on Ascend](#evaluation-on-squad-v11-dataset-when-running-on-ascend) + - [Model Description](#model-description) + - [Performance](#performance) + - [Pretraining Performance](#pretraining-performance) + - [Inference Performance](#inference-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -139,7 +152,7 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol ├─ascend_distributed_launcher ├─__init__.py ├─hyper_parameter_config.ini # hyper paramter for distributed pretraining - ├─run_distribute_pretrain.py # script for distributed pretraining + ├─get_distribute_pretrain_cmd.py # script for distributed pretraining ├─README.md ├─run_classifier.sh # shell script for standalone classifier task on ascend or gpu ├─run_ner.sh # shell script for standalone NER task on ascend or gpu diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md index b492c4c309..18a6532fbf 100644 --- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md @@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set ## how to use -For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: +For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir: ``` -python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json ``` output: @@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: - device_id - device_num + - device_id + - device_num + - data_dir 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py similarity index 74% rename from model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py rename to model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py index 794aaf7234..e2a62ba95d 100644 --- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py @@ -42,11 +42,21 @@ def parse_args(): help="Data path, it is better to use absolute path") parser.add_argument("--hccl_config_dir", type=str, default="", help="Hccl config path, it is better to use absolute path") + parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", + help="Path of the generated cmd file.") args = parser.parse_args() return args +def append_cmd(cmd, s): + cmd += s + cmd += "\n" + return cmd + +def append_cmd_env(cmd, key, value): + return append_cmd(cmd, "export" + str(key) + "=" + str(value)) + def distribute_pretrain(): """ distribute pretrain scripts. The number of D chips can be automatically allocated @@ -92,6 +102,7 @@ def distribute_pretrain(): print("avg_core_per_rank:", avg_core_per_rank) count = 0 + cmd = "" for instance in this_server["device"]: device_id = instance["device_id"] rank_id = instance["rank_id"] @@ -104,39 +115,44 @@ def distribute_pretrain(): end = start + core_gap cmdopt = str(start) + "-" + str(end) - os.environ["DEVICE_ID"] = device_id - os.environ["RANK_ID"] = rank_id - os.environ["DEPLOY_MODE"] = "0" - os.environ["GE_USE_STATIC_MEMORY"] = "1" + cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id)) + cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id)) + cmd = append_cmd(cmd, "export DEPLOY_MODE=0") + cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1") - os.system("rm -rf LOG" + str(device_id)) - os.system("mkdir ./LOG" + str(device_id)) - os.system("cp *.py ./LOG" + str(device_id)) - os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") - os.system("env > ./LOG" + str(device_id) + "/env.log") + cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) + cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) + cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) + cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") + cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") cur_dir = os.getcwd() - os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" - os.environ["GLOG_logtostderr"] = "0" + cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log") + cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") print("core_nums:", cmdopt) print("epoch_size:", str(cfg['epoch_size'])) print("data_dir:", data_dir) print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") - os.chdir(cur_dir + "/LOG" + str(device_id)) - cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " + cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) + + run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," " 'device_num' or 'data_dir'! ") - cmd += opt - cmd += " --data_dir=" + data_dir - cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + run_cmd += opt + run_cmd += " --data_dir=" + data_dir + run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' - os.system(cmd) - os.chdir(cur_dir) + cmd = append_cmd(cmd, run_cmd) + cmd = append_cmd(cmd, "cd -") + cmd += "\n" + + with open(args.cmd_file, "w") as f: + f.write(cmd) if __name__ == "__main__": distribute_pretrain() diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh index 1f7309a24f..0843b01c9d 100644 --- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh +++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh @@ -24,8 +24,11 @@ echo "For hyper parameter, please note that you should customize the scripts: echo "==============================================================================================================" CUR_DIR=`pwd` -python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \ +python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \ --run_script_dir=${CUR_DIR}/run_pretrain.py \ --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ --data_dir=$1 \ - --hccl_config_dir=$2 + --hccl_config_dir=$2 \ + --cmd_file=distributed_cmd.sh + +bash distributed_cmd.sh diff --git a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py index 8a4f3272a9..fae88f92a5 100644 --- a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py +++ b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py @@ -590,7 +590,7 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell): scaling = scaling_sens * self.degree * self.accumulation_steps grads = self.hyper_map(F.partial(grad_scale, scaling), grads) if self.enable_global_norm: - grads = ClipByGlobalNorm()(grad) + grads = ClipByGlobalNorm()(grads) else: grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) accu_overflow = self.overflow_reducer(accu_overflow)