| @@ -1,19 +1,32 @@ | |||
| # Contents | |||
| - [Contents](#contents) | |||
| - [BERT Description](#bert-description) | |||
| - [Model Architecture](#model-architecture) | |||
| - [Dataset](#dataset) | |||
| - [Environment Requirements](#environment-requirements) | |||
| - [Quick Start](#quick-start) | |||
| - [Script Description](#script-description) | |||
| - [Script and Sample Code](#script-and-sample-code) | |||
| - [Script Parameters](#script-parameters) | |||
| - [Dataset Preparation](#dataset-preparation) | |||
| - [Training Process](#training-process) | |||
| - [Evaluation Process](#evaluation-process) | |||
| - [Model Description](#model-description) | |||
| - [Performance](#performance) | |||
| - [Training Performance](#training-performance) | |||
| - [Evaluation Performance](#evaluation-performance) | |||
| - [Script and Sample Code](#script-and-sample-code) | |||
| - [Script Parameters](#script-parameters) | |||
| - [Pre-Training](#pre-training) | |||
| - [Fine-Tuning and Evaluation](#fine-tuning-and-evaluation) | |||
| - [Options and Parameters](#options-and-parameters) | |||
| - [Options:](#options) | |||
| - [Parameters:](#parameters) | |||
| - [Training Process](#training-process) | |||
| - [Training](#training) | |||
| - [Running on Ascend](#running-on-ascend) | |||
| - [Distributed Training](#distributed-training) | |||
| - [Running on Ascend](#running-on-ascend-1) | |||
| - [Evaluation Process](#evaluation-process) | |||
| - [Evaluation](#evaluation) | |||
| - [evaluation on cola dataset when running on Ascend](#evaluation-on-cola-dataset-when-running-on-ascend) | |||
| - [evaluation on cluener dataset when running on Ascend](#evaluation-on-cluener-dataset-when-running-on-ascend) | |||
| - [evaluation on squad v1.1 dataset when running on Ascend](#evaluation-on-squad-v11-dataset-when-running-on-ascend) | |||
| - [Model Description](#model-description) | |||
| - [Performance](#performance) | |||
| - [Pretraining Performance](#pretraining-performance) | |||
| - [Inference Performance](#inference-performance) | |||
| - [Description of Random Situation](#description-of-random-situation) | |||
| - [ModelZoo Homepage](#modelzoo-homepage) | |||
| @@ -139,7 +152,7 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol | |||
| ├─ascend_distributed_launcher | |||
| ├─__init__.py | |||
| ├─hyper_parameter_config.ini # hyper paramter for distributed pretraining | |||
| ├─run_distribute_pretrain.py # script for distributed pretraining | |||
| ├─get_distribute_pretrain_cmd.py # script for distributed pretraining | |||
| ├─README.md | |||
| ├─run_classifier.sh # shell script for standalone classifier task on ascend or gpu | |||
| ├─run_ner.sh # shell script for standalone NER task on ascend or gpu | |||
| @@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set | |||
| ## how to use | |||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | |||
| For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir: | |||
| ``` | |||
| python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| ``` | |||
| output: | |||
| @@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt | |||
| 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | |||
| 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | |||
| device_id | |||
| device_num | |||
| - device_id | |||
| - device_num | |||
| - data_dir | |||
| 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | |||
| @@ -42,11 +42,21 @@ def parse_args(): | |||
| help="Data path, it is better to use absolute path") | |||
| parser.add_argument("--hccl_config_dir", type=str, default="", | |||
| help="Hccl config path, it is better to use absolute path") | |||
| parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", | |||
| help="Path of the generated cmd file.") | |||
| args = parser.parse_args() | |||
| return args | |||
| def append_cmd(cmd, s): | |||
| cmd += s | |||
| cmd += "\n" | |||
| return cmd | |||
| def append_cmd_env(cmd, key, value): | |||
| return append_cmd(cmd, "export" + str(key) + "=" + str(value)) | |||
| def distribute_pretrain(): | |||
| """ | |||
| distribute pretrain scripts. The number of D chips can be automatically allocated | |||
| @@ -92,6 +102,7 @@ def distribute_pretrain(): | |||
| print("avg_core_per_rank:", avg_core_per_rank) | |||
| count = 0 | |||
| cmd = "" | |||
| for instance in this_server["device"]: | |||
| device_id = instance["device_id"] | |||
| rank_id = instance["rank_id"] | |||
| @@ -104,39 +115,44 @@ def distribute_pretrain(): | |||
| end = start + core_gap | |||
| cmdopt = str(start) + "-" + str(end) | |||
| os.environ["DEVICE_ID"] = device_id | |||
| os.environ["RANK_ID"] = rank_id | |||
| os.environ["DEPLOY_MODE"] = "0" | |||
| os.environ["GE_USE_STATIC_MEMORY"] = "1" | |||
| cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id)) | |||
| cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id)) | |||
| cmd = append_cmd(cmd, "export DEPLOY_MODE=0") | |||
| cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1") | |||
| os.system("rm -rf LOG" + str(device_id)) | |||
| os.system("mkdir ./LOG" + str(device_id)) | |||
| os.system("cp *.py ./LOG" + str(device_id)) | |||
| os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| os.system("env > ./LOG" + str(device_id) + "/env.log") | |||
| cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") | |||
| cur_dir = os.getcwd() | |||
| os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" | |||
| os.environ["GLOG_logtostderr"] = "0" | |||
| cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log") | |||
| cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") | |||
| print("core_nums:", cmdopt) | |||
| print("epoch_size:", str(cfg['epoch_size'])) | |||
| print("data_dir:", data_dir) | |||
| print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") | |||
| os.chdir(cur_dir + "/LOG" + str(device_id)) | |||
| cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||
| cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) | |||
| run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | |||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | |||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | |||
| " 'device_num' or 'data_dir'! ") | |||
| cmd += opt | |||
| cmd += " --data_dir=" + data_dir | |||
| cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| run_cmd += opt | |||
| run_cmd += " --data_dir=" + data_dir | |||
| run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' | |||
| os.system(cmd) | |||
| os.chdir(cur_dir) | |||
| cmd = append_cmd(cmd, run_cmd) | |||
| cmd = append_cmd(cmd, "cd -") | |||
| cmd += "\n" | |||
| with open(args.cmd_file, "w") as f: | |||
| f.write(cmd) | |||
| if __name__ == "__main__": | |||
| distribute_pretrain() | |||
| @@ -24,8 +24,11 @@ echo "For hyper parameter, please note that you should customize the scripts: | |||
| echo "==============================================================================================================" | |||
| CUR_DIR=`pwd` | |||
| python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \ | |||
| python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py \ | |||
| --run_script_dir=${CUR_DIR}/run_pretrain.py \ | |||
| --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ | |||
| --data_dir=$1 \ | |||
| --hccl_config_dir=$2 | |||
| --hccl_config_dir=$2 \ | |||
| --cmd_file=distributed_cmd.sh | |||
| bash distributed_cmd.sh | |||
| @@ -590,7 +590,7 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell): | |||
| scaling = scaling_sens * self.degree * self.accumulation_steps | |||
| grads = self.hyper_map(F.partial(grad_scale, scaling), grads) | |||
| if self.enable_global_norm: | |||
| grads = ClipByGlobalNorm()(grad) | |||
| grads = ClipByGlobalNorm()(grads) | |||
| else: | |||
| grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) | |||
| accu_overflow = self.overflow_reducer(accu_overflow) | |||