Merge pull request !6073 from chenhaozhe/fix-doc-errortags/v1.0.0
| @@ -377,6 +377,12 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1], | |||
| ... | |||
| ``` | |||
| > **Attention** If you are running with a huge dataset, it's better to add an external environ variable to make sure the hccl won't timeout. | |||
| > ``` | |||
| > export HCCL_CONNECT_TIMEOUT=600 | |||
| > ``` | |||
| > This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds. | |||
| ### Distributed Training | |||
| #### Running on Ascend | |||
| ``` | |||
| @@ -178,8 +178,7 @@ def run_pretrain(): | |||
| if args_opt.accumulation_steps <= 1: | |||
| net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, | |||
| scale_update_cell=update_cell, | |||
| enable_global_norm=cfg.enable_global_norm) | |||
| scale_update_cell=update_cell) | |||
| else: | |||
| accumulation_steps = args_opt.accumulation_steps | |||
| net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer, | |||
| @@ -350,13 +350,12 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||
| optimizer (Optimizer): Optimizer for updating the weights. | |||
| scale_update_cell (Cell): Cell to do the loss scale. Default: None. | |||
| """ | |||
| def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=False): | |||
| def __init__(self, network, optimizer, scale_update_cell=None): | |||
| super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) | |||
| self.network = network | |||
| self.network.set_grad() | |||
| self.weights = optimizer.parameters | |||
| self.optimizer = optimizer | |||
| self.enable_global_norm = enable_global_norm | |||
| self.grad = C.GradOperation(get_by_list=True, | |||
| sens_param=True) | |||
| self.reducer_flag = False | |||
| @@ -423,10 +422,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||
| # apply grad reducer on grads | |||
| grads = self.grad_reducer(grads) | |||
| grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) | |||
| if self.enable_global_norm: | |||
| grads = ClipByGlobalNorm()(grads) | |||
| else: | |||
| grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) | |||
| grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) | |||
| self.get_status(init) | |||
| flag_sum = self.reduce_sum(init, (0,)) | |||
| if self.is_distributed: | |||
| @@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set | |||
| ## how to use | |||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | |||
| For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir: | |||
| ``` | |||
| python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| ``` | |||
| output: | |||
| @@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt | |||
| 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | |||
| 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | |||
| device_id | |||
| device_num | |||
| - device_id | |||
| - device_num | |||
| - data_dir | |||
| 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | |||
| @@ -42,11 +42,21 @@ def parse_args(): | |||
| help="Data path, it is better to use absolute path") | |||
| parser.add_argument("--hccl_config_dir", type=str, default="", | |||
| help="Hccl config path, it is better to use absolute path") | |||
| parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", | |||
| help="Path of the generated cmd file.") | |||
| args = parser.parse_args() | |||
| return args | |||
| def append_cmd(cmd, s): | |||
| cmd += s | |||
| cmd += "\n" | |||
| return cmd | |||
| def append_cmd_env(cmd, key, value): | |||
| return append_cmd(cmd, "export" + str(key) + "=" + str(value)) | |||
| def distribute_pretrain(): | |||
| """ | |||
| distribute pretrain scripts. The number of D chips can be automatically allocated | |||
| @@ -92,6 +102,7 @@ def distribute_pretrain(): | |||
| print("avg_core_per_rank:", avg_core_per_rank) | |||
| count = 0 | |||
| cmd = "" | |||
| for instance in this_server["device"]: | |||
| device_id = instance["device_id"] | |||
| rank_id = instance["rank_id"] | |||
| @@ -104,38 +115,44 @@ def distribute_pretrain(): | |||
| end = start + core_gap | |||
| cmdopt = str(start) + "-" + str(end) | |||
| os.environ["DEVICE_ID"] = device_id | |||
| os.environ["RANK_ID"] = rank_id | |||
| os.environ["DEPLOY_MODE"] = "0" | |||
| os.environ["GE_USE_STATIC_MEMORY"] = "1" | |||
| cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id)) | |||
| cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id)) | |||
| cmd = append_cmd(cmd, "export DEPLOY_MODE=0") | |||
| cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1") | |||
| os.system("rm -rf LOG" + str(device_id)) | |||
| os.system("mkdir ./LOG" + str(device_id)) | |||
| os.system("cp *.py ./LOG" + str(device_id)) | |||
| os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| os.system("env > ./LOG" + str(device_id) + "/env.log") | |||
| cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) | |||
| cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") | |||
| cur_dir = os.getcwd() | |||
| os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" | |||
| os.environ["GLOG_logtostderr"] = "0" | |||
| cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log") | |||
| cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") | |||
| print("core_nums:", cmdopt) | |||
| print("epoch_size:", str(cfg['epoch_size'])) | |||
| print("data_dir:", data_dir) | |||
| print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | |||
| print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") | |||
| cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) | |||
| cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||
| run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | |||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | |||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | |||
| " 'device_num' or 'data_dir'! ") | |||
| cmd += opt | |||
| cmd += " --data_dir=" + data_dir | |||
| cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' | |||
| run_cmd += opt | |||
| run_cmd += " --data_dir=" + data_dir | |||
| run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' | |||
| os.system(cmd) | |||
| cmd = append_cmd(cmd, run_cmd) | |||
| cmd = append_cmd(cmd, "cd -") | |||
| cmd += "\n" | |||
| with open(args.cmd_file, "w") as f: | |||
| f.write(cmd) | |||
| if __name__ == "__main__": | |||
| distribute_pretrain() | |||
| @@ -6,6 +6,7 @@ enable_lossscale=true | |||
| do_shuffle=true | |||
| enable_data_sink=true | |||
| data_sink_steps=100 | |||
| accumulation_steps=1 | |||
| save_checkpoint_path=./checkpoint/ | |||
| save_checkpoint_steps=10000 | |||
| save_checkpoint_num=1 | |||
| save_checkpoint_num=1 | |||
| @@ -5,7 +5,7 @@ mindspore distributed training launch helper utilty that will generate hccl conf | |||
| # use | |||
| ``` | |||
| python hccl_tools.py --device_num [0,8) | |||
| python hccl_tools.py --device_num "[0,8)" | |||
| ``` | |||
| output: | |||