| @@ -377,6 +377,12 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1], | |||||
| ... | ... | ||||
| ``` | ``` | ||||
| > **Attention** If you are running with a huge dataset, it's better to add an external environ variable to make sure the hccl won't timeout. | |||||
| > ``` | |||||
| > export HCCL_CONNECT_TIMEOUT=600 | |||||
| > ``` | |||||
| > This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds. | |||||
| ### Distributed Training | ### Distributed Training | ||||
| #### Running on Ascend | #### Running on Ascend | ||||
| ``` | ``` | ||||
| @@ -178,8 +178,7 @@ def run_pretrain(): | |||||
| if args_opt.accumulation_steps <= 1: | if args_opt.accumulation_steps <= 1: | ||||
| net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, | net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, | ||||
| scale_update_cell=update_cell, | |||||
| enable_global_norm=cfg.enable_global_norm) | |||||
| scale_update_cell=update_cell) | |||||
| else: | else: | ||||
| accumulation_steps = args_opt.accumulation_steps | accumulation_steps = args_opt.accumulation_steps | ||||
| net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer, | net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer, | ||||
| @@ -350,13 +350,12 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| optimizer (Optimizer): Optimizer for updating the weights. | optimizer (Optimizer): Optimizer for updating the weights. | ||||
| scale_update_cell (Cell): Cell to do the loss scale. Default: None. | scale_update_cell (Cell): Cell to do the loss scale. Default: None. | ||||
| """ | """ | ||||
| def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=False): | |||||
| def __init__(self, network, optimizer, scale_update_cell=None): | |||||
| super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) | super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) | ||||
| self.network = network | self.network = network | ||||
| self.network.set_grad() | self.network.set_grad() | ||||
| self.weights = optimizer.parameters | self.weights = optimizer.parameters | ||||
| self.optimizer = optimizer | self.optimizer = optimizer | ||||
| self.enable_global_norm = enable_global_norm | |||||
| self.grad = C.GradOperation(get_by_list=True, | self.grad = C.GradOperation(get_by_list=True, | ||||
| sens_param=True) | sens_param=True) | ||||
| self.reducer_flag = False | self.reducer_flag = False | ||||
| @@ -423,10 +422,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| # apply grad reducer on grads | # apply grad reducer on grads | ||||
| grads = self.grad_reducer(grads) | grads = self.grad_reducer(grads) | ||||
| grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) | grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) | ||||
| if self.enable_global_norm: | |||||
| grads = ClipByGlobalNorm()(grads) | |||||
| else: | |||||
| grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) | |||||
| grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) | |||||
| self.get_status(init) | self.get_status(init) | ||||
| flag_sum = self.reduce_sum(init, (0,)) | flag_sum = self.reduce_sum(init, (0,)) | ||||
| if self.is_distributed: | if self.is_distributed: | ||||
| @@ -5,9 +5,9 @@ The number of D chips can be automatically allocated based on the device_num set | |||||
| ## how to use | ## how to use | ||||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | |||||
| For example, if we want to generate the launch command of the distributed training of Bert model on D chip, we can run the following command in `/bert/` dir: | |||||
| ``` | ``` | ||||
| python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||||
| python ./scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||||
| ``` | ``` | ||||
| output: | output: | ||||
| @@ -42,7 +42,8 @@ log file dir: ./LOG6/log.txt | |||||
| 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | ||||
| 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | ||||
| device_id | |||||
| device_num | |||||
| - device_id | |||||
| - device_num | |||||
| - data_dir | |||||
| 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | ||||
| @@ -42,11 +42,21 @@ def parse_args(): | |||||
| help="Data path, it is better to use absolute path") | help="Data path, it is better to use absolute path") | ||||
| parser.add_argument("--hccl_config_dir", type=str, default="", | parser.add_argument("--hccl_config_dir", type=str, default="", | ||||
| help="Hccl config path, it is better to use absolute path") | help="Hccl config path, it is better to use absolute path") | ||||
| parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", | |||||
| help="Path of the generated cmd file.") | |||||
| args = parser.parse_args() | args = parser.parse_args() | ||||
| return args | return args | ||||
| def append_cmd(cmd, s): | |||||
| cmd += s | |||||
| cmd += "\n" | |||||
| return cmd | |||||
| def append_cmd_env(cmd, key, value): | |||||
| return append_cmd(cmd, "export" + str(key) + "=" + str(value)) | |||||
| def distribute_pretrain(): | def distribute_pretrain(): | ||||
| """ | """ | ||||
| distribute pretrain scripts. The number of D chips can be automatically allocated | distribute pretrain scripts. The number of D chips can be automatically allocated | ||||
| @@ -92,6 +102,7 @@ def distribute_pretrain(): | |||||
| print("avg_core_per_rank:", avg_core_per_rank) | print("avg_core_per_rank:", avg_core_per_rank) | ||||
| count = 0 | count = 0 | ||||
| cmd = "" | |||||
| for instance in this_server["device"]: | for instance in this_server["device"]: | ||||
| device_id = instance["device_id"] | device_id = instance["device_id"] | ||||
| rank_id = instance["rank_id"] | rank_id = instance["rank_id"] | ||||
| @@ -104,38 +115,44 @@ def distribute_pretrain(): | |||||
| end = start + core_gap | end = start + core_gap | ||||
| cmdopt = str(start) + "-" + str(end) | cmdopt = str(start) + "-" + str(end) | ||||
| os.environ["DEVICE_ID"] = device_id | |||||
| os.environ["RANK_ID"] = rank_id | |||||
| os.environ["DEPLOY_MODE"] = "0" | |||||
| os.environ["GE_USE_STATIC_MEMORY"] = "1" | |||||
| cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id)) | |||||
| cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id)) | |||||
| cmd = append_cmd(cmd, "export DEPLOY_MODE=0") | |||||
| cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1") | |||||
| os.system("rm -rf LOG" + str(device_id)) | |||||
| os.system("mkdir ./LOG" + str(device_id)) | |||||
| os.system("cp *.py ./LOG" + str(device_id)) | |||||
| os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||||
| os.system("env > ./LOG" + str(device_id) + "/env.log") | |||||
| cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) | |||||
| cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) | |||||
| cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) | |||||
| cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||||
| cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") | |||||
| cur_dir = os.getcwd() | cur_dir = os.getcwd() | ||||
| os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" | |||||
| os.environ["GLOG_logtostderr"] = "0" | |||||
| cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log") | |||||
| cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") | |||||
| print("core_nums:", cmdopt) | print("core_nums:", cmdopt) | ||||
| print("epoch_size:", str(cfg['epoch_size'])) | print("epoch_size:", str(cfg['epoch_size'])) | ||||
| print("data_dir:", data_dir) | print("data_dir:", data_dir) | ||||
| print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | |||||
| print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") | |||||
| cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) | |||||
| cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||||
| run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | ||||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | ||||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | ||||
| " 'device_num' or 'data_dir'! ") | " 'device_num' or 'data_dir'! ") | ||||
| cmd += opt | |||||
| cmd += " --data_dir=" + data_dir | |||||
| cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||||
| + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' | |||||
| run_cmd += opt | |||||
| run_cmd += " --data_dir=" + data_dir | |||||
| run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||||
| + str(rank_size) + ' >./pretraining_log.txt 2>&1 &' | |||||
| os.system(cmd) | |||||
| cmd = append_cmd(cmd, run_cmd) | |||||
| cmd = append_cmd(cmd, "cd -") | |||||
| cmd += "\n" | |||||
| with open(args.cmd_file, "w") as f: | |||||
| f.write(cmd) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| distribute_pretrain() | distribute_pretrain() | ||||
| @@ -6,6 +6,7 @@ enable_lossscale=true | |||||
| do_shuffle=true | do_shuffle=true | ||||
| enable_data_sink=true | enable_data_sink=true | ||||
| data_sink_steps=100 | data_sink_steps=100 | ||||
| accumulation_steps=1 | |||||
| save_checkpoint_path=./checkpoint/ | save_checkpoint_path=./checkpoint/ | ||||
| save_checkpoint_steps=10000 | save_checkpoint_steps=10000 | ||||
| save_checkpoint_num=1 | |||||
| save_checkpoint_num=1 | |||||
| @@ -5,7 +5,7 @@ mindspore distributed training launch helper utilty that will generate hccl conf | |||||
| # use | # use | ||||
| ``` | ``` | ||||
| python hccl_tools.py --device_num [0,8) | |||||
| python hccl_tools.py --device_num "[0,8)" | |||||
| ``` | ``` | ||||
| output: | output: | ||||