From 1464bf3dd85afe000f02fba0756d791edd11d7f6 Mon Sep 17 00:00:00 2001 From: chenhaozhe Date: Wed, 14 Oct 2020 14:14:12 +0800 Subject: [PATCH] add hccl_time_out options in bert distribute launcher --- .../get_distribute_pretrain_cmd.py | 5 +++++ .../nlp/bert/scripts/run_distributed_pretrain_ascend.sh | 1 + 2 files changed, 6 insertions(+) diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py index 9a0338af6f..74afd6608a 100644 --- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py @@ -44,6 +44,9 @@ def parse_args(): help="Hccl config path, it is better to use absolute path") parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh", help="Path of the generated cmd file.") + parser.add_argument("--hccl_time_out", type=int, default=120, + help="Seconds to determine the hccl time out," + "default: 120, which is the same as hccl default config") args = parser.parse_args() return args @@ -73,6 +76,8 @@ def distribute_pretrain(): cfg = dict(cf.items("config")) print("hccl_config_dir:", args.hccl_config_dir) + print("hccl_time_out:", args.hccl_time_out) + cmd = append_cmd_env(cmd, 'HCCL_CONNECTION_TIMEOUT', args.hccl_time_out) cmd = append_cmd_env(cmd, 'RANK_TABLE_FILE', args.hccl_config_dir) cores = multiprocessing.cpu_count() diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh index 40b214ab06..d69190af81 100644 --- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh +++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh @@ -29,6 +29,7 @@ python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cm --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ --data_dir=$1 \ --hccl_config_dir=$2 \ + --hccl_time_out=600 \ --cmd_file=distributed_cmd.sh bash distributed_cmd.sh