Browse Source

!7285 Add hccl_time_out optinos in bert distributed launcher

Merge pull request !7285 from chenhaozhe/add-hccl_time_out-options
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
c6ba8d2d84
2 changed files with 6 additions and 0 deletions
  1. +5
    -0
      model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py
  2. +1
    -0
      model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh

+ 5
- 0
model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py View File

@@ -44,6 +44,9 @@ def parse_args():
help="Hccl config path, it is better to use absolute path")
parser.add_argument("--cmd_file", type=str, default="distributed_cmd.sh",
help="Path of the generated cmd file.")
parser.add_argument("--hccl_time_out", type=int, default=120,
help="Seconds to determine the hccl time out,"
"default: 120, which is the same as hccl default config")

args = parser.parse_args()
return args
@@ -73,6 +76,8 @@ def distribute_pretrain():
cfg = dict(cf.items("config"))

print("hccl_config_dir:", args.hccl_config_dir)
print("hccl_time_out:", args.hccl_time_out)
cmd = append_cmd_env(cmd, 'HCCL_CONNECTION_TIMEOUT', args.hccl_time_out)
cmd = append_cmd_env(cmd, 'RANK_TABLE_FILE', args.hccl_config_dir)

cores = multiprocessing.cpu_count()


+ 1
- 0
model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_ascend.sh View File

@@ -29,6 +29,7 @@ python ${CUR_DIR}/scripts/ascend_distributed_launcher/get_distribute_pretrain_cm
--hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
--data_dir=$1 \
--hccl_config_dir=$2 \
--hccl_time_out=600 \
--cmd_file=distributed_cmd.sh

bash distributed_cmd.sh

Loading…
Cancel
Save