Browse Source

fix hccl environ in get_distribute_pretrain_cmd.py

tags/v1.0.0
chenhaozhe 5 years ago
parent
commit
4e587420c0
2 changed files with 18 additions and 18 deletions
  1. +9
    -9
      model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py
  2. +9
    -9
      model_zoo/utils/ascend_distributed_launcher/get_distribute_pretrain_cmd.py

+ 9
- 9
model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py View File

@@ -55,13 +55,14 @@ def append_cmd(cmd, s):
return cmd return cmd


def append_cmd_env(cmd, key, value): def append_cmd_env(cmd, key, value):
return append_cmd(cmd, "export" + str(key) + "=" + str(value))
return append_cmd(cmd, "export " + str(key) + "=" + str(value))


def distribute_pretrain(): def distribute_pretrain():
""" """
distribute pretrain scripts. The number of D chips can be automatically allocated distribute pretrain scripts. The number of D chips can be automatically allocated
based on the device_num set in hccl config file, You don not need to specify that. based on the device_num set in hccl config file, You don not need to specify that.
""" """
cmd = ""
print("start", __file__) print("start", __file__)
args = parse_args() args = parse_args()


@@ -72,7 +73,7 @@ def distribute_pretrain():
cfg = dict(cf.items("config")) cfg = dict(cf.items("config"))


print("hccl_config_dir:", args.hccl_config_dir) print("hccl_config_dir:", args.hccl_config_dir)
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
cmd = append_cmd_env(cmd, 'RANK_TABLE_FILE', args.hccl_config_dir)


cores = multiprocessing.cpu_count() cores = multiprocessing.cpu_count()
print("the number of logical core:", cores) print("the number of logical core:", cores)
@@ -94,7 +95,7 @@ def distribute_pretrain():
if server["device"][0]["device_ip"] in device_ips.values(): if server["device"][0]["device_ip"] in device_ips.values():
this_server = server this_server = server


os.environ['RANK_SIZE'] = str(rank_size)
cmd = append_cmd_env(cmd, "RANK_SIZE", str(rank_size))
print("total rank size:", rank_size) print("total rank size:", rank_size)
print("this server rank size:", len(this_server["device"])) print("this server rank size:", len(this_server["device"]))
avg_core_per_rank = int(int(cores) / len(this_server["device"])) avg_core_per_rank = int(int(cores) / len(this_server["device"]))
@@ -102,7 +103,6 @@ def distribute_pretrain():
print("avg_core_per_rank:", avg_core_per_rank) print("avg_core_per_rank:", avg_core_per_rank)


count = 0 count = 0
cmd = ""
for instance in this_server["device"]: for instance in this_server["device"]:
device_id = instance["device_id"] device_id = instance["device_id"]
rank_id = instance["rank_id"] rank_id = instance["rank_id"]
@@ -115,10 +115,10 @@ def distribute_pretrain():
end = start + core_gap end = start + core_gap
cmdopt = str(start) + "-" + str(end) cmdopt = str(start) + "-" + str(end)


cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id))
cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id))
cmd = append_cmd(cmd, "export DEPLOY_MODE=0")
cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1")
cmd = append_cmd_env(cmd, "DEVICE_ID", str(device_id))
cmd = append_cmd_env(cmd, "RANK_ID", str(rank_id))
cmd = append_cmd_env(cmd, "DEPLOY_MODE", '0')
cmd = append_cmd_env(cmd, "GE_USE_STATIC_MEMORY", '1')


cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id))
cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id))
@@ -127,7 +127,7 @@ def distribute_pretrain():
cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log")


cur_dir = os.getcwd() cur_dir = os.getcwd()
cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log")
cmd = append_cmd_env(cmd, "GLOG_log_dir", cur_dir + "/LOG" + str(device_id) + "/ms_log")
cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0")


print("core_nums:", cmdopt) print("core_nums:", cmdopt)


+ 9
- 9
model_zoo/utils/ascend_distributed_launcher/get_distribute_pretrain_cmd.py View File

@@ -55,13 +55,14 @@ def append_cmd(cmd, s):
return cmd return cmd


def append_cmd_env(cmd, key, value): def append_cmd_env(cmd, key, value):
return append_cmd(cmd, "export" + str(key) + "=" + str(value))
return append_cmd(cmd, "export " + str(key) + "=" + str(value))


def distribute_pretrain(): def distribute_pretrain():
""" """
distribute pretrain scripts. The number of D chips can be automatically allocated distribute pretrain scripts. The number of D chips can be automatically allocated
based on the device_num set in hccl config file, You don not need to specify that. based on the device_num set in hccl config file, You don not need to specify that.
""" """
cmd = ""
print("start", __file__) print("start", __file__)
args = parse_args() args = parse_args()


@@ -72,7 +73,7 @@ def distribute_pretrain():
cfg = dict(cf.items("config")) cfg = dict(cf.items("config"))


print("hccl_config_dir:", args.hccl_config_dir) print("hccl_config_dir:", args.hccl_config_dir)
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
cmd = append_cmd_env(cmd, 'RANK_TABLE_FILE', args.hccl_config_dir)


cores = multiprocessing.cpu_count() cores = multiprocessing.cpu_count()
print("the number of logical core:", cores) print("the number of logical core:", cores)
@@ -94,7 +95,7 @@ def distribute_pretrain():
if server["device"][0]["device_ip"] in device_ips.values(): if server["device"][0]["device_ip"] in device_ips.values():
this_server = server this_server = server


os.environ['RANK_SIZE'] = str(rank_size)
cmd = append_cmd_env(cmd, "RANK_SIZE", str(rank_size))
print("total rank size:", rank_size) print("total rank size:", rank_size)
print("this server rank size:", len(this_server["device"])) print("this server rank size:", len(this_server["device"]))
avg_core_per_rank = int(int(cores) / len(this_server["device"])) avg_core_per_rank = int(int(cores) / len(this_server["device"]))
@@ -102,7 +103,6 @@ def distribute_pretrain():
print("avg_core_per_rank:", avg_core_per_rank) print("avg_core_per_rank:", avg_core_per_rank)


count = 0 count = 0
cmd = ""
for instance in this_server["device"]: for instance in this_server["device"]:
device_id = instance["device_id"] device_id = instance["device_id"]
rank_id = instance["rank_id"] rank_id = instance["rank_id"]
@@ -115,10 +115,10 @@ def distribute_pretrain():
end = start + core_gap end = start + core_gap
cmdopt = str(start) + "-" + str(end) cmdopt = str(start) + "-" + str(end)


cmd = append_cmd(cmd, "export DEVICE_ID=" + str(device_id))
cmd = append_cmd(cmd, "export RANK_ID=" + str(rank_id))
cmd = append_cmd(cmd, "export DEPLOY_MODE=0")
cmd = append_cmd(cmd, "export GE_USE_STATIC_MEMORY=1")
cmd = append_cmd_env(cmd, "DEVICE_ID", str(device_id))
cmd = append_cmd_env(cmd, "RANK_ID", str(rank_id))
cmd = append_cmd_env(cmd, "DEPLOY_MODE", '0')
cmd = append_cmd_env(cmd, "GE_USE_STATIC_MEMORY", '1')


cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id))
cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id))
@@ -127,7 +127,7 @@ def distribute_pretrain():
cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log")


cur_dir = os.getcwd() cur_dir = os.getcwd()
cmd = append_cmd_env(cmd, "GLOG_LOG_DIR", cur_dir + "/LOG" + str(device_id) + "/ms_log")
cmd = append_cmd_env(cmd, "GLOG_log_dir", cur_dir + "/LOG" + str(device_id) + "/ms_log")
cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0")


print("core_nums:", cmdopt) print("core_nums:", cmdopt)


Loading…
Cancel
Save