|
|
|
@@ -85,6 +85,7 @@ def distribute_pretrain(): |
|
|
|
|
|
|
|
# get device_ips |
|
|
|
device_ips = {} |
|
|
|
physic_logic_ids = {} |
|
|
|
with open('/etc/hccn.conf', 'r') as fin: |
|
|
|
for hccn_item in fin.readlines(): |
|
|
|
if hccn_item.strip().startswith('address_'): |
|
|
|
@@ -92,6 +93,12 @@ def distribute_pretrain(): |
|
|
|
device_id = device_id.split('_')[1] |
|
|
|
device_ips[device_id] = device_ip.strip() |
|
|
|
|
|
|
|
if not device_ips: |
|
|
|
raise ValueError("There is no address in /etc/hccn.conf") |
|
|
|
|
|
|
|
for logic_id, device_id in enumerate(sorted(device_ips.keys())): |
|
|
|
physic_logic_ids[device_id] = logic_id |
|
|
|
|
|
|
|
with open(args.hccl_config_dir, "r", encoding="utf-8") as fin: |
|
|
|
hccl_config = json.loads(fin.read()) |
|
|
|
rank_size = 0 |
|
|
|
@@ -109,38 +116,42 @@ def distribute_pretrain(): |
|
|
|
|
|
|
|
count = 0 |
|
|
|
for instance in this_server["device"]: |
|
|
|
# device_id is the physical id, we use logic id to sepcific the selected device. |
|
|
|
# While running on a server with 8 pcs, the logic ids are equal to the device ids. |
|
|
|
device_id = instance["device_id"] |
|
|
|
rank_id = instance["rank_id"] |
|
|
|
logic_id = physic_logic_ids[device_id] |
|
|
|
print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":") |
|
|
|
print("rank_id:", rank_id) |
|
|
|
print("device_id:", device_id) |
|
|
|
print("logic_id", logic_id) |
|
|
|
|
|
|
|
start = count * int(avg_core_per_rank) |
|
|
|
count += 1 |
|
|
|
end = start + core_gap |
|
|
|
cmdopt = str(start) + "-" + str(end) |
|
|
|
|
|
|
|
cmd = append_cmd_env(cmd, "DEVICE_ID", str(device_id)) |
|
|
|
cmd = append_cmd_env(cmd, "DEVICE_ID", str(logic_id)) |
|
|
|
cmd = append_cmd_env(cmd, "RANK_ID", str(rank_id)) |
|
|
|
cmd = append_cmd_env(cmd, "DEPLOY_MODE", '0') |
|
|
|
cmd = append_cmd_env(cmd, "GE_USE_STATIC_MEMORY", '1') |
|
|
|
|
|
|
|
cmd = append_cmd(cmd, "rm -rf LOG" + str(device_id)) |
|
|
|
cmd = append_cmd(cmd, "mkdir ./LOG" + str(device_id)) |
|
|
|
cmd = append_cmd(cmd, "cp *.py ./LOG" + str(device_id)) |
|
|
|
cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(device_id) + "/ms_log") |
|
|
|
cmd = append_cmd(cmd, "env > ./LOG" + str(device_id) + "/env.log") |
|
|
|
cmd = append_cmd(cmd, "rm -rf LOG" + str(logic_id)) |
|
|
|
cmd = append_cmd(cmd, "mkdir ./LOG" + str(logic_id)) |
|
|
|
cmd = append_cmd(cmd, "cp *.py ./LOG" + str(logic_id)) |
|
|
|
cmd = append_cmd(cmd, "mkdir -p ./LOG" + str(logic_id) + "/ms_log") |
|
|
|
cmd = append_cmd(cmd, "env > ./LOG" + str(logic_id) + "/env.log") |
|
|
|
|
|
|
|
cur_dir = os.getcwd() |
|
|
|
cmd = append_cmd_env(cmd, "GLOG_log_dir", cur_dir + "/LOG" + str(device_id) + "/ms_log") |
|
|
|
cmd = append_cmd_env(cmd, "GLOG_log_dir", cur_dir + "/LOG" + str(logic_id) + "/ms_log") |
|
|
|
cmd = append_cmd_env(cmd, "GLOG_logtostderr", "0") |
|
|
|
|
|
|
|
print("core_nums:", cmdopt) |
|
|
|
print("epoch_size:", str(cfg['epoch_size'])) |
|
|
|
print("data_dir:", data_dir) |
|
|
|
print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/pretraining_log.txt") |
|
|
|
print("log_file_dir: " + cur_dir + "/LOG" + str(logic_id) + "/pretraining_log.txt") |
|
|
|
|
|
|
|
cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(device_id)) |
|
|
|
cmd = append_cmd(cmd, "cd " + cur_dir + "/LOG" + str(logic_id)) |
|
|
|
|
|
|
|
run_cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " |
|
|
|
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) |
|
|
|
@@ -149,11 +160,15 @@ def distribute_pretrain(): |
|
|
|
" 'device_num' or 'data_dir'! ") |
|
|
|
run_cmd += opt |
|
|
|
run_cmd += " --data_dir=" + data_dir |
|
|
|
run_cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ |
|
|
|
run_cmd += ' --device_id=' + str(logic_id) + ' --device_num=' \ |
|
|
|
+ str(rank_size) + ' >./pretraining_log.txt 2>&1 &' |
|
|
|
|
|
|
|
cmd = append_cmd(cmd, run_cmd) |
|
|
|
cmd = append_cmd(cmd, "cd -") |
|
|
|
cmd = append_cmd(cmd, "echo \"run with" + |
|
|
|
" rank_id=" + str(rank_id) + |
|
|
|
" device_id=" + str(device_id) + |
|
|
|
" logic_id=" + str(logic_id) + "\"") |
|
|
|
cmd += "\n" |
|
|
|
|
|
|
|
with open(args.cmd_file, "w") as f: |
|
|
|
|