Merge pull request !3512 from Guomenghao319/add_python_distribute_pretrain_scripttags/v0.7.0-beta
| @@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( | |||
| - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. | |||
| ``` bash | |||
| sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH | |||
| sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH | |||
| ``` | |||
| ### Fine-Tuning and Evaluation | |||
| @@ -0,0 +1,48 @@ | |||
| # Run distribute pretrain | |||
| ## description | |||
| The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that. | |||
| ## how to use | |||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | |||
| ``` | |||
| python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| ``` | |||
| output: | |||
| ``` | |||
| hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| the number of logical core: 192 | |||
| avg_core_per_rank: 96 | |||
| rank_size: 2 | |||
| start training for rank 0, device 5: | |||
| rank_id: 0 | |||
| device_id: 5 | |||
| core nums: 0-95 | |||
| epoch_size: 8 | |||
| data_dir: /data/small_512/ | |||
| schema_dir: | |||
| log file dir: ./LOG5/log.txt | |||
| start training for rank 1, device 6: | |||
| rank_id: 1 | |||
| device_id: 6 | |||
| core nums: 96-191 | |||
| epoch_size: 8 | |||
| data_dir: /data/small_512/ | |||
| schema_dir: | |||
| log file dir: ./LOG6/log.txt | |||
| ``` | |||
| ## Note | |||
| 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | |||
| 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | |||
| device_id | |||
| device_num | |||
| 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | |||
| @@ -0,0 +1,11 @@ | |||
| [config] | |||
| distribute=true | |||
| epoch_size=40 | |||
| enable_save_ckpt=true | |||
| enable_lossscale=true | |||
| do_shuffle=true | |||
| enable_data_sink=true | |||
| data_sink_steps=100 | |||
| save_checkpoint_path=./checkpoint/ | |||
| save_checkpoint_steps=10000 | |||
| save_checkpoint_num=1 | |||
| @@ -0,0 +1,142 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """distribute pretrain script""" | |||
| import os | |||
| import json | |||
| import configparser | |||
| import multiprocessing | |||
| from argparse import ArgumentParser | |||
| def parse_args(): | |||
| """ | |||
| parse args . | |||
| Args: | |||
| Returns: | |||
| args. | |||
| Examples: | |||
| >>> parse_args() | |||
| """ | |||
| parser = ArgumentParser(description="mindspore distributed training") | |||
| parser.add_argument("--run_script_dir", type=str, default="", | |||
| help="Run script path, it is better to use absolute path") | |||
| parser.add_argument("--hyper_parameter_config_dir", type=str, default="", | |||
| help="Hyper Parameter config path, it is better to use absolute path") | |||
| parser.add_argument("--data_dir", type=str, default="", | |||
| help="Data path, it is better to use absolute path") | |||
| parser.add_argument("--hccl_config_dir", type=str, default="", | |||
| help="Hccl config path, it is better to use absolute path") | |||
| args = parser.parse_args() | |||
| return args | |||
| def distribute_pretrain(): | |||
| """ | |||
| distribute pretrain scripts. The number of D chips can be automatically allocated | |||
| based on the device_num set in hccl config file, You don not need to specify that. | |||
| """ | |||
| print("start", __file__) | |||
| args = parse_args() | |||
| run_script = args.run_script_dir | |||
| data_dir = args.data_dir | |||
| cf = configparser.ConfigParser() | |||
| cf.read(args.hyper_parameter_config_dir) | |||
| cfg = dict(cf.items("config")) | |||
| print("hccl_config_dir:", args.hccl_config_dir) | |||
| os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir | |||
| os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir | |||
| cores = multiprocessing.cpu_count() | |||
| print("the number of logical core:", cores) | |||
| # get device_ips | |||
| device_ips = {} | |||
| with open('/etc/hccn.conf', 'r') as fin: | |||
| for hccn_item in fin.readlines(): | |||
| if hccn_item.strip().startswith('address_'): | |||
| device_id, device_ip = hccn_item.split('=') | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip.strip() | |||
| with open(args.hccl_config_dir, "r", encoding="utf-8") as fin: | |||
| hccl_config = json.loads(fin.read()) | |||
| rank_size = 0 | |||
| for server in hccl_config["server_list"]: | |||
| rank_size += len(server["device"]) | |||
| if server["device"][0]["device_ip"] in device_ips.values(): | |||
| this_server = server | |||
| os.environ['RANK_SIZE'] = str(rank_size) | |||
| print("total rank size:", rank_size) | |||
| print("this server rank size:", len(this_server["device"])) | |||
| avg_core_per_rank = int(int(cores) / len(this_server["device"])) | |||
| core_gap = avg_core_per_rank - 1 | |||
| print("avg_core_per_rank:", avg_core_per_rank) | |||
| count = 0 | |||
| for instance in this_server["device"]: | |||
| device_id = instance["device_id"] | |||
| rank_id = instance["rank_id"] | |||
| print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":") | |||
| print("rank_id:", rank_id) | |||
| print("device_id:", device_id) | |||
| start = count * int(avg_core_per_rank) | |||
| count += 1 | |||
| end = start + core_gap | |||
| cmdopt = str(start) + "-" + str(end) | |||
| os.environ["DEVICE_ID"] = device_id | |||
| os.environ["RANK_ID"] = rank_id | |||
| os.environ["DEPLOY_MODE"] = "0" | |||
| os.environ["GE_USE_STATIC_MEMORY"] = "1" | |||
| os.system("rm -rf LOG" + str(device_id)) | |||
| os.system("mkdir ./LOG" + str(device_id)) | |||
| os.system("cp *.py ./LOG" + str(device_id)) | |||
| os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| os.system("env > ./LOG" + str(device_id) + "/env.log") | |||
| cur_dir = os.getcwd() | |||
| os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" | |||
| os.environ["GLOG_logtostderr"] = "0" | |||
| print("core_nums:", cmdopt) | |||
| print("epoch_size:", str(cfg['epoch_size'])) | |||
| print("data_dir:", data_dir) | |||
| print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | |||
| cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " | |||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | |||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | |||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | |||
| " 'device_num' or 'data_dir'! ") | |||
| cmd += opt | |||
| cmd += " --data_dir=" + data_dir | |||
| cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' | |||
| os.system(cmd) | |||
| if __name__ == "__main__": | |||
| distribute_pretrain() | |||
| @@ -16,57 +16,16 @@ | |||
| echo "==============================================================================================================" | |||
| echo "Please run the scipt as: " | |||
| echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" | |||
| echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" | |||
| echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH" | |||
| echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json" | |||
| echo "It is better to use absolute path." | |||
| echo "For hyper parameter, please note that you should customize the scripts: | |||
| '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " | |||
| echo "==============================================================================================================" | |||
| CUR_DIR=`pwd` | |||
| EPOCH_SIZE=$2 | |||
| DATA_DIR=$3 | |||
| SCHEMA_DIR=$4 | |||
| PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) | |||
| export RANK_TABLE_FILE=$5 | |||
| export RANK_SIZE=$1 | |||
| cores=`cat /proc/cpuinfo|grep "processor" |wc -l` | |||
| echo "the number of logical core" $cores | |||
| avg_core_per_rank=`expr $cores \/ $RANK_SIZE` | |||
| core_gap=`expr $avg_core_per_rank \- 1` | |||
| echo "avg_core_per_rank" $avg_core_per_rank | |||
| echo "core_gap" $core_gap | |||
| for((i=0;i<RANK_SIZE;i++)) | |||
| do | |||
| start=`expr $i \* $avg_core_per_rank` | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$i | |||
| export DEPLOY_MODE=0 | |||
| export GE_USE_STATIC_MEMORY=1 | |||
| end=`expr $start \+ $core_gap` | |||
| cmdopt=$start"-"$end | |||
| rm -rf LOG$i | |||
| mkdir ./LOG$i | |||
| cp *.py ./LOG$i | |||
| cd ./LOG$i || exit | |||
| echo "start training for rank $i, device $DEVICE_ID" | |||
| mkdir -p ms_log | |||
| CUR_DIR=`pwd` | |||
| export GLOG_log_dir=${CUR_DIR}/ms_log | |||
| export GLOG_logtostderr=0 | |||
| env > env.log | |||
| taskset -c $cmdopt python ${PROJECT_DIR}/../run_pretrain.py \ | |||
| --distribute="true" \ | |||
| --epoch_size=$EPOCH_SIZE \ | |||
| --device_id=$DEVICE_ID \ | |||
| --device_num=$RANK_SIZE \ | |||
| --enable_save_ckpt="true" \ | |||
| --enable_lossscale="true" \ | |||
| --do_shuffle="true" \ | |||
| --enable_data_sink="true" \ | |||
| --data_sink_steps=100 \ | |||
| --load_checkpoint_path="" \ | |||
| --save_checkpoint_steps=10000 \ | |||
| --save_checkpoint_num=1 \ | |||
| --data_dir=$DATA_DIR \ | |||
| --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & | |||
| cd ../ | |||
| done | |||
| python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \ | |||
| --run_script_dir=${CUR_DIR}/run_pretrain.py \ | |||
| --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ | |||
| --data_dir=$1 \ | |||
| --hccl_config_dir=$2 | |||
| @@ -0,0 +1,48 @@ | |||
| # Run distribute pretrain | |||
| ## description | |||
| The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that. | |||
| ## how to use | |||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | |||
| ``` | |||
| python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| ``` | |||
| output: | |||
| ``` | |||
| hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||
| the number of logical core: 192 | |||
| avg_core_per_rank: 96 | |||
| rank_size: 2 | |||
| start training for rank 0, device 5: | |||
| rank_id: 0 | |||
| device_id: 5 | |||
| core nums: 0-95 | |||
| epoch_size: 8 | |||
| data_dir: /data/small_512/ | |||
| schema_dir: | |||
| log file dir: ./LOG5/log.txt | |||
| start training for rank 1, device 6: | |||
| rank_id: 1 | |||
| device_id: 6 | |||
| core nums: 96-191 | |||
| epoch_size: 8 | |||
| data_dir: /data/small_512/ | |||
| schema_dir: | |||
| log file dir: ./LOG6/log.txt | |||
| ``` | |||
| ## Note | |||
| 1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. | |||
| 2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: | |||
| device_id | |||
| device_num | |||
| 3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. | |||
| @@ -0,0 +1,11 @@ | |||
| [config] | |||
| distribute=true | |||
| epoch_size=40 | |||
| enable_save_ckpt=true | |||
| enable_lossscale=true | |||
| do_shuffle=true | |||
| enable_data_sink=true | |||
| data_sink_steps=100 | |||
| save_checkpoint_path=./checkpoint/ | |||
| save_checkpoint_steps=10000 | |||
| save_checkpoint_num=1 | |||
| @@ -0,0 +1,142 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """distribute pretrain script""" | |||
| import os | |||
| import json | |||
| import configparser | |||
| import multiprocessing | |||
| from argparse import ArgumentParser | |||
| def parse_args(): | |||
| """ | |||
| parse args . | |||
| Args: | |||
| Returns: | |||
| args. | |||
| Examples: | |||
| >>> parse_args() | |||
| """ | |||
| parser = ArgumentParser(description="mindspore distributed training") | |||
| parser.add_argument("--run_script_dir", type=str, default="", | |||
| help="Run script path, it is better to use absolute path") | |||
| parser.add_argument("--hyper_parameter_config_dir", type=str, default="", | |||
| help="Hyper Parameter config path, it is better to use absolute path") | |||
| parser.add_argument("--data_dir", type=str, default="", | |||
| help="Data path, it is better to use absolute path") | |||
| parser.add_argument("--hccl_config_dir", type=str, default="", | |||
| help="Hccl config path, it is better to use absolute path") | |||
| args = parser.parse_args() | |||
| return args | |||
| def distribute_pretrain(): | |||
| """ | |||
| distribute pretrain scripts. The number of D chips can be automatically allocated | |||
| based on the device_num set in hccl config file, You don not need to specify that. | |||
| """ | |||
| print("start", __file__) | |||
| args = parse_args() | |||
| run_script = args.run_script_dir | |||
| data_dir = args.data_dir | |||
| cf = configparser.ConfigParser() | |||
| cf.read(args.hyper_parameter_config_dir) | |||
| cfg = dict(cf.items("config")) | |||
| print("hccl_config_dir:", args.hccl_config_dir) | |||
| os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir | |||
| os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir | |||
| cores = multiprocessing.cpu_count() | |||
| print("the number of logical core:", cores) | |||
| # get device_ips | |||
| device_ips = {} | |||
| with open('/etc/hccn.conf', 'r') as fin: | |||
| for hccn_item in fin.readlines(): | |||
| if hccn_item.strip().startswith('address_'): | |||
| device_id, device_ip = hccn_item.split('=') | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip.strip() | |||
| with open(args.hccl_config_dir, "r", encoding="utf-8") as fin: | |||
| hccl_config = json.loads(fin.read()) | |||
| rank_size = 0 | |||
| for server in hccl_config["server_list"]: | |||
| rank_size += len(server["device"]) | |||
| if server["device"][0]["device_ip"] in device_ips.values(): | |||
| this_server = server | |||
| os.environ['RANK_SIZE'] = str(rank_size) | |||
| print("total rank size:", rank_size) | |||
| print("this server rank size:", len(this_server["device"])) | |||
| avg_core_per_rank = int(int(cores) / len(this_server["device"])) | |||
| core_gap = avg_core_per_rank - 1 | |||
| print("avg_core_per_rank:", avg_core_per_rank) | |||
| count = 0 | |||
| for instance in this_server["device"]: | |||
| device_id = instance["device_id"] | |||
| rank_id = instance["rank_id"] | |||
| print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":") | |||
| print("rank_id:", rank_id) | |||
| print("device_id:", device_id) | |||
| start = count * int(avg_core_per_rank) | |||
| count += 1 | |||
| end = start + core_gap | |||
| cmdopt = str(start) + "-" + str(end) | |||
| os.environ["DEVICE_ID"] = device_id | |||
| os.environ["RANK_ID"] = rank_id | |||
| os.environ["DEPLOY_MODE"] = "0" | |||
| os.environ["GE_USE_STATIC_MEMORY"] = "1" | |||
| os.system("rm -rf LOG" + str(device_id)) | |||
| os.system("mkdir ./LOG" + str(device_id)) | |||
| os.system("cp *.py ./LOG" + str(device_id)) | |||
| os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") | |||
| os.system("env > ./LOG" + str(device_id) + "/env.log") | |||
| cur_dir = os.getcwd() | |||
| os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" | |||
| os.environ["GLOG_logtostderr"] = "0" | |||
| print("core_nums:", cmdopt) | |||
| print("epoch_size:", str(cfg['epoch_size'])) | |||
| print("data_dir:", data_dir) | |||
| print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | |||
| cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " | |||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | |||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | |||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | |||
| " 'device_num' or 'data_dir'! ") | |||
| cmd += opt | |||
| cmd += " --data_dir=" + data_dir | |||
| cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ | |||
| + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' | |||
| os.system(cmd) | |||
| if __name__ == "__main__": | |||
| distribute_pretrain() | |||
| @@ -17,7 +17,6 @@ import os | |||
| import sys | |||
| import json | |||
| import socket | |||
| import platform | |||
| from argparse import ArgumentParser | |||
| from typing import Dict, Any | |||
| @@ -114,40 +113,25 @@ def main(): | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip.strip() | |||
| arch = platform.processor() | |||
| hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch], | |||
| 'chip_info': '910', | |||
| 'deploy_mode': 'lab', | |||
| 'group_count': '1', | |||
| 'group_list': []} | |||
| instance_list = [] | |||
| hccn_table = {'version': '1.0', | |||
| 'server_count': '1', | |||
| 'server_list': []} | |||
| device_list = [] | |||
| rank_id = 0 | |||
| for instance_id in device_num_list: | |||
| instance = {'devices': []} | |||
| device_id = visible_devices[instance_id] | |||
| device_ip = device_ips[device_id] | |||
| instance['devices'].append({ | |||
| 'device_id': device_id, | |||
| 'device_ip': device_ip, | |||
| }) | |||
| device = {'device_id': device_id, | |||
| 'device_ip': device_ip, | |||
| 'rank_id': str(rank_id)} | |||
| print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip)) | |||
| instance['rank_id'] = str(rank_id) | |||
| rank_id += 1 | |||
| instance['server_id'] = server_id | |||
| instance_list.append(instance) | |||
| hccn_table['group_list'].append({ | |||
| 'device_num': str(len(device_num_list)), | |||
| 'server_num': '1', | |||
| 'group_name': '', | |||
| 'instance_count': str(len(device_num_list)), | |||
| 'instance_list': instance_list, | |||
| device_list.append(device) | |||
| hccn_table['server_list'].append({ | |||
| 'server_id': server_id, | |||
| 'device': device_list, | |||
| 'host_nic_ip': 'reserve' | |||
| }) | |||
| hccn_table['para_plane_nic_location'] = 'device' | |||
| hccn_table['para_plane_nic_name'] = [] | |||
| for instance_id in device_num_list: | |||
| eth_id = visible_devices[instance_id] | |||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||
| hccn_table['para_plane_nic_num'] = str(len(device_num_list)) | |||
| hccn_table['status'] = 'completed' | |||
| # save hccn_table to file | |||