From 74fcbd290032b6fb24a0451458bae3066a5a47e6 Mon Sep 17 00:00:00 2001 From: wandongdong Date: Wed, 8 Jul 2020 21:29:09 +0800 Subject: [PATCH] add hccl_config --- model_zoo/mobilenetv2/Readme.md | 4 +- model_zoo/mobilenetv2/scripts/run_train.sh | 11 ++-- model_zoo/mobilenetv2/src/launch.py | 66 ---------------------- 3 files changed, 8 insertions(+), 73 deletions(-) diff --git a/model_zoo/mobilenetv2/Readme.md b/model_zoo/mobilenetv2/Readme.md index 5b36a63fe4..1687d2cbdc 100644 --- a/model_zoo/mobilenetv2/Readme.md +++ b/model_zoo/mobilenetv2/Readme.md @@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/) ### Usage -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] ### Launch ``` # training example - Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt + Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ mobilenet_199.ckpt GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ ``` diff --git a/model_zoo/mobilenetv2/scripts/run_train.sh b/model_zoo/mobilenetv2/scripts/run_train.sh index f1d80aeac6..a6e2a79477 100644 --- a/model_zoo/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/mobilenetv2/scripts/run_train.sh @@ -22,14 +22,16 @@ run_ascend() exit 1 fi - if [ ! -d $5 ] + if [ ! -d $5 ] && [ ! -f $5 ] then - echo "error: DATASET_PATH=$5 is not a directory" + echo "error: DATASET_PATH=$5 is not a directory or file" exit 1 fi BASEPATH=$(cd "`dirname $0`" || exit; pwd) export PYTHONPATH=${BASEPATH}:$PYTHONPATH + export MINDSPORE_HCCL_CONFIG_PATH=$4 + export RANK_TABLE_FILE=$4 if [ -d "../train" ]; then rm -rf ../train @@ -38,8 +40,7 @@ run_ascend() cd ../train || exit python ${BASEPATH}/../src/launch.py \ --nproc_per_node=$2 \ - --visible_devices=$4 \ - --server_id=$3 \ + --visible_devices=$3 \ --training_script=${BASEPATH}/../train.py \ --dataset_path=$5 \ --pre_trained=$6 \ @@ -80,7 +81,7 @@ run_gpu() if [ $# -gt 6 ] || [ $# -lt 4 ] then echo "Usage:\n \ - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ " exit 1 diff --git a/model_zoo/mobilenetv2/src/launch.py b/model_zoo/mobilenetv2/src/launch.py index 48c8159664..f5c97b0bd7 100644 --- a/model_zoo/mobilenetv2/src/launch.py +++ b/model_zoo/mobilenetv2/src/launch.py @@ -15,7 +15,6 @@ """launch train script""" import os import sys -import json import subprocess import shutil from argparse import ArgumentParser @@ -42,8 +41,6 @@ def parse_args(): "each process can be bound to a single D.") parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", help="will use the visible devices sequentially") - parser.add_argument("--server_id", type=str, default="", - help="server ip") parser.add_argument("--training_script", type=str, help="The full path to the single D training " "program/script to be launched in parallel, " @@ -63,66 +60,6 @@ def main(): assert os.path.isfile(args.training_script) assert len(visible_devices) >= args.nproc_per_node print('visible_devices:{}'.format(visible_devices)) - if not args.server_id: - print('pleaser input server ip!!!') - exit(0) - print('server_id:{}'.format(args.server_id)) - - # construct hccn_table - hccn_configs = open('/etc/hccn.conf', 'r').readlines() - device_ips = {} - for hccn_item in hccn_configs: - hccn_item = hccn_item.strip() - if hccn_item.startswith('address_'): - device_id, device_ip = hccn_item.split('=') - device_id = device_id.split('_')[1] - device_ips[device_id] = device_ip - print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) - hccn_table = {} - hccn_table['board_id'] = '0x0000' - hccn_table['chip_info'] = '910' - hccn_table['deploy_mode'] = 'lab' - hccn_table['group_count'] = '1' - hccn_table['group_list'] = [] - instance_list = [] - usable_dev = '' - for instance_id in range(args.nproc_per_node): - instance = {} - instance['devices'] = [] - device_id = visible_devices[instance_id] - device_ip = device_ips[device_id] - usable_dev += str(device_id) - instance['devices'].append({ - 'device_id': device_id, - 'device_ip': device_ip, - }) - instance['rank_id'] = str(instance_id) - instance['server_id'] = args.server_id - instance_list.append(instance) - hccn_table['group_list'].append({ - 'device_num': str(args.nproc_per_node), - 'server_num': '1', - 'group_name': '', - 'instance_count': str(args.nproc_per_node), - 'instance_list': instance_list, - }) - hccn_table['para_plane_nic_location'] = 'device' - hccn_table['para_plane_nic_name'] = [] - for instance_id in range(args.nproc_per_node): - eth_id = visible_devices[instance_id] - hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) - hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) - hccn_table['status'] = 'completed' - - # save hccn_table to file - table_path = os.getcwd() - if not os.path.exists(table_path): - os.mkdir(table_path) - table_fn = os.path.join(table_path, - 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) - with open(table_fn, 'w') as table_fp: - json.dump(hccn_table, table_fp, indent=4) - sys.stdout.flush() # spawn the processes processes = [] @@ -137,9 +74,6 @@ def main(): device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) env['RANK_ID'] = str(rank_id) env['DEVICE_ID'] = str(device_id) - if args.nproc_per_node > 1: - env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn - env['RANK_TABLE_FILE'] = table_fn if os.path.exists(device_dir): shutil.rmtree(device_dir) os.mkdir(device_dir)