| @@ -70,7 +70,6 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||||
| │ ├──args.py # parse args | │ ├──args.py # parse args | ||||
| │ ├──config.py # parameter configuration | │ ├──config.py # parameter configuration | ||||
| │ ├──dataset.py # creating dataset | │ ├──dataset.py # creating dataset | ||||
| │ ├──launch.py # start python script | |||||
| │ ├──lr_generator.py # learning rate config | │ ├──lr_generator.py # learning rate config | ||||
| │ ├──mobilenetV2.py # MobileNetV2 architecture | │ ├──mobilenetV2.py # MobileNetV2 architecture | ||||
| │ ├──models.py # contain define_net and Loss, Monitor | │ ├──models.py # contain define_net and Loss, Monitor | ||||
| @@ -31,21 +31,32 @@ run_ascend() | |||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | BASEPATH=$(cd "`dirname $0`" || exit; pwd) | ||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | export PYTHONPATH=${BASEPATH}:$PYTHONPATH | ||||
| export RANK_TABLE_FILE=$4 | export RANK_TABLE_FILE=$4 | ||||
| DEVICE_NUM=$2 | |||||
| if [ -d "../train" ]; | if [ -d "../train" ]; | ||||
| then | then | ||||
| rm -rf ../train | rm -rf ../train | ||||
| fi | fi | ||||
| mkdir ../train | mkdir ../train | ||||
| cd ../train || exit | cd ../train || exit | ||||
| python ${BASEPATH}/../src/launch.py \ | |||||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||||
| do | |||||
| export DEVICE_ID=$i | |||||
| export RANK_ID=$i | |||||
| rm -rf ./rank$i | |||||
| mkdir ./rank$i | |||||
| cp ../*.py ./rank$i | |||||
| cp -r ../src ./rank$i | |||||
| cd ./rank$i || exit | |||||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | |||||
| env > env.log | |||||
| python train.py \ | |||||
| --platform=$1 \ | --platform=$1 \ | ||||
| --nproc_per_node=$2 \ | |||||
| --visible_devices=$3 \ | |||||
| --training_script=${BASEPATH}/../train.py \ | |||||
| --dataset_path=$5 \ | --dataset_path=$5 \ | ||||
| --pretrain_ckpt=$6 \ | --pretrain_ckpt=$6 \ | ||||
| --freeze_layer=$7 \ | --freeze_layer=$7 \ | ||||
| &> ../train.log & # dataset train folder | |||||
| &> log$i.log & | |||||
| cd .. | |||||
| done | |||||
| } | } | ||||
| run_gpu() | run_gpu() | ||||
| @@ -1,64 +0,0 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """launch train script""" | |||||
| import os | |||||
| import sys | |||||
| import subprocess | |||||
| import shutil | |||||
| from args import launch_parse_args | |||||
| def main(): | |||||
| print("start", __file__) | |||||
| args = launch_parse_args() | |||||
| print(args) | |||||
| visible_devices = args.visible_devices.split(',') | |||||
| assert os.path.isfile(args.training_script) | |||||
| assert len(visible_devices) >= args.nproc_per_node | |||||
| print('visible_devices:{}'.format(visible_devices)) | |||||
| # spawn the processes | |||||
| processes = [] | |||||
| cmds = [] | |||||
| log_files = [] | |||||
| env = os.environ.copy() | |||||
| env['RANK_SIZE'] = str(args.nproc_per_node) | |||||
| cur_path = os.getcwd() | |||||
| for rank_id in range(0, args.nproc_per_node): | |||||
| os.chdir(cur_path) | |||||
| device_id = visible_devices[rank_id] | |||||
| rank_dir = os.path.join(cur_path, 'rank{}'.format(rank_id)) | |||||
| env['RANK_ID'] = str(rank_id) | |||||
| env['DEVICE_ID'] = str(device_id) | |||||
| if os.path.exists(rank_dir): | |||||
| shutil.rmtree(rank_dir) | |||||
| os.mkdir(rank_dir) | |||||
| os.chdir(rank_dir) | |||||
| cmd = [sys.executable, '-u'] | |||||
| cmd.append(args.training_script) | |||||
| cmd.extend(args.training_script_args) | |||||
| log_file = open(f'{rank_dir}/log{rank_id}.log', 'w') | |||||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | |||||
| processes.append(process) | |||||
| cmds.append(cmd) | |||||
| log_files.append(log_file) | |||||
| for process, cmd, log_file in zip(processes, cmds, log_files): | |||||
| process.wait() | |||||
| if process.returncode != 0: | |||||
| raise subprocess.CalledProcessError(returncode=process, cmd=cmd) | |||||
| log_file.close() | |||||
| if __name__ == "__main__": | |||||
| main() | |||||
| @@ -64,7 +64,6 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||||
| ├── src | ├── src | ||||
| │ ├──config.py # parameter configuration | │ ├──config.py # parameter configuration | ||||
| │ ├──dataset.py # creating dataset | │ ├──dataset.py # creating dataset | ||||
| │ ├──launch.py # start python script | |||||
| │ ├──lr_generator.py # learning rate config | │ ├──lr_generator.py # learning rate config | ||||
| │ ├──mobilenetV3.py # MobileNetV3 architecture | │ ├──mobilenetV3.py # MobileNetV3 architecture | ||||
| ├── train.py # training script | ├── train.py # training script | ||||
| @@ -1,162 +0,0 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """launch train script""" | |||||
| import os | |||||
| import sys | |||||
| import json | |||||
| import subprocess | |||||
| import shutil | |||||
| from argparse import ArgumentParser | |||||
| def parse_args(): | |||||
| """ | |||||
| parse args . | |||||
| Args: | |||||
| Returns: | |||||
| args. | |||||
| Examples: | |||||
| >>> parse_args() | |||||
| """ | |||||
| parser = ArgumentParser(description="mindspore distributed training launch " | |||||
| "helper utilty that will spawn up " | |||||
| "multiple distributed processes") | |||||
| parser.add_argument("--nproc_per_node", type=int, default=1, | |||||
| help="The number of processes to launch on each node, " | |||||
| "for D training, this is recommended to be set " | |||||
| "to the number of D in your system so that " | |||||
| "each process can be bound to a single D.") | |||||
| parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", | |||||
| help="will use the visible devices sequentially") | |||||
| parser.add_argument("--server_id", type=str, default="", | |||||
| help="server ip") | |||||
| parser.add_argument("--training_script", type=str, | |||||
| help="The full path to the single D training " | |||||
| "program/script to be launched in parallel, " | |||||
| "followed by all the arguments for the " | |||||
| "training script") | |||||
| # rest from the training program | |||||
| args, unknown = parser.parse_known_args() | |||||
| args.training_script_args = unknown | |||||
| return args | |||||
| def main(): | |||||
| print("start", __file__) | |||||
| args = parse_args() | |||||
| print(args) | |||||
| visible_devices = args.visible_devices.split(',') | |||||
| assert os.path.isfile(args.training_script) | |||||
| assert len(visible_devices) >= args.nproc_per_node | |||||
| print('visible_devices:{}'.format(visible_devices)) | |||||
| if not args.server_id: | |||||
| print('pleaser input server ip!!!') | |||||
| exit(0) | |||||
| print('server_id:{}'.format(args.server_id)) | |||||
| # construct hccn_table | |||||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||||
| device_ips = {} | |||||
| for hccn_item in hccn_configs: | |||||
| hccn_item = hccn_item.strip() | |||||
| if hccn_item.startswith('address_'): | |||||
| device_id, device_ip = hccn_item.split('=') | |||||
| device_id = device_id.split('_')[1] | |||||
| device_ips[device_id] = device_ip | |||||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||||
| hccn_table = {} | |||||
| hccn_table['board_id'] = '0x0000' | |||||
| hccn_table['chip_info'] = '910' | |||||
| hccn_table['deploy_mode'] = 'lab' | |||||
| hccn_table['group_count'] = '1' | |||||
| hccn_table['group_list'] = [] | |||||
| instance_list = [] | |||||
| usable_dev = '' | |||||
| for instance_id in range(args.nproc_per_node): | |||||
| instance = {} | |||||
| instance['devices'] = [] | |||||
| device_id = visible_devices[instance_id] | |||||
| device_ip = device_ips[device_id] | |||||
| usable_dev += str(device_id) | |||||
| instance['devices'].append({ | |||||
| 'device_id': device_id, | |||||
| 'device_ip': device_ip, | |||||
| }) | |||||
| instance['rank_id'] = str(instance_id) | |||||
| instance['server_id'] = args.server_id | |||||
| instance_list.append(instance) | |||||
| hccn_table['group_list'].append({ | |||||
| 'device_num': str(args.nproc_per_node), | |||||
| 'server_num': '1', | |||||
| 'group_name': '', | |||||
| 'instance_count': str(args.nproc_per_node), | |||||
| 'instance_list': instance_list, | |||||
| }) | |||||
| hccn_table['para_plane_nic_location'] = 'device' | |||||
| hccn_table['para_plane_nic_name'] = [] | |||||
| for instance_id in range(args.nproc_per_node): | |||||
| eth_id = visible_devices[instance_id] | |||||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||||
| hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) | |||||
| hccn_table['status'] = 'completed' | |||||
| # save hccn_table to file | |||||
| table_path = os.getcwd() | |||||
| if not os.path.exists(table_path): | |||||
| os.mkdir(table_path) | |||||
| table_fn = os.path.join(table_path, | |||||
| 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) | |||||
| with open(table_fn, 'w') as table_fp: | |||||
| json.dump(hccn_table, table_fp, indent=4) | |||||
| sys.stdout.flush() | |||||
| # spawn the processes | |||||
| processes = [] | |||||
| cmds = [] | |||||
| log_files = [] | |||||
| env = os.environ.copy() | |||||
| env['RANK_SIZE'] = str(args.nproc_per_node) | |||||
| cur_path = os.getcwd() | |||||
| for rank_id in range(0, args.nproc_per_node): | |||||
| os.chdir(cur_path) | |||||
| device_id = visible_devices[rank_id] | |||||
| device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) | |||||
| env['RANK_ID'] = str(rank_id) | |||||
| env['DEVICE_ID'] = str(device_id) | |||||
| if args.nproc_per_node > 1: | |||||
| env['RANK_TABLE_FILE'] = table_fn | |||||
| if os.path.exists(device_dir): | |||||
| shutil.rmtree(device_dir) | |||||
| os.mkdir(device_dir) | |||||
| os.chdir(device_dir) | |||||
| cmd = [sys.executable, '-u'] | |||||
| cmd.append(args.training_script) | |||||
| cmd.extend(args.training_script_args) | |||||
| log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') | |||||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | |||||
| processes.append(process) | |||||
| cmds.append(cmd) | |||||
| log_files.append(log_file) | |||||
| for process, cmd, log_file in zip(processes, cmds, log_files): | |||||
| process.wait() | |||||
| if process.returncode != 0: | |||||
| raise subprocess.CalledProcessError(returncode=process, cmd=cmd) | |||||
| log_file.close() | |||||
| if __name__ == "__main__": | |||||
| main() | |||||
| @@ -66,7 +66,7 @@ default_boxes_ltrb = GeneratDefaultBoxes().default_boxes_ltrb | |||||
| default_boxes = GeneratDefaultBoxes().default_boxes | default_boxes = GeneratDefaultBoxes().default_boxes | ||||
| y1, x1, y2, x2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1) | y1, x1, y2, x2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1) | ||||
| vol_anchors = (x2 - x1) * (y2 - y1) | vol_anchors = (x2 - x1) * (y2 - y1) | ||||
| matching_threshold = config.match_thershold | |||||
| matching_threshold = config.match_threshold | |||||
| def ssd_bboxes_encode(boxes): | def ssd_bboxes_encode(boxes): | ||||
| @@ -100,7 +100,7 @@ def metrics(pred_data): | |||||
| class_boxes = pred_boxes[score_mask] * [h, w, h, w] | class_boxes = pred_boxes[score_mask] * [h, w, h, w] | ||||
| if score_mask.any(): | if score_mask.any(): | ||||
| nms_index = apply_nms(class_boxes, class_box_scores, config.nms_thershold, config.max_boxes) | |||||
| nms_index = apply_nms(class_boxes, class_box_scores, config.nms_threshold, config.max_boxes) | |||||
| class_boxes = class_boxes[nms_index] | class_boxes = class_boxes[nms_index] | ||||
| class_box_scores = class_box_scores[nms_index] | class_box_scores = class_box_scores[nms_index] | ||||
| @@ -21,8 +21,8 @@ config = ed({ | |||||
| "img_shape": [300, 300], | "img_shape": [300, 300], | ||||
| "num_ssd_boxes": 1917, | "num_ssd_boxes": 1917, | ||||
| "neg_pre_positive": 3, | "neg_pre_positive": 3, | ||||
| "match_thershold": 0.5, | |||||
| "nms_thershold": 0.6, | |||||
| "match_threshold": 0.5, | |||||
| "nms_threshold": 0.6, | |||||
| "min_score": 0.1, | "min_score": 0.1, | ||||
| "max_boxes": 100, | "max_boxes": 100, | ||||
| @@ -38,7 +38,7 @@ config = ed({ | |||||
| "num_default": [3, 6, 6, 6, 6, 6], | "num_default": [3, 6, 6, 6, 6, 6], | ||||
| "extras_in_channels": [256, 576, 1280, 512, 256, 256], | "extras_in_channels": [256, 576, 1280, 512, 256, 256], | ||||
| "extras_out_channels": [576, 1280, 512, 256, 256, 128], | "extras_out_channels": [576, 1280, 512, 256, 256, 128], | ||||
| "extras_srides": [1, 1, 2, 2, 2, 2], | |||||
| "extras_strides": [1, 1, 2, 2, 2, 2], | |||||
| "extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25], | "extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25], | ||||
| "feature_size": [19, 10, 5, 3, 2, 1], | "feature_size": [19, 10, 5, 3, 2, 1], | ||||
| "min_scale": 0.2, | "min_scale": 0.2, | ||||
| @@ -228,7 +228,7 @@ class SSD300(nn.Cell): | |||||
| in_channels = config.extras_in_channels | in_channels = config.extras_in_channels | ||||
| out_channels = config.extras_out_channels | out_channels = config.extras_out_channels | ||||
| ratios = config.extras_ratio | ratios = config.extras_ratio | ||||
| strides = config.extras_srides | |||||
| strides = config.extras_strides | |||||
| residual_list = [] | residual_list = [] | ||||
| for i in range(2, len(in_channels)): | for i in range(2, len(in_channels)): | ||||
| residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i], | residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i], | ||||