| @@ -70,7 +70,6 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil | |||
| │ ├──args.py # parse args | |||
| │ ├──config.py # parameter configuration | |||
| │ ├──dataset.py # creating dataset | |||
| │ ├──launch.py # start python script | |||
| │ ├──lr_generator.py # learning rate config | |||
| │ ├──mobilenetV2.py # MobileNetV2 architecture | |||
| │ ├──models.py # contain define_net and Loss, Monitor | |||
| @@ -31,21 +31,32 @@ run_ascend() | |||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | |||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | |||
| export RANK_TABLE_FILE=$4 | |||
| DEVICE_NUM=$2 | |||
| if [ -d "../train" ]; | |||
| then | |||
| rm -rf ../train | |||
| fi | |||
| mkdir ../train | |||
| cd ../train || exit | |||
| python ${BASEPATH}/../src/launch.py \ | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$i | |||
| rm -rf ./rank$i | |||
| mkdir ./rank$i | |||
| cp ../*.py ./rank$i | |||
| cp -r ../src ./rank$i | |||
| cd ./rank$i || exit | |||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | |||
| env > env.log | |||
| python train.py \ | |||
| --platform=$1 \ | |||
| --nproc_per_node=$2 \ | |||
| --visible_devices=$3 \ | |||
| --training_script=${BASEPATH}/../train.py \ | |||
| --dataset_path=$5 \ | |||
| --pretrain_ckpt=$6 \ | |||
| --freeze_layer=$7 \ | |||
| &> ../train.log & # dataset train folder | |||
| &> log$i.log & | |||
| cd .. | |||
| done | |||
| } | |||
| run_gpu() | |||
| @@ -1,64 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """launch train script""" | |||
| import os | |||
| import sys | |||
| import subprocess | |||
| import shutil | |||
| from args import launch_parse_args | |||
| def main(): | |||
| print("start", __file__) | |||
| args = launch_parse_args() | |||
| print(args) | |||
| visible_devices = args.visible_devices.split(',') | |||
| assert os.path.isfile(args.training_script) | |||
| assert len(visible_devices) >= args.nproc_per_node | |||
| print('visible_devices:{}'.format(visible_devices)) | |||
| # spawn the processes | |||
| processes = [] | |||
| cmds = [] | |||
| log_files = [] | |||
| env = os.environ.copy() | |||
| env['RANK_SIZE'] = str(args.nproc_per_node) | |||
| cur_path = os.getcwd() | |||
| for rank_id in range(0, args.nproc_per_node): | |||
| os.chdir(cur_path) | |||
| device_id = visible_devices[rank_id] | |||
| rank_dir = os.path.join(cur_path, 'rank{}'.format(rank_id)) | |||
| env['RANK_ID'] = str(rank_id) | |||
| env['DEVICE_ID'] = str(device_id) | |||
| if os.path.exists(rank_dir): | |||
| shutil.rmtree(rank_dir) | |||
| os.mkdir(rank_dir) | |||
| os.chdir(rank_dir) | |||
| cmd = [sys.executable, '-u'] | |||
| cmd.append(args.training_script) | |||
| cmd.extend(args.training_script_args) | |||
| log_file = open(f'{rank_dir}/log{rank_id}.log', 'w') | |||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | |||
| processes.append(process) | |||
| cmds.append(cmd) | |||
| log_files.append(log_file) | |||
| for process, cmd, log_file in zip(processes, cmds, log_files): | |||
| process.wait() | |||
| if process.returncode != 0: | |||
| raise subprocess.CalledProcessError(returncode=process, cmd=cmd) | |||
| log_file.close() | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -64,7 +64,6 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||
| ├── src | |||
| │ ├──config.py # parameter configuration | |||
| │ ├──dataset.py # creating dataset | |||
| │ ├──launch.py # start python script | |||
| │ ├──lr_generator.py # learning rate config | |||
| │ ├──mobilenetV3.py # MobileNetV3 architecture | |||
| ├── train.py # training script | |||
| @@ -1,162 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """launch train script""" | |||
| import os | |||
| import sys | |||
| import json | |||
| import subprocess | |||
| import shutil | |||
| from argparse import ArgumentParser | |||
| def parse_args(): | |||
| """ | |||
| parse args . | |||
| Args: | |||
| Returns: | |||
| args. | |||
| Examples: | |||
| >>> parse_args() | |||
| """ | |||
| parser = ArgumentParser(description="mindspore distributed training launch " | |||
| "helper utilty that will spawn up " | |||
| "multiple distributed processes") | |||
| parser.add_argument("--nproc_per_node", type=int, default=1, | |||
| help="The number of processes to launch on each node, " | |||
| "for D training, this is recommended to be set " | |||
| "to the number of D in your system so that " | |||
| "each process can be bound to a single D.") | |||
| parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", | |||
| help="will use the visible devices sequentially") | |||
| parser.add_argument("--server_id", type=str, default="", | |||
| help="server ip") | |||
| parser.add_argument("--training_script", type=str, | |||
| help="The full path to the single D training " | |||
| "program/script to be launched in parallel, " | |||
| "followed by all the arguments for the " | |||
| "training script") | |||
| # rest from the training program | |||
| args, unknown = parser.parse_known_args() | |||
| args.training_script_args = unknown | |||
| return args | |||
| def main(): | |||
| print("start", __file__) | |||
| args = parse_args() | |||
| print(args) | |||
| visible_devices = args.visible_devices.split(',') | |||
| assert os.path.isfile(args.training_script) | |||
| assert len(visible_devices) >= args.nproc_per_node | |||
| print('visible_devices:{}'.format(visible_devices)) | |||
| if not args.server_id: | |||
| print('pleaser input server ip!!!') | |||
| exit(0) | |||
| print('server_id:{}'.format(args.server_id)) | |||
| # construct hccn_table | |||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||
| device_ips = {} | |||
| for hccn_item in hccn_configs: | |||
| hccn_item = hccn_item.strip() | |||
| if hccn_item.startswith('address_'): | |||
| device_id, device_ip = hccn_item.split('=') | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip | |||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||
| hccn_table = {} | |||
| hccn_table['board_id'] = '0x0000' | |||
| hccn_table['chip_info'] = '910' | |||
| hccn_table['deploy_mode'] = 'lab' | |||
| hccn_table['group_count'] = '1' | |||
| hccn_table['group_list'] = [] | |||
| instance_list = [] | |||
| usable_dev = '' | |||
| for instance_id in range(args.nproc_per_node): | |||
| instance = {} | |||
| instance['devices'] = [] | |||
| device_id = visible_devices[instance_id] | |||
| device_ip = device_ips[device_id] | |||
| usable_dev += str(device_id) | |||
| instance['devices'].append({ | |||
| 'device_id': device_id, | |||
| 'device_ip': device_ip, | |||
| }) | |||
| instance['rank_id'] = str(instance_id) | |||
| instance['server_id'] = args.server_id | |||
| instance_list.append(instance) | |||
| hccn_table['group_list'].append({ | |||
| 'device_num': str(args.nproc_per_node), | |||
| 'server_num': '1', | |||
| 'group_name': '', | |||
| 'instance_count': str(args.nproc_per_node), | |||
| 'instance_list': instance_list, | |||
| }) | |||
| hccn_table['para_plane_nic_location'] = 'device' | |||
| hccn_table['para_plane_nic_name'] = [] | |||
| for instance_id in range(args.nproc_per_node): | |||
| eth_id = visible_devices[instance_id] | |||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||
| hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) | |||
| hccn_table['status'] = 'completed' | |||
| # save hccn_table to file | |||
| table_path = os.getcwd() | |||
| if not os.path.exists(table_path): | |||
| os.mkdir(table_path) | |||
| table_fn = os.path.join(table_path, | |||
| 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) | |||
| with open(table_fn, 'w') as table_fp: | |||
| json.dump(hccn_table, table_fp, indent=4) | |||
| sys.stdout.flush() | |||
| # spawn the processes | |||
| processes = [] | |||
| cmds = [] | |||
| log_files = [] | |||
| env = os.environ.copy() | |||
| env['RANK_SIZE'] = str(args.nproc_per_node) | |||
| cur_path = os.getcwd() | |||
| for rank_id in range(0, args.nproc_per_node): | |||
| os.chdir(cur_path) | |||
| device_id = visible_devices[rank_id] | |||
| device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) | |||
| env['RANK_ID'] = str(rank_id) | |||
| env['DEVICE_ID'] = str(device_id) | |||
| if args.nproc_per_node > 1: | |||
| env['RANK_TABLE_FILE'] = table_fn | |||
| if os.path.exists(device_dir): | |||
| shutil.rmtree(device_dir) | |||
| os.mkdir(device_dir) | |||
| os.chdir(device_dir) | |||
| cmd = [sys.executable, '-u'] | |||
| cmd.append(args.training_script) | |||
| cmd.extend(args.training_script_args) | |||
| log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') | |||
| process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) | |||
| processes.append(process) | |||
| cmds.append(cmd) | |||
| log_files.append(log_file) | |||
| for process, cmd, log_file in zip(processes, cmds, log_files): | |||
| process.wait() | |||
| if process.returncode != 0: | |||
| raise subprocess.CalledProcessError(returncode=process, cmd=cmd) | |||
| log_file.close() | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -66,7 +66,7 @@ default_boxes_ltrb = GeneratDefaultBoxes().default_boxes_ltrb | |||
| default_boxes = GeneratDefaultBoxes().default_boxes | |||
| y1, x1, y2, x2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1) | |||
| vol_anchors = (x2 - x1) * (y2 - y1) | |||
| matching_threshold = config.match_thershold | |||
| matching_threshold = config.match_threshold | |||
| def ssd_bboxes_encode(boxes): | |||
| @@ -100,7 +100,7 @@ def metrics(pred_data): | |||
| class_boxes = pred_boxes[score_mask] * [h, w, h, w] | |||
| if score_mask.any(): | |||
| nms_index = apply_nms(class_boxes, class_box_scores, config.nms_thershold, config.max_boxes) | |||
| nms_index = apply_nms(class_boxes, class_box_scores, config.nms_threshold, config.max_boxes) | |||
| class_boxes = class_boxes[nms_index] | |||
| class_box_scores = class_box_scores[nms_index] | |||
| @@ -21,8 +21,8 @@ config = ed({ | |||
| "img_shape": [300, 300], | |||
| "num_ssd_boxes": 1917, | |||
| "neg_pre_positive": 3, | |||
| "match_thershold": 0.5, | |||
| "nms_thershold": 0.6, | |||
| "match_threshold": 0.5, | |||
| "nms_threshold": 0.6, | |||
| "min_score": 0.1, | |||
| "max_boxes": 100, | |||
| @@ -38,7 +38,7 @@ config = ed({ | |||
| "num_default": [3, 6, 6, 6, 6, 6], | |||
| "extras_in_channels": [256, 576, 1280, 512, 256, 256], | |||
| "extras_out_channels": [576, 1280, 512, 256, 256, 128], | |||
| "extras_srides": [1, 1, 2, 2, 2, 2], | |||
| "extras_strides": [1, 1, 2, 2, 2, 2], | |||
| "extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25], | |||
| "feature_size": [19, 10, 5, 3, 2, 1], | |||
| "min_scale": 0.2, | |||
| @@ -228,7 +228,7 @@ class SSD300(nn.Cell): | |||
| in_channels = config.extras_in_channels | |||
| out_channels = config.extras_out_channels | |||
| ratios = config.extras_ratio | |||
| strides = config.extras_srides | |||
| strides = config.extras_strides | |||
| residual_list = [] | |||
| for i in range(2, len(in_channels)): | |||
| residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i], | |||