diff --git a/model_zoo/official/cv/mobilenetv2/README.md b/model_zoo/official/cv/mobilenetv2/README.md index 527a390ee7..181dfecb22 100644 --- a/model_zoo/official/cv/mobilenetv2/README.md +++ b/model_zoo/official/cv/mobilenetv2/README.md @@ -70,7 +70,6 @@ For FP16 operators, if the input data type is FP32, the backend of MindSpore wil │ ├──args.py # parse args │ ├──config.py # parameter configuration │ ├──dataset.py # creating dataset - │ ├──launch.py # start python script │ ├──lr_generator.py # learning rate config │ ├──mobilenetV2.py # MobileNetV2 architecture │ ├──models.py # contain define_net and Loss, Monitor diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh index 9245861398..db84aed3b0 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh @@ -31,21 +31,32 @@ run_ascend() BASEPATH=$(cd "`dirname $0`" || exit; pwd) export PYTHONPATH=${BASEPATH}:$PYTHONPATH export RANK_TABLE_FILE=$4 + DEVICE_NUM=$2 if [ -d "../train" ]; then rm -rf ../train fi mkdir ../train cd ../train || exit - python ${BASEPATH}/../src/launch.py \ + for((i=0; i<${DEVICE_NUM}; i++)) + do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./rank$i + mkdir ./rank$i + cp ../*.py ./rank$i + cp -r ../src ./rank$i + cd ./rank$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + python train.py \ --platform=$1 \ - --nproc_per_node=$2 \ - --visible_devices=$3 \ - --training_script=${BASEPATH}/../train.py \ --dataset_path=$5 \ --pretrain_ckpt=$6 \ --freeze_layer=$7 \ - &> ../train.log & # dataset train folder + &> log$i.log & + cd .. + done } run_gpu() diff --git a/model_zoo/official/cv/mobilenetv2/src/launch.py b/model_zoo/official/cv/mobilenetv2/src/launch.py deleted file mode 100644 index 793dfe1d9e..0000000000 --- a/model_zoo/official/cv/mobilenetv2/src/launch.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""launch train script""" -import os -import sys -import subprocess -import shutil -from args import launch_parse_args - -def main(): - print("start", __file__) - args = launch_parse_args() - print(args) - visible_devices = args.visible_devices.split(',') - assert os.path.isfile(args.training_script) - assert len(visible_devices) >= args.nproc_per_node - print('visible_devices:{}'.format(visible_devices)) - - # spawn the processes - processes = [] - cmds = [] - log_files = [] - env = os.environ.copy() - env['RANK_SIZE'] = str(args.nproc_per_node) - cur_path = os.getcwd() - for rank_id in range(0, args.nproc_per_node): - os.chdir(cur_path) - device_id = visible_devices[rank_id] - rank_dir = os.path.join(cur_path, 'rank{}'.format(rank_id)) - env['RANK_ID'] = str(rank_id) - env['DEVICE_ID'] = str(device_id) - if os.path.exists(rank_dir): - shutil.rmtree(rank_dir) - os.mkdir(rank_dir) - os.chdir(rank_dir) - cmd = [sys.executable, '-u'] - cmd.append(args.training_script) - cmd.extend(args.training_script_args) - log_file = open(f'{rank_dir}/log{rank_id}.log', 'w') - process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) - processes.append(process) - cmds.append(cmd) - log_files.append(log_file) - for process, cmd, log_file in zip(processes, cmds, log_files): - process.wait() - if process.returncode != 0: - raise subprocess.CalledProcessError(returncode=process, cmd=cmd) - log_file.close() - - -if __name__ == "__main__": - main() diff --git a/model_zoo/official/cv/mobilenetv3/Readme.md b/model_zoo/official/cv/mobilenetv3/Readme.md index 5fb65d2513..1138975d6a 100644 --- a/model_zoo/official/cv/mobilenetv3/Readme.md +++ b/model_zoo/official/cv/mobilenetv3/Readme.md @@ -64,7 +64,6 @@ Dataset used: [imagenet](http://www.image-net.org/) ├── src │ ├──config.py # parameter configuration │ ├──dataset.py # creating dataset - │ ├──launch.py # start python script │ ├──lr_generator.py # learning rate config │ ├──mobilenetV3.py # MobileNetV3 architecture ├── train.py # training script diff --git a/model_zoo/official/cv/mobilenetv3/src/launch.py b/model_zoo/official/cv/mobilenetv3/src/launch.py deleted file mode 100644 index df5f4e65f0..0000000000 --- a/model_zoo/official/cv/mobilenetv3/src/launch.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""launch train script""" -import os -import sys -import json -import subprocess -import shutil -from argparse import ArgumentParser - -def parse_args(): - """ - parse args . - - Args: - - Returns: - args. - - Examples: - >>> parse_args() - """ - parser = ArgumentParser(description="mindspore distributed training launch " - "helper utilty that will spawn up " - "multiple distributed processes") - parser.add_argument("--nproc_per_node", type=int, default=1, - help="The number of processes to launch on each node, " - "for D training, this is recommended to be set " - "to the number of D in your system so that " - "each process can be bound to a single D.") - parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", - help="will use the visible devices sequentially") - parser.add_argument("--server_id", type=str, default="", - help="server ip") - parser.add_argument("--training_script", type=str, - help="The full path to the single D training " - "program/script to be launched in parallel, " - "followed by all the arguments for the " - "training script") - # rest from the training program - args, unknown = parser.parse_known_args() - args.training_script_args = unknown - return args - - -def main(): - print("start", __file__) - args = parse_args() - print(args) - visible_devices = args.visible_devices.split(',') - assert os.path.isfile(args.training_script) - assert len(visible_devices) >= args.nproc_per_node - print('visible_devices:{}'.format(visible_devices)) - if not args.server_id: - print('pleaser input server ip!!!') - exit(0) - print('server_id:{}'.format(args.server_id)) - - # construct hccn_table - hccn_configs = open('/etc/hccn.conf', 'r').readlines() - device_ips = {} - for hccn_item in hccn_configs: - hccn_item = hccn_item.strip() - if hccn_item.startswith('address_'): - device_id, device_ip = hccn_item.split('=') - device_id = device_id.split('_')[1] - device_ips[device_id] = device_ip - print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) - hccn_table = {} - hccn_table['board_id'] = '0x0000' - hccn_table['chip_info'] = '910' - hccn_table['deploy_mode'] = 'lab' - hccn_table['group_count'] = '1' - hccn_table['group_list'] = [] - instance_list = [] - usable_dev = '' - for instance_id in range(args.nproc_per_node): - instance = {} - instance['devices'] = [] - device_id = visible_devices[instance_id] - device_ip = device_ips[device_id] - usable_dev += str(device_id) - instance['devices'].append({ - 'device_id': device_id, - 'device_ip': device_ip, - }) - instance['rank_id'] = str(instance_id) - instance['server_id'] = args.server_id - instance_list.append(instance) - hccn_table['group_list'].append({ - 'device_num': str(args.nproc_per_node), - 'server_num': '1', - 'group_name': '', - 'instance_count': str(args.nproc_per_node), - 'instance_list': instance_list, - }) - hccn_table['para_plane_nic_location'] = 'device' - hccn_table['para_plane_nic_name'] = [] - for instance_id in range(args.nproc_per_node): - eth_id = visible_devices[instance_id] - hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) - hccn_table['para_plane_nic_num'] = str(args.nproc_per_node) - hccn_table['status'] = 'completed' - - # save hccn_table to file - table_path = os.getcwd() - if not os.path.exists(table_path): - os.mkdir(table_path) - table_fn = os.path.join(table_path, - 'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id)) - with open(table_fn, 'w') as table_fp: - json.dump(hccn_table, table_fp, indent=4) - sys.stdout.flush() - - # spawn the processes - processes = [] - cmds = [] - log_files = [] - env = os.environ.copy() - env['RANK_SIZE'] = str(args.nproc_per_node) - cur_path = os.getcwd() - for rank_id in range(0, args.nproc_per_node): - os.chdir(cur_path) - device_id = visible_devices[rank_id] - device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) - env['RANK_ID'] = str(rank_id) - env['DEVICE_ID'] = str(device_id) - if args.nproc_per_node > 1: - env['RANK_TABLE_FILE'] = table_fn - if os.path.exists(device_dir): - shutil.rmtree(device_dir) - os.mkdir(device_dir) - os.chdir(device_dir) - cmd = [sys.executable, '-u'] - cmd.append(args.training_script) - cmd.extend(args.training_script_args) - log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w') - process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env) - processes.append(process) - cmds.append(cmd) - log_files.append(log_file) - for process, cmd, log_file in zip(processes, cmds, log_files): - process.wait() - if process.returncode != 0: - raise subprocess.CalledProcessError(returncode=process, cmd=cmd) - log_file.close() - - -if __name__ == "__main__": - main() diff --git a/model_zoo/official/cv/ssd/src/box_utils.py b/model_zoo/official/cv/ssd/src/box_utils.py index 87688d72f2..34b655d1f5 100644 --- a/model_zoo/official/cv/ssd/src/box_utils.py +++ b/model_zoo/official/cv/ssd/src/box_utils.py @@ -66,7 +66,7 @@ default_boxes_ltrb = GeneratDefaultBoxes().default_boxes_ltrb default_boxes = GeneratDefaultBoxes().default_boxes y1, x1, y2, x2 = np.split(default_boxes_ltrb[:, :4], 4, axis=-1) vol_anchors = (x2 - x1) * (y2 - y1) -matching_threshold = config.match_thershold +matching_threshold = config.match_threshold def ssd_bboxes_encode(boxes): diff --git a/model_zoo/official/cv/ssd/src/coco_eval.py b/model_zoo/official/cv/ssd/src/coco_eval.py index eb36618089..4c190bc5ef 100644 --- a/model_zoo/official/cv/ssd/src/coco_eval.py +++ b/model_zoo/official/cv/ssd/src/coco_eval.py @@ -100,7 +100,7 @@ def metrics(pred_data): class_boxes = pred_boxes[score_mask] * [h, w, h, w] if score_mask.any(): - nms_index = apply_nms(class_boxes, class_box_scores, config.nms_thershold, config.max_boxes) + nms_index = apply_nms(class_boxes, class_box_scores, config.nms_threshold, config.max_boxes) class_boxes = class_boxes[nms_index] class_box_scores = class_box_scores[nms_index] diff --git a/model_zoo/official/cv/ssd/src/config.py b/model_zoo/official/cv/ssd/src/config.py index ff0cd21963..d2d3ddcef9 100644 --- a/model_zoo/official/cv/ssd/src/config.py +++ b/model_zoo/official/cv/ssd/src/config.py @@ -21,8 +21,8 @@ config = ed({ "img_shape": [300, 300], "num_ssd_boxes": 1917, "neg_pre_positive": 3, - "match_thershold": 0.5, - "nms_thershold": 0.6, + "match_threshold": 0.5, + "nms_threshold": 0.6, "min_score": 0.1, "max_boxes": 100, @@ -38,7 +38,7 @@ config = ed({ "num_default": [3, 6, 6, 6, 6, 6], "extras_in_channels": [256, 576, 1280, 512, 256, 256], "extras_out_channels": [576, 1280, 512, 256, 256, 128], - "extras_srides": [1, 1, 2, 2, 2, 2], + "extras_strides": [1, 1, 2, 2, 2, 2], "extras_ratio": [0.2, 0.2, 0.2, 0.25, 0.5, 0.25], "feature_size": [19, 10, 5, 3, 2, 1], "min_scale": 0.2, diff --git a/model_zoo/official/cv/ssd/src/ssd.py b/model_zoo/official/cv/ssd/src/ssd.py index 3a65a51a8a..a91e9d819a 100644 --- a/model_zoo/official/cv/ssd/src/ssd.py +++ b/model_zoo/official/cv/ssd/src/ssd.py @@ -228,7 +228,7 @@ class SSD300(nn.Cell): in_channels = config.extras_in_channels out_channels = config.extras_out_channels ratios = config.extras_ratio - strides = config.extras_srides + strides = config.extras_strides residual_list = [] for i in range(2, len(in_channels)): residual = InvertedResidual(in_channels[i], out_channels[i], stride=strides[i],