Merge pull request !7878 from linqingke/cnnctctags/v1.1.0
| @@ -150,7 +150,6 @@ The entire code structure is as following: | |||
| |---callback.py // loss callback file | |||
| |---dataset.py // process dataset | |||
| |---util.py // routine operation | |||
| |---generate_hccn_file.py // generate distribute json file | |||
| |---preprocess_dataset.py // preprocess dataset | |||
| ``` | |||
| @@ -31,7 +31,6 @@ echo $PATH1 | |||
| PATH2=$(get_real_path $2) | |||
| echo $PATH2 | |||
| python ${current_exec_path}/src/generate_hccn_file.py --rank_file=$PATH1 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| export RANK_SIZE=8 | |||
| ulimit -u unlimited | |||
| @@ -1,88 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """generate ascend rank file""" | |||
| import os | |||
| import socket | |||
| import argparse | |||
| parser = argparse.ArgumentParser(description="ascend distribute rank.") | |||
| parser.add_argument("--rank_file", type=str, default="scripts/rank_table_8p.json", help="rank_tabel_file_path.") | |||
| def main(rank_table_file): | |||
| nproc_per_node = 8 | |||
| visible_devices = ['0', '1', '2', '3', '4', '5', '6', '7'] | |||
| server_id = socket.gethostbyname(socket.gethostname()) | |||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||
| device_ips = {} | |||
| for hccn_item in hccn_configs: | |||
| hccn_item = hccn_item.strip() | |||
| if hccn_item.startswith('address_'): | |||
| device_id, device_ip = hccn_item.split('=') | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip | |||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||
| hccn_table = {} | |||
| hccn_table['board_id'] = '0x002f' # A+K | |||
| # hccn_table['board_id'] = '0x0000' # A+X | |||
| hccn_table['chip_info'] = '910' | |||
| hccn_table['deploy_mode'] = 'lab' | |||
| hccn_table['group_count'] = '1' | |||
| hccn_table['group_list'] = [] | |||
| instance_list = [] | |||
| for instance_id in range(nproc_per_node): | |||
| instance = {} | |||
| instance['devices'] = [] | |||
| device_id = visible_devices[instance_id] | |||
| device_ip = device_ips[device_id] | |||
| instance['devices'].append({ | |||
| 'device_id': device_id, | |||
| 'device_ip': device_ip, | |||
| }) | |||
| instance['rank_id'] = str(instance_id) | |||
| instance['server_id'] = server_id | |||
| instance_list.append(instance) | |||
| hccn_table['group_list'].append({ | |||
| 'device_num': str(nproc_per_node), | |||
| 'server_num': '1', | |||
| 'group_name': '', | |||
| 'instance_count': str(nproc_per_node), | |||
| 'instance_list': instance_list, | |||
| }) | |||
| hccn_table['para_plane_nic_location'] = 'device' | |||
| hccn_table['para_plane_nic_name'] = [] | |||
| for instance_id in range(nproc_per_node): | |||
| eth_id = visible_devices[instance_id] | |||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||
| hccn_table['para_plane_nic_num'] = str(nproc_per_node) | |||
| hccn_table['status'] = 'completed' | |||
| import json | |||
| with open(rank_table_file, 'w') as table_fp: | |||
| json.dump(hccn_table, table_fp, indent=4) | |||
| if __name__ == '__main__': | |||
| args_opt = parser.parse_args() | |||
| rank_table = args_opt.rank_file | |||
| if os.path.exists(rank_table): | |||
| print('Rank table file exists.') | |||
| else: | |||
| print('Generating rank table file.') | |||
| main(rank_table) | |||
| print('Rank table file generated') | |||
| @@ -58,7 +58,7 @@ A testing set containing about 2000 readable words | |||
| After installing MindSpore via the official website, you can start training and evaluation as follows: | |||
| ```python | |||
| # run distributed training example | |||
| sh scripts/run_distribute_train.sh pretrained_model.ckpt | |||
| sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt | |||
| #download opencv library | |||
| download pyblind11, opencv3.4 | |||
| @@ -91,7 +91,6 @@ sh scripts/run_eval_ascend.sh | |||
| └── run_eval_ascend.sh // shell script for evaluation | |||
| ├── src | |||
| ├── __init__.py | |||
| ├── generate_hccn_file.py // creating rank.json | |||
| ├── ETSNET | |||
| ├── __init__.py | |||
| ├── base.py // convolution and BN operator | |||
| @@ -130,7 +129,7 @@ Major parameters in train.py and config.py are: | |||
| ### Distributed Training | |||
| ``` | |||
| sh scripts/run_distribute_train.sh pretrained_model.ckpt | |||
| sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt | |||
| ``` | |||
| The above shell script will run distribute training in the background. You can view the results through the file | |||
| @@ -169,18 +168,18 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | |||
| | Parameters | PSENet | | |||
| | -------------------------- | ----------------------------------------------------------- | | |||
| | Model Version | Inception V1 | | |||
| | Model Version | V1 | | |||
| | Resource | Ascend 910 ;CPU 2.60GHz,192cores;Memory,755G | | |||
| | uploaded Date | 09/15/2020 (month/day/year) | | |||
| | MindSpore Version | 1.0-alpha | | |||
| | uploaded Date | 09/30/2020 (month/day/year) | | |||
| | MindSpore Version | 1.0.0 | | |||
| | Dataset | ICDAR2015 | | |||
| | Training Parameters | start_lr=0.1; lr_scale=0.1 | | |||
| | Optimizer | SGD | | |||
| | Loss Function | LossCallBack | | |||
| | outputs | probability | | |||
| | Loss | 0.35 | | |||
| | Speed | 1pc: 444 ms/step; 4pcs: 446 ms/step | | |||
| | Total time | 1pc: 75.48 h; 4pcs: 18.87 h | | |||
| | Speed | 1pc: 444 ms/step; 8pcs: 446 ms/step | | |||
| | Total time | 1pc: 75.48 h; 8pcs: 10.01 h | | |||
| | Parameters (M) | 27.36 | | |||
| | Checkpoint for Fine tuning | 109.44M (.ckpt file) | | |||
| | Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet | | |||
| @@ -190,13 +189,13 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | |||
| | Parameters | PSENet | | |||
| | ------------------- | --------------------------- | | |||
| | Model Version | Inception V1 | | |||
| | Model Version | V1 | | |||
| | Resource | Ascend 910 | | |||
| | Uploaded Date | 09/15/2020 (month/day/year) | | |||
| | MindSpore Version | 1.0-alpha | | |||
| | Uploaded Date | 09/30/2020 (month/day/year) | | |||
| | MindSpore Version | 1.0,0 | | |||
| | Dataset | ICDAR2015 | | |||
| | outputs | probability | | |||
| | Accuracy | 1pc: 81%; 4pcs: 81% | | |||
| | Accuracy | 1pc: 81%; 8pcs: 81% | | |||
| ## [How to use](#contents) | |||
| @@ -17,9 +17,9 @@ | |||
| current_exec_path=$(pwd) | |||
| echo 'current_exec_path: '${current_exec_path} | |||
| if [ $# != 1 ] | |||
| if [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train.sh [PRETRAINED_PATH]" | |||
| echo "Usage: sh run_distribute_train.sh [RANK_FILE] [PRETRAINED_PATH]" | |||
| exit 1 | |||
| fi | |||
| @@ -30,20 +30,24 @@ get_real_path(){ | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: PRETRAINED_PATH=$PATH1 is not a file" | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| python ${current_exec_path}/src/generate_hccn_file.py | |||
| PATH2=$(get_real_path $2) | |||
| if [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| export RANK_TABLE_FILE=${current_exec_path}/rank_table_8p.json | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| @@ -70,7 +74,7 @@ do | |||
| cd ${current_exec_path}/device_$i || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH1 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & | |||
| python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH2 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & | |||
| cd ${current_exec_path} || exit | |||
| done | |||
| @@ -1,85 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import os | |||
| import socket | |||
| RANK_TABLE_SAVE_PATH = './rank_table_8p.json' | |||
| def main(): | |||
| nproc_per_node = 4 | |||
| visible_devices = ['0', '1', '2', '3'] | |||
| server_id = socket.gethostbyname(socket.gethostname()) | |||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||
| device_ips = {} | |||
| for hccn_item in hccn_configs: | |||
| hccn_item = hccn_item.strip() | |||
| if hccn_item.startswith('address_'): | |||
| device_id, device_ip = hccn_item.split('=') | |||
| device_id = device_id.split('_')[1] | |||
| device_ips[device_id] = device_ip | |||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||
| hccn_table = {} | |||
| hccn_table['board_id'] = '0x002f' # A+K | |||
| hccn_table['chip_info'] = '910' | |||
| hccn_table['deploy_mode'] = 'lab' | |||
| hccn_table['group_count'] = '1' | |||
| hccn_table['group_list'] = [] | |||
| instance_list = [] | |||
| for instance_id in range(nproc_per_node): | |||
| instance = {} | |||
| instance['devices'] = [] | |||
| device_id = visible_devices[instance_id] | |||
| device_ip = device_ips[device_id] | |||
| instance['devices'].append({ | |||
| 'device_id': device_id, | |||
| 'device_ip': device_ip, | |||
| }) | |||
| instance['rank_id'] = str(instance_id) | |||
| instance['server_id'] = server_id | |||
| instance_list.append(instance) | |||
| hccn_table['group_list'].append({ | |||
| 'device_num': str(nproc_per_node), | |||
| 'server_num': '1', | |||
| 'group_name': '', | |||
| 'instance_count': str(nproc_per_node), | |||
| 'instance_list': instance_list, | |||
| }) | |||
| hccn_table['para_plane_nic_location'] = 'device' | |||
| hccn_table['para_plane_nic_name'] = [] | |||
| for instance_id in range(nproc_per_node): | |||
| eth_id = visible_devices[instance_id] | |||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||
| hccn_table['para_plane_nic_num'] = str(nproc_per_node) | |||
| hccn_table['status'] = 'completed' | |||
| import json | |||
| with open(RANK_TABLE_SAVE_PATH, 'w') as table_fp: | |||
| json.dump(hccn_table, table_fp, indent=4) | |||
| if __name__ == '__main__': | |||
| if os.path.exists(RANK_TABLE_SAVE_PATH): | |||
| print('Rank table file exists.') | |||
| else: | |||
| print('Generating rank table file.') | |||
| main() | |||
| print('Rank table file generated') | |||