Merge pull request !7878 from linqingke/cnnctctags/v1.1.0
| @@ -150,7 +150,6 @@ The entire code structure is as following: | |||||
| |---callback.py // loss callback file | |---callback.py // loss callback file | ||||
| |---dataset.py // process dataset | |---dataset.py // process dataset | ||||
| |---util.py // routine operation | |---util.py // routine operation | ||||
| |---generate_hccn_file.py // generate distribute json file | |||||
| |---preprocess_dataset.py // preprocess dataset | |---preprocess_dataset.py // preprocess dataset | ||||
| ``` | ``` | ||||
| @@ -31,7 +31,6 @@ echo $PATH1 | |||||
| PATH2=$(get_real_path $2) | PATH2=$(get_real_path $2) | ||||
| echo $PATH2 | echo $PATH2 | ||||
| python ${current_exec_path}/src/generate_hccn_file.py --rank_file=$PATH1 | |||||
| export RANK_TABLE_FILE=$PATH1 | export RANK_TABLE_FILE=$PATH1 | ||||
| export RANK_SIZE=8 | export RANK_SIZE=8 | ||||
| ulimit -u unlimited | ulimit -u unlimited | ||||
| @@ -1,88 +0,0 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """generate ascend rank file""" | |||||
| import os | |||||
| import socket | |||||
| import argparse | |||||
| parser = argparse.ArgumentParser(description="ascend distribute rank.") | |||||
| parser.add_argument("--rank_file", type=str, default="scripts/rank_table_8p.json", help="rank_tabel_file_path.") | |||||
| def main(rank_table_file): | |||||
| nproc_per_node = 8 | |||||
| visible_devices = ['0', '1', '2', '3', '4', '5', '6', '7'] | |||||
| server_id = socket.gethostbyname(socket.gethostname()) | |||||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||||
| device_ips = {} | |||||
| for hccn_item in hccn_configs: | |||||
| hccn_item = hccn_item.strip() | |||||
| if hccn_item.startswith('address_'): | |||||
| device_id, device_ip = hccn_item.split('=') | |||||
| device_id = device_id.split('_')[1] | |||||
| device_ips[device_id] = device_ip | |||||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||||
| hccn_table = {} | |||||
| hccn_table['board_id'] = '0x002f' # A+K | |||||
| # hccn_table['board_id'] = '0x0000' # A+X | |||||
| hccn_table['chip_info'] = '910' | |||||
| hccn_table['deploy_mode'] = 'lab' | |||||
| hccn_table['group_count'] = '1' | |||||
| hccn_table['group_list'] = [] | |||||
| instance_list = [] | |||||
| for instance_id in range(nproc_per_node): | |||||
| instance = {} | |||||
| instance['devices'] = [] | |||||
| device_id = visible_devices[instance_id] | |||||
| device_ip = device_ips[device_id] | |||||
| instance['devices'].append({ | |||||
| 'device_id': device_id, | |||||
| 'device_ip': device_ip, | |||||
| }) | |||||
| instance['rank_id'] = str(instance_id) | |||||
| instance['server_id'] = server_id | |||||
| instance_list.append(instance) | |||||
| hccn_table['group_list'].append({ | |||||
| 'device_num': str(nproc_per_node), | |||||
| 'server_num': '1', | |||||
| 'group_name': '', | |||||
| 'instance_count': str(nproc_per_node), | |||||
| 'instance_list': instance_list, | |||||
| }) | |||||
| hccn_table['para_plane_nic_location'] = 'device' | |||||
| hccn_table['para_plane_nic_name'] = [] | |||||
| for instance_id in range(nproc_per_node): | |||||
| eth_id = visible_devices[instance_id] | |||||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||||
| hccn_table['para_plane_nic_num'] = str(nproc_per_node) | |||||
| hccn_table['status'] = 'completed' | |||||
| import json | |||||
| with open(rank_table_file, 'w') as table_fp: | |||||
| json.dump(hccn_table, table_fp, indent=4) | |||||
| if __name__ == '__main__': | |||||
| args_opt = parser.parse_args() | |||||
| rank_table = args_opt.rank_file | |||||
| if os.path.exists(rank_table): | |||||
| print('Rank table file exists.') | |||||
| else: | |||||
| print('Generating rank table file.') | |||||
| main(rank_table) | |||||
| print('Rank table file generated') | |||||
| @@ -58,7 +58,7 @@ A testing set containing about 2000 readable words | |||||
| After installing MindSpore via the official website, you can start training and evaluation as follows: | After installing MindSpore via the official website, you can start training and evaluation as follows: | ||||
| ```python | ```python | ||||
| # run distributed training example | # run distributed training example | ||||
| sh scripts/run_distribute_train.sh pretrained_model.ckpt | |||||
| sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt | |||||
| #download opencv library | #download opencv library | ||||
| download pyblind11, opencv3.4 | download pyblind11, opencv3.4 | ||||
| @@ -91,7 +91,6 @@ sh scripts/run_eval_ascend.sh | |||||
| └── run_eval_ascend.sh // shell script for evaluation | └── run_eval_ascend.sh // shell script for evaluation | ||||
| ├── src | ├── src | ||||
| ├── __init__.py | ├── __init__.py | ||||
| ├── generate_hccn_file.py // creating rank.json | |||||
| ├── ETSNET | ├── ETSNET | ||||
| ├── __init__.py | ├── __init__.py | ||||
| ├── base.py // convolution and BN operator | ├── base.py // convolution and BN operator | ||||
| @@ -130,7 +129,7 @@ Major parameters in train.py and config.py are: | |||||
| ### Distributed Training | ### Distributed Training | ||||
| ``` | ``` | ||||
| sh scripts/run_distribute_train.sh pretrained_model.ckpt | |||||
| sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt | |||||
| ``` | ``` | ||||
| The above shell script will run distribute training in the background. You can view the results through the file | The above shell script will run distribute training in the background. You can view the results through the file | ||||
| @@ -169,18 +168,18 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | |||||
| | Parameters | PSENet | | | Parameters | PSENet | | ||||
| | -------------------------- | ----------------------------------------------------------- | | | -------------------------- | ----------------------------------------------------------- | | ||||
| | Model Version | Inception V1 | | |||||
| | Model Version | V1 | | |||||
| | Resource | Ascend 910 ;CPU 2.60GHz,192cores;Memory,755G | | | Resource | Ascend 910 ;CPU 2.60GHz,192cores;Memory,755G | | ||||
| | uploaded Date | 09/15/2020 (month/day/year) | | |||||
| | MindSpore Version | 1.0-alpha | | |||||
| | uploaded Date | 09/30/2020 (month/day/year) | | |||||
| | MindSpore Version | 1.0.0 | | |||||
| | Dataset | ICDAR2015 | | | Dataset | ICDAR2015 | | ||||
| | Training Parameters | start_lr=0.1; lr_scale=0.1 | | | Training Parameters | start_lr=0.1; lr_scale=0.1 | | ||||
| | Optimizer | SGD | | | Optimizer | SGD | | ||||
| | Loss Function | LossCallBack | | | Loss Function | LossCallBack | | ||||
| | outputs | probability | | | outputs | probability | | ||||
| | Loss | 0.35 | | | Loss | 0.35 | | ||||
| | Speed | 1pc: 444 ms/step; 4pcs: 446 ms/step | | |||||
| | Total time | 1pc: 75.48 h; 4pcs: 18.87 h | | |||||
| | Speed | 1pc: 444 ms/step; 8pcs: 446 ms/step | | |||||
| | Total time | 1pc: 75.48 h; 8pcs: 10.01 h | | |||||
| | Parameters (M) | 27.36 | | | Parameters (M) | 27.36 | | ||||
| | Checkpoint for Fine tuning | 109.44M (.ckpt file) | | | Checkpoint for Fine tuning | 109.44M (.ckpt file) | | ||||
| | Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet | | | Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet | | ||||
| @@ -190,13 +189,13 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean | |||||
| | Parameters | PSENet | | | Parameters | PSENet | | ||||
| | ------------------- | --------------------------- | | | ------------------- | --------------------------- | | ||||
| | Model Version | Inception V1 | | |||||
| | Model Version | V1 | | |||||
| | Resource | Ascend 910 | | | Resource | Ascend 910 | | ||||
| | Uploaded Date | 09/15/2020 (month/day/year) | | |||||
| | MindSpore Version | 1.0-alpha | | |||||
| | Uploaded Date | 09/30/2020 (month/day/year) | | |||||
| | MindSpore Version | 1.0,0 | | |||||
| | Dataset | ICDAR2015 | | | Dataset | ICDAR2015 | | ||||
| | outputs | probability | | | outputs | probability | | ||||
| | Accuracy | 1pc: 81%; 4pcs: 81% | | |||||
| | Accuracy | 1pc: 81%; 8pcs: 81% | | |||||
| ## [How to use](#contents) | ## [How to use](#contents) | ||||
| @@ -17,9 +17,9 @@ | |||||
| current_exec_path=$(pwd) | current_exec_path=$(pwd) | ||||
| echo 'current_exec_path: '${current_exec_path} | echo 'current_exec_path: '${current_exec_path} | ||||
| if [ $# != 1 ] | |||||
| if [ $# != 2 ] | |||||
| then | then | ||||
| echo "Usage: sh run_distribute_train.sh [PRETRAINED_PATH]" | |||||
| echo "Usage: sh run_distribute_train.sh [RANK_FILE] [PRETRAINED_PATH]" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -30,20 +30,24 @@ get_real_path(){ | |||||
| echo "$(realpath -m $PWD/$1)" | echo "$(realpath -m $PWD/$1)" | ||||
| fi | fi | ||||
| } | } | ||||
| PATH1=$(get_real_path $1) | |||||
| PATH1=$(get_real_path $1) | |||||
| if [ ! -f $PATH1 ] | if [ ! -f $PATH1 ] | ||||
| then | then | ||||
| echo "error: PRETRAINED_PATH=$PATH1 is not a file" | |||||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| python ${current_exec_path}/src/generate_hccn_file.py | |||||
| PATH2=$(get_real_path $2) | |||||
| if [ ! -f $PATH2 ] | |||||
| then | |||||
| echo "error: PRETRAINED_PATH=$PATH2 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| export DEVICE_NUM=8 | export DEVICE_NUM=8 | ||||
| export RANK_SIZE=8 | export RANK_SIZE=8 | ||||
| export RANK_TABLE_FILE=${current_exec_path}/rank_table_8p.json | |||||
| export RANK_TABLE_FILE=$PATH1 | |||||
| for((i=0; i<${DEVICE_NUM}; i++)) | for((i=0; i<${DEVICE_NUM}; i++)) | ||||
| do | do | ||||
| @@ -70,7 +74,7 @@ do | |||||
| cd ${current_exec_path}/device_$i || exit | cd ${current_exec_path}/device_$i || exit | ||||
| export RANK_ID=$i | export RANK_ID=$i | ||||
| export DEVICE_ID=$i | export DEVICE_ID=$i | ||||
| python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH1 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & | |||||
| python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH2 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 & | |||||
| cd ${current_exec_path} || exit | cd ${current_exec_path} || exit | ||||
| done | done | ||||
| @@ -1,85 +0,0 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| import os | |||||
| import socket | |||||
| RANK_TABLE_SAVE_PATH = './rank_table_8p.json' | |||||
| def main(): | |||||
| nproc_per_node = 4 | |||||
| visible_devices = ['0', '1', '2', '3'] | |||||
| server_id = socket.gethostbyname(socket.gethostname()) | |||||
| hccn_configs = open('/etc/hccn.conf', 'r').readlines() | |||||
| device_ips = {} | |||||
| for hccn_item in hccn_configs: | |||||
| hccn_item = hccn_item.strip() | |||||
| if hccn_item.startswith('address_'): | |||||
| device_id, device_ip = hccn_item.split('=') | |||||
| device_id = device_id.split('_')[1] | |||||
| device_ips[device_id] = device_ip | |||||
| print('device_id:{}, device_ip:{}'.format(device_id, device_ip)) | |||||
| hccn_table = {} | |||||
| hccn_table['board_id'] = '0x002f' # A+K | |||||
| hccn_table['chip_info'] = '910' | |||||
| hccn_table['deploy_mode'] = 'lab' | |||||
| hccn_table['group_count'] = '1' | |||||
| hccn_table['group_list'] = [] | |||||
| instance_list = [] | |||||
| for instance_id in range(nproc_per_node): | |||||
| instance = {} | |||||
| instance['devices'] = [] | |||||
| device_id = visible_devices[instance_id] | |||||
| device_ip = device_ips[device_id] | |||||
| instance['devices'].append({ | |||||
| 'device_id': device_id, | |||||
| 'device_ip': device_ip, | |||||
| }) | |||||
| instance['rank_id'] = str(instance_id) | |||||
| instance['server_id'] = server_id | |||||
| instance_list.append(instance) | |||||
| hccn_table['group_list'].append({ | |||||
| 'device_num': str(nproc_per_node), | |||||
| 'server_num': '1', | |||||
| 'group_name': '', | |||||
| 'instance_count': str(nproc_per_node), | |||||
| 'instance_list': instance_list, | |||||
| }) | |||||
| hccn_table['para_plane_nic_location'] = 'device' | |||||
| hccn_table['para_plane_nic_name'] = [] | |||||
| for instance_id in range(nproc_per_node): | |||||
| eth_id = visible_devices[instance_id] | |||||
| hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) | |||||
| hccn_table['para_plane_nic_num'] = str(nproc_per_node) | |||||
| hccn_table['status'] = 'completed' | |||||
| import json | |||||
| with open(RANK_TABLE_SAVE_PATH, 'w') as table_fp: | |||||
| json.dump(hccn_table, table_fp, indent=4) | |||||
| if __name__ == '__main__': | |||||
| if os.path.exists(RANK_TABLE_SAVE_PATH): | |||||
| print('Rank table file exists.') | |||||
| else: | |||||
| print('Generating rank table file.') | |||||
| main() | |||||
| print('Rank table file generated') | |||||