| @@ -41,20 +41,22 @@ ImageNet2012 | |||||
| └──resnet | └──resnet | ||||
| ├── README.md | ├── README.md | ||||
| ├── script | ├── script | ||||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||||
| ├── run_eval.sh # launch evaluation | |||||
| └── run_standalone_train.sh # launch standalone training(1 pcs) | |||||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) | |||||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||||
| ├── run_parameter_server_train.sh # launch Ascend parameter server training(8 pcs) | |||||
| ├── run_eval.sh # launch evaluation | |||||
| └── run_standalone_train.sh # launch standalone training(1 pcs) | |||||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) | |||||
| ├── run_parameter_server_train_gpu.sh # launch gpu parameter server training(8 pcs) | |||||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||||
| ├── src | ├── src | ||||
| ├── config.py # parameter configuration | |||||
| ├── dataset.py # data preprocessing | |||||
| ├── crossentropy.py # loss definition for ImageNet2012 dataset | |||||
| ├── lr_generator.py # generate learning rate for each step | |||||
| └── resnet.py # resnet backbone, including resnet50 and resnet101 | |||||
| ├── eval.py # eval net | |||||
| └── train.py # train net | |||||
| ├── config.py # parameter configuration | |||||
| ├── dataset.py # data preprocessing | |||||
| ├── crossentropy.py # loss definition for ImageNet2012 dataset | |||||
| ├── lr_generator.py # generate learning rate for each step | |||||
| └── resnet.py # resnet backbone, including resnet50 and resnet101 | |||||
| ├── eval.py # eval net | |||||
| └── train.py # train net | |||||
| ``` | ``` | ||||
| @@ -252,3 +254,14 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA | |||||
| # infer example | # infer example | ||||
| sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] | sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] | ||||
| ``` | ``` | ||||
| ### Running parameter server mode training | |||||
| ``` | |||||
| # parameter server training Ascend example | |||||
| sh run_parameter_server_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||||
| # parameter server training GPU example | |||||
| sh run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||||
| > The way to evaluate is the same as the examples above. | |||||
| ``` | |||||
| @@ -0,0 +1,158 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| if [ $# != 4 ] && [ $# != 5 ] | |||||
| then | |||||
| echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] | |||||
| then | |||||
| echo "error: the selected net is neither resnet50 nor resnet101" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] | |||||
| then | |||||
| echo "error: the selected dataset is neither cifar10 nor imagenet2012" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] | |||||
| then | |||||
| echo "error: training resnet101 with cifar10 dataset is unsupported now!" | |||||
| exit 1 | |||||
| fi | |||||
| get_real_path(){ | |||||
| if [ "${1:0:1}" == "/" ]; then | |||||
| echo "$1" | |||||
| else | |||||
| echo "$(realpath -m $PWD/$1)" | |||||
| fi | |||||
| } | |||||
| PATH1=$(get_real_path $3) | |||||
| PATH2=$(get_real_path $4) | |||||
| if [ $# == 5 ] | |||||
| then | |||||
| PATH3=$(get_real_path $5) | |||||
| fi | |||||
| if [ ! -f $PATH1 ] | |||||
| then | |||||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| if [ ! -d $PATH2 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $# == 5 ] && [ ! -f $PATH3 ] | |||||
| then | |||||
| echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| ulimit -u unlimited | |||||
| export DEVICE_NUM=8 | |||||
| export RANK_SIZE=8 | |||||
| export RANK_TABLE_FILE=$PATH1 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=$RANK_SIZE | |||||
| export MS_SERVER_NUM=1 | |||||
| export MS_SCHED_HOST=127.0.0.1 | |||||
| export MS_SCHED_PORT=8081 | |||||
| export MS_ROLE=MS_SCHED | |||||
| export DEVICE_ID=0 | |||||
| export RANK_ID=0 | |||||
| rm -rf ./sched | |||||
| mkdir ./sched | |||||
| cp ../*.py ./sched | |||||
| cp *.sh ./sched | |||||
| cp -r ../src ./sched | |||||
| cd ./sched || exit | |||||
| echo "start scheduler" | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log & | |||||
| fi | |||||
| if [ $# == 5 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log & | |||||
| fi | |||||
| cd .. | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0; i<1; i++)) | |||||
| do | |||||
| export DEVICE_ID=$i | |||||
| export RANK_ID=$i | |||||
| rm -rf ./server_$i | |||||
| mkdir ./server_$i | |||||
| cp ../*.py ./server_$i | |||||
| cp *.sh ./server_$i | |||||
| cp -r ../src ./server_$i | |||||
| cd ./server_$i || exit | |||||
| echo "start server" | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log & | |||||
| fi | |||||
| if [ $# == 5 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log & | |||||
| fi | |||||
| cd .. | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||||
| do | |||||
| export DEVICE_ID=$i | |||||
| export RANK_ID=$i | |||||
| rm -rf ./worker_$i | |||||
| mkdir ./worker_$i | |||||
| cp ../*.py ./worker_$i | |||||
| cp *.sh ./worker_$i | |||||
| cp -r ../src ./worker_$i | |||||
| cd ./worker_$i || exit | |||||
| echo "start training for worker rank $RANK_ID, device $DEVICE_ID" | |||||
| env > env.log | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log & | |||||
| fi | |||||
| if [ $# == 5 ] | |||||
| then | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log & | |||||
| fi | |||||
| cd .. | |||||
| done | |||||
| @@ -0,0 +1,144 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| if [ $# != 3 ] && [ $# != 4 ] | |||||
| then | |||||
| echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] | |||||
| then | |||||
| echo "error: the selected net is neither resnet50 nor resnet101" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] | |||||
| then | |||||
| echo "error: the selected dataset is neither cifar10 nor imagenet2012" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] | |||||
| then | |||||
| echo "error: training resnet101 with cifar10 dataset is unsupported now!" | |||||
| exit 1 | |||||
| fi | |||||
| get_real_path(){ | |||||
| if [ "${1:0:1}" == "/" ]; then | |||||
| echo "$1" | |||||
| else | |||||
| echo "$(realpath -m $PWD/$1)" | |||||
| fi | |||||
| } | |||||
| PATH1=$(get_real_path $3) | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| PATH2=$(get_real_path $4) | |||||
| fi | |||||
| if [ ! -d $PATH2 ] | |||||
| then | |||||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||||
| exit 1 | |||||
| fi | |||||
| if [ $# == 5 ] && [ ! -f $PATH2 ] | |||||
| then | |||||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||||
| exit 1 | |||||
| fi | |||||
| export DEVICE_NUM=8 | |||||
| export RANK_SIZE=8 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=8 | |||||
| export MS_SERVER_NUM=1 | |||||
| export MS_SCHED_HOST=127.0.0.1 | |||||
| export MS_SCHED_PORT=8081 | |||||
| export MS_ROLE=MS_SCHED | |||||
| rm -rf ./sched | |||||
| mkdir ./sched | |||||
| cp ../*.py ./sched | |||||
| cp *.sh ./sched | |||||
| cp -r ../src ./sched | |||||
| cd ./sched || exit | |||||
| if [ $# == 3 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & | |||||
| fi | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & | |||||
| fi | |||||
| cd .. | |||||
| export MS_ROLE=MS_PSERVER | |||||
| rm -rf ./server | |||||
| mkdir ./server | |||||
| cp ../*.py ./server | |||||
| cp *.sh ./server | |||||
| cp -r ../src ./server | |||||
| cd ./server || exit | |||||
| if [ $# == 3 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log & | |||||
| fi | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log & | |||||
| fi | |||||
| cd .. | |||||
| export MS_ROLE=MS_WORKER | |||||
| rm -rf ./worker | |||||
| mkdir ./worker | |||||
| cp ../*.py ./worker | |||||
| cp *.sh ./worker | |||||
| cp -r ../src ./worker | |||||
| cd ./worker || exit | |||||
| if [ $# == 3 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & | |||||
| fi | |||||
| if [ $# == 4 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & | |||||
| fi | |||||
| cd .. | |||||
| @@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') | |||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | ||||
| parser.add_argument('--device_target', type=str, default='Ascend', help='Device target') | parser.add_argument('--device_target', type=str, default='Ascend', help='Device target') | ||||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | ||||
| parser.add_argument('--parameter_server', type=bool, default=False, help='Run parameter server train') | |||||
| args_opt = parser.parse_args() | args_opt = parser.parse_args() | ||||
| random.seed(1) | random.seed(1) | ||||
| @@ -92,6 +93,8 @@ if __name__ == '__main__': | |||||
| # define net | # define net | ||||
| net = resnet(class_num=config.class_num) | net = resnet(class_num=config.class_num) | ||||
| if args_opt.parameter_server: | |||||
| net.set_param_ps() | |||||
| # init weight | # init weight | ||||
| if args_opt.pre_trained: | if args_opt.pre_trained: | ||||
| @@ -181,4 +184,4 @@ if __name__ == '__main__': | |||||
| cb += [ckpt_cb] | cb += [ckpt_cb] | ||||
| # train model | # train model | ||||
| model.train(config.epoch_size, dataset, callbacks=cb) | |||||
| model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=(not args_opt.parameter_server)) | |||||