| @@ -41,20 +41,22 @@ ImageNet2012 | |||
| └──resnet | |||
| ├── README.md | |||
| ├── script | |||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||
| ├── run_eval.sh # launch evaluation | |||
| └── run_standalone_train.sh # launch standalone training(1 pcs) | |||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) | |||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||
| ├── run_parameter_server_train.sh # launch Ascend parameter server training(8 pcs) | |||
| ├── run_eval.sh # launch evaluation | |||
| └── run_standalone_train.sh # launch standalone training(1 pcs) | |||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) | |||
| ├── run_parameter_server_train_gpu.sh # launch gpu parameter server training(8 pcs) | |||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||
| ├── src | |||
| ├── config.py # parameter configuration | |||
| ├── dataset.py # data preprocessing | |||
| ├── crossentropy.py # loss definition for ImageNet2012 dataset | |||
| ├── lr_generator.py # generate learning rate for each step | |||
| └── resnet.py # resnet backbone, including resnet50 and resnet101 | |||
| ├── eval.py # eval net | |||
| └── train.py # train net | |||
| ├── config.py # parameter configuration | |||
| ├── dataset.py # data preprocessing | |||
| ├── crossentropy.py # loss definition for ImageNet2012 dataset | |||
| ├── lr_generator.py # generate learning rate for each step | |||
| └── resnet.py # resnet backbone, including resnet50 and resnet101 | |||
| ├── eval.py # eval net | |||
| └── train.py # train net | |||
| ``` | |||
| @@ -252,3 +254,14 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA | |||
| # infer example | |||
| sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| ### Running parameter server mode training | |||
| ``` | |||
| # parameter server training Ascend example | |||
| sh run_parameter_server_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # parameter server training GPU example | |||
| sh run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| > The way to evaluate is the same as the examples above. | |||
| ``` | |||
| @@ -0,0 +1,158 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 4 ] && [ $# != 5 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] | |||
| then | |||
| echo "error: the selected net is neither resnet50 nor resnet101" | |||
| exit 1 | |||
| fi | |||
| if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] | |||
| then | |||
| echo "error: the selected dataset is neither cifar10 nor imagenet2012" | |||
| exit 1 | |||
| fi | |||
| if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] | |||
| then | |||
| echo "error: training resnet101 with cifar10 dataset is unsupported now!" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $3) | |||
| PATH2=$(get_real_path $4) | |||
| if [ $# == 5 ] | |||
| then | |||
| PATH3=$(get_real_path $5) | |||
| fi | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 5 ] && [ ! -f $PATH3 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=1 | |||
| export MS_SCHED_HOST=127.0.0.1 | |||
| export MS_SCHED_PORT=8081 | |||
| export MS_ROLE=MS_SCHED | |||
| export DEVICE_ID=0 | |||
| export RANK_ID=0 | |||
| rm -rf ./sched | |||
| mkdir ./sched | |||
| cp ../*.py ./sched | |||
| cp *.sh ./sched | |||
| cp -r ../src ./sched | |||
| cd ./sched || exit | |||
| echo "start scheduler" | |||
| if [ $# == 4 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log & | |||
| fi | |||
| if [ $# == 5 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log & | |||
| fi | |||
| cd .. | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0; i<1; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$i | |||
| rm -rf ./server_$i | |||
| mkdir ./server_$i | |||
| cp ../*.py ./server_$i | |||
| cp *.sh ./server_$i | |||
| cp -r ../src ./server_$i | |||
| cd ./server_$i || exit | |||
| echo "start server" | |||
| if [ $# == 4 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log & | |||
| fi | |||
| if [ $# == 5 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log & | |||
| fi | |||
| cd .. | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| for((i=0; i<${DEVICE_NUM}; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$i | |||
| rm -rf ./worker_$i | |||
| mkdir ./worker_$i | |||
| cp ../*.py ./worker_$i | |||
| cp *.sh ./worker_$i | |||
| cp -r ../src ./worker_$i | |||
| cd ./worker_$i || exit | |||
| echo "start training for worker rank $RANK_ID, device $DEVICE_ID" | |||
| env > env.log | |||
| if [ $# == 4 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log & | |||
| fi | |||
| if [ $# == 5 ] | |||
| then | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log & | |||
| fi | |||
| cd .. | |||
| done | |||
| @@ -0,0 +1,144 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 3 ] && [ $# != 4 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] | |||
| then | |||
| echo "error: the selected net is neither resnet50 nor resnet101" | |||
| exit 1 | |||
| fi | |||
| if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] | |||
| then | |||
| echo "error: the selected dataset is neither cifar10 nor imagenet2012" | |||
| exit 1 | |||
| fi | |||
| if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] | |||
| then | |||
| echo "error: training resnet101 with cifar10 dataset is unsupported now!" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $3) | |||
| if [ $# == 4 ] | |||
| then | |||
| PATH2=$(get_real_path $4) | |||
| fi | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 5 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=8 | |||
| export MS_SERVER_NUM=1 | |||
| export MS_SCHED_HOST=127.0.0.1 | |||
| export MS_SCHED_PORT=8081 | |||
| export MS_ROLE=MS_SCHED | |||
| rm -rf ./sched | |||
| mkdir ./sched | |||
| cp ../*.py ./sched | |||
| cp *.sh ./sched | |||
| cp -r ../src ./sched | |||
| cd ./sched || exit | |||
| if [ $# == 3 ] | |||
| then | |||
| mpirun --allow-run-as-root -n 1 \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| mpirun --allow-run-as-root -n 1 \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & | |||
| fi | |||
| cd .. | |||
| export MS_ROLE=MS_PSERVER | |||
| rm -rf ./server | |||
| mkdir ./server | |||
| cp ../*.py ./server | |||
| cp *.sh ./server | |||
| cp -r ../src ./server | |||
| cd ./server || exit | |||
| if [ $# == 3 ] | |||
| then | |||
| mpirun --allow-run-as-root -n 1 \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| mpirun --allow-run-as-root -n 1 \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log & | |||
| fi | |||
| cd .. | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ./worker | |||
| mkdir ./worker | |||
| cp ../*.py ./worker | |||
| cp *.sh ./worker | |||
| cp -r ../src ./worker | |||
| cd ./worker || exit | |||
| if [ $# == 3 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & | |||
| fi | |||
| if [ $# == 4 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & | |||
| fi | |||
| cd .. | |||
| @@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') | |||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||
| parser.add_argument('--device_target', type=str, default='Ascend', help='Device target') | |||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | |||
| parser.add_argument('--parameter_server', type=bool, default=False, help='Run parameter server train') | |||
| args_opt = parser.parse_args() | |||
| random.seed(1) | |||
| @@ -92,6 +93,8 @@ if __name__ == '__main__': | |||
| # define net | |||
| net = resnet(class_num=config.class_num) | |||
| if args_opt.parameter_server: | |||
| net.set_param_ps() | |||
| # init weight | |||
| if args_opt.pre_trained: | |||
| @@ -181,4 +184,4 @@ if __name__ == '__main__': | |||
| cb += [ckpt_cb] | |||
| # train model | |||
| model.train(config.epoch_size, dataset, callbacks=cb) | |||
| model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=(not args_opt.parameter_server)) | |||