Add ps model zoo for resnet

5 years ago · 71a20a87a1
--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@@ -41,20 +41,22 @@ ImageNet2012
 └──resnet
  ├── README.md
  ├── script
    ├── run_distribute_train.sh         # launch distributed training(8 pcs)
    ├── run_eval.sh                     # launch evaluation
    └── run_standalone_train.sh         # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh     # launch gpu distributed training(8 pcs)
    ├── run_eval_gpu.sh                 # launch gpu evaluation
    └── run_standalone_train_gpu.sh     # launch gpu standalone training(1 pcs)
    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
    ├── run_parameter_server_train.sh      # launch Ascend parameter server training(8 pcs)
    ├── run_eval.sh                        # launch evaluation
    └── run_standalone_train.sh            # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(8 pcs)
    ├── run_parameter_server_train_gpu.sh  # launch gpu parameter server training(8 pcs)
    ├── run_eval_gpu.sh                    # launch gpu evaluation
    └── run_standalone_train_gpu.sh        # launch gpu standalone training(1 pcs)
  ├── src
    ├── config.py                       # parameter configuration
    ├── dataset.py                      # data preprocessing
    ├── crossentropy.py                 # loss definition for ImageNet2012 dataset
    ├── lr_generator.py                 # generate learning rate for each step
    └── resnet.py                       # resnet backbone, including resnet50 and resnet101
  ├── eval.py                           # eval net
  └── train.py                          # train net
    ├── config.py                          # parameter configuration
    ├── dataset.py                         # data preprocessing
    ├── crossentropy.py                    # loss definition for ImageNet2012 dataset
    ├── lr_generator.py                    # generate learning rate for each step
    └── resnet.py                          # resnet backbone, including resnet50 and resnet101
  ├── eval.py                              # eval net
  └── train.py                             # train net
 ```
@@ -252,3 +254,14 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA
 # infer example
 sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
 ```
 ### Running parameter server mode training
 ```
 # parameter server training Ascend example
 sh run_parameter_server_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 # parameter server training GPU example
 sh run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 > The way to evaluate is the same as the examples above.
 ```
--- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh
+++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh
@@ -0,0 +1,158 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 if [ $# != 4 ] && [ $# != 5 ]
 then 
 	echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi
 if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
 then 
    echo "error: the selected net is neither resnet50 nor resnet101"
 exit 1
 fi
 if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
 then 
    echo "error: the selected dataset is neither cifar10 nor imagenet2012"
 exit 1
 fi
 if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
 then 
    echo "error: training resnet101 with cifar10 dataset is unsupported now!"
 exit 1
 fi
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 PATH1=$(get_real_path $3)
 PATH2=$(get_real_path $4)
 if [ $# == 5 ]
 then 
    PATH3=$(get_real_path $5)
 fi
 if [ ! -f $PATH1 ]
 then 
    echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
 exit 1
 fi 
 if [ ! -d $PATH2 ]
 then 
    echo "error: DATASET_PATH=$PATH2 is not a directory"
 exit 1
 fi 
 if [ $# == 5 ] && [ ! -f $PATH3 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
 exit 1
 fi
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export RANK_TABLE_FILE=$PATH1
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=$RANK_SIZE
 export MS_SERVER_NUM=1
 export MS_SCHED_HOST=127.0.0.1
 export MS_SCHED_PORT=8081
 export MS_ROLE=MS_SCHED
 export DEVICE_ID=0
 export RANK_ID=0
 rm -rf ./sched
 mkdir ./sched
 cp ../*.py ./sched
 cp *.sh ./sched
 cp -r ../src ./sched
 cd ./sched || exit
 echo "start scheduler"
 if [ $# == 4 ]
 then	    
    python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log &
 fi
 if [ $# == 5 ]
 then
    python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log &
 fi
 cd ..
 export MS_ROLE=MS_PSERVER
 for((i=0; i<1; i++))
 do
    export DEVICE_ID=$i
    export RANK_ID=$i
    rm -rf ./server_$i
    mkdir ./server_$i
    cp ../*.py ./server_$i
    cp *.sh ./server_$i
    cp -r ../src ./server_$i
    cd ./server_$i || exit
    echo "start server"
    if [ $# == 4 ]
    then	    
        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log &
    fi
    if [ $# == 5 ]
    then
        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log &
    fi
    cd ..
 done
 export MS_ROLE=MS_WORKER
 for((i=0; i<${DEVICE_NUM}; i++))
 do
    export DEVICE_ID=$i
    export RANK_ID=$i
    rm -rf ./worker_$i
    mkdir ./worker_$i
    cp ../*.py ./worker_$i
    cp *.sh ./worker_$i
    cp -r ../src ./worker_$i
    cd ./worker_$i || exit
    echo "start training for worker rank $RANK_ID, device $DEVICE_ID"
    env > env.log
    if [ $# == 4 ]
    then	    
        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log &
    fi
    if [ $# == 5 ]
    then
        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log &
    fi
    cd ..
 done
--- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
+++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
@@ -0,0 +1,144 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 if [ $# != 3 ] && [ $# != 4 ]
 then 
 	echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi
 if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
 then 
    echo "error: the selected net is neither resnet50 nor resnet101"
 exit 1
 fi
 if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
 then 
    echo "error: the selected dataset is neither cifar10 nor imagenet2012"
 exit 1
 fi
 if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
 then 
    echo "error: training resnet101 with cifar10 dataset is unsupported now!"
 exit 1
 fi
 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }
 PATH1=$(get_real_path $3)
 if [ $# == 4 ]
 then 
    PATH2=$(get_real_path $4)
 fi
 if [ ! -d $PATH2 ]
 then 
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi 
 if [ $# == 5 ] && [ ! -f $PATH2 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 exit 1
 fi
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=8
 export MS_SERVER_NUM=1
 export MS_SCHED_HOST=127.0.0.1
 export MS_SCHED_PORT=8081
 export MS_ROLE=MS_SCHED
 rm -rf ./sched
 mkdir ./sched
 cp ../*.py ./sched
 cp *.sh ./sched
 cp -r ../src ./sched
 cd ./sched || exit
 if [ $# == 3 ]
 then	    
        mpirun --allow-run-as-root -n 1 \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
 fi
 if [ $# == 4 ]
 then
        mpirun --allow-run-as-root -n 1 \
          python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
 fi
 cd ..
 export MS_ROLE=MS_PSERVER
 rm -rf ./server
 mkdir ./server
 cp ../*.py ./server
 cp *.sh ./server
 cp -r ../src ./server
 cd ./server || exit
 if [ $# == 3 ]
 then	    
        mpirun --allow-run-as-root -n 1 \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log &
 fi
 if [ $# == 4 ]
 then
        mpirun --allow-run-as-root -n 1 \
          python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log &
 fi
 cd ..
 export MS_ROLE=MS_WORKER
 rm -rf ./worker
 mkdir ./worker
 cp ../*.py ./worker
 cp *.sh ./worker
 cp -r ../src ./worker
 cd ./worker || exit
 if [ $# == 3 ]
 then	    
        mpirun --allow-run-as-root -n $RANK_SIZE \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
 fi
 if [ $# == 4 ]
 then
        mpirun --allow-run-as-root -n $RANK_SIZE \
          python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
 fi
 cd ..
--- a/model_zoo/official/cv/resnet/train.py
+++ b/model_zoo/official/cv/resnet/train.py
@@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
 parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
 parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
 parser.add_argument('--parameter_server', type=bool, default=False, help='Run parameter server train')
 args_opt = parser.parse_args()
 random.seed(1)
@@ -92,6 +93,8 @@ if __name__ == '__main__':
    # define net
    net = resnet(class_num=config.class_num)
    if args_opt.parameter_server:
        net.set_param_ps()
    # init weight
    if args_opt.pre_trained:
@@ -181,4 +184,4 @@ if __name__ == '__main__':
        cb += [ckpt_cb]
    # train model
    model.train(config.epoch_size, dataset, callbacks=cb)
    model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=(not args_opt.parameter_server))