Merge pull request !5768 from linqingke/fasterrcnntags/v1.0.0
| @@ -154,7 +154,7 @@ sh run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] | |||||
| ### Result | ### Result | ||||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log. | |||||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log. | |||||
| ``` | ``` | ||||
| @@ -39,7 +39,7 @@ class LossCallBack(Callback): | |||||
| per_print_times (int): Print loss every times. Default: 1. | per_print_times (int): Print loss every times. Default: 1. | ||||
| """ | """ | ||||
| def __init__(self, per_print_times=1): | |||||
| def __init__(self, per_print_times=1, rank_id=0): | |||||
| super(LossCallBack, self).__init__() | super(LossCallBack, self).__init__() | ||||
| if not isinstance(per_print_times, int) or per_print_times < 0: | if not isinstance(per_print_times, int) or per_print_times < 0: | ||||
| raise ValueError("print_step must be int and >= 0.") | raise ValueError("print_step must be int and >= 0.") | ||||
| @@ -51,6 +51,7 @@ class LossCallBack(Callback): | |||||
| self.rpn_reg_loss_sum = 0 | self.rpn_reg_loss_sum = 0 | ||||
| self.rcnn_cls_loss_sum = 0 | self.rcnn_cls_loss_sum = 0 | ||||
| self.rcnn_reg_loss_sum = 0 | self.rcnn_reg_loss_sum = 0 | ||||
| self.rank_id = rank_id | |||||
| global time_stamp_init, time_stamp_first | global time_stamp_init, time_stamp_first | ||||
| if not time_stamp_init: | if not time_stamp_init: | ||||
| @@ -91,7 +92,7 @@ class LossCallBack(Callback): | |||||
| total_loss = rpn_loss + rcnn_loss | total_loss = rpn_loss + rcnn_loss | ||||
| loss_file = open("./loss.log", "a+") | |||||
| loss_file = open("./loss_{}.log".format(self.rank_id), "a+") | |||||
| loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " | loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " | ||||
| "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" % | "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" % | ||||
| (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch, | (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch, | ||||
| @@ -131,7 +131,7 @@ if __name__ == '__main__': | |||||
| net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) | net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) | ||||
| time_cb = TimeMonitor(data_size=dataset_size) | time_cb = TimeMonitor(data_size=dataset_size) | ||||
| loss_cb = LossCallBack() | |||||
| loss_cb = LossCallBack(rank_id=rank) | |||||
| cb = [time_cb, loss_cb] | cb = [time_cb, loss_cb] | ||||
| if config.save_checkpoint: | if config.save_checkpoint: | ||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, | ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, | ||||
| @@ -44,7 +44,7 @@ export CUDA_VISIBLE_DEVICES="$2" | |||||
| if [ $1 -gt 1 ] | if [ $1 -gt 1 ] | ||||
| then | then | ||||
| mpirun -n $1 --allow-run-as-root \ | |||||
| mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python3 ${BASEPATH}/../train.py > train.log 2>&1 & | python3 ${BASEPATH}/../train.py > train.log 2>&1 & | ||||
| else | else | ||||
| python3 ${BASEPATH}/../train.py > train.log 2>&1 & | python3 ${BASEPATH}/../train.py > train.log 2>&1 & | ||||
| @@ -14,4 +14,5 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| DATA_DIR=$1 | DATA_DIR=$1 | ||||
| mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||||
| mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||||
| @@ -329,7 +329,7 @@ sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] | |||||
| ### [Training Result](#content) | ### [Training Result](#content) | ||||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log. | |||||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log. | |||||
| ``` | ``` | ||||
| @@ -40,7 +40,7 @@ class LossCallBack(Callback): | |||||
| per_print_times (int): Print loss every times. Default: 1. | per_print_times (int): Print loss every times. Default: 1. | ||||
| """ | """ | ||||
| def __init__(self, per_print_times=1): | |||||
| def __init__(self, per_print_times=1, rank_id=0): | |||||
| super(LossCallBack, self).__init__() | super(LossCallBack, self).__init__() | ||||
| if not isinstance(per_print_times, int) or per_print_times < 0: | if not isinstance(per_print_times, int) or per_print_times < 0: | ||||
| raise ValueError("print_step must be int and >= 0.") | raise ValueError("print_step must be int and >= 0.") | ||||
| @@ -53,6 +53,7 @@ class LossCallBack(Callback): | |||||
| self.rcnn_cls_loss_sum = 0 | self.rcnn_cls_loss_sum = 0 | ||||
| self.rcnn_reg_loss_sum = 0 | self.rcnn_reg_loss_sum = 0 | ||||
| self.rcnn_mask_loss_sum = 0 | self.rcnn_mask_loss_sum = 0 | ||||
| self.rank_id = rank_id | |||||
| global time_stamp_init, time_stamp_first | global time_stamp_init, time_stamp_first | ||||
| if not time_stamp_init: | if not time_stamp_init: | ||||
| @@ -96,7 +97,7 @@ class LossCallBack(Callback): | |||||
| total_loss = rpn_loss + rcnn_loss | total_loss = rpn_loss + rcnn_loss | ||||
| loss_file = open("./loss.log", "a+") | |||||
| loss_file = open("./loss_{}.log".format(self.rank_id), "a+") | |||||
| loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " | loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " | ||||
| "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, " | "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, " | ||||
| "total_loss: %.5f" % | "total_loss: %.5f" % | ||||
| @@ -126,7 +126,7 @@ if __name__ == '__main__': | |||||
| net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) | net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) | ||||
| time_cb = TimeMonitor(data_size=dataset_size) | time_cb = TimeMonitor(data_size=dataset_size) | ||||
| loss_cb = LossCallBack() | |||||
| loss_cb = LossCallBack(rank_id=rank) | |||||
| cb = [time_cb, loss_cb] | cb = [time_cb, loss_cb] | ||||
| if config.save_checkpoint: | if config.save_checkpoint: | ||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, | ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, | ||||
| @@ -72,7 +72,7 @@ run_gpu() | |||||
| cd ../train || exit | cd ../train || exit | ||||
| export CUDA_VISIBLE_DEVICES="$3" | export CUDA_VISIBLE_DEVICES="$3" | ||||
| mpirun -n $2 --allow-run-as-root \ | |||||
| mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ${BASEPATH}/../train.py \ | python ${BASEPATH}/../train.py \ | ||||
| --platform=$1 \ | --platform=$1 \ | ||||
| --dataset_path=$4 \ | --dataset_path=$4 \ | ||||
| @@ -167,14 +167,14 @@ run_gpu(){ | |||||
| env > env.log | env > env.log | ||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n ${RANK_SIZE} \ | |||||
| mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log & | python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log & | ||||
| fi | fi | ||||
| if [ $# == 4 ] | if [ $# == 4 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n ${RANK_SIZE} \ | |||||
| python train.py --device_target=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & | |||||
| mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & | |||||
| fi | fi | ||||
| cd .. | cd .. | ||||
| @@ -38,7 +38,7 @@ run_gpu() | |||||
| cd ../train || exit | cd ../train || exit | ||||
| export CUDA_VISIBLE_DEVICES="$3" | export CUDA_VISIBLE_DEVICES="$3" | ||||
| mpirun -n $2 --allow-run-as-root \ | |||||
| mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ${BASEPATH}/../train.py \ | python ${BASEPATH}/../train.py \ | ||||
| --dataset_path=$4 \ | --dataset_path=$4 \ | ||||
| --device_target=$1 \ | --device_target=$1 \ | ||||
| @@ -14,4 +14,5 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| DATA_DIR=$1 | DATA_DIR=$1 | ||||
| mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||||
| mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & | |||||
| @@ -80,14 +80,14 @@ cd ./train_parallel || exit | |||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | ||||
| fi | fi | ||||
| if [ $# == 4 ] | if [ $# == 4 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | ||||
| fi | fi | ||||
| @@ -86,14 +86,14 @@ cp -r ../src ./sched | |||||
| cd ./sched || exit | cd ./sched || exit | ||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & | ||||
| fi | fi | ||||
| if [ $# == 4 ] | if [ $# == 4 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & | ||||
| fi | fi | ||||
| @@ -111,14 +111,14 @@ do | |||||
| cd ./server_$i || exit | cd ./server_$i || exit | ||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log & | ||||
| fi | fi | ||||
| if [ $# == 4 ] | if [ $# == 4 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n 1 \ | |||||
| mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log & | ||||
| fi | fi | ||||
| @@ -134,14 +134,14 @@ cp -r ../src ./worker | |||||
| cd ./worker || exit | cd ./worker || exit | ||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & | ||||
| fi | fi | ||||
| if [ $# == 4 ] | if [ $# == 4 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --net=$1 --dataset=$2 --run_distribute=True \ | python train.py --net=$1 --dataset=$2 --run_distribute=True \ | ||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & | --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & | ||||
| fi | fi | ||||
| @@ -41,6 +41,7 @@ cp *.sh ./train_parallel | |||||
| cp -r ../src ./train_parallel | cp -r ../src ./train_parallel | ||||
| cd ./train_parallel || exit | cd ./train_parallel || exit | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| python train.py --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --run_distribute=True \ | |||||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||||
| @@ -22,7 +22,7 @@ then | |||||
| PATH_CHECKPOINT=$2 | PATH_CHECKPOINT=$2 | ||||
| fi | fi | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py \ | python train.py \ | ||||
| --is_distribute=1 \ | --is_distribute=1 \ | ||||
| --platform="GPU" \ | --platform="GPU" \ | ||||
| @@ -54,14 +54,14 @@ Dataset used: [imagenet](http://www.image-net.org/) | |||||
| +-- ShuffleNetV2 | +-- ShuffleNetV2 | ||||
| +-- Readme.md # descriptions about ShuffleNetV2 | +-- Readme.md # descriptions about ShuffleNetV2 | ||||
| +-- scripts | +-- scripts | ||||
| ¦ +--run_distribute_train_for_gpu.sh # shell script for distributed training | |||||
| ¦ +--run_eval_for_gpu.sh # shell script for evaluation | |||||
| ¦ +--run_standalone_train_for_gpu.sh # shell script for standalone training | |||||
| +--run_distribute_train_for_gpu.sh # shell script for distributed training | |||||
| +--run_eval_for_gpu.sh # shell script for evaluation | |||||
| +--run_standalone_train_for_gpu.sh # shell script for standalone training | |||||
| +-- src | +-- src | ||||
| ¦ +--config.py # parameter configuration | |||||
| ¦ +--dataset.py # creating dataset | |||||
| ¦ +--loss.py # loss function for network | |||||
| ¦ +--lr_generator.py # learning rate config | |||||
| +--config.py # parameter configuration | |||||
| +--dataset.py # creating dataset | |||||
| +--loss.py # loss function for network | |||||
| +--lr_generator.py # learning rate config | |||||
| +-- train.py # training script | +-- train.py # training script | ||||
| +-- eval.py # evaluation script | +-- eval.py # evaluation script | ||||
| +-- blocks.py # ShuffleNetV2 blocks | +-- blocks.py # ShuffleNetV2 blocks | ||||
| @@ -83,7 +83,7 @@ You can start training using python or shell scripts. The usage of shell scripts | |||||
| ``` | ``` | ||||
| # training example | # training example | ||||
| python: | python: | ||||
| GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & | |||||
| GPU: mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & | |||||
| shell: | shell: | ||||
| GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ | GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ | ||||
| @@ -50,7 +50,7 @@ export CUDA_VISIBLE_DEVICES="$2" | |||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun -n $1 --allow-run-as-root \ | |||||
| mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 & | python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 & | ||||
| fi | fi | ||||
| @@ -22,7 +22,8 @@ echo "========================================================================== | |||||
| DATA_PATH=$1 | DATA_PATH=$1 | ||||
| mpirun -n 8 python train.py \ | |||||
| mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py \ | |||||
| --device_target="GPU" \ | --device_target="GPU" \ | ||||
| --dataset="imagenet2012" \ | --dataset="imagenet2012" \ | ||||
| --is_distributed=1 \ | --is_distributed=1 \ | ||||
| @@ -44,7 +44,7 @@ cp ../*.py ./distribute_train | |||||
| cp -r ../src ./distribute_train | cp -r ../src ./distribute_train | ||||
| cd ./distribute_train || exit | cd ./distribute_train || exit | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py \ | python train.py \ | ||||
| --dataset_path=$DATASET_PATH \ | --dataset_path=$DATASET_PATH \ | ||||
| --platform=GPU \ | --platform=GPU \ | ||||
| @@ -53,7 +53,8 @@ cp ../*.py ./train_parallel | |||||
| cp -r ../src ./train_parallel | cp -r ../src ./train_parallel | ||||
| cd ./train_parallel || exit | cd ./train_parallel || exit | ||||
| env > env.log | env > env.log | ||||
| mpirun --allow-run-as-root -n ${DEVICE_NUM} python train.py \ | |||||
| mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py \ | |||||
| --data_dir=$DATASET_PATH \ | --data_dir=$DATASET_PATH \ | ||||
| --pretrained_backbone=$PRETRAINED_BACKBONE \ | --pretrained_backbone=$PRETRAINED_BACKBONE \ | ||||
| --device_target=GPU \ | --device_target=GPU \ | ||||
| @@ -26,7 +26,7 @@ EPOCH_SIZE=$2 | |||||
| DATA_DIR=$3 | DATA_DIR=$3 | ||||
| SCHEMA_DIR=$4 | SCHEMA_DIR=$4 | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python run_pretrain.py \ | python run_pretrain.py \ | ||||
| --device_target="GPU" \ | --device_target="GPU" \ | ||||
| --distribute="true" \ | --distribute="true" \ | ||||
| @@ -146,7 +146,8 @@ if [ "$task" == "train" ] | |||||
| then | then | ||||
| if [ $RANK_SIZE -gt 1 ] | if [ $RANK_SIZE -gt 1 ] | ||||
| then | then | ||||
| mpirun -n $RANK_SIZE python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & | |||||
| mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & | |||||
| fi | fi | ||||
| python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & | python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & | ||||
| elif [ "$task" == "infer" ] | elif [ "$task" == "infer" ] | ||||
| @@ -33,12 +33,13 @@ class LossCallBack(Callback): | |||||
| time_stamp_init = False | time_stamp_init = False | ||||
| time_stamp_first = 0 | time_stamp_first = 0 | ||||
| def __init__(self, config: TransformerConfig, per_print_times: int = 1): | |||||
| def __init__(self, config: TransformerConfig, per_print_times: int = 1, rank_id: int = 0): | |||||
| super(LossCallBack, self).__init__() | super(LossCallBack, self).__init__() | ||||
| if not isinstance(per_print_times, int) or per_print_times < 0: | if not isinstance(per_print_times, int) or per_print_times < 0: | ||||
| raise ValueError("print_step must be int and >= 0.") | raise ValueError("print_step must be int and >= 0.") | ||||
| self.config = config | self.config = config | ||||
| self._per_print_times = per_print_times | self._per_print_times = per_print_times | ||||
| self.rank_id = rank_id | |||||
| if not self.time_stamp_init: | if not self.time_stamp_init: | ||||
| self.time_stamp_first = self._get_ms_timestamp() | self.time_stamp_first = self._get_ms_timestamp() | ||||
| @@ -46,7 +47,7 @@ class LossCallBack(Callback): | |||||
| def step_end(self, run_context): | def step_end(self, run_context): | ||||
| cb_params = run_context.original_args() | cb_params = run_context.original_args() | ||||
| file_name = "./loss.log" | |||||
| file_name = "./loss_{}.log".format(self.rank_id) | |||||
| with open(file_name, "a+") as f: | with open(file_name, "a+") as f: | ||||
| time_stamp_current = self._get_ms_timestamp() | time_stamp_current = self._get_ms_timestamp() | ||||
| f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format( | f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format( | ||||
| @@ -199,24 +199,28 @@ def _build_training_pipeline(config: TransformerConfig, | |||||
| scale_update_cell=scale_manager.get_update_cell()) | scale_update_cell=scale_manager.get_update_cell()) | ||||
| net_with_grads.set_train(True) | net_with_grads.set_train(True) | ||||
| model = Model(net_with_grads) | model = Model(net_with_grads) | ||||
| loss_monitor = LossCallBack(config) | |||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, | ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, | ||||
| keep_checkpoint_max=config.keep_ckpt_max) | keep_checkpoint_max=config.keep_ckpt_max) | ||||
| rank_size = os.getenv('RANK_SIZE') | rank_size = os.getenv('RANK_SIZE') | ||||
| callbacks = [loss_monitor] | |||||
| if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: | |||||
| ckpt_callback = ModelCheckpoint( | |||||
| prefix=config.ckpt_prefix, | |||||
| directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), | |||||
| config=ckpt_config) | |||||
| callbacks.append(ckpt_callback) | |||||
| callbacks = [] | |||||
| if rank_size is not None and int(rank_size) > 1: | |||||
| loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) | |||||
| callbacks.append(loss_monitor) | |||||
| if MultiAscend.get_rank() % 8 == 0: | |||||
| ckpt_callback = ModelCheckpoint( | |||||
| prefix=config.ckpt_prefix, | |||||
| directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())), | |||||
| config=ckpt_config) | |||||
| callbacks.append(ckpt_callback) | |||||
| if rank_size is None or int(rank_size) == 1: | if rank_size is None or int(rank_size) == 1: | ||||
| ckpt_callback = ModelCheckpoint( | ckpt_callback = ModelCheckpoint( | ||||
| prefix=config.ckpt_prefix, | prefix=config.ckpt_prefix, | ||||
| directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), | directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), | ||||
| config=ckpt_config) | config=ckpt_config) | ||||
| loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID')) | |||||
| callbacks.append(loss_monitor) | |||||
| callbacks.append(ckpt_callback) | callbacks.append(ckpt_callback) | ||||
| print(f" | ALL SET, PREPARE TO TRAIN.") | print(f" | ALL SET, PREPARE TO TRAIN.") | ||||
| @@ -29,7 +29,7 @@ TEACHER_CKPT_PATH=$5 | |||||
| PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python ${PROJECT_DIR}/../run_general_distill.py \ | python ${PROJECT_DIR}/../run_general_distill.py \ | ||||
| --distribute="true" \ | --distribute="true" \ | ||||
| --device_target="GPU" \ | --device_target="GPU" \ | ||||
| @@ -53,11 +53,12 @@ class LossCallBack(Callback): | |||||
| Args: | Args: | ||||
| per_print_times (int): Print loss every times. Default: 1. | per_print_times (int): Print loss every times. Default: 1. | ||||
| """ | """ | ||||
| def __init__(self, per_print_times=1): | |||||
| def __init__(self, per_print_times=1, rank_id=0): | |||||
| super(LossCallBack, self).__init__() | super(LossCallBack, self).__init__() | ||||
| if not isinstance(per_print_times, int) or per_print_times < 0: | if not isinstance(per_print_times, int) or per_print_times < 0: | ||||
| raise ValueError("print_step must be int and >= 0.") | raise ValueError("print_step must be int and >= 0.") | ||||
| self._per_print_times = per_print_times | self._per_print_times = per_print_times | ||||
| self.rank_id = rank_id | |||||
| global time_stamp_init, time_stamp_first | global time_stamp_init, time_stamp_first | ||||
| if not time_stamp_init: | if not time_stamp_init: | ||||
| time_stamp_first = get_ms_timestamp() | time_stamp_first = get_ms_timestamp() | ||||
| @@ -71,7 +72,7 @@ class LossCallBack(Callback): | |||||
| print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | ||||
| cb_params.cur_epoch_num, cb_params.cur_step_num, | cb_params.cur_epoch_num, cb_params.cur_step_num, | ||||
| str(cb_params.net_outputs))) | str(cb_params.net_outputs))) | ||||
| with open("./loss.log", "a+") as f: | |||||
| with open("./loss_{}.log".fromat(self.rank_id), "a+") as f: | |||||
| f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | ||||
| cb_params.cur_epoch_num, | cb_params.cur_epoch_num, | ||||
| cb_params.cur_step_num, | cb_params.cur_step_num, | ||||
| @@ -145,7 +146,7 @@ def run_transformer_train(): | |||||
| min_lr=cfg.lr_schedule.min_lr), mstype.float32) | min_lr=cfg.lr_schedule.min_lr), mstype.float32) | ||||
| optimizer = Adam(netwithloss.trainable_params(), lr) | optimizer = Adam(netwithloss.trainable_params(), lr) | ||||
| callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] | |||||
| callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)] | |||||
| if args.enable_save_ckpt == "true": | if args.enable_save_ckpt == "true": | ||||
| if device_num == 1 or (device_num > 1 and rank_id == 0): | if device_num == 1 or (device_num > 1 and rank_id == 0): | ||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, | ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, | ||||
| @@ -28,7 +28,7 @@ cp *.py ./log | |||||
| cp -r src ./log | cp -r src ./log | ||||
| cd ./log || exit | cd ./log || exit | ||||
| env > env.log | env > env.log | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python -u train.py \ | python -u train.py \ | ||||
| --dataset_path=$DATA_URL \ | --dataset_path=$DATA_URL \ | ||||
| --ckpt_path="checkpoint" \ | --ckpt_path="checkpoint" \ | ||||
| @@ -21,7 +21,7 @@ RANK_SIZE=$1 | |||||
| EPOCH_SIZE=$2 | EPOCH_SIZE=$2 | ||||
| DATASET=$3 | DATASET=$3 | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python -s ${self_path}/../train_and_eval_distribute.py \ | python -s ${self_path}/../train_and_eval_distribute.py \ | ||||
| --device_target="GPU" \ | --device_target="GPU" \ | ||||
| --data_path=$DATASET \ | --data_path=$DATASET \ | ||||
| @@ -23,7 +23,7 @@ DATASET=$3 | |||||
| VOCAB_SIZE=$4 | VOCAB_SIZE=$4 | ||||
| EMB_DIM=$5 | EMB_DIM=$5 | ||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ | |||||
| python -s ${self_path}/../train_and_eval_auto_parallel.py \ | python -s ${self_path}/../train_and_eval_auto_parallel.py \ | ||||
| --device_target="GPU" \ | --device_target="GPU" \ | ||||
| --data_path=$DATASET \ | --data_path=$DATASET \ | ||||