| @@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py. | |||
| "momentum": 0.9, # momentum optimizer | |||
| "weight_decay": 1e-4, # weight decay | |||
| "epoch_size": 120, # epoch sizes for training | |||
| "pretrain_epoch_size": 0, # epoch size of pretrain checkpoint | |||
| "buffer_size": 1000, # number of queue size in data preprocessing | |||
| "image_height": 224, # image height | |||
| "image_width": 224, # image width | |||
| @@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py. | |||
| ``` | |||
| # distributed training | |||
| sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] | |||
| sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional) | |||
| # standalone training | |||
| sh run_standalone_train.sh [DATASET_PATH] | |||
| sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| @@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH] | |||
| ```bash | |||
| # distributed training example(8p) | |||
| sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc | |||
| If you want to load pretrained ckpt file, | |||
| sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt | |||
| # standalone training example(1p) | |||
| sh run_standalone_train.sh dataset/ilsvrc | |||
| f you want to load pretrained ckpt file, | |||
| sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| @@ -24,6 +24,7 @@ config = ed({ | |||
| "momentum": 0.9, | |||
| "weight_decay": 1e-4, | |||
| "epoch_size": 120, | |||
| "pretrain_epoch_size": 0, | |||
| "buffer_size": 1000, | |||
| "image_height": 224, | |||
| "image_width": 224, | |||
| @@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): | |||
| lr = float(init_lr) + lr_inc * current_step | |||
| return lr | |||
| def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): | |||
| def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0): | |||
| """ | |||
| generate learning rate array with cosine | |||
| @@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): | |||
| steps_per_epoch(int): steps size of one epoch | |||
| warmup_epochs(int): number of warmup epochs | |||
| max_epoch(int): total epochs of training | |||
| global_step(int): the current start index of lr array | |||
| Returns: | |||
| np.array, learning rate array | |||
| """ | |||
| @@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): | |||
| decayed = linear_decay * cosine_decay + 0.00001 | |||
| lr = base_lr * decayed | |||
| lr_each_step.append(lr) | |||
| return np.array(lr_each_step).astype(np.float32) | |||
| lr_each_step = np.array(lr_each_step).astype(np.float32) | |||
| learning_rate = lr_each_step[global_step:] | |||
| return learning_rate | |||
| @@ -14,9 +14,9 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 2 ] | |||
| if [ $# != 2 ] && [ $# != 3 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" | |||
| echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| @@ -31,6 +31,11 @@ PATH1=$(get_real_path $1) | |||
| PATH2=$(get_real_path $2) | |||
| echo $PATH1 | |||
| echo $PATH2 | |||
| if [ $# == 3 ] | |||
| then | |||
| PATH3=$(get_real_path $3) | |||
| echo $PATH3 | |||
| fi | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| @@ -44,6 +49,12 @@ then | |||
| exit 1 | |||
| fi | |||
| if [ $# == 3 ] && [ ! -f $PATH3 ] | |||
| then | |||
| echo "error: PRETRAINED_PATH=$PATH3 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| @@ -61,6 +72,15 @@ do | |||
| cd ./train_parallel$i || exit | |||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | |||
| env > env.log | |||
| python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & | |||
| fi | |||
| if [ $# == 3 ] | |||
| then | |||
| python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log & | |||
| fi | |||
| cd .. | |||
| done | |||
| @@ -14,9 +14,9 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] | |||
| if [ $# != 1 ] && [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train.sh [DATASET_PATH]" | |||
| echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| @@ -29,12 +29,23 @@ get_real_path(){ | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| echo $PATH1 | |||
| if [ $# == 2 ] | |||
| then | |||
| PATH2=$(get_real_path $2) | |||
| echo $PATH2 | |||
| fi | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| fi | |||
| if [ $# == 2 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=1 | |||
| @@ -52,5 +63,13 @@ cp *.sh ./train | |||
| cd ./train || exit | |||
| echo "start training for device $DEVICE_ID" | |||
| env > env.log | |||
| python train.py --do_train=True --dataset_path=$PATH1 &> log & | |||
| if [ $# == 1 ] | |||
| then | |||
| python train.py --do_train=True --dataset_path=$PATH1 &> log & | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| fi | |||
| cd .. | |||
| @@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') | |||
| parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') | |||
| parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') | |||
| parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') | |||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') | |||
| args_opt = parser.parse_args() | |||
| device_id = int(os.getenv('DEVICE_ID')) | |||
| @@ -64,11 +65,11 @@ if __name__ == '__main__': | |||
| if isinstance(cell, nn.Conv2d): | |||
| cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), | |||
| cell.weight.default_input.shape(), | |||
| cell.weight.default_input.dtype()) | |||
| cell.weight.default_input.dtype()).to_tensor() | |||
| if isinstance(cell, nn.Dense): | |||
| cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), | |||
| cell.weight.default_input.shape(), | |||
| cell.weight.default_input.dtype()) | |||
| cell.weight.default_input.dtype()).to_tensor() | |||
| if not config.label_smooth: | |||
| config.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) | |||
| @@ -77,9 +78,13 @@ if __name__ == '__main__': | |||
| repeat_num=epoch_size, batch_size=config.batch_size) | |||
| step_size = dataset.get_dataset_size() | |||
| loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | |||
| if args_opt.pre_trained: | |||
| param_dict = load_checkpoint(args_opt.pre_trained) | |||
| load_param_into_net(net, param_dict) | |||
| # learning rate strategy with cosine | |||
| lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) | |||
| lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120, | |||
| config.pretrain_epoch_size*step_size)) | |||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, | |||
| config.weight_decay, config.loss_scale) | |||
| model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, | |||