Merge pull request !6906 from dessyang/mastertags/v1.1.0
| @@ -23,7 +23,7 @@ nasnet_a_mobile_config_gpu = edict({ | |||||
| 'rank': 0, | 'rank': 0, | ||||
| 'group_size': 1, | 'group_size': 1, | ||||
| 'work_nums': 8, | 'work_nums': 8, | ||||
| 'epoch_size': 500, | |||||
| 'epoch_size': 600, | |||||
| 'keep_checkpoint_max': 100, | 'keep_checkpoint_max': 100, | ||||
| 'ckpt_path': './checkpoint/', | 'ckpt_path': './checkpoint/', | ||||
| 'is_save_on_master': 0, | 'is_save_on_master': 0, | ||||
| @@ -39,7 +39,7 @@ nasnet_a_mobile_config_gpu = edict({ | |||||
| ### Learning Rate Config | ### Learning Rate Config | ||||
| # 'lr_decay_method': 'exponential', | # 'lr_decay_method': 'exponential', | ||||
| 'lr_init': 0.04, | |||||
| 'lr_init': 0.04*8, | |||||
| 'lr_decay_rate': 0.97, | 'lr_decay_rate': 0.97, | ||||
| 'num_epoch_per_decay': 2.4, | 'num_epoch_per_decay': 2.4, | ||||
| @@ -29,7 +29,7 @@ from mindspore.common import dtype as mstype | |||||
| from src.config import nasnet_a_mobile_config_gpu as cfg | from src.config import nasnet_a_mobile_config_gpu as cfg | ||||
| from src.dataset import create_dataset | from src.dataset import create_dataset | ||||
| from src.nasnet_a_mobile import NASNetAMobile, CrossEntropy | |||||
| from src.nasnet_a_mobile import NASNetAMobileWithLoss, NASNetAMobileTrainOneStepWithClipGradient | |||||
| from src.lr_generator import get_lr | from src.lr_generator import get_lr | ||||
| @@ -69,13 +69,10 @@ if __name__ == '__main__': | |||||
| batches_per_epoch = dataset.get_dataset_size() | batches_per_epoch = dataset.get_dataset_size() | ||||
| # network | # network | ||||
| net = NASNetAMobile(cfg.num_classes) | |||||
| net_with_loss = NASNetAMobileWithLoss(cfg) | |||||
| if args_opt.resume: | if args_opt.resume: | ||||
| ckpt = load_checkpoint(args_opt.resume) | ckpt = load_checkpoint(args_opt.resume) | ||||
| load_param_into_net(net, ckpt) | |||||
| #loss | |||||
| loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor) | |||||
| load_param_into_net(net_with_loss, ckpt) | |||||
| # learning rate schedule | # learning rate schedule | ||||
| lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate, | lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate, | ||||
| @@ -88,26 +85,28 @@ if __name__ == '__main__': | |||||
| resume = split_result[-2].split("-") | resume = split_result[-2].split("-") | ||||
| resume_epoch = int(resume[-1]) | resume_epoch = int(resume[-1]) | ||||
| step_num_in_epoch = int(split_result[-1]) | step_num_in_epoch = int(split_result[-1]) | ||||
| assert step_num_in_epoch == ds_train.get_dataset_size()\ | |||||
| assert step_num_in_epoch == dataset.get_dataset_size()\ | |||||
| , "This script only supports resuming at the end of epoch" | , "This script only supports resuming at the end of epoch" | ||||
| lr = lr[(ds_train.get_dataset_size() * (resume_epoch - 1) + step_num_in_epoch):] | |||||
| lr = lr[(dataset.get_dataset_size() * (resume_epoch - 1) + step_num_in_epoch):] | |||||
| lr = Tensor(lr, mstype.float32) | lr = Tensor(lr, mstype.float32) | ||||
| # optimizer | # optimizer | ||||
| decayed_params = [] | decayed_params = [] | ||||
| no_decayed_params = [] | no_decayed_params = [] | ||||
| for param in net.trainable_params(): | |||||
| for param in net_with_loss.trainable_params(): | |||||
| if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: | if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: | ||||
| decayed_params.append(param) | decayed_params.append(param) | ||||
| else: | else: | ||||
| no_decayed_params.append(param) | no_decayed_params.append(param) | ||||
| group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, | group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, | ||||
| {'params': no_decayed_params}, | {'params': no_decayed_params}, | ||||
| {'order_params': net.trainable_params()}] | |||||
| {'order_params': net_with_loss.trainable_params()}] | |||||
| optimizer = RMSProp(group_params, lr, decay=cfg.rmsprop_decay, weight_decay=cfg.weight_decay, | optimizer = RMSProp(group_params, lr, decay=cfg.rmsprop_decay, weight_decay=cfg.weight_decay, | ||||
| momentum=cfg.momentum, epsilon=cfg.opt_eps, loss_scale=cfg.loss_scale) | momentum=cfg.momentum, epsilon=cfg.opt_eps, loss_scale=cfg.loss_scale) | ||||
| model = Model(net, loss_fn=loss, optimizer=optimizer) | |||||
| net_with_grads = NASNetAMobileTrainOneStepWithClipGradient(net_with_loss, optimizer) | |||||
| net_with_grads.set_train() | |||||
| model = Model(net_with_grads) | |||||
| print("============== Starting Training ==============") | print("============== Starting Training ==============") | ||||
| loss_cb = LossMonitor(per_print_times=batches_per_epoch) | loss_cb = LossMonitor(per_print_times=batches_per_epoch) | ||||
| @@ -14,7 +14,7 @@ | |||||
| # ============================================================================ | # ============================================================================ | ||||
| """Transformer evaluation script.""" | """Transformer evaluation script.""" | ||||
| import os | |||||
| import argparse | |||||
| import numpy as np | import numpy as np | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| @@ -97,9 +97,14 @@ def run_transformer_eval(): | |||||
| """ | """ | ||||
| Transformer evaluation. | Transformer evaluation. | ||||
| """ | """ | ||||
| device_id = int(os.getenv('DEVICE_ID')) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False, | |||||
| device_id=device_id) | |||||
| parser = argparse.ArgumentParser(description='tranformer') | |||||
| parser.add_argument("--device_target", type=str, default="Ascend", | |||||
| help="device where the code will be implemented, default is Ascend") | |||||
| parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)') | |||||
| args = parser.parse_args() | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, reserve_class_name_in_scope=False, | |||||
| device_id=args.device_id) | |||||
| dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file) | dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=cfg.data_file) | ||||
| tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False) | tfm_model = TransformerModel(config=transformer_net_cfg, is_training=False, use_one_hot_embeddings=False) | ||||
| @@ -0,0 +1,47 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| echo "==============================================================================================================" | |||||
| echo "Please run the scipt as: " | |||||
| echo "sh run_distribute_pretrain_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_PATH" | |||||
| echo "for example: sh run_distribute_pretrain.sh 8 55 /path/ende-l128-mindrecord00" | |||||
| echo "It is better to use absolute path." | |||||
| echo "==============================================================================================================" | |||||
| rm -rf run_distribute_train | |||||
| mkdir run_distribute_train | |||||
| cp -rf ./src/ train.py ./run_distribute_train | |||||
| cd run_distribute_train || exit | |||||
| export RANK_SIZE=$1 | |||||
| EPOCH_SIZE=$2 | |||||
| DATA_PATH=$3 | |||||
| echo $RANK_SIZE | |||||
| mpirun -n $RANK_SIZE \ | |||||
| python train.py \ | |||||
| --distribute="true" \ | |||||
| --device_target="GPU" \ | |||||
| --epoch_size=$EPOCH_SIZE \ | |||||
| --device_num=$RANK_SIZE \ | |||||
| --enable_save_ckpt="true" \ | |||||
| --enable_lossscale="true" \ | |||||
| --do_shuffle="true" \ | |||||
| --checkpoint_path="" \ | |||||
| --save_checkpoint_steps=2500 \ | |||||
| --save_checkpoint_num=30 \ | |||||
| --data_path=$DATA_PATH \ | |||||
| --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 & | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| echo "==============================================================================================================" | |||||
| echo "Please run the scipt as: " | |||||
| echo "sh run_eval.sh DEVICE_TARGET DEVICE_ID" | |||||
| echo "for example: sh run_eval.sh Ascend 0" | |||||
| echo "Note: set the checkpoint and dataset path in src/eval_config.py" | |||||
| echo "==============================================================================================================" | |||||
| export DEVICE_TARGET=$1 | |||||
| DEVICE_ID=$2 | |||||
| python eval.py \ | |||||
| --device_target=$DEVICE_TARGET \ | |||||
| --device_id=$DEVICE_ID \ | |||||
| @@ -16,8 +16,8 @@ | |||||
| echo "==============================================================================================================" | echo "==============================================================================================================" | ||||
| echo "Please run the scipt as: " | echo "Please run the scipt as: " | ||||
| echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE DATA_PATH" | |||||
| echo "for example: sh run_standalone_train.sh 0 52 /path/ende-l128-mindrecord00" | |||||
| echo "sh run_standalone_train.sh DEVICE_TARGET DEVICE_ID EPOCH_SIZE DATA_PATH" | |||||
| echo "for example: sh run_standalone_train.sh Ascend 0 52 /path/ende-l128-mindrecord00" | |||||
| echo "It is better to use absolute path." | echo "It is better to use absolute path." | ||||
| echo "==============================================================================================================" | echo "==============================================================================================================" | ||||
| @@ -26,13 +26,15 @@ mkdir run_standalone_train | |||||
| cp -rf ./src/ train.py ./run_standalone_train | cp -rf ./src/ train.py ./run_standalone_train | ||||
| cd run_standalone_train || exit | cd run_standalone_train || exit | ||||
| export DEVICE_ID=$1 | |||||
| EPOCH_SIZE=$2 | |||||
| DATA_PATH=$3 | |||||
| export DEVICE_TARGET=$1 | |||||
| DEVICE_ID=$2 | |||||
| EPOCH_SIZE=$3 | |||||
| DATA_PATH=$4 | |||||
| python train.py \ | python train.py \ | ||||
| --distribute="false" \ | --distribute="false" \ | ||||
| --epoch_size=$EPOCH_SIZE \ | --epoch_size=$EPOCH_SIZE \ | ||||
| --device_target=$DEVICE_TARGET \ | |||||
| --device_id=$DEVICE_ID \ | --device_id=$DEVICE_ID \ | ||||
| --enable_save_ckpt="true" \ | --enable_save_ckpt="true" \ | ||||
| --enable_lossscale="true" \ | --enable_lossscale="true" \ | ||||
| @@ -42,4 +44,4 @@ python train.py \ | |||||
| --save_checkpoint_num=30 \ | --save_checkpoint_num=30 \ | ||||
| --data_path=$DATA_PATH \ | --data_path=$DATA_PATH \ | ||||
| --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 & | --bucket_boundaries=[16,32,48,64,128] > log.txt 2>&1 & | ||||
| cd .. | |||||
| cd .. | |||||
| @@ -23,6 +23,7 @@ cfg = edict({ | |||||
| 'scale_factor': 2, | 'scale_factor': 2, | ||||
| 'scale_window': 2000, | 'scale_window': 2000, | ||||
| 'optimizer': 'Adam', | 'optimizer': 'Adam', | ||||
| 'optimizer_adam_beta2': 0.997, | |||||
| 'lr_schedule': edict({ | 'lr_schedule': edict({ | ||||
| 'learning_rate': 2.0, | 'learning_rate': 2.0, | ||||
| 'warmup_steps': 8000, | 'warmup_steps': 8000, | ||||
| @@ -51,6 +52,23 @@ if cfg.transformer_network == 'large': | |||||
| input_mask_from_dataset=True, | input_mask_from_dataset=True, | ||||
| dtype=mstype.float32, | dtype=mstype.float32, | ||||
| compute_type=mstype.float16) | compute_type=mstype.float16) | ||||
| transformer_net_cfg_gpu = TransformerConfig( | |||||
| batch_size=32, | |||||
| seq_length=128, | |||||
| vocab_size=36560, | |||||
| hidden_size=1024, | |||||
| num_hidden_layers=6, | |||||
| num_attention_heads=16, | |||||
| intermediate_size=4096, | |||||
| hidden_act="relu", | |||||
| hidden_dropout_prob=0.1, | |||||
| attention_probs_dropout_prob=0.1, | |||||
| max_position_embeddings=128, | |||||
| initializer_range=0.02, | |||||
| label_smoothing=0.1, | |||||
| input_mask_from_dataset=True, | |||||
| dtype=mstype.float32, | |||||
| compute_type=mstype.float16) | |||||
| if cfg.transformer_network == 'base': | if cfg.transformer_network == 'base': | ||||
| transformer_net_cfg = TransformerConfig( | transformer_net_cfg = TransformerConfig( | ||||
| batch_size=96, | batch_size=96, | ||||
| @@ -166,7 +166,7 @@ class TransformerTrainOneStepCell(nn.Cell): | |||||
| self.reducer_flag = False | self.reducer_flag = False | ||||
| self.parallel_mode = context.get_auto_parallel_context("parallel_mode") | self.parallel_mode = context.get_auto_parallel_context("parallel_mode") | ||||
| if self.parallel_mode not in ParallelMode.MODE_LIST: | if self.parallel_mode not in ParallelMode.MODE_LIST: | ||||
| raise ValueError("Parallel mode does not support: ", parallel_mode) | |||||
| raise ValueError("Parallel mode does not support: ", self.parallel_mode) | |||||
| if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | ||||
| self.reducer_flag = True | self.reducer_flag = True | ||||
| self.grad_reducer = None | self.grad_reducer = None | ||||
| @@ -228,6 +228,12 @@ reciprocal = P.Reciprocal() | |||||
| def tensor_grad_scale(scale, grad): | def tensor_grad_scale(scale, grad): | ||||
| return grad * F.cast(reciprocal(scale), F.dtype(grad)) | return grad * F.cast(reciprocal(scale), F.dtype(grad)) | ||||
| _grad_overflow = C.MultitypeFuncGraph("_grad_overflow") | |||||
| grad_overflow = P.FloatStatus() | |||||
| @_grad_overflow.register("Tensor") | |||||
| def _tensor_grad_overflow(grad): | |||||
| return grad_overflow(grad) | |||||
| class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | ||||
| """ | """ | ||||
| @@ -255,7 +261,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| self.parallel_mode = _get_parallel_mode() | self.parallel_mode = _get_parallel_mode() | ||||
| if self.parallel_mode not in ParallelMode.MODE_LIST: | if self.parallel_mode not in ParallelMode.MODE_LIST: | ||||
| raise ValueError("Parallel mode does not support: ", parallel_mode) | |||||
| raise ValueError("Parallel mode does not support: ", self.parallel_mode) | |||||
| if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | ||||
| self.reducer_flag = True | self.reducer_flag = True | ||||
| self.grad_reducer = None | self.grad_reducer = None | ||||
| @@ -266,9 +272,16 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) | self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) | ||||
| self.clip_gradients = ClipGradients() | self.clip_gradients = ClipGradients() | ||||
| self.cast = P.Cast() | self.cast = P.Cast() | ||||
| self.alloc_status = P.NPUAllocFloatStatus() | |||||
| self.get_status = P.NPUGetFloatStatus() | |||||
| self.clear_before_grad = P.NPUClearFloatStatus() | |||||
| if context.get_context("device_target") == "GPU": | |||||
| self.gpu_target = True | |||||
| self.float_status = P.FloatStatus() | |||||
| self.addn = P.AddN() | |||||
| self.reshape = P.Reshape() | |||||
| else: | |||||
| self.gpu_target = False | |||||
| self.alloc_status = P.NPUAllocFloatStatus() | |||||
| self.get_status = P.NPUGetFloatStatus() | |||||
| self.clear_before_grad = P.NPUClearFloatStatus() | |||||
| self.reduce_sum = P.ReduceSum(keep_dims=False) | self.reduce_sum = P.ReduceSum(keep_dims=False) | ||||
| self.depend_parameter_use = P.ControlDepend(depend_mode=1) | self.depend_parameter_use = P.ControlDepend(depend_mode=1) | ||||
| self.base = Tensor(1, mstype.float32) | self.base = Tensor(1, mstype.float32) | ||||
| @@ -305,10 +318,12 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| target_mask, | target_mask, | ||||
| label_ids, | label_ids, | ||||
| label_weights) | label_weights) | ||||
| # alloc status | |||||
| init = self.alloc_status() | |||||
| # clear overflow buffer | |||||
| self.clear_before_grad(init) | |||||
| init = False | |||||
| if not self.gpu_target: | |||||
| # alloc status | |||||
| init = self.alloc_status() | |||||
| # clear overflow buffer | |||||
| self.clear_before_grad(init) | |||||
| if sens is None: | if sens is None: | ||||
| scaling_sens = self.loss_scale | scaling_sens = self.loss_scale | ||||
| else: | else: | ||||
| @@ -327,8 +342,16 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||||
| if self.reducer_flag: | if self.reducer_flag: | ||||
| # apply grad reducer on grads | # apply grad reducer on grads | ||||
| grads = self.grad_reducer(grads) | grads = self.grad_reducer(grads) | ||||
| self.get_status(init) | |||||
| flag_sum = self.reduce_sum(init, (0,)) | |||||
| if not self.gpu_target: | |||||
| self.get_status(init) | |||||
| # sum overflow buffer elements, 0: not overflow, >0: overflow | |||||
| flag_sum = self.reduce_sum(init, (0,)) | |||||
| else: | |||||
| flag_sum = self.hyper_map(F.partial(_grad_overflow), grads) | |||||
| flag_sum = self.addn(flag_sum) | |||||
| # convert flag_sum to scalar | |||||
| flag_sum = self.reshape(flag_sum, (())) | |||||
| if self.is_distributed: | if self.is_distributed: | ||||
| # sum overflow flag over devices | # sum overflow flag over devices | ||||
| @@ -35,7 +35,7 @@ from mindspore.common import set_seed | |||||
| from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ | from src.transformer_for_train import TransformerTrainOneStepCell, TransformerNetworkWithLoss, \ | ||||
| TransformerTrainOneStepWithLossScaleCell | TransformerTrainOneStepWithLossScaleCell | ||||
| from src.config import cfg, transformer_net_cfg | |||||
| from src.config import cfg, transformer_net_cfg, transformer_net_cfg_gpu | |||||
| from src.dataset import create_transformer_dataset | from src.dataset import create_transformer_dataset | ||||
| from src.lr_schedule import create_dynamic_lr | from src.lr_schedule import create_dynamic_lr | ||||
| @@ -73,13 +73,17 @@ class LossCallBack(Callback): | |||||
| time_stamp_current = get_ms_timestamp() | time_stamp_current = get_ms_timestamp() | ||||
| cb_params = run_context.original_args() | cb_params = run_context.original_args() | ||||
| print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | ||||
| cb_params.cur_epoch_num, cb_params.cur_step_num, | |||||
| cb_params.cur_epoch_num, | |||||
| cb_params.cur_step_num, | |||||
| str(cb_params.net_outputs))) | str(cb_params.net_outputs))) | ||||
| with open("./loss_{}.log".format(self.rank_id), "a+") as f: | with open("./loss_{}.log".format(self.rank_id), "a+") as f: | ||||
| f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, | |||||
| cb_params.cur_epoch_num, | |||||
| cb_params.cur_step_num, | |||||
| str(cb_params.net_outputs))) | |||||
| f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format( | |||||
| time_stamp_current - time_stamp_first, | |||||
| cb_params.cur_epoch_num, | |||||
| cb_params.cur_step_num, | |||||
| str(cb_params.net_outputs[0].asnumpy()), | |||||
| str(cb_params.net_outputs[1].asnumpy()), | |||||
| str(cb_params.net_outputs[2].asnumpy()))) | |||||
| f.write('\n') | f.write('\n') | ||||
| @@ -91,6 +95,8 @@ def argparse_init(): | |||||
| parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'], | parser.add_argument("--distribute", type=str, default="false", choices=['true', 'false'], | ||||
| help="Run distribute, default is false.") | help="Run distribute, default is false.") | ||||
| parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.") | parser.add_argument("--epoch_size", type=int, default=52, help="Epoch size, default is 52.") | ||||
| parser.add_argument("--device_target", type=str, default="Ascend", | |||||
| help="device where the code will be implemented, default is Ascend") | |||||
| parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") | parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") | ||||
| parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") | parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") | ||||
| parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'], | parser.add_argument("--enable_lossscale", type=str, default="true", choices=['true', 'false'], | ||||
| @@ -116,15 +122,21 @@ def run_transformer_train(): | |||||
| """ | """ | ||||
| parser = argparse_init() | parser = argparse_init() | ||||
| args, _ = parser.parse_known_args() | args, _ = parser.parse_known_args() | ||||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id) | |||||
| context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) | context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) | ||||
| if args.distribute == "true": | if args.distribute == "true": | ||||
| device_num = args.device_num | |||||
| if args.device_target == "Ascend": | |||||
| device_num = args.device_num | |||||
| D.init('hccl') | |||||
| else: | |||||
| D.init('nccl') | |||||
| device_num = D.get_group_size() | |||||
| rank = get_rank() | |||||
| args.device_id = rank | |||||
| context.reset_auto_parallel_context() | context.reset_auto_parallel_context() | ||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | ||||
| device_num=device_num) | device_num=device_num) | ||||
| D.init() | |||||
| rank_id = args.device_id % device_num | rank_id = args.device_id % device_num | ||||
| save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') | save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') | ||||
| else: | else: | ||||
| @@ -135,27 +147,39 @@ def run_transformer_train(): | |||||
| rank_id=rank_id, do_shuffle=args.do_shuffle, | rank_id=rank_id, do_shuffle=args.do_shuffle, | ||||
| dataset_path=args.data_path, | dataset_path=args.data_path, | ||||
| bucket_boundaries=args.bucket_boundaries) | bucket_boundaries=args.bucket_boundaries) | ||||
| netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) | |||||
| if args.device_target == "Ascend": | |||||
| netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) | |||||
| else: | |||||
| netwithloss = TransformerNetworkWithLoss(transformer_net_cfg_gpu, True) | |||||
| if args.checkpoint_path: | if args.checkpoint_path: | ||||
| parameter_dict = load_checkpoint(args.checkpoint_path) | parameter_dict = load_checkpoint(args.checkpoint_path) | ||||
| load_param_into_net(netwithloss, parameter_dict) | load_param_into_net(netwithloss, parameter_dict) | ||||
| hidden_size = transformer_net_cfg.hidden_size if args.device_target == "Ascend" \ | |||||
| else transformer_net_cfg_gpu.hidden_size | |||||
| lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", | lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", | ||||
| training_steps=dataset.get_dataset_size()*args.epoch_size, | training_steps=dataset.get_dataset_size()*args.epoch_size, | ||||
| learning_rate=cfg.lr_schedule.learning_rate, | learning_rate=cfg.lr_schedule.learning_rate, | ||||
| warmup_steps=cfg.lr_schedule.warmup_steps, | warmup_steps=cfg.lr_schedule.warmup_steps, | ||||
| hidden_size=transformer_net_cfg.hidden_size, | |||||
| hidden_size=hidden_size, | |||||
| start_decay_step=cfg.lr_schedule.start_decay_step, | start_decay_step=cfg.lr_schedule.start_decay_step, | ||||
| min_lr=cfg.lr_schedule.min_lr), mstype.float32) | min_lr=cfg.lr_schedule.min_lr), mstype.float32) | ||||
| optimizer = Adam(netwithloss.trainable_params(), lr) | |||||
| if args.device_target == "GPU" and cfg.transformer_network == "large": | |||||
| optimizer = Adam(netwithloss.trainable_params(), lr, beta2=cfg.optimizer_adam_beta2) | |||||
| else: | |||||
| optimizer = Adam(netwithloss.trainable_params(), lr) | |||||
| callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)] | callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)] | ||||
| if args.enable_save_ckpt == "true": | if args.enable_save_ckpt == "true": | ||||
| if device_num == 1 or (device_num > 1 and rank_id == 0): | if device_num == 1 or (device_num > 1 and rank_id == 0): | ||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, | |||||
| keep_checkpoint_max=args.save_checkpoint_num) | |||||
| if args.device_target == "Ascend": | |||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, | |||||
| keep_checkpoint_max=args.save_checkpoint_num) | |||||
| else: | |||||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset.get_dataset_size(), | |||||
| keep_checkpoint_max=args.save_checkpoint_num) | |||||
| ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config) | ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config) | ||||
| callbacks.append(ckpoint_cb) | callbacks.append(ckpoint_cb) | ||||