Merge pull request !4577 from chenhaozhe/optimize-print-of-berttags/v0.7.0-beta
| @@ -14,17 +14,30 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( | |||||
| ### Pre-Training | ### Pre-Training | ||||
| - Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. | - Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. | ||||
| - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model. | |||||
| - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`. | |||||
| ``` bash | ``` bash | ||||
| sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR | |||||
| bash scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR | |||||
| ``` | ``` | ||||
| - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. | |||||
| - Run `run_standalone_pretrain_for_gpu.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`. | |||||
| ``` bash | |||||
| bash scripts/run_standalone_pretrain_for_gpu.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR | |||||
| ``` | |||||
| - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`. | |||||
| ``` bash | ``` bash | ||||
| sh scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE | |||||
| bash scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE | |||||
| ``` | ``` | ||||
| - Run `run_distribute_pretrain_for_gpu.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`. | |||||
| ```bash | |||||
| bash scripts/run_distribute_pretrain_for_gpu.sh RANK_SIZE EPOCH_SIZE DATA_DIR SCHEMA_DIR | |||||
| ``` | |||||
| ### Fine-Tuning and Evaluation | ### Fine-Tuning and Evaluation | ||||
| - Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset) | - Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset) | ||||
| @@ -141,7 +141,7 @@ def run_pretrain(): | |||||
| else: | else: | ||||
| raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]". | raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]". | ||||
| format(cfg.optimizer)) | format(cfg.optimizer)) | ||||
| callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] | |||||
| callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size())] | |||||
| if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: | if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: | ||||
| config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, | config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, | ||||
| keep_checkpoint_max=args_opt.save_checkpoint_num) | keep_checkpoint_max=args_opt.save_checkpoint_num) | ||||
| @@ -125,7 +125,7 @@ def distribute_pretrain(): | |||||
| print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt") | print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt") | ||||
| os.chdir(cur_dir + "/LOG" + str(device_id)) | os.chdir(cur_dir + "/LOG" + str(device_id)) | ||||
| cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " | |||||
| cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | ||||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | ||||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | ||||
| @@ -18,6 +18,7 @@ Functional Cells used in Bert finetune and evaluation. | |||||
| """ | """ | ||||
| import os | import os | ||||
| import math | |||||
| import numpy as np | import numpy as np | ||||
| import mindspore.nn as nn | import mindspore.nn as nn | ||||
| from mindspore import log as logger | from mindspore import log as logger | ||||
| @@ -90,15 +91,14 @@ class LossCallBack(Callback): | |||||
| Args: | Args: | ||||
| per_print_times (int): Print loss every times. Default: 1. | per_print_times (int): Print loss every times. Default: 1. | ||||
| """ | """ | ||||
| def __init__(self, per_print_times=1): | |||||
| def __init__(self, dataset_size=1): | |||||
| super(LossCallBack, self).__init__() | super(LossCallBack, self).__init__() | ||||
| if not isinstance(per_print_times, int) or per_print_times < 0: | |||||
| raise ValueError("print_step must be int and >= 0") | |||||
| self._per_print_times = per_print_times | |||||
| self._dataset_size = dataset_size | |||||
| def step_end(self, run_context): | def step_end(self, run_context): | ||||
| cb_params = run_context.original_args() | cb_params = run_context.original_args() | ||||
| print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, | |||||
| str(cb_params.net_outputs))) | |||||
| percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) | |||||
| print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" | |||||
| .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) | |||||
| def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): | def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): | ||||
| """ | """ | ||||
| @@ -7,7 +7,7 @@ The number of D chips can be automatically allocated based on the device_num set | |||||
| ## how to use | ## how to use | ||||
| For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: | ||||
| ``` | ``` | ||||
| python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||||
| python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json | |||||
| ``` | ``` | ||||
| output: | output: | ||||
| @@ -124,7 +124,7 @@ def distribute_pretrain(): | |||||
| print("data_dir:", data_dir) | print("data_dir:", data_dir) | ||||
| print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") | ||||
| cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " | |||||
| cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " " | |||||
| opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) | ||||
| if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): | ||||
| raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," | ||||