Browse Source

!4577 Optimize the print of bert

Merge pull request !4577 from chenhaozhe/optimize-print-of-bert
tags/v0.7.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
7cec00b491
6 changed files with 27 additions and 14 deletions
  1. +17
    -4
      model_zoo/official/nlp/bert/README.md
  2. +1
    -1
      model_zoo/official/nlp/bert/run_pretrain.py
  3. +1
    -1
      model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
  4. +6
    -6
      model_zoo/official/nlp/bert/src/utils.py
  5. +1
    -1
      model_zoo/utils/ascend_distributed_launcher/README.md
  6. +1
    -1
      model_zoo/utils/ascend_distributed_launcher/run_distributed.py

+ 17
- 4
model_zoo/official/nlp/bert/README.md View File

@@ -14,17 +14,30 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
### Pre-Training ### Pre-Training
- Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file. - Set options in `config.py`, including lossscale, optimizer and network. Click [here](https://www.mindspore.cn/tutorial/zh-CN/master/use/data_preparation/loading_the_datasets.html#tfrecord) for more information about dataset and the json schema file.


- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model.
- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.


``` bash ``` bash
sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
bash scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
``` ```
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.

- Run `run_standalone_pretrain_for_gpu.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.

``` bash
bash scripts/run_standalone_pretrain_for_gpu.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
```

- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `Ascend`.


``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE
bash scripts/run_distribute_pretrain.sh DATA_DIR RANK_TABLE_FILE
``` ```


- Run `run_distribute_pretrain_for_gpu.sh` for distributed pre-training of BERT-base and BERT-NEZHA model on `GPU`.

```bash
bash scripts/run_distribute_pretrain_for_gpu.sh RANK_SIZE EPOCH_SIZE DATA_DIR SCHEMA_DIR
```

### Fine-Tuning and Evaluation ### Fine-Tuning and Evaluation
- Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset) - Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset)




+ 1
- 1
model_zoo/official/nlp/bert/run_pretrain.py View File

@@ -141,7 +141,7 @@ def run_pretrain():
else: else:
raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]". raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]".
format(cfg.optimizer)) format(cfg.optimizer))
callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()]
callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size())]
if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0:
config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
keep_checkpoint_max=args_opt.save_checkpoint_num) keep_checkpoint_max=args_opt.save_checkpoint_num)


+ 1
- 1
model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py View File

@@ -125,7 +125,7 @@ def distribute_pretrain():
print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt") print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt")


os.chdir(cur_dir + "/LOG" + str(device_id)) os.chdir(cur_dir + "/LOG" + str(device_id))
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"


+ 6
- 6
model_zoo/official/nlp/bert/src/utils.py View File

@@ -18,6 +18,7 @@ Functional Cells used in Bert finetune and evaluation.
""" """


import os import os
import math
import numpy as np import numpy as np
import mindspore.nn as nn import mindspore.nn as nn
from mindspore import log as logger from mindspore import log as logger
@@ -90,15 +91,14 @@ class LossCallBack(Callback):
Args: Args:
per_print_times (int): Print loss every times. Default: 1. per_print_times (int): Print loss every times. Default: 1.
""" """
def __init__(self, per_print_times=1):
def __init__(self, dataset_size=1):
super(LossCallBack, self).__init__() super(LossCallBack, self).__init__()
if not isinstance(per_print_times, int) or per_print_times < 0:
raise ValueError("print_step must be int and >= 0")
self._per_print_times = per_print_times
self._dataset_size = dataset_size
def step_end(self, run_context): def step_end(self, run_context):
cb_params = run_context.original_args() cb_params = run_context.original_args()
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs)))
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))


def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
""" """


+ 1
- 1
model_zoo/utils/ascend_distributed_launcher/README.md View File

@@ -7,7 +7,7 @@ The number of D chips can be automatically allocated based on the device_num set
## how to use ## how to use
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
``` ```
python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
python model_zoo/utils/ascend_distributed_launcher/run_distributed.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
``` ```


output: output:


model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py → model_zoo/utils/ascend_distributed_launcher/run_distributed.py View File

@@ -124,7 +124,7 @@ def distribute_pretrain():
print("data_dir:", data_dir) print("data_dir:", data_dir)
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")


cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
cmd = 'taskset -c ' + cmdopt + ' nohup python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"

Loading…
Cancel
Save