diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index a54c1faf99..3a6221cac0 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -1,11 +1,12 @@ # BERT Example ## Description -This example implements pre-training, fine-tuning and evaluation of [BERT-base](https://github.com/google-research/bert)(the base version of BERT model) and [BERT-NEZHA](https://github.com/huawei-noah/Pretrained-Language-Model)(a Chinese pretrained language model developed by Huawei, which introduced a improvement of Functional Relative Positional Encoding as an effective positional encoding scheme). +This example implements pre-training, fine-tuning and evaluation of [BERT-base](https://github.com/google-research/bert) and [BERT-NEZHA](https://github.com/huawei-noah/Pretrained-Language-Model). ## Requirements - Install [MindSpore](https://www.mindspore.cn/install/en). - Download the zhwiki dataset for pre-training. Extract and clean text in the dataset with [WikiExtractor](https://github.com/attardi/wikiextractor). Convert the dataset to TFRecord format and move the files to a specified path. - Download dataset for fine-tuning and evaluation such as CLUENER, TNEWS, SQuAD v1.1, etc. +- Convert dataset files from json format to tfrecord format, please refer to run_classifier.py which in [BERT](https://github.com/google-research/bert) repository. > Notes: If you are running a fine-tuning or evaluation task, prepare a checkpoint from pre-train. @@ -25,14 +26,29 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ``` ### Fine-Tuning and Evaluation +- Including three kinds of task: Classification, NER(Named Entity Recognition) and SQuAD(Stanford Question Answering Dataset) + - Set bert network config and optimizer hyperparameters in `finetune_eval_config.py`. -- Set task related hyperparameters in scripts/run_XXX.sh. +- Classification task: Set task related hyperparameters in scripts/run_classifier.sh. +- Run `bash scripts/run_classifier.py` for fine-tuning of BERT-base and BERT-NEZHA model. + + ```bash + bash scripts/run_classifier.sh + ``` + +- NER task: Set task related hyperparameters in scripts/run_ner.sh. +- Run `bash scripts/run_ner.py` for fine-tuning of BERT-base and BERT-NEZHA model. -- Run `bash scripts/run_XXX.py` for fine-tuning of BERT-base and BERT-NEZHA model. + ```bash + bash scripts/run_ner.sh + ``` + +- SQuAD task: Set task related hyperparameters in scripts/run_squad.sh. +- Run `bash scripts/run_squad.py` for fine-tuning of BERT-base and BERT-NEZHA model. ```bash - bash scripts/run_XXX.sh + bash scripts/run_squad.sh ``` ## Usage @@ -61,8 +77,86 @@ options: --data_dir path to dataset directory: PATH, default is "" --schema_dir path to schema.json file, PATH, default is "" ``` +### Fine-Tuning and Evaluation +``` +usage: run_ner.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----do_eval DO_EVAL] + [--assessment_method ASSESSMENT_METHOD] [--use_crf USE_CRF] + [--device_id N] [--epoch_num N] [--vocab_file_path VOCAB_FILE_PATH] + [--label2id_file_path LABEL2ID_FILE_PATH] + [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH] + [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] + [--train_data_file_path TRAIN_DATA_FILE_PATH] + [--eval_data_file_path EVAL_DATA_FILE_PATH] + [--schema_file_path SCHEMA_FILE_PATH] +options: + --device_target targeted device to run task: Ascend | GPU + --do_train whether to run training on training set: true | false + --do_eval whether to run eval on dev set: true | false + --assessment_method assessment method to do evaluation: f1 | clue_benchmark + --use_crf whether to use crf to calculate loss: true | false + --device_id device id to run task + --epoch_num total number of training epochs to perform + --num_class number of classes to do labeling + --vocab_file_path the vocabulary file that the BERT model was trained on + --label2id_file_path label to id json file + --save_finetune_checkpoint_path path to save generated finetuning checkpoint + --load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) + --load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval + --train_data_file_path ner tfrecord for training. E.g., train.tfrecord + --eval_data_file_path ner tfrecord for predictions if f1 is used to evaluate result, ner json for predictions if clue_benchmark is used to evaluate result + --schema_file_path path to datafile schema file + +usage: run_squad.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----do_eval DO_EVAL] + [--device_id N] [--epoch_num N] [--num_class N] + [--vocab_file_path VOCAB_FILE_PATH] + [--eval_json_path EVAL_JSON_PATH] + [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH] + [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] + [--load_finetune_checkpoint_path LOAD_FINETUNE_CHECKPOINT_PATH] + [--train_data_file_path TRAIN_DATA_FILE_PATH] + [--eval_data_file_path EVAL_DATA_FILE_PATH] + [--schema_file_path SCHEMA_FILE_PATH] +options: + --device_target targeted device to run task: Ascend | GPU + --do_train whether to run training on training set: true | false + --do_eval whether to run eval on dev set: true | false + --device_id device id to run task + --epoch_num total number of training epochs to perform + --num_class number of classes to classify, usually 2 for squad task + --vocab_file_path the vocabulary file that the BERT model was trained on + --eval_json_path path to squad dev json file + --save_finetune_checkpoint_path path to save generated finetuning checkpoint + --load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) + --load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval + --train_data_file_path squad tfrecord for training. E.g., train1.1.tfrecord + --eval_data_file_path squad tfrecord for predictions. E.g., dev1.1.tfrecord + --schema_file_path path to datafile schema file + +usage: run_classifier.py [--device_target DEVICE_TARGET] [--do_train DO_TRAIN] [----do_eval DO_EVAL] + [--assessment_method ASSESSMENT_METHOD] [--device_id N] [--epoch_num N] [--num_class N] + [--save_finetune_checkpoint_path SAVE_FINETUNE_CHECKPOINT_PATH] + [--load_pretrain_checkpoint_path LOAD_PRETRAIN_CHECKPOINT_PATH] + [--load_finetune_checkpoint_path LOAD_FINETUNE_CHECKPOINT_PATH] + [--train_data_file_path TRAIN_DATA_FILE_PATH] + [--eval_data_file_path EVAL_DATA_FILE_PATH] + [--schema_file_path SCHEMA_FILE_PATH] +options: + --device_target targeted device to run task: Ascend | GPU + --do_train whether to run training on training set: true | false + --do_eval whether to run eval on dev set: true | false + --assessment_method assessment method to do evaluation: accuracy | f1 | mcc | spearman_correlation + --device_id device id to run task + --epoch_num total number of training epochs to perform + --num_class number of classes to do labeling + --save_finetune_checkpoint_path path to save generated finetuning checkpoint + --load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) + --load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval + --train_data_file_path tfrecord for training. E.g., train.tfrecord + --eval_data_file_path tfrecord for predictions. E.g., dev.tfrecord + --schema_file_path path to datafile schema file +``` ## Options and Parameters -It contains of parameters of BERT model and options for training, which is set in file `config.py`, `finetune_config.py` and `evaluation_config.py` respectively. +It contains of parameters of BERT model and options for training, which is set in file `config.py` and `finetune_eval_config.py` respectively. ### Options: ``` config.py: @@ -71,57 +165,6 @@ config.py: scale_factor factor used to update loss scale: N, default is 2 scale_window steps for once updatation of loss scale: N, default is 1000 optimizer optimizer used in the network: AdamWerigtDecayDynamicLR | Lamb | Momentum, default is "Lamb" - -scripts/run_ner.sh: - device_target targeted device to run task: Ascend | GPU - do_train whether to run training on training set: true | false - do_eval whether to run eval on dev set: true | false - assessment_method assessment method to do evaluation: f1 | clue_benchmark - use_crf whether to use crf to calculate loss: true | false - device_id device id to run task - epoch_num total number of training epochs to perform - num_class number of classes to do labeling - vocab_file_path the vocabulary file that the BERT model was trained on - label2id_file_path label to id json file - save_finetune_checkpoint_path path to save generated finetuning checkpoint - load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) - load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval - train_data_file_path ner tfrecord for training. E.g., train.tfrecord - eval_data_file_path ner tfrecord for predictions if f1 is used to evaluate result, ner json for predictions if clue_benchmark is used to evaluate result - schema_file_path path to datafile schema file - -scripts/run_squad.sh: - device_target targeted device to run task: Ascend | GPU - do_train whether to run training on training set: true | false - do_eval whether to run eval on dev set: true | false - device_id device id to run task - epoch_num total number of training epochs to perform - num_class number of classes to classify, usually 2 for squad task - vocab_file_path the vocabulary file that the BERT model was trained on - eval_json_path path to squad dev json file - save_finetune_checkpoint_path path to save generated finetuning checkpoint - load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) - load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval - train_data_file_path squad tfrecord for training. E.g., train1.1.tfrecord - eval_data_file_path squad tfrecord for predictions. E.g., dev1.1.tfrecord - schema_file_path path to datafile schema file - -scripts/run_classifier.sh - device_target targeted device to run task: Ascend | GPU - do_train whether to run training on training set: true | false - do_eval whether to run eval on dev set: true | false - assessment_method assessment method to do evaluation: accuracy | f1 | mcc | spearman_correlation - device_id device id to run task - epoch_num total number of training epochs to perform - num_class number of classes to do labeling - save_finetune_checkpoint_path path to save generated finetuning checkpoint - load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model) - load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval - train_data_file_path tfrecord for training. E.g., train.tfrecord - eval_data_file_path tfrecord for predictions. E.g., dev.tfrecord - schema_file_path path to datafile schema file - - ``` ### Parameters: diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py index b230f71fad..d725336df7 100644 --- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py @@ -123,8 +123,9 @@ def distribute_pretrain(): print("core_nums:", cmdopt) print("epoch_size:", str(cfg['epoch_size'])) print("data_dir:", data_dir) - print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") + print("log_file_dir: " + cur_dir + "/LOG" + str(device_id) + "/log.txt") + os.chdir(cur_dir + "/LOG" + str(device_id)) cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): @@ -133,10 +134,10 @@ def distribute_pretrain(): cmd += opt cmd += " --data_dir=" + data_dir cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ - + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' + + str(rank_size) + ' >./log.txt 2>&1 &' os.system(cmd) - + os.chdir(cur_dir) if __name__ == "__main__": distribute_pretrain() diff --git a/model_zoo/official/nlp/bert/src/config.py b/model_zoo/official/nlp/bert/src/config.py index 2341007bd4..c8692d1216 100644 --- a/model_zoo/official/nlp/bert/src/config.py +++ b/model_zoo/official/nlp/bert/src/config.py @@ -56,7 +56,7 @@ large: BERT-NEZHA(a Chinese pretrained language model developed by Huawei, which ''' if cfg.bert_network == 'base': bert_net_cfg = BertConfig( - batch_size=32, + batch_size=64, seq_length=128, vocab_size=21128, hidden_size=768, @@ -77,7 +77,7 @@ if cfg.bert_network == 'base': ) if cfg.bert_network == 'nezha': bert_net_cfg = BertConfig( - batch_size=32, + batch_size=96, seq_length=128, vocab_size=21128, hidden_size=1024, @@ -98,7 +98,7 @@ if cfg.bert_network == 'nezha': ) if cfg.bert_network == 'large': bert_net_cfg = BertConfig( - batch_size=16, + batch_size=24, seq_length=512, vocab_size=30522, hidden_size=1024, diff --git a/model_zoo/official/nlp/tinybert/README.md b/model_zoo/official/nlp/tinybert/README.md index aa96d246e5..d4b708b408 100644 --- a/model_zoo/official/nlp/tinybert/README.md +++ b/model_zoo/official/nlp/tinybert/README.md @@ -1,6 +1,6 @@ # TinyBERT Example ## Description -[TinyBERT](https://github.com/huawei-noah/Pretrained-Model/tree/master/TinyBERT) is 7.5x smalller and 9.4x faster on inference than [BERT-base](https://github.com/google-research/bert)(the base version of BERT model) and achieves competitive performances in the tasks of natural language understanding. It performs a novel transformer distillation at both the pre-training and task-specific learning stages. +[TinyBERT](https://github.com/huawei-noah/Pretrained-Model/tree/master/TinyBERT) is 7.5x smalller and 9.4x faster on inference than [BERT-base](https://github.com/google-research/bert) (the base version of BERT model) and achieves competitive performances in the tasks of natural language understanding. It performs a novel transformer distillation at both the pre-training and task-specific learning stages. ## Requirements - Install [MindSpore](https://www.mindspore.cn/install/en). diff --git a/model_zoo/official/nlp/tinybert/run_general_distill.py b/model_zoo/official/nlp/tinybert/run_general_distill.py index 42d852808b..c0e1044773 100644 --- a/model_zoo/official/nlp/tinybert/run_general_distill.py +++ b/model_zoo/official/nlp/tinybert/run_general_distill.py @@ -84,7 +84,7 @@ def run_general_distill(): args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) dataset_size = dataset.get_dataset_size() - + print('dataset size: ', dataset_size) if args_opt.enable_data_sink == "true": repeat_count = args_opt.epoch_size * dataset.get_dataset_size() // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps diff --git a/model_zoo/official/nlp/tinybert/run_task_distill.py b/model_zoo/official/nlp/tinybert/run_task_distill.py index 1483b22c0d..12a3acda48 100644 --- a/model_zoo/official/nlp/tinybert/run_task_distill.py +++ b/model_zoo/official/nlp/tinybert/run_task_distill.py @@ -90,6 +90,7 @@ def run_predistill(): args_opt.train_data_dir, args_opt.schema_dir) dataset_size = dataset.get_dataset_size() + print('td1 dataset size: ', dataset_size) if args_opt.enable_data_sink == 'true': repeat_count = args_opt.td_phase1_epoch_size * dataset.get_dataset_size() // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps @@ -147,6 +148,7 @@ def run_task_distill(ckpt_file): args_opt.train_data_dir, args_opt.schema_dir) dataset_size = train_dataset.get_dataset_size() + print('td2 train dataset size: ', dataset_size) if args_opt.enable_data_sink == 'true': repeat_count = args_opt.td_phase2_epoch_size * train_dataset.get_dataset_size() // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps @@ -173,12 +175,9 @@ def run_task_distill(ckpt_file): eval_dataset = create_tinybert_dataset('td', td_teacher_net_cfg.batch_size, device_num, rank, args_opt.do_shuffle, args_opt.eval_data_dir, args_opt.schema_dir) + if args_opt.do_eval.lower() == "true": callback = [TimeMonitor(time_monitor_steps), LossCallBack(), - ModelSaveCkpt(netwithloss.bert, - args_opt.save_ckpt_step, - args_opt.max_ckpt_num, - td_phase2_save_ckpt_dir), EvalCallBack(netwithloss.bert, eval_dataset)] else: callback = [TimeMonitor(time_monitor_steps), LossCallBack(), diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh index d45c280723..a1297b061f 100644 --- a/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh +++ b/model_zoo/official/nlp/tinybert/scripts/run_distribute_gd.sh @@ -62,9 +62,8 @@ do --device_num=$RANK_SIZE \ --enable_data_sink="true" \ --data_sink_steps=100 \ - --save_ckpt_step=100 \ + --save_ckpt_step=10000 \ --max_ckpt_num=1 \ - --save_ckpt_path="" \ --load_teacher_ckpt_path="" \ --data_dir="" \ --schema_dir="" > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/tinybert/src/dataset.py b/model_zoo/official/nlp/tinybert/src/dataset.py index 576d2ee6d9..fdc0dfe21e 100644 --- a/model_zoo/official/nlp/tinybert/src/dataset.py +++ b/model_zoo/official/nlp/tinybert/src/dataset.py @@ -37,9 +37,6 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, shard_equal_rows=True) - - ori_dataset_size = ds.get_dataset_size() - print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) ds = ds.map(input_columns="segment_ids", operations=type_cast_op) ds = ds.map(input_columns="input_mask", operations=type_cast_op)