From 9bdece71d4c9f59678d4f6f0e31c1fb4a3e772bc Mon Sep 17 00:00:00 2001 From: yoonlee666 Date: Fri, 31 Jul 2020 14:20:20 +0800 Subject: [PATCH] add switch for data shuffle --- model_zoo/official/nlp/bert/run_classifier.py | 19 ++++++++++++------- model_zoo/official/nlp/bert/run_ner.py | 10 ++++++++-- model_zoo/official/nlp/bert/run_squad.py | 10 ++++++++-- .../nlp/bert/scripts/run_classifier.sh | 2 ++ .../official/nlp/bert/scripts/run_ner.sh | 2 ++ .../official/nlp/bert/scripts/run_squad.sh | 2 ++ .../nlp/bert/src/assessment_method.py | 1 - model_zoo/official/nlp/bert/src/dataset.py | 17 +++++++++-------- 8 files changed, 43 insertions(+), 20 deletions(-) diff --git a/model_zoo/official/nlp/bert/run_classifier.py b/model_zoo/official/nlp/bert/run_classifier.py index 97b01cceb9..c3663a5727 100644 --- a/model_zoo/official/nlp/bert/run_classifier.py +++ b/model_zoo/official/nlp/bert/run_classifier.py @@ -133,14 +133,17 @@ def run_classifier(): """run classifier task""" parser = argparse.ArgumentParser(description="run classifier") parser.add_argument("--device_target", type=str, default="Ascend", help="Device type, default is Ascend") - parser.add_argument("--assessment_method", type=str, default="accuracy", help="assessment_method include: " - "[MCC, Spearman_correlation, " - "Accuracy], default is accuracy") - parser.add_argument("--do_train", type=str, default="false", help="Eable train, default is false") - parser.add_argument("--do_eval", type=str, default="false", help="Eable eval, default is false") + parser.add_argument("--assessment_method", type=str, default="accuracy", + help="assessment_method including [MCC, Spearman_correlation, Accuracy], default is accuracy") + parser.add_argument("--do_train", type=str, default="false", help="Enable train, default is false") + parser.add_argument("--do_eval", type=str, default="false", help="Enable eval, default is false") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.") parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.") + parser.add_argument("--train_data_shuffle", type=str, default="true", + help="Enable train data shuffle, default is true") + parser.add_argument("--eval_data_shuffle", type=str, default="false", + help="Enable eval data shuffle, default is false") parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path") @@ -182,7 +185,8 @@ def run_classifier(): ds = create_classification_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, assessment_method=assessment_method, data_file_path=args_opt.train_data_file_path, - schema_file_path=args_opt.schema_file_path) + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) if args_opt.do_eval.lower() == "true": @@ -197,7 +201,8 @@ def run_classifier(): ds = create_classification_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, assessment_method=assessment_method, data_file_path=args_opt.eval_data_file_path, - schema_file_path=args_opt.schema_file_path) + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) do_eval(ds, BertCLS, args_opt.num_class, assessment_method, load_finetune_checkpoint_path) if __name__ == "__main__": diff --git a/model_zoo/official/nlp/bert/run_ner.py b/model_zoo/official/nlp/bert/run_ner.py index c9314bf39c..1ea6893945 100644 --- a/model_zoo/official/nlp/bert/run_ner.py +++ b/model_zoo/official/nlp/bert/run_ner.py @@ -150,6 +150,10 @@ def run_ner(): parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.") parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.") + parser.add_argument("--train_data_shuffle", type=str, default="true", + help="Enable train data shuffle, default is true") + parser.add_argument("--eval_data_shuffle", type=str, default="false", + help="Enable eval data shuffle, default is false") parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path, used in clue benchmark") parser.add_argument("--label2id_file_path", type=str, default="", help="label2id file path, used in clue benchmark") parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") @@ -208,7 +212,8 @@ def run_ner(): if args_opt.do_train.lower() == "true": ds = create_ner_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, assessment_method=assessment_method, data_file_path=args_opt.train_data_file_path, - schema_file_path=args_opt.schema_file_path) + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) if args_opt.do_eval.lower() == "true": @@ -222,7 +227,8 @@ def run_ner(): if args_opt.do_eval.lower() == "true": ds = create_ner_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, assessment_method=assessment_method, data_file_path=args_opt.eval_data_file_path, - schema_file_path=args_opt.schema_file_path) + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) do_eval(ds, BertNER, args_opt.use_crf, number_labels, assessment_method, args_opt.eval_data_file_path, load_finetune_checkpoint_path, args_opt.vocab_file_path, args_opt.label2id_file_path, tag_to_index) diff --git a/model_zoo/official/nlp/bert/run_squad.py b/model_zoo/official/nlp/bert/run_squad.py index bc2b75fa32..972f9dcdfc 100644 --- a/model_zoo/official/nlp/bert/run_squad.py +++ b/model_zoo/official/nlp/bert/run_squad.py @@ -140,6 +140,10 @@ def run_squad(): parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.") parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.") + parser.add_argument("--train_data_shuffle", type=str, default="true", + help="Enable train data shuffle, default is true") + parser.add_argument("--eval_data_shuffle", type=str, default="false", + help="Enable eval data shuffle, default is false") parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path") parser.add_argument("--eval_json_path", type=str, default="", help="Evaluation json file path, can be eval.json") parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") @@ -186,7 +190,8 @@ def run_squad(): if args_opt.do_train.lower() == "true": ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, data_file_path=args_opt.train_data_file_path, - schema_file_path=args_opt.schema_file_path) + schema_file_path=args_opt.schema_file_path, + do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) if args_opt.do_eval.lower() == "true": if save_finetune_checkpoint_path == "": @@ -199,7 +204,8 @@ def run_squad(): if args_opt.do_eval.lower() == "true": ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, data_file_path=args_opt.eval_data_file_path, - schema_file_path=args_opt.schema_file_path, is_training=False) + schema_file_path=args_opt.schema_file_path, is_training=False, + do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) do_eval(ds, args_opt.vocab_file_path, args_opt.eval_json_path, load_finetune_checkpoint_path, bert_net_cfg.seq_length) diff --git a/model_zoo/official/nlp/bert/scripts/run_classifier.sh b/model_zoo/official/nlp/bert/scripts/run_classifier.sh index 275324b950..39516fa419 100644 --- a/model_zoo/official/nlp/bert/scripts/run_classifier.sh +++ b/model_zoo/official/nlp/bert/scripts/run_classifier.sh @@ -34,6 +34,8 @@ python ${PROJECT_DIR}/../run_classifier.py \ --device_id=0 \ --epoch_num=1 \ --num_class=2 \ + --train_data_shuffle="true" \ + --eval_data_shuffle="false" \ --save_finetune_checkpoint_path="" \ --load_pretrain_checkpoint_path="" \ --load_finetune_checkpoint_path="" \ diff --git a/model_zoo/official/nlp/bert/scripts/run_ner.sh b/model_zoo/official/nlp/bert/scripts/run_ner.sh index ae401b2462..45c37be653 100644 --- a/model_zoo/official/nlp/bert/scripts/run_ner.sh +++ b/model_zoo/official/nlp/bert/scripts/run_ner.sh @@ -35,6 +35,8 @@ python ${PROJECT_DIR}/../run_ner.py \ --device_id=0 \ --epoch_num=1 \ --num_class=2 \ + --train_data_shuffle="true" \ + --eval_data_shuffle="false" \ --vocab_file_path="" \ --label2id_file_path="" \ --save_finetune_checkpoint_path="" \ diff --git a/model_zoo/official/nlp/bert/scripts/run_squad.sh b/model_zoo/official/nlp/bert/scripts/run_squad.sh index a33950cadb..efca61db1d 100644 --- a/model_zoo/official/nlp/bert/scripts/run_squad.sh +++ b/model_zoo/official/nlp/bert/scripts/run_squad.sh @@ -33,6 +33,8 @@ python ${PROJECT_DIR}/../run_squad.py \ --device_id=0 \ --epoch_num=1 \ --num_class=2 \ + --train_data_shuffle="true" \ + --eval_data_shuffle="false" \ --vocab_file_path="" \ --eval_json_path="" \ --save_finetune_checkpoint_path="" \ diff --git a/model_zoo/official/nlp/bert/src/assessment_method.py b/model_zoo/official/nlp/bert/src/assessment_method.py index ca6579cabf..dae4894129 100644 --- a/model_zoo/official/nlp/bert/src/assessment_method.py +++ b/model_zoo/official/nlp/bert/src/assessment_method.py @@ -34,7 +34,6 @@ class Accuracy(): logit_id = np.argmax(logits, axis=-1) self.acc_num += np.sum(labels == logit_id) self.total_num += len(labels) - print("=========================accuracy is ", self.acc_num / self.total_num) class F1(): ''' diff --git a/model_zoo/official/nlp/bert/src/dataset.py b/model_zoo/official/nlp/bert/src/dataset.py index 5b922b9f0b..8193ef83fa 100644 --- a/model_zoo/official/nlp/bert/src/dataset.py +++ b/model_zoo/official/nlp/bert/src/dataset.py @@ -53,11 +53,11 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", - data_file_path=None, schema_file_path=None): + data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(input_columns="label_ids", operations=type_cast_op_float) @@ -76,11 +76,11 @@ def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", - data_file_path=None, schema_file_path=None): + data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) ds = ds.map(input_columns="label_ids", operations=type_cast_op_float) @@ -98,14 +98,15 @@ def create_classification_dataset(batch_size=1, repeat_count=1, assessment_metho return ds -def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): +def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, + is_training=True, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", - "start_positions", "end_positions", - "unique_ids", "is_impossible"]) + columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", + "end_positions", "unique_ids", "is_impossible"], + shuffle=do_shuffle) ds = ds.map(input_columns="start_positions", operations=type_cast_op) ds = ds.map(input_columns="end_positions", operations=type_cast_op) else: