| @@ -80,7 +80,7 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da | |||||
| ```text | ```text | ||||
| . | . | ||||
| └─bert | |||||
| └─ternarybert | |||||
| ├─README.md | ├─README.md | ||||
| ├─scripts | ├─scripts | ||||
| ├─run_train.sh # shell script for training phase | ├─run_train.sh # shell script for training phase | ||||
| @@ -106,26 +106,12 @@ Before running the shell script, please set the `task_name`, `model_dir` and `da | |||||
| ```text | ```text | ||||
| usage: train.py [--h] | |||||
| [--device_target {GPU,Ascend}] | |||||
| [--do_eval {true,false}] | |||||
| [--epoch_size EPOCH_SIZE] | |||||
| [--device_id DEVICE_ID] | |||||
| [--do_shuffle {true,false}] | |||||
| [--enable_data_sink {true,false}] | |||||
| [--save_ckpt_step SAVE_CKPT_STEP] | |||||
| [--eval_ckpt_step EVAL_CKPT_STEP] | |||||
| [--max_ckpt_num MAX_CKPT_NUM] | |||||
| [--data_sink_steps DATA_SINK_STEPS] | |||||
| [--teacher_model_dir TEACHER_MODEL_DIR] | |||||
| [--student_model_dir STUDENT_MODEL_DIR] | |||||
| [--data_dir DATA_DIR] | |||||
| [--output_dir OUTPUT_DIR] | |||||
| [--task_name {sts-b,qnli,mnli}] | |||||
| [--dataset_type DATASET_TYPE] | |||||
| [--seed SEED] | |||||
| [--train_batch_size TRAIN_BATCH_SIZE] | |||||
| [--eval_batch_size EVAL_BATCH_SIZE] | |||||
| usage: train.py [--h] [--device_target {GPU,Ascend}] [--do_eval {true,false}] [--epoch_size EPOCH_SIZE] | |||||
| [--device_id DEVICE_ID] [--do_shuffle {true,false}] [--enable_data_sink {true,false}] [--save_ckpt_step SAVE_CKPT_STEP] | |||||
| [--eval_ckpt_step EVAL_CKPT_STEP] [--max_ckpt_num MAX_CKPT_NUM] [--data_sink_steps DATA_SINK_STEPS] | |||||
| [--teacher_model_dir TEACHER_MODEL_DIR] [--student_model_dir STUDENT_MODEL_DIR] [--data_dir DATA_DIR] | |||||
| [--output_dir OUTPUT_DIR] [--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--seed SEED] | |||||
| [--train_batch_size TRAIN_BATCH_SIZE] [--eval_batch_size EVAL_BATCH_SIZE] | |||||
| options: | options: | ||||
| --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" | --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" | ||||
| @@ -154,14 +140,8 @@ options: | |||||
| ```text | ```text | ||||
| usage: eval.py [--h] | |||||
| [--device_target {GPU,Ascend}] | |||||
| [--device_id DEVICE_ID] | |||||
| [--model_dir MODEL_DIR] | |||||
| [--data_dir DATA_DIR] | |||||
| [--task_name {sts-b,qnli,mnli}] | |||||
| [--dataset_type DATASET_TYPE] | |||||
| [--batch_size BATCH_SIZE] | |||||
| usage: eval.py [--h] [--device_target {GPU,Ascend}] [--device_id DEVICE_ID] [--model_dir MODEL_DIR] [--data_dir DATA_DIR] | |||||
| [--task_name {sts-b,qnli,mnli}] [--dataset_type DATASET_TYPE] [--batch_size BATCH_SIZE] | |||||
| options: | options: | ||||
| --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" | --device_target Device where the code will be implemented: "GPU" | "Ascend", default is "GPU" | ||||
| @@ -205,7 +185,7 @@ Parameters for eval: | |||||
| Parameters for teacher bert network: | Parameters for teacher bert network: | ||||
| seq_length length of input sequence: N, default is 128 | seq_length length of input sequence: N, default is 128 | ||||
| vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522 | |||||
| vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 | |||||
| hidden_size size of bert encoder layers: N | hidden_size size of bert encoder layers: N | ||||
| num_hidden_layers number of hidden layers: N | num_hidden_layers number of hidden layers: N | ||||
| num_attention_heads number of attention heads: N, default is 12 | num_attention_heads number of attention heads: N, default is 12 | ||||
| @@ -224,7 +204,7 @@ Parameters for teacher bert network: | |||||
| Parameters for student bert network: | Parameters for student bert network: | ||||
| seq_length length of input sequence: N, default is 128 | seq_length length of input sequence: N, default is 128 | ||||
| vocab_size size of each embedding vector: N, must be consistant with the dataset you use. Default is 30522 | |||||
| vocab_size size of each embedding vector: N, must be consistent with the dataset you use. Default is 30522 | |||||
| hidden_size size of bert encoder layers: N | hidden_size size of bert encoder layers: N | ||||
| num_hidden_layers number of hidden layers: N | num_hidden_layers number of hidden layers: N | ||||
| num_attention_heads number of attention heads: N, default is 12 | num_attention_heads number of attention heads: N, default is 12 | ||||
| @@ -348,7 +328,7 @@ eval step: 0, Accuracy: 90.625 | |||||
| eval step: 1, Accuracy: 81.25 | eval step: 1, Accuracy: 81.25 | ||||
| eval step: 2, Accuracy: 79.16666666666666 | eval step: 2, Accuracy: 79.16666666666666 | ||||
| ... | ... | ||||
| The best Accuracy: 83.70860927152319 | |||||
| The best Accuracy: 83.58388835685436 | |||||
| ``` | ``` | ||||
| @@ -362,27 +342,13 @@ The best Accuracy: 83.70860927152319 | |||||
| | ----------------- | :---------------------------------------------------- | | | ----------------- | :---------------------------------------------------- | | ||||
| | Model Version | TernaryBERT | | | Model Version | TernaryBERT | | ||||
| | Resource | NV SMX2 V100-32G | | | Resource | NV SMX2 V100-32G | | ||||
| | uploaded Date | 08/20/2020 | | |||||
| | uploaded Date | 02/01/2020 | | |||||
| | MindSpore Version | 1.1.0 | | | MindSpore Version | 1.1.0 | | ||||
| | Dataset | STS-B, QNLI, MNLI | | |||||
| | batch_size | 16, 16, 16 | | |||||
| | Metric value | 87.58388835685437, 90.426505583013, 83.70860927152319 | | |||||
| | Speed | | | |||||
| | Total time | | | |||||
| ### Inference Performance | |||||
| | Parameters | GPU | | |||||
| | ----------------- | :---------------------------------------------------- | | |||||
| | Model Version | TernaryBERT | | |||||
| | Resource | NV SMX2 V100-32G | | |||||
| | uploaded Date | 08/20/2020 | | |||||
| | MindSpore Version | 1.1.0 | | |||||
| | Dataset | STS-B, QNLI, MNLI | | |||||
| | batch_size | 32, 32, 32 | | |||||
| | Accuracy | 87.58388835685437, 90.426505583013, 83.70860927152319 | | |||||
| | Speed | | | |||||
| | Total time | | | |||||
| | Dataset | STS-B | | |||||
| | batch_size | 16 | | |||||
| | Metric value | 87.5839 | | |||||
| | Speed | 0.19s/step | | |||||
| | Total time | 6.7min(3epoch, 1p) | | |||||
| # [Description of Random Situation](#contents) | # [Description of Random Situation](#contents) | ||||
| @@ -20,7 +20,7 @@ import re | |||||
| import argparse | import argparse | ||||
| from mindspore import context | from mindspore import context | ||||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | from mindspore.train.serialization import load_checkpoint, load_param_into_net | ||||
| from src.dataset import create_tinybert_dataset | |||||
| from src.dataset import create_dataset | |||||
| from src.config import eval_cfg, student_net_cfg, task_cfg | from src.config import eval_cfg, student_net_cfg, task_cfg | ||||
| from src.tinybert_model import BertModelCLS | from src.tinybert_model import BertModelCLS | ||||
| @@ -66,15 +66,15 @@ def do_eval_standalone(args_opt): | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id) | context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args.device_id) | ||||
| eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size, | |||||
| device_num=1, | |||||
| rank=0, | |||||
| do_shuffle='false', | |||||
| data_dir=eval_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=False) | |||||
| eval_dataset = create_dataset(batch_size=eval_cfg.batch_size, | |||||
| device_num=1, | |||||
| rank=0, | |||||
| do_shuffle='false', | |||||
| data_dir=eval_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=False) | |||||
| print('eval dataset size:', eval_dataset.get_dataset_size()) | print('eval dataset size:', eval_dataset.get_dataset_size()) | ||||
| print('eval dataset batch size:', eval_dataset.get_batch_size()) | print('eval dataset batch size:', eval_dataset.get_batch_size()) | ||||
| @@ -313,7 +313,7 @@ class BertNetworkWithLoss(nn.Cell): | |||||
| class BertTrainWithLossScaleCell(nn.Cell): | class BertTrainWithLossScaleCell(nn.Cell): | ||||
| """ | """ | ||||
| Especifically defined for finetuning where only four inputs tensor are needed. | |||||
| Specifically defined for finetuning where only four inputs tensor are needed. | |||||
| """ | """ | ||||
| def __init__(self, network, optimizer, scale_update_cell=None): | def __init__(self, network, optimizer, scale_update_cell=None): | ||||
| super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) | super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) | ||||
| @@ -333,6 +333,8 @@ class BertTrainWithLossScaleCell(nn.Cell): | |||||
| if self.reducer_flag: | if self.reducer_flag: | ||||
| self.degree = get_group_size() | self.degree = get_group_size() | ||||
| self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) | self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) | ||||
| self.clip_type = gradient_cfg.clip_type | |||||
| self.clip_value = gradient_cfg.clip_value | |||||
| self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) | self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) | ||||
| self.cast = P.Cast() | self.cast = P.Cast() | ||||
| self.alloc_status = P.NPUAllocFloatStatus() | self.alloc_status = P.NPUAllocFloatStatus() | ||||
| @@ -410,7 +412,7 @@ class BertTrainWithLossScaleCell(nn.Cell): | |||||
| # apply grad reducer on grads | # apply grad reducer on grads | ||||
| grads = self.grad_reducer(grads) | grads = self.grad_reducer(grads) | ||||
| grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) | grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads) | ||||
| grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads) | |||||
| grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads) | |||||
| restore = () | restore = () | ||||
| for i in range(self.length): | for i in range(self.length): | ||||
| restore = restore + (F.assign(weights[i], self.saved_params[i]),) | restore = restore + (F.assign(weights[i], self.saved_params[i]),) | ||||
| @@ -437,7 +439,7 @@ class BertTrainWithLossScaleCell(nn.Cell): | |||||
| class BertTrainCell(nn.Cell): | class BertTrainCell(nn.Cell): | ||||
| """ | """ | ||||
| Especifically defined for finetuning where only four inputs tensor are needed. | |||||
| Specifically defined for finetuning where only four inputs tensor are needed. | |||||
| """ | """ | ||||
| def __init__(self, network, optimizer, sens=1.0): | def __init__(self, network, optimizer, sens=1.0): | ||||
| super(BertTrainCell, self).__init__(auto_prefix=False) | super(BertTrainCell, self).__init__(auto_prefix=False) | ||||
| @@ -448,6 +450,8 @@ class BertTrainCell(nn.Cell): | |||||
| self.sens = sens | self.sens = sens | ||||
| self.grad = C.GradOperation(get_by_list=True, | self.grad = C.GradOperation(get_by_list=True, | ||||
| sens_param=True) | sens_param=True) | ||||
| self.clip_type = gradient_cfg.clip_type | |||||
| self.clip_value = gradient_cfg.clip_value | |||||
| self.reducer_flag = False | self.reducer_flag = False | ||||
| self.parallel_mode = context.get_auto_parallel_context("parallel_mode") | self.parallel_mode = context.get_auto_parallel_context("parallel_mode") | ||||
| if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: | ||||
| @@ -514,7 +518,7 @@ class BertTrainCell(nn.Cell): | |||||
| F.control_depend(input_ids, grads) | F.control_depend(input_ids, grads) | ||||
| # apply grad reducer on grads | # apply grad reducer on grads | ||||
| grads = self.grad_reducer(grads) | grads = self.grad_reducer(grads) | ||||
| grads = self.hyper_map(F.partial(clip_grad, gradient_cfg.clip_type, gradient_cfg.clip_value), grads) | |||||
| grads = self.hyper_map(F.partial(clip_grad, self.clip_type, self.clip_value), grads) | |||||
| restore = () | restore = () | ||||
| for i in range(self.length): | for i in range(self.length): | ||||
| restore = restore + (F.assign(weights[i], self.saved_params[i]),) | restore = restore + (F.assign(weights[i], self.saved_params[i]),) | ||||
| @@ -27,8 +27,8 @@ class DataType(Enum): | |||||
| MINDRECORD = 2 | MINDRECORD = 2 | ||||
| def create_tinybert_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||||
| data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): | |||||
| def create_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, | |||||
| data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): | |||||
| """create tinybert dataset""" | """create tinybert dataset""" | ||||
| if isinstance(data_dir, list): | if isinstance(data_dir, list): | ||||
| data_files = data_dir | data_files = data_dir | ||||
| @@ -89,12 +89,7 @@ class LossCallBack(Callback): | |||||
| class StepCallBack(Callback): | class StepCallBack(Callback): | ||||
| """ | """ | ||||
| Monitor the loss in training. | |||||
| If the loss in NAN or INF terminating training. | |||||
| Note: | |||||
| if per_print_times is 0 do not print loss. | |||||
| Args: | |||||
| per_print_times (int): Print loss every times. Default: 1. | |||||
| Monitor the time in training. | |||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| super(StepCallBack, self).__init__() | super(StepCallBack, self).__init__() | ||||
| @@ -21,7 +21,7 @@ from mindspore import context | |||||
| from mindspore.train.model import Model | from mindspore.train.model import Model | ||||
| from mindspore.nn.optim import AdamWeightDecay | from mindspore.nn.optim import AdamWeightDecay | ||||
| from mindspore import set_seed | from mindspore import set_seed | ||||
| from src.dataset import create_tinybert_dataset | |||||
| from src.dataset import create_dataset | |||||
| from src.utils import StepCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate | from src.utils import StepCallBack, ModelSaveCkpt, EvalCallBack, BertLearningRate | ||||
| from src.config import train_cfg, eval_cfg, teacher_net_cfg, student_net_cfg, task_cfg | from src.config import train_cfg, eval_cfg, teacher_net_cfg, student_net_cfg, task_cfg | ||||
| from src.cell_wrapper import BertNetworkWithLoss, BertTrainCell | from src.cell_wrapper import BertNetworkWithLoss, BertTrainCell | ||||
| @@ -86,26 +86,26 @@ def run_task_distill(args_opt): | |||||
| rank = 0 | rank = 0 | ||||
| device_num = 1 | device_num = 1 | ||||
| train_dataset = create_tinybert_dataset(batch_size=train_cfg.batch_size, | |||||
| device_num=device_num, | |||||
| rank=rank, | |||||
| do_shuffle=args_opt.do_shuffle, | |||||
| data_dir=train_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=True) | |||||
| train_dataset = create_dataset(batch_size=train_cfg.batch_size, | |||||
| device_num=device_num, | |||||
| rank=rank, | |||||
| do_shuffle=args_opt.do_shuffle, | |||||
| data_dir=train_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=True) | |||||
| dataset_size = train_dataset.get_dataset_size() | dataset_size = train_dataset.get_dataset_size() | ||||
| print('train dataset size:', dataset_size) | print('train dataset size:', dataset_size) | ||||
| eval_dataset = create_tinybert_dataset(batch_size=eval_cfg.batch_size, | |||||
| device_num=device_num, | |||||
| rank=rank, | |||||
| do_shuffle=args_opt.do_shuffle, | |||||
| data_dir=eval_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=False) | |||||
| eval_dataset = create_dataset(batch_size=eval_cfg.batch_size, | |||||
| device_num=device_num, | |||||
| rank=rank, | |||||
| do_shuffle=args_opt.do_shuffle, | |||||
| data_dir=eval_data_dir, | |||||
| data_type=args_opt.dataset_type, | |||||
| seq_length=task.seq_length, | |||||
| task_type=task.task_type, | |||||
| drop_remainder=False) | |||||
| print('eval dataset size:', eval_dataset.get_dataset_size()) | print('eval dataset size:', eval_dataset.get_dataset_size()) | ||||
| if args_opt.enable_data_sink == 'true': | if args_opt.enable_data_sink == 'true': | ||||