diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index 48423e32f7..f8f7568fb6 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -381,6 +381,11 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1], > ``` > This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds. +> **Attention** If you are running with a big bert model, some error of protobuf may occurs while saving checkpoints, try with the following environ set. +> ``` +> export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +> ``` + ### Distributed Training #### Running on Ascend ``` diff --git a/model_zoo/official/nlp/bert/src/utils.py b/model_zoo/official/nlp/bert/src/utils.py index 77c71d2b88..6d6a6196b1 100644 --- a/model_zoo/official/nlp/bert/src/utils.py +++ b/model_zoo/official/nlp/bert/src/utils.py @@ -145,11 +145,17 @@ class LossCallBack(Callback): super(LossCallBack, self).__init__() self._dataset_size = dataset_size def step_end(self, run_context): + """ + Print loss after each step + """ cb_params = run_context.original_args() if self._dataset_size > 0: percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) + if percent == 0: + percent = 1 + epoch_num -= 1 print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" - .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) + .format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) else: print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, str(cb_params.net_outputs)))