Browse Source

change epoch to integer while print loss

tags/v1.0.0
chenhaozhe 5 years ago
parent
commit
1a8dffe4c6
2 changed files with 12 additions and 1 deletions
  1. +5
    -0
      model_zoo/official/nlp/bert/README.md
  2. +7
    -1
      model_zoo/official/nlp/bert/src/utils.py

+ 5
- 0
model_zoo/official/nlp/bert/README.md View File

@@ -381,6 +381,11 @@ epoch: 0.0, current epoch percent: 0.000, step: 2, outpus are (Tensor(shape=[1],
> ```
> This will extend the timeout limits of hccl from the default 120 seconds to 600 seconds.

> **Attention** If you are running with a big bert model, some error of protobuf may occurs while saving checkpoints, try with the following environ set.
> ```
> export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
> ```

### Distributed Training
#### Running on Ascend
```


+ 7
- 1
model_zoo/official/nlp/bert/src/utils.py View File

@@ -145,11 +145,17 @@ class LossCallBack(Callback):
super(LossCallBack, self).__init__()
self._dataset_size = dataset_size
def step_end(self, run_context):
"""
Print loss after each step
"""
cb_params = run_context.original_args()
if self._dataset_size > 0:
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
if percent == 0:
percent = 1
epoch_num -= 1
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
.format(int(epoch_num), "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
else:
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs)))


Loading…
Cancel
Save