From 15d37e5db9d04d16a3ceb5a254540786977794fa Mon Sep 17 00:00:00 2001
From: chenhaozhe <chenhaozhe1@huawei.com>
Date: Thu, 18 Mar 2021 10:37:29 +0800
Subject: [PATCH] imporve convergence of loss in bert

---
 model_zoo/official/cv/ssd/README.md                      | 5 +++--
 model_zoo/official/nlp/bert/src/bert_for_pre_training.py | 2 +-
 model_zoo/official/nlp/bert/src/bert_model.py            | 3 ++-
 model_zoo/official/nlp/bert/src/config.py                | 4 ++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/model_zoo/official/cv/ssd/README.md b/model_zoo/official/cv/ssd/README.md
index 2c69d69d35..31525f3214 100644
--- a/model_zoo/official/cv/ssd/README.md
+++ b/model_zoo/official/cv/ssd/README.md
@@ -317,8 +317,9 @@ You can train your own model based on either pretrained classification model or
 
 1. Convert your own dataset to COCO or VOC style. Otherwise you have to add your own data preprocess code.
 2. Change config.py according to your own dataset, especially the `num_classes`.
-3. Set argument `filter_weight` to `True` while calling `train.py`, this will filter the final detection box weight from the pretrained model.
-4. Build your own bash scripts using new config and arguments for further convenient.
+3. Prepare a pretrained checkpoint. You can load the pretrained checkpoint by `pre_trained` argument. Transfer training means a new training job, so just keep `pre_trained_epoch_size`  same as default value `0`.
+4. Set argument `filter_weight` to `True` while calling `train.py`, this will filter the final detection box weight from the pretrained model.
+5. Build your own bash scripts using new config and arguments for further convenient.
 
 ### [Evaluation Process](#contents)
 
diff --git a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
index 1c15dcd0ec..aea0cd5357 100644
--- a/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
+++ b/model_zoo/official/nlp/bert/src/bert_for_pre_training.py
@@ -599,7 +599,7 @@ class BertTrainAccumulationAllReducePostWithLossScaleCell(nn.Cell):
             scaling_sens = sens
         # alloc status and clear should be right before gradoperation
         init = self.alloc_status()
-        init = F.depend(loss, init)
+        init = F.depend(init, loss)
         clear_status = self.clear_status(init)
         scaling_sens = F.depend(scaling_sens, clear_status)
         # update accumulation parameters
diff --git a/model_zoo/official/nlp/bert/src/bert_model.py b/model_zoo/official/nlp/bert/src/bert_model.py
index 05dc6862c8..7fe4f8955d 100644
--- a/model_zoo/official/nlp/bert/src/bert_model.py
+++ b/model_zoo/official/nlp/bert/src/bert_model.py
@@ -804,7 +804,8 @@ class BertModel(nn.Cell):
         self.bert_embedding_lookup = nn.Embedding(
             vocab_size=config.vocab_size,
             embedding_size=self.embedding_size,
-            use_one_hot=use_one_hot_embeddings)
+            use_one_hot=use_one_hot_embeddings,
+            embedding_table=TruncatedNormal(config.initializer_range))
 
         self.bert_embedding_postprocessor = EmbeddingPostprocessor(
             embedding_size=self.embedding_size,
diff --git a/model_zoo/official/nlp/bert/src/config.py b/model_zoo/official/nlp/bert/src/config.py
index b1d757f1f1..68a5f0634a 100644
--- a/model_zoo/official/nlp/bert/src/config.py
+++ b/model_zoo/official/nlp/bert/src/config.py
@@ -36,9 +36,9 @@ cfg = edict({
         'warmup_steps': 10000,
     }),
     'Lamb': edict({
-        'learning_rate': 3e-5,
+        'learning_rate': 3e-4,
         'end_learning_rate': 0.0,
-        'power': 5.0,
+        'power': 2.0,
         'warmup_steps': 10000,
         'weight_decay': 0.01,
         'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),