From 9a45c4419cdcd12405ec59a1195a71447a969ba6 Mon Sep 17 00:00:00 2001
From: lilei <lilei120@huawei.com>
Date: Sat, 9 Jan 2021 18:49:37 +0800
Subject: [PATCH] modify batch_normal

---
 .../ascend/ascend_backend_optimization.cc     |  12 +-
 .../backend/optimizer/ascend/ascend_helper.cc |   5 +-
 .../backend/optimizer/ascend/ascend_helper.h  |   3 +-
 .../format_type/deal_ref_trans_and_cast.cc    |   7 +-
 .../ascend/format_type/insert_cast.cc         |   8 +-
 .../ascend/ir_fission/bn_grad_split.cc        |   6 +-
 .../optimizer/ascend/ir_fission/bn_split.cc   |  10 +-
 .../backend/session/anf_runtime_algorithm.cc  |   3 +
 mindspore/nn/layer/normalization.py           |  29 +---
 mindspore/nn/layer/quant.py                   |  29 +---
 mindspore/ops/operations/nn_ops.py            |   3 +-
 .../official/nlp/bert_thor/src/bert_model.py  |   7 +-
 .../nlp/bert_thor/src/fused_layer_norm.py     | 127 ------------------
 .../st/networks/models/bert/src/bert_model.py |   7 +-
 .../models/bert/src/fused_layer_norm.py       | 120 -----------------
 .../ascend/ir_fission/bn_grad_split_test.cc   |   6 +-
 .../gtest_input/pre_activate/bn_grad_split.py |   2 +-
 .../gtest_input/pre_activate/bn_split.py      |   2 +-
 .../parallel/test_batchnorm_batch_parallel.py |   2 +-
 .../parallel/test_operator_model_parallel.py  |   2 +-
 20 files changed, 58 insertions(+), 332 deletions(-)
 delete mode 100644 model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
 delete mode 100644 tests/st/networks/models/bert/src/fused_layer_norm.py
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
index aaa526cf6e..7f690a9310 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -155,7 +155,6 @@ void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
 
 void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
   MS_EXCEPTION_IF_NULL(ir_fusion_pm);
-  ir_fusion_pm->AddPass(std::make_shared<BatchNormBertFission>());
   ir_fusion_pm->AddPass(std::make_shared<SingleBatchNormFission>());
   ir_fusion_pm->AddPass(std::make_shared<BatchNorm2BNInfer>());
   ir_fusion_pm->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
@@ -270,15 +269,8 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
   }
   auto optimizer = std::make_shared<GraphOptimizer>();
   auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
-  if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
-    ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
-    ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
-  } else {
-    ir_fusion_pm->AddPass(std::make_shared<BatchNormGradSplit>());
-    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormFusion>());
-    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion0>());
-    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion1>());
-  }
+  ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
+  ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
   ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
   ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
index ab96c6af11..84fc8aa6e0 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@@ -262,7 +262,8 @@ CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input,
 
 CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const std::string &format,
                               const TypeId &input_type, const TypeId &output_type,
-                              const std::vector<size_t> &origin_shape, const TypeId &origin_type) {
+                              const std::vector<size_t> &origin_shape, const TypeId &origin_type,
+                              const std::vector<Axis> &reshape_type) {
   MS_EXCEPTION_IF_NULL(func_graph);
   std::string input_format = format;
   std::string output_format = format;
@@ -272,6 +273,8 @@ CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &
   kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
   builder.SetInputsFormat({input_format});
   builder.SetOutputsFormat({output_format});
+  builder.SetInputsReshapeType({reshape_type});
+  builder.SetOutputsReshapeType({reshape_type});
   builder.SetInputsDeviceType({input_type});
   builder.SetOutputsDeviceType({output_type});
   builder.SetFusionType(kernel::FusionType::OPAQUE);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
index c4901a7b10..872d024810 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
@@ -96,7 +96,8 @@ CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input,
 
 CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const std::string &format,
                               const TypeId &input_type, const TypeId &output_type,
-                              const std::vector<size_t> &origin_shape, const TypeId &origin_type);
+                              const std::vector<size_t> &origin_shape, const TypeId &origin_type,
+                              const std::vector<Axis> &reshape_type = std::vector<Axis>{});
 
 AnfNodePtr InsertTransOpForInput(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                  const KernelSelectPtr &kernel_select);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc
index 713fb70274..20b900da45 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc
@@ -143,7 +143,12 @@ CNodePtr DealRefTransAndCast::AddAdditionalToRefOutput(const FuncGraphPtr &func_
   }
   // insert depend
   if (origin_format != cur_format || origin_type != cur_type) {
-    std::vector<AnfNodePtr> depend_nodes{NewValueNode(prim::kPrimDepend), cnode, final_node};
+    std::vector<AnfNodePtr> depend_nodes;
+    if (get_item.get() != nullptr) {
+      depend_nodes = std::vector<AnfNodePtr>{NewValueNode(prim::kPrimDepend), get_item, final_node};
+    } else {
+      depend_nodes = std::vector<AnfNodePtr>{NewValueNode(prim::kPrimDepend), cnode, final_node};
+    }
     final_node = func_graph->NewCNode(depend_nodes);
     MS_LOG(INFO) << "DealRefTranshwAndCast add denpend, op debug info is " << final_node->DebugString();
   }
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
index f30f25d07e..ed2dd6ffcc 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
@@ -58,8 +58,8 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
       origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
       const auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
       if (origin_type != device_type) {
-        replace_node =
-          AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, infer_type);
+        replace_node = AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape,
+                                            infer_type, AnfAlgo::GetOutputReshapeType(getitem, 0));
         MS_EXCEPTION_IF_NULL(replace_node);
         replace_node->set_scope(cnode->scope());
         AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
@@ -107,8 +107,8 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
     const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, 0);
     AnfNodePtr replace_node = cnode;
     if (origin_type != device_type) {
-      replace_node =
-        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, infer_type);
+      replace_node = AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape,
+                                          infer_type, AnfAlgo::GetOutputReshapeType(cnode, 0));
       MS_EXCEPTION_IF_NULL(replace_node);
       replace_node->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc
index 159e832b3b..8a1d7d3a7c 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc
@@ -114,12 +114,16 @@ CNodePtr BNGradSplitForTBE(const FuncGraphPtr &func_graph, const CNodePtr &cnode
 
 const BaseRef BnGradSplit::DefinePattern() const {
   VarPtr Xs = std::make_shared<SeqVar>();
-  return VectorRef({prim::kPrimFusedBatchNormGrad, Xs});
+  return VectorRef({prim::kPrimBatchNormGrad, Xs});
 }
 
 const AnfNodePtr BnGradSplit::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
   MS_EXCEPTION_IF_NULL(node);
   auto cnode = node->cast<CNodePtr>();
+  if (!GetBoolAttr(cnode, kAttrIsTraining)) {
+    MS_LOG(INFO) << "is training should be true if do fusion";
+    return nullptr;
+  }
   return BNGradSplitForTBE(func_graph, cnode);
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc
index 044b3d925a..2b36a94733 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc
@@ -96,7 +96,7 @@ AnfNodePtr CreateOutputsOfBNTrainingUpdate(const FuncGraphPtr &graph, const CNod
   return bn_training_update;
 }
 
-AnfNodePtr SplitFusedBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
+AnfNodePtr SplitBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(node);
 
@@ -125,11 +125,15 @@ AnfNodePtr SplitFusedBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNo
 const BaseRef BnSplit::DefinePattern() const {
   VarPtr Xs = std::make_shared<SeqVar>();
   MS_EXCEPTION_IF_NULL(Xs);
-  return VectorRef({prim::kPrimFusedBatchNorm, Xs});
+  return VectorRef({prim::kPrimBatchNorm, Xs});
 }
 
 const AnfNodePtr BnSplit::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
-  return SplitFusedBatchNormForTBE(func_graph, node);
+  if (!GetBoolAttr(node, kAttrIsTraining)) {
+    MS_LOG(INFO) << "is training should be true if do fusion";
+    return nullptr;
+  }
+  return SplitBatchNormForTBE(func_graph, node);
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
index 390331b941..f1962739cf 100644
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -483,6 +483,9 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
     MS_LOG(EXCEPTION) << anf_node->DebugString() << "anf_node is not CNode."
                       << " trace: " << trace::DumpSourceLines(anf_node);
   }
+  if (CheckPrimitiveType(anf_node, prim::kPrimTupleGetItem)) {
+    return VisitKernelWithReturnType(anf_node, 0, visit_nop_node);
+  }
   auto input_node = AnfAlgo::GetInputNode(anf_node->cast<CNodePtr>(), input_idx);
   MS_EXCEPTION_IF_NULL(input_node);
   return VisitKernelWithReturnType(input_node, 0, visit_nop_node);
diff --git a/mindspore/nn/layer/normalization.py b/mindspore/nn/layer/normalization.py
index 0d1bae5a19..bd90d0a7ea 100644
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -87,8 +87,7 @@ class _BatchNorm(Cell):
         self.cast = P.Cast()
         self.dtype = P.DType()
         self.reshape = P.Reshape()
-        self.is_ascend = context.get_context("device_target") == "Ascend"
-        self.is_gpu = context.get_context("device_target") == "GPU"
+        self._target = context.get_context("device_target")
         self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
         self.momentum = 1.0 - momentum
         if context.get_context("enable_ge"):
@@ -96,22 +95,21 @@ class _BatchNorm(Cell):
         else:
             self.is_ge_backend = False
 
-        if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
+        if self._target == "Ascend":
             self.bn_train = P.BatchNorm(is_training=True,
-                                        epsilon=self.eps)
-        elif self.is_gpu:
+                                        epsilon=self.eps,
+                                        momentum=self.momentum)
+        if self._target == "GPU":
             self.bn_train = P.FusedBatchNormEx(mode=1,
                                                epsilon=self.eps,
                                                momentum=self.momentum,
                                                data_format=self.format)
-        else:
+        if self._target == "CPU":
             self.bn_train = P.FusedBatchNorm(mode=1,
                                              epsilon=self.eps,
                                              momentum=self.momentum)
         self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format)
         self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend))
-        self.enable_default_train = self.is_graph_mode and not self.is_global and \
-                                    (self.is_ge_backend or self.is_ascend)
 
         data_parallel_strategy = ((1,), (1,))
         data_parallel_strategy_one = ((1,), ())
@@ -168,21 +166,6 @@ class _BatchNorm(Cell):
                 axes, re_shape = _shape_infer(F.shape(x), self.num_features)
                 return self._global_sync(x, axes, re_shape)
 
-            if self.enable_default_train:
-                y, batch_mean, batch_var, _, _ = self.bn_train(x,
-                                                               self.gamma,
-                                                               self.beta,
-                                                               None,
-                                                               None)
-
-                mean_sub = self.sub_mean(self.moving_mean, batch_mean)
-                temp_mean = self.mul_mean(mean_sub, self.momentum)
-                mean_sub2 = self.sub_var(self.moving_variance, batch_var)
-                temp_variance = self.mul_var(mean_sub2, self.momentum)
-                y = F.depend(y, self.assign_sub_mean(self.moving_mean, temp_mean))
-                y = F.depend(y, self.assign_sub_var(self.moving_variance, temp_variance))
-                return y
-
             return self.bn_train(x,
                                  self.gamma,
                                  self.beta,
diff --git a/mindspore/nn/layer/quant.py b/mindspore/nn/layer/quant.py
index d0dcd153c5..c18df0aca2 100644
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -426,15 +426,14 @@ class Conv2dBnFoldQuantOneConv(Cell):
         self.quant_dtype = quant_dtype
         data_format = 'NCHW'
         self.format = Validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name)
-        self.is_gpu = context.get_context('device_target') == "GPU"
-        self.is_ascend = context.get_context('device_target') == "Ascend"
+        self._target = context.get_context("device_target")
         self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
         if context.get_context("enable_ge"):
             self.is_ge_backend = True
         else:
             self.is_ge_backend = False
         self.enable_default_train = self.is_graph_mode and \
-                                    (self.is_ge_backend or self.is_ascend)
+                                    (self.is_ge_backend or self._target == "Ascend")
 
         # initialize convolution op and Parameter
         self.conv = P.Conv2D(out_channel=out_channels,
@@ -468,15 +467,16 @@ class Conv2dBnFoldQuantOneConv(Cell):
                                                      channel_axis=channel_axis,
                                                      num_channels=out_channels,
                                                      quant_dtype=quant_dtype)
-        if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
+        if self._target == "Ascend":
             self.bn_train = P.BatchNorm(is_training=True,
-                                        epsilon=self.eps)
-        elif self.is_gpu:
+                                        epsilon=self.eps,
+                                        momentum=self.momentum)
+        if self._target == "GPU":
             self.bn_train = P.FusedBatchNormEx(mode=1,
                                                epsilon=self.eps,
                                                momentum=self.momentum,
                                                data_format=self.format)
-        else:
+        if self._target == "CPU":
             self.bn_train = P.FusedBatchNorm(mode=1,
                                              epsilon=self.eps,
                                              momentum=self.momentum)
@@ -520,21 +520,6 @@ class Conv2dBnFoldQuantOneConv(Cell):
         else:
             conv_orig = conv / scale_factor
         if self.training:
-            if self.enable_default_train:
-                out, batch_mean, batch_var, _, _ = self.bn_train(conv_orig,
-                                                                 self.gamma,
-                                                                 self.beta,
-                                                                 None,
-                                                                 None)
-
-                mean_sub = self.sub_mean(self.moving_mean, batch_mean)
-                temp_mean = self.mul_mean(mean_sub, self.momentum)
-                mean_sub2 = self.sub_var(self.moving_variance, batch_var)
-                temp_variance = self.mul_var(mean_sub2, self.momentum)
-                out = F.depend(out, self.assign_sub_mean(self.moving_mean, temp_mean))
-                out = F.depend(out, self.assign_sub_var(self.moving_variance, temp_variance))
-                return out
-
             return self.bn_train(conv_orig,
                                  self.gamma,
                                  self.beta,
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index 4337eeef01..ab148af807 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1058,9 +1058,10 @@ class BatchNorm(PrimitiveWithInfer):
     """
 
     @prim_attr_register
-    def __init__(self, is_training=False, epsilon=1e-5, data_format="NCHW"):
+    def __init__(self, is_training=False, epsilon=1e-5, momentum=0.1, data_format="NCHW"):
         validator.check_value_type('is_training', is_training, (bool,), self.name)
         validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
+        validator.check_float_range(momentum, 0, 1, Rel.INC_BOTH, 'momentum', self.name)
         self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
         if context.get_context("device_target") != "GPU" and self.format == "NHWC":
             raise ValueError("NHWC format only support in GPU target.")
diff --git a/model_zoo/official/nlp/bert_thor/src/bert_model.py b/model_zoo/official/nlp/bert_thor/src/bert_model.py
index d6e32e0158..1845c474b1 100644
--- a/model_zoo/official/nlp/bert_thor/src/bert_model.py
+++ b/model_zoo/official/nlp/bert_thor/src/bert_model.py
@@ -28,7 +28,6 @@ from mindspore.common.tensor import Tensor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from .config import cfg
-from .fused_layer_norm import FusedLayerNorm
 from .lr_generator import get_bert_damping
 from .thor_layer import Dense_Thor, Embedding_Thor
 
@@ -277,11 +276,7 @@ class BertOutput(nn.Cell):
         self.dropout = nn.Dropout(1 - dropout_prob)
         self.dropout_prob = dropout_prob
         self.add = P.TensorAdd()
-        if compute_type == mstype.float16:
-            self.layernorm = FusedLayerNorm((out_channels,),
-                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
-        else:
-            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
         self.cast = P.Cast()
 
     def construct(self, hidden_status, input_tensor):
diff --git a/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py b/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
deleted file mode 100644
index 0932625b6f..0000000000
--- a/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""fused layernorm"""
-import numpy as np
-
-import mindspore.common.dtype as mstype
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
-from mindspore.nn.cell import Cell
-from mindspore.ops import functional as F
-from mindspore.ops import operations as P
-from mindspore.ops.primitive import constexpr
-
-__all__ = ['FusedLayerNorm']
-
-
-@constexpr
-def get_shape_for_norm(x_shape, begin_norm_axis):
-    print("input_shape: ", x_shape)
-    norm_shape = x_shape[begin_norm_axis:]
-    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
-    print("output_shape: ", output_shape)
-    return output_shape
-
-
-class FusedLayerNorm(Cell):
-    r"""
-    Applies Layer Normalization over a mini-batch of inputs.
-
-    Layer normalization is widely used in recurrent neural networks. It applies
-    normalization over a mini-batch of inputs for each single training case as described
-    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
-    normalization, layer normalization performs exactly the same computation at training and
-    testing times. It can be described using the following formula. It is applied across all channels
-    and pixel but only one batch size.
-
-    .. math::
-        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    Args:
-        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
-            `begin_norm_axis ... R - 1`.
-        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
-        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
-            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'ones'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'zeros'.
-        use_batch_nrom (bool): Whether use batchnorm to preocess.
-
-    Inputs:
-        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
-          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
-    Outputs:
-        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
-    Examples:
-        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape[1:]
-        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
-        >>> m(x)
-    """
-
-    def __init__(self,
-                 normalized_shape,
-                 begin_norm_axis=-1,
-                 begin_params_axis=-1,
-                 gamma_init='ones',
-                 beta_init='zeros',
-                 use_batch_norm=False):
-        super(FusedLayerNorm, self).__init__()
-        if not isinstance(normalized_shape, (tuple, list)):
-            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
-                            .format(normalized_shape, type(normalized_shape)))
-        self.normalized_shape = normalized_shape
-        self.begin_norm_axis = begin_norm_axis
-        self.begin_params_axis = begin_params_axis
-        self.gamma = Parameter(initializer(
-            gamma_init, normalized_shape))
-        self.beta = Parameter(initializer(
-            beta_init, normalized_shape))
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
-
-        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
-        self.use_batch_norm = use_batch_norm
-        self.mul = P.Mul()
-        self.add = P.TensorAdd()
-
-    def construct(self, input_x):
-        """construct of FusedLayerNorm"""
-        if self.use_batch_norm and self.training:
-            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
-            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
-            shape_x = F.shape(input_x)
-            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
-            input_x = F.reshape(input_x, norm_shape)
-            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
-            output = F.reshape(output, shape_x)
-            y = self.mul(output, self.gamma)
-            y = self.add(y, self.beta)
-        else:
-            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
-        return y
-
-    def extend_repr(self):
-        """Display instance object as string."""
-        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
-            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
-        return s
diff --git a/tests/st/networks/models/bert/src/bert_model.py b/tests/st/networks/models/bert/src/bert_model.py
index 77bdb51198..085e695e41 100644
--- a/tests/st/networks/models/bert/src/bert_model.py
+++ b/tests/st/networks/models/bert/src/bert_model.py
@@ -25,7 +25,6 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
-from .fused_layer_norm import FusedLayerNorm
 
 
 class BertConfig:
@@ -251,11 +250,7 @@ class BertOutput(nn.Cell):
         self.dropout = nn.Dropout(1 - dropout_prob)
         self.dropout_prob = dropout_prob
         self.add = P.TensorAdd()
-        if compute_type == mstype.float16:
-            self.layernorm = FusedLayerNorm((out_channels,),
-                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
-        else:
-            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
+        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
         self.cast = P.Cast()
 
     def construct(self, hidden_status, input_tensor):
diff --git a/tests/st/networks/models/bert/src/fused_layer_norm.py b/tests/st/networks/models/bert/src/fused_layer_norm.py
deleted file mode 100644
index 0e5c19f654..0000000000
--- a/tests/st/networks/models/bert/src/fused_layer_norm.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""fused layernorm"""
-import numpy as np
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.common.parameter import Parameter
-from mindspore.common.initializer import initializer
-from mindspore.ops.primitive import constexpr
-import mindspore.common.dtype as mstype
-from mindspore.nn.cell import Cell
-
-
-__all__ = ['FusedLayerNorm']
-
-@constexpr
-def get_shape_for_norm(x_shape, begin_norm_axis):
-    print("input_shape: ", x_shape)
-    norm_shape = x_shape[begin_norm_axis:]
-    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
-    print("output_shape: ", output_shape)
-    return output_shape
-
-class FusedLayerNorm(Cell):
-    r"""
-    Applies Layer Normalization over a mini-batch of inputs.
-
-    Layer normalization is widely used in recurrent neural networks. It applies
-    normalization over a mini-batch of inputs for each single training case as described
-    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
-    normalization, layer normalization performs exactly the same computation at training and
-    testing times. It can be described using the following formula. It is applied across all channels
-    and pixel but only one batch size.
-
-    .. math::
-        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    Args:
-        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
-            `begin_norm_axis ... R - 1`.
-        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
-        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
-            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'ones'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'zeros'.
-        use_batch_nrom (bool): Whether use batchnorm to preocess.
-
-    Inputs:
-        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
-          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
-    Outputs:
-        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
-    Examples:
-        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape[1:]
-        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
-        >>> m(x)
-    """
-    def __init__(self,
-                 normalized_shape,
-                 begin_norm_axis=-1,
-                 begin_params_axis=-1,
-                 gamma_init='ones',
-                 beta_init='zeros',
-                 use_batch_norm=False):
-        super(FusedLayerNorm, self).__init__()
-        if not isinstance(normalized_shape, (tuple, list)):
-            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
-                            .format(normalized_shape, type(normalized_shape)))
-        self.normalized_shape = normalized_shape
-        self.begin_norm_axis = begin_norm_axis
-        self.begin_params_axis = begin_params_axis
-        self.gamma = Parameter(initializer(
-            gamma_init, normalized_shape), name="gamma")
-        self.beta = Parameter(initializer(
-            beta_init, normalized_shape), name="beta")
-        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
-
-        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
-        self.use_batch_norm = use_batch_norm
-
-    def construct(self, input_x):
-        if self.use_batch_norm and self.training:
-            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
-            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
-            shape_x = F.shape(input_x)
-            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
-            input_x = F.reshape(input_x, norm_shape)
-            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
-            output = F.reshape(output, shape_x)
-            y = output * self.gamma + self.beta
-        else:
-            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
-        return y
-
-    def extend_repr(self):
-        """Display instance object as string."""
-        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
-            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
-        return s
diff --git a/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc b/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc
index dbce1c5f66..455613bc61 100644
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc
@@ -81,10 +81,12 @@ TEST_F(TestHWBnGradSplit, test_bn_grad_split_tbe) {
   kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
   builder1.SetInputsFormat(
     {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
-  builder1.SetOutputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
+  builder1.SetOutputsFormat(
+    {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
   builder1.SetInputsDeviceType(
     {kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
-  builder1.SetOutputsDeviceType({kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
+  builder1.SetOutputsDeviceType(
+    {kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
   builder1.SetKernelType(TBE_KERNEL);
   AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), bn_grad.get());
   // do bn_grad_split pass
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py
index 614e139d3c..e44bb0ab8a 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py
@@ -18,7 +18,7 @@ from mindspore.ops.operations import _grad_ops as G
 
 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
-bn_grad = G.FusedBatchNormGrad()
+bn_grad = G.BatchNormGrad(is_training=True)
 bn_grad1 = Primitive('BNGrad1')
 bn_grad2 = Primitive('BNGrad2')
 bn_grad3 = Primitive('BNGrad3')
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py
index 22059f9ca7..a801ce6145 100644
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py
@@ -18,7 +18,7 @@ from mindspore.ops import operations as P
 
 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
-bn = P.FusedBatchNorm()
+bn = P.BatchNorm(is_training=True)
 fused_bn1 = Primitive('FusedBN1')
 fused_bn2 = Primitive('FusedBN2')
 fused_bn3 = Primitive('FusedBN3')
diff --git a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
index 4927c26bbd..21214b3295 100644
--- a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
@@ -32,7 +32,7 @@ from tests.dataset_mock import MindData
 
 dev_num = 8
 strategy_weight = ((dev_num, 1, 1, 1), (1, 1, 1, 1))
-strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,))
+strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,), (1,), (1,))
 strategy_fc_weight_bias = ((dev_num, 1), (1, 1), (1,))
 
 
diff --git a/tests/ut/python/parallel/test_operator_model_parallel.py b/tests/ut/python/parallel/test_operator_model_parallel.py
index 340247cff4..45946f7e19 100644
--- a/tests/ut/python/parallel/test_operator_model_parallel.py
+++ b/tests/ut/python/parallel/test_operator_model_parallel.py
@@ -37,7 +37,7 @@ dev_num = 8
 strategy_no_weight = ((dev_num, 1, 1, 1),)
 strategy_weight = ((dev_num, 1, 1, 1), (1, 1, 1, 1))
 strategy_add = ((dev_num, 1, 1, 1), (dev_num, 1, 1, 1))
-strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,))
+strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,), (1,), (1,))
 
 strategy_fc_weight_nobias = ((1, dev_num), (1, dev_num))
 strategy_tensor_add = ((1, dev_num), (dev_num,))