!11132 modify batchnormal and fusedbatchnormal

From: @Somnus2020 Reviewed-by: Signed-off-by:
5 years ago · 151c0e2977
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -155,7 +155,6 @@ void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {

 void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
  MS_EXCEPTION_IF_NULL(ir_fusion_pm);
  ir_fusion_pm->AddPass(std::make_shared<BatchNormBertFission>());
  ir_fusion_pm->AddPass(std::make_shared<SingleBatchNormFission>());
  ir_fusion_pm->AddPass(std::make_shared<BatchNorm2BNInfer>());
  ir_fusion_pm->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
@@ -270,15 +269,8 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
  }
  auto optimizer = std::make_shared<GraphOptimizer>();
  auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
  if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
    ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
    ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
  } else {
    ir_fusion_pm->AddPass(std::make_shared<BatchNormGradSplit>());
    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormFusion>());
    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion0>());
    ir_fusion_pm->AddPass(std::make_shared<FusedBatchNormMixPrecisionFusion1>());
  }
  ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
  ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
  ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
  ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
  ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@@ -262,7 +262,8 @@ CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input,

 CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const std::string &format,
                              const TypeId &input_type, const TypeId &output_type,
                              const std::vector<size_t> &origin_shape, const TypeId &origin_type) {
                              const std::vector<size_t> &origin_shape, const TypeId &origin_type,
                              const std::vector<Axis> &reshape_type) {
  MS_EXCEPTION_IF_NULL(func_graph);
  std::string input_format = format;
  std::string output_format = format;
@@ -272,6 +273,8 @@ CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
  builder.SetInputsFormat({input_format});
  builder.SetOutputsFormat({output_format});
  builder.SetInputsReshapeType({reshape_type});
  builder.SetOutputsReshapeType({reshape_type});
  builder.SetInputsDeviceType({input_type});
  builder.SetOutputsDeviceType({output_type});
  builder.SetFusionType(kernel::FusionType::OPAQUE);
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
@@ -96,7 +96,8 @@ CNodePtr NewTransOpNode(const FuncGraphPtr &func_graph, const AnfNodePtr &input,

 CNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &input, const std::string &format,
                              const TypeId &input_type, const TypeId &output_type,
                              const std::vector<size_t> &origin_shape, const TypeId &origin_type);
                              const std::vector<size_t> &origin_shape, const TypeId &origin_type,
                              const std::vector<Axis> &reshape_type = std::vector<Axis>{});

 AnfNodePtr InsertTransOpForInput(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                 const KernelSelectPtr &kernel_select);
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/deal_ref_trans_and_cast.cc
@@ -143,7 +143,12 @@ CNodePtr DealRefTransAndCast::AddAdditionalToRefOutput(const FuncGraphPtr &func_
  }
  // insert depend
  if (origin_format != cur_format || origin_type != cur_type) {
    std::vector<AnfNodePtr> depend_nodes{NewValueNode(prim::kPrimDepend), cnode, final_node};
    std::vector<AnfNodePtr> depend_nodes;
    if (get_item.get() != nullptr) {
      depend_nodes = std::vector<AnfNodePtr>{NewValueNode(prim::kPrimDepend), get_item, final_node};
    } else {
      depend_nodes = std::vector<AnfNodePtr>{NewValueNode(prim::kPrimDepend), cnode, final_node};
    }
    final_node = func_graph->NewCNode(depend_nodes);
    MS_LOG(INFO) << "DealRefTranshwAndCast add denpend, op debug info is " << final_node->DebugString();
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
@@ -58,8 +58,8 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
      origin_type = origin_type == kTypeUnknown ? infer_type : origin_type;
      const auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, output_idx);
      if (origin_type != device_type) {
        replace_node =
          AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape, infer_type);
        replace_node = AddCastOpNodeToGraph(func_graph, getitem, dev_fmt, device_type, origin_type, origin_shape,
                                            infer_type, AnfAlgo::GetOutputReshapeType(getitem, 0));
        MS_EXCEPTION_IF_NULL(replace_node);
        replace_node->set_scope(cnode->scope());
        AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
@@ -107,8 +107,8 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
    const TypeId device_type = AnfAlgo::GetOutputDeviceDataType(cnode, 0);
    AnfNodePtr replace_node = cnode;
    if (origin_type != device_type) {
      replace_node =
        AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape, infer_type);
      replace_node = AddCastOpNodeToGraph(func_graph, cnode, dev_fmt, device_type, origin_type, origin_shape,
                                          infer_type, AnfAlgo::GetOutputReshapeType(cnode, 0));
      MS_EXCEPTION_IF_NULL(replace_node);
      replace_node->set_scope(cnode->scope());
      AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_grad_split.cc
@@ -114,12 +114,16 @@ CNodePtr BNGradSplitForTBE(const FuncGraphPtr &func_graph, const CNodePtr &cnode

 const BaseRef BnGradSplit::DefinePattern() const {
  VarPtr Xs = std::make_shared<SeqVar>();
  return VectorRef({prim::kPrimFusedBatchNormGrad, Xs});
  return VectorRef({prim::kPrimBatchNormGrad, Xs});
 }

 const AnfNodePtr BnGradSplit::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  if (!GetBoolAttr(cnode, kAttrIsTraining)) {
    MS_LOG(INFO) << "is training should be true if do fusion";
    return nullptr;
  }
  return BNGradSplitForTBE(func_graph, cnode);
 }
 }  // namespace opt
--- a/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ir_fission/bn_split.cc
@@ -96,7 +96,7 @@ AnfNodePtr CreateOutputsOfBNTrainingUpdate(const FuncGraphPtr &graph, const CNod
  return bn_training_update;
 }

 AnfNodePtr SplitFusedBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
 AnfNodePtr SplitBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(node);

@@ -125,11 +125,15 @@ AnfNodePtr SplitFusedBatchNormForTBE(const FuncGraphPtr &func_graph, const AnfNo
 const BaseRef BnSplit::DefinePattern() const {
  VarPtr Xs = std::make_shared<SeqVar>();
  MS_EXCEPTION_IF_NULL(Xs);
  return VectorRef({prim::kPrimFusedBatchNorm, Xs});
  return VectorRef({prim::kPrimBatchNorm, Xs});
 }

 const AnfNodePtr BnSplit::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &) const {
  return SplitFusedBatchNormForTBE(func_graph, node);
  if (!GetBoolAttr(node, kAttrIsTraining)) {
    MS_LOG(INFO) << "is training should be true if do fusion";
    return nullptr;
  }
  return SplitBatchNormForTBE(func_graph, node);
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -483,6 +483,9 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
    MS_LOG(EXCEPTION) << anf_node->DebugString() << "anf_node is not CNode."
                      << " trace: " << trace::DumpSourceLines(anf_node);
  }
  if (CheckPrimitiveType(anf_node, prim::kPrimTupleGetItem)) {
    return VisitKernelWithReturnType(anf_node, 0, visit_nop_node);
  }
  auto input_node = AnfAlgo::GetInputNode(anf_node->cast<CNodePtr>(), input_idx);
  MS_EXCEPTION_IF_NULL(input_node);
  return VisitKernelWithReturnType(input_node, 0, visit_nop_node);
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -87,8 +87,7 @@ class _BatchNorm(Cell):
        self.cast = P.Cast()
        self.dtype = P.DType()
        self.reshape = P.Reshape()
        self.is_ascend = context.get_context("device_target") == "Ascend"
        self.is_gpu = context.get_context("device_target") == "GPU"
        self._target = context.get_context("device_target")
        self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
        self.momentum = 1.0 - momentum
        if context.get_context("enable_ge"):
@@ -96,22 +95,21 @@ class _BatchNorm(Cell):
        else:
            self.is_ge_backend = False

        if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
        if self._target == "Ascend":
            self.bn_train = P.BatchNorm(is_training=True,
                                        epsilon=self.eps)
        elif self.is_gpu:
                                        epsilon=self.eps,
                                        momentum=self.momentum)
        if self._target == "GPU":
            self.bn_train = P.FusedBatchNormEx(mode=1,
                                               epsilon=self.eps,
                                               momentum=self.momentum,
                                               data_format=self.format)
        else:
        if self._target == "CPU":
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
        self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format)
        self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend))
        self.enable_default_train = self.is_graph_mode and not self.is_global and \
                                    (self.is_ge_backend or self.is_ascend)

        data_parallel_strategy = ((1,), (1,))
        data_parallel_strategy_one = ((1,), ())
@@ -168,21 +166,6 @@ class _BatchNorm(Cell):
                axes, re_shape = _shape_infer(F.shape(x), self.num_features)
                return self._global_sync(x, axes, re_shape)

            if self.enable_default_train:
                y, batch_mean, batch_var, _, _ = self.bn_train(x,
                                                               self.gamma,
                                                               self.beta,
                                                               None,
                                                               None)

                mean_sub = self.sub_mean(self.moving_mean, batch_mean)
                temp_mean = self.mul_mean(mean_sub, self.momentum)
                mean_sub2 = self.sub_var(self.moving_variance, batch_var)
                temp_variance = self.mul_var(mean_sub2, self.momentum)
                y = F.depend(y, self.assign_sub_mean(self.moving_mean, temp_mean))
                y = F.depend(y, self.assign_sub_var(self.moving_variance, temp_variance))
                return y

            return self.bn_train(x,
                                 self.gamma,
                                 self.beta,
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -426,15 +426,14 @@ class Conv2dBnFoldQuantOneConv(Cell):
        self.quant_dtype = quant_dtype
        data_format = 'NCHW'
        self.format = Validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name)
        self.is_gpu = context.get_context('device_target') == "GPU"
        self.is_ascend = context.get_context('device_target') == "Ascend"
        self._target = context.get_context("device_target")
        self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
        if context.get_context("enable_ge"):
            self.is_ge_backend = True
        else:
            self.is_ge_backend = False
        self.enable_default_train = self.is_graph_mode and \
                                    (self.is_ge_backend or self.is_ascend)
                                    (self.is_ge_backend or self._target == "Ascend")

        # initialize convolution op and Parameter
        self.conv = P.Conv2D(out_channel=out_channels,
@@ -468,15 +467,16 @@ class Conv2dBnFoldQuantOneConv(Cell):
                                                     channel_axis=channel_axis,
                                                     num_channels=out_channels,
                                                     quant_dtype=quant_dtype)
        if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
        if self._target == "Ascend":
            self.bn_train = P.BatchNorm(is_training=True,
                                        epsilon=self.eps)
        elif self.is_gpu:
                                        epsilon=self.eps,
                                        momentum=self.momentum)
        if self._target == "GPU":
            self.bn_train = P.FusedBatchNormEx(mode=1,
                                               epsilon=self.eps,
                                               momentum=self.momentum,
                                               data_format=self.format)
        else:
        if self._target == "CPU":
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
@@ -520,21 +520,6 @@ class Conv2dBnFoldQuantOneConv(Cell):
        else:
            conv_orig = conv / scale_factor
        if self.training:
            if self.enable_default_train:
                out, batch_mean, batch_var, _, _ = self.bn_train(conv_orig,
                                                                 self.gamma,
                                                                 self.beta,
                                                                 None,
                                                                 None)

                mean_sub = self.sub_mean(self.moving_mean, batch_mean)
                temp_mean = self.mul_mean(mean_sub, self.momentum)
                mean_sub2 = self.sub_var(self.moving_variance, batch_var)
                temp_variance = self.mul_var(mean_sub2, self.momentum)
                out = F.depend(out, self.assign_sub_mean(self.moving_mean, temp_mean))
                out = F.depend(out, self.assign_sub_var(self.moving_variance, temp_variance))
                return out

            return self.bn_train(conv_orig,
                                 self.gamma,
                                 self.beta,
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1058,9 +1058,10 @@ class BatchNorm(PrimitiveWithInfer):
    """

    @prim_attr_register
    def __init__(self, is_training=False, epsilon=1e-5, data_format="NCHW"):
    def __init__(self, is_training=False, epsilon=1e-5, momentum=0.1, data_format="NCHW"):
        validator.check_value_type('is_training', is_training, (bool,), self.name)
        validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
        validator.check_float_range(momentum, 0, 1, Rel.INC_BOTH, 'momentum', self.name)
        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
        if context.get_context("device_target") != "GPU" and self.format == "NHWC":
            raise ValueError("NHWC format only support in GPU target.")
--- a/model_zoo/official/nlp/bert_thor/src/bert_model.py
+++ b/model_zoo/official/nlp/bert_thor/src/bert_model.py
@@ -28,7 +28,6 @@ from mindspore.common.tensor import Tensor
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from .config import cfg
 from .fused_layer_norm import FusedLayerNorm
 from .lr_generator import get_bert_damping
 from .thor_layer import Dense_Thor, Embedding_Thor

@@ -277,11 +276,7 @@ class BertOutput(nn.Cell):
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.dropout_prob = dropout_prob
        self.add = P.TensorAdd()
        if compute_type == mstype.float16:
            self.layernorm = FusedLayerNorm((out_channels,),
                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
        else:
            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.cast = P.Cast()

    def construct(self, hidden_status, input_tensor):
--- a/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
+++ b/model_zoo/official/nlp/bert_thor/src/fused_layer_norm.py
@@ -1,127 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """fused layernorm"""
 import numpy as np

 import mindspore.common.dtype as mstype
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.nn.cell import Cell
 from mindspore.ops import functional as F
 from mindspore.ops import operations as P
 from mindspore.ops.primitive import constexpr

 __all__ = ['FusedLayerNorm']


@constexpr
 def get_shape_for_norm(x_shape, begin_norm_axis):
    print("input_shape: ", x_shape)
    norm_shape = x_shape[begin_norm_axis:]
    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
    print("output_shape: ", output_shape)
    return output_shape


 class FusedLayerNorm(Cell):
    r"""
    Applies Layer Normalization over a mini-batch of inputs.

    Layer normalization is widely used in recurrent neural networks. It applies
    normalization over a mini-batch of inputs for each single training case as described
    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
    normalization, layer normalization performs exactly the same computation at training and
    testing times. It can be described using the following formula. It is applied across all channels
    and pixel but only one batch size.

    .. math::
        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    Args:
        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
            `begin_norm_axis ... R - 1`.
        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'zeros'.
        use_batch_nrom (bool): Whether use batchnorm to preocess.

    Inputs:
        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.

    Outputs:
        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.

    Examples:
        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
        >>> shape1 = x.shape[1:]
        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
        >>> m(x)
    """

    def __init__(self,
                 normalized_shape,
                 begin_norm_axis=-1,
                 begin_params_axis=-1,
                 gamma_init='ones',
                 beta_init='zeros',
                 use_batch_norm=False):
        super(FusedLayerNorm, self).__init__()
        if not isinstance(normalized_shape, (tuple, list)):
            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
                            .format(normalized_shape, type(normalized_shape)))
        self.normalized_shape = normalized_shape
        self.begin_norm_axis = begin_norm_axis
        self.begin_params_axis = begin_params_axis
        self.gamma = Parameter(initializer(
            gamma_init, normalized_shape))
        self.beta = Parameter(initializer(
            beta_init, normalized_shape))
        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)

        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
        self.use_batch_norm = use_batch_norm
        self.mul = P.Mul()
        self.add = P.TensorAdd()

    def construct(self, input_x):
        """construct of FusedLayerNorm"""
        if self.use_batch_norm and self.training:
            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
            shape_x = F.shape(input_x)
            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
            input_x = F.reshape(input_x, norm_shape)
            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
            output = F.reshape(output, shape_x)
            y = self.mul(output, self.gamma)
            y = self.add(y, self.beta)
        else:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        return y

    def extend_repr(self):
        """Display instance object as string."""
        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
        return s
--- a/tests/st/networks/models/bert/src/bert_model.py
+++ b/tests/st/networks/models/bert/src/bert_model.py
@@ -25,7 +25,6 @@ from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 from mindspore.common.tensor import Tensor
 from mindspore.common.parameter import Parameter
 from .fused_layer_norm import FusedLayerNorm


 class BertConfig:
@@ -251,11 +250,7 @@ class BertOutput(nn.Cell):
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.dropout_prob = dropout_prob
        self.add = P.TensorAdd()
        if compute_type == mstype.float16:
            self.layernorm = FusedLayerNorm((out_channels,),
                                            use_batch_norm=enable_fused_layernorm).to_float(compute_type)
        else:
            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
        self.cast = P.Cast()

    def construct(self, hidden_status, input_tensor):
--- a/tests/st/networks/models/bert/src/fused_layer_norm.py
+++ b/tests/st/networks/models/bert/src/fused_layer_norm.py
@@ -1,120 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """fused layernorm"""
 import numpy as np
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore.ops.primitive import constexpr
 import mindspore.common.dtype as mstype
 from mindspore.nn.cell import Cell


 __all__ = ['FusedLayerNorm']

@constexpr
 def get_shape_for_norm(x_shape, begin_norm_axis):
    print("input_shape: ", x_shape)
    norm_shape = x_shape[begin_norm_axis:]
    output_shape = (1, -1, 1, int(np.prod(norm_shape)))
    print("output_shape: ", output_shape)
    return output_shape

 class FusedLayerNorm(Cell):
    r"""
    Applies Layer Normalization over a mini-batch of inputs.

    Layer normalization is widely used in recurrent neural networks. It applies
    normalization over a mini-batch of inputs for each single training case as described
    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
    normalization, layer normalization performs exactly the same computation at training and
    testing times. It can be described using the following formula. It is applied across all channels
    and pixel but only one batch size.

    .. math::
        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    Args:
        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
            `begin_norm_axis ... R - 1`.
        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'zeros'.
        use_batch_nrom (bool): Whether use batchnorm to preocess.

    Inputs:
        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.

    Outputs:
        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.

    Examples:
        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
        >>> shape1 = x.shape[1:]
        >>> m = nn.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
        >>> m(x)
    """
    def __init__(self,
                 normalized_shape,
                 begin_norm_axis=-1,
                 begin_params_axis=-1,
                 gamma_init='ones',
                 beta_init='zeros',
                 use_batch_norm=False):
        super(FusedLayerNorm, self).__init__()
        if not isinstance(normalized_shape, (tuple, list)):
            raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
                            .format(normalized_shape, type(normalized_shape)))
        self.normalized_shape = normalized_shape
        self.begin_norm_axis = begin_norm_axis
        self.begin_params_axis = begin_params_axis
        self.gamma = Parameter(initializer(
            gamma_init, normalized_shape), name="gamma")
        self.beta = Parameter(initializer(
            beta_init, normalized_shape), name="beta")
        self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)

        self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
        self.use_batch_norm = use_batch_norm

    def construct(self, input_x):
        if self.use_batch_norm and self.training:
            ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
            zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
            shape_x = F.shape(input_x)
            norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
            input_x = F.reshape(input_x, norm_shape)
            output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
            output = F.reshape(output, shape_x)
            y = output * self.gamma + self.beta
        else:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        return y

    def extend_repr(self):
        """Display instance object as string."""
        s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
        return s
--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/bn_grad_split_test.cc
@@ -81,10 +81,12 @@ TEST_F(TestHWBnGradSplit, test_bn_grad_split_tbe) {
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1;
  builder1.SetInputsFormat(
    {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
  builder1.SetOutputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
  builder1.SetOutputsFormat(
    {kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
  builder1.SetInputsDeviceType(
    {kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
  builder1.SetOutputsDeviceType({kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
  builder1.SetOutputsDeviceType(
    {kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32, kNumberTypeFloat32});
  builder1.SetKernelType(TBE_KERNEL);
  AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), bn_grad.get());
  // do bn_grad_split pass
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_grad_split.py
@@ -18,7 +18,7 @@ from mindspore.ops.operations import _grad_ops as G

 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
 bn_grad = G.FusedBatchNormGrad()
 bn_grad = G.BatchNormGrad(is_training=True)
 bn_grad1 = Primitive('BNGrad1')
 bn_grad2 = Primitive('BNGrad2')
 bn_grad3 = Primitive('BNGrad3')
--- a/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/bn_split.py
@@ -18,7 +18,7 @@ from mindspore.ops import operations as P

 make_tuple = Primitive('make_tuple')
 tuple_getitem = Primitive('tuple_getitem')
 bn = P.FusedBatchNorm()
 bn = P.BatchNorm(is_training=True)
 fused_bn1 = Primitive('FusedBN1')
 fused_bn2 = Primitive('FusedBN2')
 fused_bn3 = Primitive('FusedBN3')
--- a/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
+++ b/tests/ut/python/parallel/test_batchnorm_batch_parallel.py
@@ -32,7 +32,7 @@ from tests.dataset_mock import MindData

 dev_num = 8
 strategy_weight = ((dev_num, 1, 1, 1), (1, 1, 1, 1))
 strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,))
 strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,), (1,), (1,))
 strategy_fc_weight_bias = ((dev_num, 1), (1, 1), (1,))


--- a/tests/ut/python/parallel/test_operator_model_parallel.py
+++ b/tests/ut/python/parallel/test_operator_model_parallel.py
@@ -37,7 +37,7 @@ dev_num = 8
 strategy_no_weight = ((dev_num, 1, 1, 1),)
 strategy_weight = ((dev_num, 1, 1, 1), (1, 1, 1, 1))
 strategy_add = ((dev_num, 1, 1, 1), (dev_num, 1, 1, 1))
 strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,))
 strategy_bn = ((dev_num, 1, 1, 1), (1,), (1,), (1,), (1,))

 strategy_fc_weight_nobias = ((1, dev_num), (1, dev_num))
 strategy_tensor_add = ((1, dev_num), (dev_num,))