diff --git a/mindspore/nn/graph_kernels/__init__.py b/mindspore/nn/_graph_kernels/__init__.py
similarity index 55%
rename from mindspore/nn/graph_kernels/__init__.py
rename to mindspore/nn/_graph_kernels/__init__.py
index 8128f2db60..356d183766 100644
--- a/mindspore/nn/graph_kernels/__init__.py
+++ b/mindspore/nn/_graph_kernels/__init__.py
@@ -18,13 +18,6 @@ GraphKernel.
 GraphKernel provides a unified style to express graph and kernel for user.
 It breaks the boundary between graph and kernel and provides more opportunities to do compile optimization.
 """
-from .graph_kernels import MaximumGrad, MinimumGrad, AbsGrad, ApplyMomentum, BiasAdd, EqualCount,     \
-    ReduceMean, ReLU, SoftmaxCrossEntropyWithLogits, LayerNorm, LayerNormXBackprop,   \
-    LayerNormBetaGammaBackprop, LogSoftmax, Tanh, TanhGrad, Gelu, Softmax, BiasAddGrad,            \
-    LambUpdateWithLR, LambNextMV
+from .graph_kernels import LambUpdateWithLR, LambNextMV
 
-__all__ = ['MaximumGrad', 'MinimumGrad', 'AbsGrad', 'ApplyMomentum', 'BiasAdd', 'EqualCount',
-           'ReduceMean', 'ReLU', 'SoftmaxCrossEntropyWithLogits', 'LayerNorm',
-           'LayerNormXBackprop', 'LayerNormBetaGammaBackprop', 'LogSoftmax', 'Tanh', 'TanhGrad',
-           'Gelu', 'Softmax', 'BiasAddGrad', 'LambUpdateWithLR', 'LambNextMV'
-           ]
+__all__ = ['LambUpdateWithLR', 'LambNextMV']
diff --git a/mindspore/nn/_graph_kernels/graph_kernels.py b/mindspore/nn/_graph_kernels/graph_kernels.py
new file mode 100644
index 0000000000..b314476878
--- /dev/null
+++ b/mindspore/nn/_graph_kernels/graph_kernels.py
@@ -0,0 +1,259 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Graph kernels. They are composites of basic primitives and can be compiled into
+a fused kernel automatically when context.set_context(enable_graph_kernel=True).
+"""
+from ...ops import operations as P
+from ...ops.primitive import PrimitiveWithInfer, prim_attr_register
+from ...ops.composite import multitype_ops as C
+from ..cell import GraphKernel
+
+
+class InplaceAssign(PrimitiveWithInfer):
+    """
+    Inplace assign `Parameter` with a value.
+
+    This primitive can only be used in graph kernel.
+
+    Inputs:
+        - **variable** (Parameter) - The `Parameter`.
+        - **value** (Tensor) - The value to be assigned.
+        - **depend** (Tensor) - The dependent tensor to keep this op connected in graph.
+
+    Outputs:
+        Tensor, has the same type as original `variable`.
+
+    Examples:
+        >>> class MyClass(GraphKernel):
+        ...     def __init__(self):
+        ...         super(MyClass, self).__init__()
+        ...         self.mul = P.Mul()
+        ...         self.fake_output_assign = InplaceAssign()
+        ...         self.fake_output_assign.add_prim_attr("fake_output", True)
+        ...
+        ...     def construct(self, i0, i1):
+        ...         mul_res = self.mul(i0, i1)
+        ...         # mul_res is a fake output and parameter i0 will be updated.
+        ...         mul_res = self.fake_output_assign(i0, mul_res, mul_res)
+        ...         return mul_res
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        super(InplaceAssign, self).__init__("InplaceAssign")
+        self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output'])
+
+    def infer_shape(self, x, y, z):
+        return z
+
+    def infer_dtype(self, x, y, z):
+        return z
+
+    def get_bprop(self):
+        def bprop(x, y, z, out, dout):
+            return (x, C.zeros_like(y), dout)
+
+        return bprop
+
+
+class LambUpdateWithLR(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se)
+        i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4
+
+    Inputs:
+        - **input0** (Tensor) - The first tensor to be computed.
+        - **input1** (Tensor) - The second tensor to be computed.
+        - **input2** (Tensor) - The third tensor to be computed.
+        - **input3** (Tensor) - The fourth tensor to be computed.
+        - **input4** (Tensor) - The fifth tensor to be computed.
+        - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result.
+        - **greater_y** (Tensor) - The seventh tensor to be computed.
+        - **select_e** (Tensor) - The eighth tensor to be computed.
+        - **minimum_y** (Tensor) - The ninth tensor to be computed.
+
+    Outputs:
+        A fake output tensor.
+
+    Examples:
+        >>> import numpy as np
+        >>> import mindspore.context as context
+        >>> from mindspore.common import dtype as mstype
+        >>> from mindspore.common.tensor import Tensor
+        >>> from mindspore.common.parameter import Parameter
+        >>> from mindspore.nn.cell import Cell
+        >>> class Net(Cell):
+        ...     def __init__(self, i5):
+        ...         super(Net, self).__init__()
+        ...         self.i5 = Parameter(i5, name='i5')
+        ...         self.lamb_update = LambUpdateWithLR()
+        ...
+        ...     def construct(self, i0, i1, i2, i3, i4, i6, i7, i8):
+        ...         return self.lamb_update(i0, i1, i2, i3, i4, self.i5, i6, i7, i8)
+        >>> shape = [1, 16]
+        >>> oshape = [1]
+        >>> i0 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i1 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i2 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i3 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i4 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i5 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i6 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i7 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> i8 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32))
+        >>> context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True)
+        >>> net = Net(i5)
+        >>> _ = net(i0, i1, i2, i3, i4, i6, i7, i8)
+        >>> output = (net.i5)
+    """
+
+    def __init__(self):
+        super(LambUpdateWithLR, self).__init__()
+        self.greater = P.Greater()
+        self.select = P.Select()
+        self.div = P.RealDiv()
+        self.min = P.Minimum()
+        self.max = P.Maximum()
+        self.mul = P.Mul()
+        self.sub = P.Sub()
+        self.fake_output_assign = InplaceAssign()
+        self.fake_output_assign.add_prim_attr("fake_output", True)
+
+    def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y):
+        greater0 = self.greater(input0, greater_y)
+        greater1 = self.greater(input1, greater_y)
+        real_div0 = self.div(input1, input2)
+        select0 = self.select(greater0, real_div0, select_e)
+        select1 = self.select(greater1, select0, select_e)
+        min0 = self.min(select1, minimum_y)
+        max0 = self.max(min0, greater_y)
+        mul0 = self.mul(max0, input3)
+        mul1 = self.mul(mul0, input4)
+        sub0 = self.sub(input5, mul1)
+        sub0 = self.fake_output_assign(input5, sub0, sub0)
+        return sub0
+
+
+class LambNextMV(GraphKernel):
+    r"""
+    Part of Lamb optimizer.
+
+    .. math::
+        rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6}
+        rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3}
+        y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7
+        y_3 = \frac{rd_0}{\sqrt{rd_1} + x3}
+        i5 = i_8 \times i_5 + i_9 \times i_4
+        i2 = x_0 \times i_2 + x_1 \times i_1
+
+    Inputs:
+        - **inputs1** (Tensor) - The first input tensor to be computed.
+        - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result.
+        - **inputs3** (Tensor) - The third input tensor to be computed.
+        - **inputs4** (Tensor) - The fourth input tensor to be computed.
+        - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result.
+        - **inputs6** (Tensor) - The sixth input tensor to be computed.
+        - **inputs7** (Tensor) - The seventh input tensor to be computed.
+        - **inputs8** (Tensor) - The eighth input tensor to be computed.
+        - **inputs9** (Tensor) - The ninth input tensor to be computed.
+        - **inputsx0** (Tensor) - The tenth input tensor to be computed.
+        - **inputsx1** (Tensor) - The eleventh input tensor to be computed.
+        - **inputsx2** (Tensor) - The twelfth input tensor to be computed.
+        - **inputsx3** (Tensor) - The thirteenth input tensor to be computed.
+
+    Outputs:
+        Tuple of 2 Tensors.
+
+        - **add3** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
+                              the one with higher precision or higher digits among the inputs.
+        - **realdiv4** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
+                                  the one with higher precision or higher digits among the inputs.
+
+    Examples:
+        >>> import numpy as np
+        >>> import mindspore.context as context
+        >>> from mindspore.common import dtype as mstype
+        >>> from mindspore.common.tensor import Tensor
+        >>> from mindspore.common.parameter import Parameter
+        >>> from mindspore.nn.cell import Cell
+        >>> class Net(Cell):
+        ...     def __init__(self, i1, i4):
+        ...         super(Net, self).__init__()
+        ...         self.i1 = Parameter(i1, name='i1')
+        ...         self.i4 = Parameter(i4, name='i4')
+        ...         self.lamb_next = LambNextMV()
+        ...
+        ...     def construct(self, i0, i2, i3, i5, i6, i7, i8, i9, i10, i11, i12):
+        ...         i0_ = i0 + i2
+        ...         return self.lamb_next(i0_, self.i1, i2, i3, self.i4, i5, i6, i7, i8, i9, i10, i11, i12)
+        >>> shape = [1, 16]
+        >>> i0 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i1 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i2 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i3 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i4 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i5 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i6 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i7 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i8 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i9 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i10 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32))
+        >>> i11 = Tensor(np.random.normal(0, 1, shape).astype(np.float32))
+        >>> i12 = Tensor(np.ones(shape).astype(np.float32) * 1e-6)
+        >>> context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True)
+        >>> net = Net(i1, i4)
+        >>> (o0, o1) = net(i0, i2, i3, i5, i6, i7, i8, i9, i10, i11, i12)
+        >>> output = (o0, net.i4, net.i1, o1)
+    """
+
+    def __init__(self):
+        super(LambNextMV, self).__init__()
+        self.mul = P.Mul()
+        self.add = P.TensorAdd()
+        self.div = P.RealDiv()
+        self.sqrt = P.Sqrt()
+        self.rsqrt = P.Rsqrt()
+        self.fake_output_assign_1 = InplaceAssign()
+        self.fake_output_assign_1.add_prim_attr("fake_output", False)
+        self.fake_output_assign_2 = InplaceAssign()
+        self.fake_output_assign_2.add_prim_attr("fake_output", False)
+
+    def construct(self, input1, input2, input3, input4, input5, input6, input7,
+                  input8, input9, inputx0, inputx1, inputx2, inputx3):
+        mul3 = self.mul(inputx1, input1)
+        mul2 = self.mul(inputx0, input2)
+        add1 = self.add(mul2, mul3)
+        realdiv1 = self.div(add1, input3)
+        add2 = self.add(realdiv1, inputx3)
+        sqrt0 = self.rsqrt(add2)
+        sqrt1 = self.sqrt(realdiv1)
+        add4 = self.add(sqrt1, inputx3)
+        mul1 = self.mul(input9, input4)
+        mul0 = self.mul(input8, input5)
+        add0 = self.add(mul0, mul1)
+        realdiv0 = self.div(add0, input6)
+        realdiv2 = self.mul(realdiv0, sqrt0)
+        realdiv4 = self.div(realdiv0, add4)
+        mul4 = self.mul(inputx2, input7)
+        add3 = self.add(realdiv2, mul4)
+
+        add3 = self.fake_output_assign_1(input5, add0, add3)
+        add3 = self.fake_output_assign_2(input2, add1, add3)
+
+        return add3, realdiv4
diff --git a/mindspore/nn/graph_kernels/graph_kernels.py b/mindspore/nn/graph_kernels/graph_kernels.py
deleted file mode 100644
index ca542da989..0000000000
--- a/mindspore/nn/graph_kernels/graph_kernels.py
+++ /dev/null
@@ -1,1208 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Graph kernels. They are composites of basic primitives and can be compiled into
-a fused kernel automaticly when context.set_context(enable_graph_kernel=True).
-"""
-from ...common import dtype as mstype
-from ...ops import operations as P
-from ...ops.primitive import PrimitiveWithInfer, prim_attr_register
-from ...ops.composite import multitype_ops as C
-from ...ops.operations import _grad_ops as G
-from ..._checkparam import Validator as validator
-from ..cell import Cell, GraphKernel
-
-
-class InplaceAssign(PrimitiveWithInfer):
-    """
-    Inplace assign `Parameter` with a value.
-
-    This primitive can only use in graph kernel.
-
-    Inputs:
-        - **variable** (Parameter) - The `Parameter`.
-        - **value** (Tensor) - The value to be assigned.
-        - **depend** (Tensor) - The dependent tensor to keep this op connected in graph.
-
-    Outputs:
-        Tensor, has the same type as original `variable`.
-
-    Examples:
-    >>> def construct(self, x):
-    ...     val = x - 1.0
-    ...     ret = x + 2.0
-    ...     return InplaceAssign()(x, val, ret)
-    >>> x = Tensor([2.0], mindspore.float32)
-    >>> net = Net()
-    >>> net(x)
-   """
-    @prim_attr_register
-    def __init__(self):
-        self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output'])
-
-    def infer_shape(self, x, y, z):
-        return z
-
-    def infer_dtype(self, x, y, z):
-        return z
-
-    def get_bprop(self):
-        def bprop(x, y, z, out, dout):
-            return (x, C.zeros_like(y), dout)
-        return bprop
-
-
-class MaximumGrad(GraphKernel):
-    """
-
-    Backprop function for Maximum operator.
-
-    Inputs:
-        - **x** (Tensor) - The first input tensor of maximum.
-        - **y** (Tensor) - The second input tensor of maximum.
-        - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output.
-
-    Outputs:
-        dx (Tensor): has the same shape as x and y, returns dout element if
-        `x >= y` returns true at the same position, or returns zero at that
-        position
-        dy (Tensor): has the same shape as x and y, dy = dout - dx
-
-    Examples:
-        >>> layer = MaximumGrad()
-        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
-    """
-
-    def __init__(self, grad_x=True, grad_y=True):
-        super(MaximumGrad, self).__init__()
-        self.grad_x = grad_x
-        self.grad_y = grad_y
-        self.select = P.Select()
-        self.greater_equal = P.GreaterEqual()
-        self.zeros_like = P.ZerosLike()
-        self.sub = P.Sub()
-
-    def construct(self, x, y, dout):
-        cmp_result = self.greater_equal(x, y)
-        dx = self.select(cmp_result, dout, self.zeros_like(dout))
-        dy = dout - dx
-
-        return dx, dy
-
-
-class MinimumGrad(GraphKernel):
-    """
-    Backprop function for Minimum operator.
-
-    Compares x and y elementwise, dout must has the same shape with x and y.
-
-    Inputs:
-        - **x** (Tensor) - The first input
-        - **y** (Tensor) - x and y must have same shape
-        - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output
-
-    Outputs:
-        - dx (Tensor) - Has the same shape as x and y, returns dout element if
-        `x <= y` returns true at the same position, or returns zero at that
-        position
-        - dy (Tensor) - Has the same shape as x and y, dy = dout - dx
-
-    Examples:
-        >>> layer = MinimumGrad()
-        >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
-    """
-
-    def __init__(self, grad_x=True, grad_y=True):
-        super(MinimumGrad, self).__init__()
-        self.grad_x = grad_x
-        self.grad_y = grad_y
-        self.select = P.Select()
-        self.less_equal = P.LessEqual()
-        self.zeros_like = P.ZerosLike()
-        self.sub = P.Sub()
-
-    def construct(self, x, y, dout):
-        cmp_result = self.less_equal(x, y)
-        dx = self.select(cmp_result, dout, self.zeros_like(dout))
-        dy = dout - dx
-
-        return dx, dy
-
-
-class AbsGrad(GraphKernel):
-    """
-    Abs's backprop function.
-
-    Inputs:
-        **input_x** (Tensor) - input data of this operator.
-        **dout** (Tensor) - output of the next operator's backprop function.
-
-    Outputs:
-        Tensor, has the same shape as input_x.
-
-    Examples:
-        >>> back = AbsGrad()
-        >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6]))
-    """
-
-    def __init__(self):
-        super(AbsGrad, self).__init__()
-        self.mul = P.Mul()
-        self.abs = P.Abs()
-        self.add = P.TensorAdd()
-        self.div = P.RealDiv()
-        self.round = P.Round()
-
-    def construct(self, input_x, dout):
-        NUM_MAX = 32768
-        mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX))
-        res_abs = self.abs(mul_max)
-        res_div = self.div(mul_max, res_abs)
-        res_round = self.round(res_div)
-        res = self.mul(res_round, dout)
-        return res
-
-
-class ApplyMomentum(GraphKernel):
-    """
-    Update parameter according to the ApplyMomentum algorithm.
-
-    Inputs:
-        variable (Tensor): mutable tensor var
-        accumulation (Tensor): mutable tensor accum
-        learning_rate (float32): learning rate
-        gradient (float32): The gradient
-        momentum (float32): Momentum
-
-    Outputs: updated accumulation and variable
-    """
-
-    def __init__(self,
-                 use_nesterov=False,
-                 use_locking=False,
-                 gradient_scale=1.0):
-        super(ApplyMomentum, self).__init__()
-        self.gradient_scale = validator.check_value_type('gradient_scale', gradient_scale, [float], type(self).__name__)
-        self.fake_output_assign_1 = InplaceAssign()
-        self.fake_output_assign_1.add_prim_attr("fake_output", True)
-        self.fake_output_assign_2 = InplaceAssign()
-        self.fake_output_assign_2.add_prim_attr("fake_output", True)
-
-    def construct(self, variable, accumulation, learning_rate, gradient, momentum):
-        gradient = gradient * self.gradient_scale
-        momt_accumulation = accumulation * momentum
-        accumulation_inplace = momt_accumulation + gradient
-
-        sum_gradient = accumulation_inplace * learning_rate
-        variable_inplace = variable - sum_gradient
-
-        accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace)
-        variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace)
-        return accumulation_inplace, variable_inplace
-
-
-class BiasAdd(GraphKernel):
-    """
-    Return the sum of x and bias.
-
-    Inputs:
-        x (Tensor): Tensor of input data.
-        bias (Tensor): The bias tensor.
-
-    Output:
-        Tensor, the sum of x and bias.
-
-    Example:
-    >>> layer = BiasAdd()
-    >>> output = layer(Tensor([1, 2, 3]), Tensor([1,]))
-    """
-
-    def __init__(self):
-        super(BiasAdd, self).__init__()
-
-    def construct(self, x, bias):
-        shape = P.Shape()(x)
-        if len(shape) == 4:
-            bias_shape = (1, P.Shape()(bias)[0], 1, 1)  # NCHW
-        else:
-            bias_shape = (1, P.Shape()(bias)[0])
-        res = x + P.Reshape()(bias, bias_shape)
-        return res
-
-class BiasAddGrad(GraphKernel):
-    """
-    Computes gradients of BiasAdd.
-
-    Inputs:
-        x (Tensor): the gradients of bias add output.
-
-    Output:
-        Tensor, the gradients of bias add input.
-
-    Examples:
-        >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32)
-        >>> bias_add_grad = BiasAddGrad()
-        >>> dx = bias_add_grad(dout)
-    """
-    def __init__(self):
-        super(BiasAddGrad, self).__init__()
-
-    def construct(self, x):
-        shape_x = P.Shape()(x)
-        reduce_axis = [0]
-        for i in range(2, len(shape_x)):
-            reduce_axis.append(i)
-
-        res = P.ReduceSum()(x, reduce_axis)
-        return res
-
-
-class EqualCount(GraphKernel):
-    """
-    Computes the number of the same elements of two tensors.
-
-    The two input tensors must have the same shape and data type.
-
-    Inputs:
-        x (Tensor): the first input tensor.
-        y (Tensor): the second input tensor.
-
-    Outputs:
-        Tensor, the type is same as input tensor and size as (1,).
-
-    Examples:
-        >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32)
-        >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32)
-        >>> equal_count = EqualCount()
-        >>> print(equal_count(x, y))
-        2
-    """
-    def __init__(self):
-        super(EqualCount, self).__init__()
-
-    def construct(self, x, y):
-        equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32))
-        equal_count = P.Cast()(equal_bool, mstype.float16)
-
-        axes = (0,)
-        res = P.ReduceSum()(equal_count, axes)
-        res = P.Cast()(res, P.DType()(x))
-        return res
-
-
-class ReduceMean(GraphKernel):
-    """
-    Reduce a dimension of a tensor by averaging all elements in the dimension.
-
-    The dtype of the tensor to be reduced is number.
-
-    Args:
-        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
-                          If false, don't keep these dimensions. Default: False.
-
-    Inputs:
-        - **input_x** (Tensor[Number]) - The input tensor.
-        - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
-          Only constant value is allowed.
-
-    Outputs:
-        Tensor, has the same dtype as the `input_x`.
-
-        - If axis is (), and keep_dims is False,
-          the output is a 0-D tensor representing the sum of all elements in the input tensor.
-        - If axis is int, set as 2, and keep_dims is False,
-          the shape of output is :math:`(x_1, x_3, ..., x_R)`.
-        - If axis is tuple(int), set as (2, 3), and keep_dims is False,
-          the shape of output is :math:`(x_1, x_4, ..., x_R)`.
-
-    Examples:
-        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> op = ReduceMean(keep_dims=True)
-        >>> output = op(input_x, 1)
-    """
-
-    def __init__(self, keep_dims=True):
-        super(ReduceMean, self).__init__()
-        self.keep_dims = validator.check_value_type('keep_dims', keep_dims, [bool], type(self).__name__)
-        self.sum = P.ReduceSum(self.keep_dims)
-
-    def construct(self, x, axis):
-        shape = P.Shape()(x)
-        value_num = 1
-        for i in axis:
-            value_num *= shape[i]
-
-        data_sum = self.sum(x, axis)
-        avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num)
-        res = data_sum * avg
-        return res
-
-
-class ReLU(GraphKernel):
-    r"""
-    Computes ReLU(Rectified Linear Unit) of input tensor element-wise.
-
-    It returns :math:`\max(x,\  0)` element-wise.
-
-    Inputs:
-        - **input_x** (Tensor) - The input tensor.
-
-    Outputs:
-        Tensor, with the same type and shape as the `input_x`.
-
-    Examples:
-        >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
-        >>> relu = ReLU()
-        >>> result = relu(input_x)
-        >>> print(result)
-        [[0. 4. 0.]
-         [2. 0. 9.]]
-    """
-    def __init__(self):
-        super(ReLU, self).__init__()
-        self.max = P.Maximum()
-
-    def construct(self, x):
-        return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
-
-
-class SoftmaxCrossEntropyWithLogits(GraphKernel):
-    r"""
-    Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding.
-
-    Note:
-        Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
-
-        .. math::
-            p_{ij} = softmax(X_{ij}) = \frac{\exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)}
-
-        .. math::
-            loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})}
-
-    Inputs:
-        - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`.
-        - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`.
-
-    Outputs:
-        Tuple of 2 Tensors, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`.
-
-    Examples:
-        >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32)
-        >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32)
-        >>> softmax_cross = SoftmaxCrossEntropyWithLogits()
-        >>> loss, backprop = softmax_cross(logits, labels)
-    """
-
-    def __init__(self):
-        super(SoftmaxCrossEntropyWithLogits, self).__init__()
-        self.max = P.ReduceMax(keep_dims=True)
-        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
-
-    def construct(self, features, labels):
-        data_max = self.max(features, (1,))
-        data_sub = features - data_max
-        data_exp = P.Exp()(data_sub)
-        data_sum = self.sum_keep_dims(data_exp, (1,))
-        data_div = data_exp / data_sum
-        data_log_tmp = P.Log()(data_sum)
-        data_log = data_sub - data_log_tmp
-        data_mul = labels * data_log
-        data_muls = P.Neg()(data_mul)
-        loss = P.ReduceSum()(data_muls, (1,))
-        backprop = data_div - labels
-        return loss, backprop
-
-    def bprop(self, features, labels, out, dout):
-        grad = out[1]
-        grad = grad * P.ExpandDims()(dout[0], -1)
-        return grad, P.ZerosLike()(labels)
-
-
-class LayerNormForward(GraphKernel):
-    """ Forward function of the LayerNorm operator. """
-    def __init__(self, begin_norm_axis=1, begin_params_axis=1):
-        super(LayerNormForward, self).__init__()
-        self.begin_norm_axis = validator.check_value_type('begin_norm_axis', begin_norm_axis, [int],
-                                                          type(self).__name__)
-        self.begin_params_axis = validator.check_value_type('begin_params_axis', begin_params_axis, [int],
-                                                            type(self).__name__)
-        self.mul = P.Mul()
-        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
-        self.sub = P.Sub()
-        self.add = P.TensorAdd()
-        self.log = P.Log()
-        self.exp = P.Exp()
-        self.eps = P.Eps()
-
-    def construct(self, input_x, input_gamma, input_beta):
-        shape_x = P.Shape()(input_x)
-
-        # Calculate the scaling ratio of the average
-        begin_norm_axis = self.begin_norm_axis
-        if begin_norm_axis < 0:
-            begin_norm_axis += len(shape_x)
-        reduce_axis = ()
-        for i in range(len(shape_x)):
-            if i > begin_norm_axis or i == begin_norm_axis:
-                reduce_axis = reduce_axis + (i,)
-
-        reduce_elts = 1.0
-        for i in reduce_axis:
-            reduce_elts *= shape_x[i]
-        mean_cof = 1.0 / reduce_elts
-
-        # Calculate mean
-        mean_muls = self.mul(input_x, mean_cof)
-        mean = self.sum_keep_dims(mean_muls, reduce_axis)
-
-        # Calculate variance
-        variance_sub = self.sub(input_x, mean)
-        variance_mul = self.mul(variance_sub, variance_sub)
-        variance_muls = self.mul(variance_mul, mean_cof)
-        variance = self.sum_keep_dims(variance_muls, reduce_axis)
-
-        # Calculate normalize
-        normalize_sub = self.sub(input_x, mean)
-        epsilon = self.eps(input_x)
-        normalize_add = self.add(variance, epsilon)
-        normalize_log = self.log(normalize_add)
-        normalize_log_mul = self.mul(normalize_log, -0.5)
-        normalize_exp = self.exp(normalize_log_mul)
-        normalize_mul = self.mul(normalize_sub, normalize_exp)
-
-        # Calculate scale and translate
-        if self.begin_params_axis == 0:
-            scale_mul = self.mul(input_gamma, normalize_mul)
-            res = self.add(scale_mul, input_beta)
-        else:
-            scale_mul = self.mul(input_gamma, normalize_mul)
-            res = self.add(scale_mul, input_beta)
-
-        return res, mean, variance
-
-
-class LayerNormXBackprop(GraphKernel):
-    r"""
-    Together with LayerNormBetaGammaBackprop, to supply the backprop
-    functionality for LayerNorm.
-
-    Note:
-        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
-        input_gamma as :math:`\gamma`. Then,
-        .. math::
-            \begin{array}{ll} \\
-                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
-                \frac {\partial L} {\partial x_i} =
-                    \frac{\gamma}{\sqrt{\sigma^2+\epsilon}}
-                    ( \frac{\partial L}{\partial y_i}
-                    - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta}
-                    - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma})
-            \end{array}
-
-    Inputs:
-        - **dy**(Tensor) - The first item of the next operator's backprop's output.
-        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
-        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
-        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
-        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
-
-    Outputs:
-        Tensor, the output of this operator, will be used as the first item of the result of
-            LayerNorm's backprop function, has the same shape and data type as 'input_x'.
-
-    Examples:
-        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> op = LayerNormXBackprop(keep_dims=False)
-        >>> output = op(dy, input_x, variance, mean, input_gamma)
-    """
-
-    def __init__(self):
-        super(LayerNormXBackprop, self).__init__()
-        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
-        self.log = P.Log()
-        self.exp = P.Exp()
-        self.eps = P.Eps()
-
-    def construct(self, dy, input_x, variance, mean, input_gamma):
-        shape_x = P.Shape()(input_x)
-        shape_mean = P.Shape()(mean)
-        reduce_axis = ()
-        flag = -1
-        min_l = 0
-        if len(shape_x) > len(shape_mean):
-            min_l = len(shape_x)
-        else:
-            min_l = len(shape_mean)
-        for i in range(min_l):
-            if (shape_x[i] != shape_mean[i]) and (flag == -1):
-                flag = i
-        if flag != -1:
-            for i in range(flag, len(shape_x)):
-                reduce_axis = reduce_axis + (i,)
-        else:
-            reduce_axis = reduce_axis + (len(shape_x) - 1,)
-        mean_num = 1.0
-        for i in reduce_axis:
-            mean_num *= shape_x[i]
-        pd_xl = input_gamma * dy
-        epsilon = self.eps(input_x)
-        var_elta = variance + epsilon
-        var_elta_log = self.log(var_elta)
-        var_elta_mul = var_elta_log * -0.5
-        var_elta_2 = P.Exp()(var_elta_mul)
-        pdvar1_mul = var_elta_2 * var_elta_2
-        pd_var_1 = pdvar1_mul * var_elta_2
-        sub_x_mean = input_x - mean
-        pdvar_mul1 = pd_xl * sub_x_mean
-        pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis)
-        pdvar_mul3 = pdvar_sum * pd_var_1
-        pd_var = pdvar_mul3 * -0.5
-        pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis)
-        pdmean1_mul = pdmean1_sum * var_elta_2
-        pd_mean_1 = pdmean1_mul * -1.0
-        pdmean2_mul1 = sub_x_mean * -2.0
-        pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis)
-        pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num)
-        pd_mean_2 = pd_var * pdmean2_mul3
-        pd_mean = pd_mean_2 + pd_mean_1
-        pd_x_1 = var_elta_2 * pd_xl
-        pdx2_mul = pd_var * sub_x_mean
-        pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num))
-        pd_x_3 = pd_mean * (1.0 / mean_num)
-        pdx_add = pd_x_1 + pd_x_2
-        pd_x = pdx_add + pd_x_3
-        return pd_x
-
-
-class LayerNormBetaGammaBackprop(GraphKernel):
-    r"""
-    Together with LayerNormXBackprop, to supply the backprop functionality for
-    LayerNorm.
-    Note:
-        Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
-        input_gamma as :math:`\gamma`. Then,
-        .. math::
-            \begin{array}{ll} \\
-                \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
-                \frac {\partial L} {\partial \beta} =
-                    \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\
-                \frac {\partial L} {\partial \gamma} =
-                    \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i}
-            \end{array}
-
-    Inputs:
-        - **dy**(Tensor) - The first item of the next operator's backprop's output.
-        - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
-        - **variance**(Tensor) - The second input of the forward function of LayerNorm.
-        - **mean**(Tensor) - The third input of the forward function of LayerNorm.
-        - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
-
-    Outputs:
-        Tuple of 2 Tensors, the backprop outputs.
-
-        - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as
-                    the second item of the LayerNorm's backprop function.
-        - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as
-                    the third item of the LayerNorm's backprop function.
-
-    Examples:
-        >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
-        >>> op = LayerNormBetaGammaBackprop(keep_dims=False)
-        >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma)
-    """
-    def __init__(self):
-        super(LayerNormBetaGammaBackprop, self).__init__()
-        self.sum_not_keep_dims = P.ReduceSum(keep_dims=False)
-        self.log = P.Log()
-        self.exp = P.Exp()
-        self.eps = P.Eps()
-
-    def construct(self, dy, input_x, variance, mean, shape_gamma):
-        shape_x = P.Shape()(input_x)
-        params_axis = ()
-
-        if len(shape_x) != len(shape_gamma):
-            sub = len(shape_x) - len(shape_gamma)
-            for i in range(sub):
-                params_axis = params_axis + (i,)
-
-        pd_beta = self.sum_not_keep_dims(dy, params_axis)
-        epsilon = self.eps(input_x)
-        var_elta = variance + epsilon
-        var_elta_log = self.log(var_elta)
-        var_elta_mul = var_elta_log * -0.5
-        var_elta_2 = P.Exp()(var_elta_mul)
-        sub_x_mean = input_x - mean
-        var_elta_2_cast = var_elta_2
-        xl_mul = var_elta_2_cast * sub_x_mean
-        pdga_mul = dy * xl_mul
-        pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis)
-        return pd_beta, pd_gamma
-
-
-class LogSoftmax(GraphKernel):
-    r"""
-    Log Softmax activation function.
-
-    Applies the Log Softmax function to the input tensor on the specified axis.
-    Suppose a slice in the given aixs :math:`x` then for each element :math:`x_i`
-    the Log Softmax function is shown as follows:
-
-    .. math::
-        \text{output}(x_i) = \log \left(\frac{\exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right),
-
-    where :math:`N` is the length of the Tensor.
-
-    Args:
-        axis (int): The axis to do the Log softmax operation. Default: -1.
-
-    Inputs:
-        - **logits** (Tensor) - The input of Log Softmax.
-
-    Outputs:
-        Tensor, with the same type and shape as the logits.
-
-    Examples:
-        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
-        >>> log_softmax = LogSoftmax()
-        >>> result = log_softmax(input_x)
-        >>> print(result)
-        [-4.4519143 -3.4519143 -2.4519143 -1.4519144 -0.4519144]
-    """
-
-    def __init__(self, axis=-1):
-        super(LogSoftmax, self).__init__()
-        self.axis = validator.check_value_type('axis', axis, [int], type(self).__name__)
-        self.max_keep_dims = P.ReduceMax(keep_dims=True)
-        self.sub = P.Sub()
-        self.exp = P.Exp()
-        self.sum_keep_dims = P.ReduceSum(keep_dims=True)
-        self.log = P.Log()
-        self.mul = P.Mul()
-
-    def construct(self, input_x):
-        data_max = self.max_keep_dims(input_x, (self.axis,))
-        data_sub = self.sub(input_x, data_max)
-
-        data_exp = self.exp(data_sub)
-        data_sum = self.sum_keep_dims(data_exp, (self.axis,))
-        data_log = self.log(data_sum)
-
-        res = self.sub(data_sub, data_log)
-        return res
-
-    def bprop(self, input_x, out, dout):
-        input_x = out
-        input_dy = dout
-
-        data_exp = self.exp(input_x)
-        data_sum = self.sum_keep_dims(input_dy, (self.axis,))
-        data_softmax = self.mul(data_exp, data_sum)
-
-        res = self.sub(input_dy, data_softmax)
-        return (res,)
-
-
-class Tanh(GraphKernel):
-    r"""
-    Tanh activation function.
-
-    Computes hyperbolic tangent of input element-wise. The Tanh function is defined as:
-
-    .. math::
-        tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1},
-
-    where :math:`x_i` is an element of the input Tensor.
-
-    Inputs:
-        - **input_x** (Tensor) - The input of Tanh.
-
-    Outputs:
-        Tensor, with the same type and shape as the input_x.
-
-    Examples:
-        >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
-        >>> tanh = Tanh()
-        >>> result = tanh(input_x)
-        >>> print(result)
-        [0.7615941 0.9640276 0.9950548 0.9993293 0.99990916]
-    """
-    def __init__(self):
-        super(Tanh, self).__init__()
-        self.abs = P.Abs()
-        self.add = P.TensorAdd()
-        self.div = P.RealDiv()
-        self.mul = P.Mul()
-        self.mul_fp16 = P.Mul()
-        self.mul_fp16.add_prim_attr("output_precision", "float16")
-        self.exp = P.Exp()
-
-    def construct(self, input_x):
-        input_abs = self.abs(input_x)
-        sign_flag = self.div(input_x, input_abs)
-        sign_flag_neg = self.mul(sign_flag, -1.0)
-
-        power_val = self.mul(input_abs, -2.0)
-        exp_val = self.exp(power_val)
-        up_val = self.add(exp_val, -1.0)
-        down_val = self.add(exp_val, 1.0)
-
-        div_val = self.div(up_val, down_val)
-        res = self.mul(sign_flag_neg, div_val)
-        return res
-
-    def bprop(self, input_x, out, dout):
-        input_y = out
-        input_dy = dout
-
-        data_square = self.mul(input_y, input_y)
-        data_mul = self.mul(data_square, -1.0)
-        anuminate = self.add(data_mul, 1.0)
-        res = self.mul_fp16(anuminate, input_dy)
-
-        return (res,)
-
-class TanhGrad(GraphKernel):
-    """
-    Backprop function of Tanh
-
-    Mathematical calculating:
-        result = Tanh(out)
-        result = 1 - result * result
-        result = result * dout
-    Inputs:
-        out (Tensor): Tanh's output
-        dout (Tensor): next layer's backward function's output, has same shape as out
-
-    Outputs:
-        result (Tensor): result of (1 - tanh(out)^2) * dout
-
-    Examples:
-        >>> x_np = np.random.randn(5, 3, 6).astype(np.float16)
-        >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16)
-        >>> x_ms = Tensor(x_np)
-        >>> dy_ms = Tensor(dy_np)
-        >>> tanh_grad = TanhGrad()
-        >>> out = tanh_grad(x_np, dy_np)
-    """
-    def __init__(self):
-        super(TanhGrad, self).__init__()
-        self.add = P.TensorAdd()
-        self.mul = P.Mul()
-        self.mul_fp16 = P.Mul()
-        self.mul_fp16.add_prim_attr("output_precision", "float16")
-
-    def construct(self, out, dout):
-        input_y = out
-        input_dy = dout
-
-        data_square = self.mul(input_y, input_y)
-        data_mul = self.mul(data_square, -1.0)
-        anuminate = self.add(data_mul, 1.0)
-        res = self.mul_fp16(anuminate, input_dy)
-
-        return res
-
-class Gelu(GraphKernel):
-    r"""
-    Gaussian Error Linear Units activation function.
-
-    GeLU is described in the paper `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
-    And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.
-    <https://arxiv.org/abs/1810.04805>`_.
-
-    Defined as follows:
-
-    .. math::
-        \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})),
-
-    where :math:`erf` is the "Gauss error function" .
-
-    Inputs:
-        - **input_x** (Tensor) - Input to compute the Gelu.
-
-    Outputs:
-        Tensor, with the same type and shape as input.
-
-    Examples:
-        >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
-        >>> gelu = Gelu()
-        >>> result = gelu(tensor)
-    """
-
-    def __init__(self):
-        super(Gelu, self).__init__()
-        self.add = P.TensorAdd()
-        self.abs = P.Abs()
-        self.exp = P.Exp()
-        self.neg = P.Neg()
-        self.minimum = P.Minimum()
-        self.div = P.RealDiv()
-        self.mul = P.Mul()
-        self.CSVALUE = 0.044715
-        self.CSVALUE_A = 1.59576912
-        self.CSVALUE_5 = 0.3989422804
-        self.CSVALUE_3B = 0.2140644488
-
-    def construct(self, input_x):
-        def _tanh_parameter_compute(data_x):
-            """
-            compute the parameter of tanh:
-            return: result equal (x+0.044715*tf.pow(x,3))
-            """
-            mul_0 = self.mul(data_x, data_x)
-            pow_0 = self.mul(mul_0, data_x)
-            mul_1 = self.mul(pow_0, self.CSVALUE)
-            result = self.add(data_x, mul_1)
-
-            return result
-
-        tanh_parameter = _tanh_parameter_compute(input_x)
-        mul_0 = self.mul(tanh_parameter, 1.5957691)
-
-        mul_0_min = self.minimum(mul_0, 0.0)
-        right_mul = self.exp(mul_0_min)
-
-        mul_0_abs = self.abs(mul_0)
-        mul_0_abs_neg = self.mul(mul_0_abs, -1.0)
-        mul_0_abs_neg_exp = self.exp(mul_0_abs_neg)
-
-        mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0)
-        left_mul = self.div(input_x, mul_0_abs_neg_exp_add)
-
-        result = self.mul(left_mul, right_mul)
-        return result
-
-    def bprop(self, input_x, out, dout):
-        """ register backprop function for Gelu """
-        data_x = input_x
-        data_gelu = out
-        data_dy = dout
-
-        def _math_four_compute(data_x):
-            """
-            Return:
-                math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))
-            """
-            datax_pow = data_x * data_x * data_x
-            datax_muls_c = self.mul(datax_pow, self.CSVALUE)
-            datax_addx = self.add(datax_muls_c, data_x)
-            datax_muls_s = self.mul(datax_addx, self.CSVALUE_A)
-
-            return datax_muls_s
-
-        # common part
-        math_four = _math_four_compute(data_x)
-        math_four_abs = self.abs(math_four)
-        math_four_abs_neg = self.mul(math_four_abs, -1.0)
-        math_four_abs_neg_exp = self.exp(math_four_abs_neg)
-        math_four_min = self.minimum(math_four, 0.0)
-
-        # dividend part
-        datax_pow = self.mul(data_x, data_x)
-        datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B)
-        datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A)
-        data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add)
-        math_four_min_2 = self.mul(math_four_min, 2.0)
-        div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp)
-        div_left = self.exp(math_four_min_2)
-        dividend = self.add(div_left, div_right)
-
-        # divisor part
-        div_0 = self.add(math_four_abs_neg_exp, 1.0)
-        div_1 = self.exp(math_four_min)
-        divisor = self.mul(div_1, div_0)
-        res_grad = self.div(dividend, divisor)
-
-        result = self.mul(res_grad, data_dy)
-        return (result,)
-
-
-class Softmax(GraphKernel):
-    """
-    Operator Softmax
-    .. math: `exp(x-max(x)) / sum(exp(x-max(x)))`
-
-    Args:
-        axis (int, tuple): Axis along which the softmax normalization is applied
-
-    Inputs:
-        x (Tensor): input data for softmax
-
-    Outputs:
-        output (Tensor): a tensor with the same shape of the input
-
-    Examples:
-        >>> layer = Softmax(1)
-        >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32)
-        >>> output = layer(x)
-    """
-
-    def __init__(self, axis):
-        super(Softmax, self).__init__()
-        validator.check_value_type("axis", axis, [int, tuple], type(self).__name__)
-        if isinstance(axis, int):
-            self.axis = (axis,)
-        else:
-            self.axis = axis
-        for item in self.axis:
-            validator.check_value_type("item of axis", item, [int], type(self).__name__)
-        self.max = P.ReduceMax(keep_dims=True)
-        self.sub = P.Sub()
-        self.exp = P.Exp()
-        self.sum = P.ReduceSum(keep_dims=True)
-        self.mul = P.Mul()
-
-    def construct(self, x):
-        max_x = self.max(x, self.axis)
-        data_sub = self.sub(x, max_x)
-        data_exp = self.exp(data_sub)
-        data_expsum = self.sum(data_exp, self.axis)
-        output = data_exp / data_expsum
-        return output
-
-    def bprop(self, x, out, dout):
-        mul_res = self.mul(dout, out)
-        sum_res = self.sum(mul_res, self.axis)
-        sub_res = self.sub(dout, sum_res)
-        res = self.mul(sub_res, out)
-        return (res,)
-
-
-class LayerNorm(Cell):
-    r"""
-    Applies Layer Normalization over a mini-batch of inputs.
-
-    Layer normalization is widely used in recurrent neural networks. It applies
-    normalization on a mini-batch of inputs for each single training case as described
-    in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
-    normalization, layer normalization performs exactly the same computation at training and
-    testing time. It can be described using the following formula. It is applied across all channels
-    and pixel but only one batch size.
-
-    .. math::
-        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
-    Args:
-        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
-            `begin_norm_axis ... R - 1`.
-        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value must be in [-1, rank(input)). Default: -1.
-        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
-            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value must be in [-1, rank(input)). Default: -1.
-        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'ones'.
-        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
-            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
-            'he_uniform', etc. Default: 'zeros'.
-
-    Inputs:
-        - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
-          and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
-    Outputs:
-        Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
-    Examples:
-        >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape[1:]
-        >>> m = G.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
-        >>> m(x)
-    """
-
-    def __init__(self,
-                 begin_norm_axis=-1,
-                 begin_params_axis=-1
-                 ):
-        super(LayerNorm, self).__init__()
-        self.begin_norm_axis = begin_norm_axis
-        self.begin_params_axis = begin_params_axis
-        self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis)
-        self.layer_norm_x_grad = LayerNormXBackprop()
-        self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop()
-        self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
-
-    def construct(self, input_x, input_gamma, input_beta):
-        return self.layer_norm(input_x, input_gamma, input_beta)
-
-    # case 1
-    def bprop(self, input_x, input_gamma, input_beta, out, dout):
-        dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma)
-        return dx, d_gamma, d_beta
-
-
-class LambUpdateWithLR(GraphKernel):
-    r"""
-    Part of Lamb optimizer.
-
-    .. math::
-        s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se)
-        i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4
-
-    Inputs:
-        - **input0** (Tensor) - The first tensor to be computed.
-        - **input1** (Tensor) - The second tensor to be computed.
-        - **input2** (Tensor) - The third tensor to be computed.
-        - **input3** (Tensor) - The fourth tensor to be computed.
-        - **input4** (Tensor) - The fifth tensor to be computed.
-        - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result.
-        - **greater_y** (Tensor) - The seventh tensor to be computed.
-        - **select_e** (Tensor) - The eighth tensor to be computed.
-        - **minimum_y** (Tensor) - The ninth tensor to be computed.
-
-    Outputs:
-        A fake output tensor.
-
-    Examples:
-        >>> lamb_update = LambUpdateWithLR()
-        >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
-        >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
-        >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
-        >>> yg = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> se = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> ym = np.random.normal(0, 1, [1]).astype(np.float32)
-        >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym)
-
-    """
-
-    def __init__(self):
-        super(LambUpdateWithLR, self).__init__()
-        self.greater = P.Greater()
-        self.select = P.Select()
-        self.div = P.RealDiv()
-        self.min = P.Minimum()
-        self.max = P.Maximum()
-        self.mul = P.Mul()
-        self.sub = P.Sub()
-        self.fake_output_assign = InplaceAssign()
-        self.fake_output_assign.add_prim_attr("fake_output", True)
-
-    def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y):
-        greater0 = self.greater(input0, greater_y)
-        greater1 = self.greater(input1, greater_y)
-        real_div0 = self.div(input1, input2)
-        select0 = self.select(greater0, real_div0, select_e)
-        select1 = self.select(greater1, select0, select_e)
-        min0 = self.min(select1, minimum_y)
-        max0 = self.max(min0, greater_y)
-        mul0 = self.mul(max0, input3)
-        mul1 = self.mul(mul0, input4)
-        sub0 = self.sub(input5, mul1)
-        sub0 = self.fake_output_assign(input5, sub0, sub0)
-        return sub0
-
-class LambNextMV(GraphKernel):
-    r"""
-    Part of Lamb optimizer.
-
-    .. math::
-        rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6}
-        rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3}
-        y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7
-        y_3 = \frac{rd_0}{\sqrt{rd_1} + x3}
-        i5 = i_8 \times i_5 + i_9 \times i_4
-        i2 = x_0 \times i_2 + x_1 \times i_1
-
-    Inputs:
-        - **inputs1** (Tensor) - The first input tensor to be computed.
-        - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result.
-        - **inputs3** (Tensor) - The third input tensor to be computed.
-        - **inputs4** (Tensor) - The fourth input tensor to be computed.
-        - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result.
-        - **inputs6** (Tensor) - The sixth input tensor to be computed.
-        - **inputs7** (Tensor) - The seventh input tensor to be computed.
-        - **inputs8** (Tensor) - The eighth input tensor to be computed.
-        - **inputs9** (Tensor) - The ninth input tensor to be computed.
-        - **inputsx0** (Tensor) - The tenth input tensor to be computed.
-        - **inputsx1** (Tensor) - The eleventh input tensor to be computed.
-        - **inputsx2** (Tensor) - The twelfth input tensor to be computed.
-        - **inputsx3** (Tensor) - The thirteenth input tensor to be computed.
-
-    Outputs:
-        Tuple of 2 Tensors.
-
-        - **add3** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
-                              the one with higher precision or higher digits among the inputs.
-        - **realdiv4** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
-                                  the one with higher precision or higher digits among the inputs.
-
-    Examples:
-        >>> lamb_next_mv = LambNextMV()
-        >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
-        >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6)
-        >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3)
-
-    """
-
-    def __init__(self):
-        super(LambNextMV, self).__init__()
-        self.mul = P.Mul()
-        self.add = P.TensorAdd()
-        self.div = P.RealDiv()
-        self.sqrt = P.Sqrt()
-        self.rsqrt = P.Rsqrt()
-        self.fake_output_assign_1 = InplaceAssign()
-        self.fake_output_assign_1.add_prim_attr("fake_output", False)
-        self.fake_output_assign_2 = InplaceAssign()
-        self.fake_output_assign_2.add_prim_attr("fake_output", False)
-
-
-    def construct(self, input1, input2, input3, input4, input5, input6, input7,
-                  input8, input9, inputx0, inputx1, inputx2, inputx3):
-        mul3 = self.mul(inputx1, input1)
-        mul2 = self.mul(inputx0, input2)
-        add1 = self.add(mul2, mul3)
-        realdiv1 = self.div(add1, input3)
-        add2 = self.add(realdiv1, inputx3)
-        sqrt0 = self.rsqrt(add2)
-        sqrt1 = self.sqrt(realdiv1)
-        add4 = self.add(sqrt1, inputx3)
-        mul1 = self.mul(input9, input4)
-        mul0 = self.mul(input8, input5)
-        add0 = self.add(mul0, mul1)
-        realdiv0 = self.div(add0, input6)
-        realdiv2 = self.mul(realdiv0, sqrt0)
-        realdiv4 = self.div(realdiv0, add4)
-        mul4 = self.mul(inputx2, input7)
-        add3 = self.add(realdiv2, mul4)
-
-        add3 = self.fake_output_assign_1(input5, add0, add3)
-        add3 = self.fake_output_assign_2(input2, add1, add3)
-
-        return add3, realdiv4
diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py
index 997c5304d5..2b6b5f9d5b 100755
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@@ -26,7 +26,7 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from .optimizer import Optimizer
 from .. import layer
-from .. import graph_kernels as G
+from .. import _graph_kernels as G
 
 num_one = Tensor(np.ones([1]), mstype.float32)
 
diff --git a/mindspore/ops/_selected_grad_ops.py b/mindspore/ops/_selected_grad_ops.py
index 5da1d53abf..73a3f4af70 100644
--- a/mindspore/ops/_selected_grad_ops.py
+++ b/mindspore/ops/_selected_grad_ops.py
@@ -17,7 +17,7 @@
 from mindspore.ops.op_selector import new_ops_selector
 
 op_selector = new_ops_selector(
-    "mindspore.ops.operations._grad_ops", "mindspore.nn.graph_kernels")
+    "mindspore.ops.operations._grad_ops", "mindspore.nn._graph_kernels")
 
 
 @op_selector
diff --git a/mindspore/ops/_selected_ops.py b/mindspore/ops/_selected_ops.py
index 2d816f58f8..dd5ee410e7 100644
--- a/mindspore/ops/_selected_ops.py
+++ b/mindspore/ops/_selected_ops.py
@@ -17,11 +17,11 @@
 from mindspore.ops.op_selector import new_ops_selector
 
 op_selector = new_ops_selector(
-    "mindspore.ops.operations", "mindspore.nn.graph_kernels")
+    "mindspore.ops.operations", "mindspore.nn._graph_kernels")
 opt_selector = new_ops_selector(
-    "mindspore.nn.optim", "mindspore.nn.graph_kernels")
+    "mindspore.nn.optim", "mindspore.nn._graph_kernels")
 nn_selector = new_ops_selector(
-    "mindspore.nn", "mindspore.nn.graph_kernels")
+    "mindspore.nn", "mindspore.nn._graph_kernels")
 
 
 @nn_selector
diff --git a/tests/st/ops/graph_kernel/test_fuse.py b/tests/st/ops/graph_kernel/test_fuse.py
index 726e2cc16c..2168ca2546 100644
--- a/tests/st/ops/graph_kernel/test_fuse.py
+++ b/tests/st/ops/graph_kernel/test_fuse.py
@@ -19,7 +19,7 @@ import mindspore.context as context
 from mindspore import Tensor
 from mindspore.nn import Cell
 import mindspore.ops.operations as P
-from mindspore.nn.graph_kernels import ReLU
+from mindspore.ops.operations import _grad_ops as G
 
 
 class Net(Cell):
@@ -28,41 +28,45 @@ class Net(Cell):
         self.add = P.TensorAdd()
         self.sub = P.Sub()
         self.mul = P.Mul()
-        self.relu = ReLU()
+        self.sqrt_grad = G.SqrtGrad()
 
-    def construct(self, x, y):
+    def construct(self, x, y, z):
         sub_res = self.sub(x, y)
         mul_res = self.mul(sub_res, x)
-        relu_res = self.relu(mul_res)
-        square_res = P.Square()(relu_res)
-        add_res = self.add(relu_res, square_res)
+        sqrt_grad_res = self.sqrt_grad(mul_res, z)
+        square_res = P.Square()(sqrt_grad_res)
+        add_res = self.add(sqrt_grad_res, square_res)
         add1_res = self.add(add_res, add_res)
         return self.add(add1_res, add1_res)
 
 
+def get_output(i0, i1, i2, enable_graph_kernel=False):
+    if enable_graph_kernel:
+        context.set_context(enable_graph_kernel=True)
+    net = Net()
+    output = net(i0, i1, i2)
+    return output
+
+
 def test_basic():
-    input_x = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32)
-    input_y = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32)
-    sub_res = input_x - input_y
-    mul_res = sub_res * input_x
-    relu_res = np.maximum(mul_res, 0)
-    square_res = np.square(relu_res)
-    add_res = relu_res + square_res
-    add1_res = add_res + add_res
-    expect = add1_res + add1_res
+    i0 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32))
+    i1 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32))
+    i2 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32))
 
-    net = Net()
-    result = net(Tensor(input_x), Tensor(input_y))
+    expect = get_output(i0, i1, i2, False)
+    output = get_output(i0, i1, i2, True)
+
+    expect_np = expect.asnumpy().copy()
+    output_np = output.asnumpy().copy()
 
-    res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True)
-    assert res
+    assert np.allclose(expect_np, output_np, 1.e-4, 1.e-7)
 
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard
 def test_basic_gpu():
-    context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU")
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
     test_basic()
 
 
@@ -71,5 +75,5 @@ def test_basic_gpu():
 @pytest.mark.platform_x86_ascend_training
 @pytest.mark.env_onecard
 def test_basic_ascend():
-    context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend")
+    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     test_basic()
diff --git a/tests/st/ops/graph_kernel/test_lamb.py b/tests/st/ops/graph_kernel/test_lamb.py
index e3b4ef72eb..6d146553c0 100644
--- a/tests/st/ops/graph_kernel/test_lamb.py
+++ b/tests/st/ops/graph_kernel/test_lamb.py
@@ -18,7 +18,7 @@ import numpy as np
 import mindspore.context as context
 from mindspore import Tensor, Parameter
 from mindspore.nn import Cell
-from mindspore.nn.graph_kernels import LambUpdateWithLR, LambNextMV
+from mindspore.nn._graph_kernels import LambUpdateWithLR, LambNextMV
 
 
 class LambNet(Cell):