diff --git a/mindspore/nn/graph_kernels/__init__.py b/mindspore/nn/_graph_kernels/__init__.py similarity index 55% rename from mindspore/nn/graph_kernels/__init__.py rename to mindspore/nn/_graph_kernels/__init__.py index 8128f2db60..356d183766 100644 --- a/mindspore/nn/graph_kernels/__init__.py +++ b/mindspore/nn/_graph_kernels/__init__.py @@ -18,13 +18,6 @@ GraphKernel. GraphKernel provides a unified style to express graph and kernel for user. It breaks the boundary between graph and kernel and provides more opportunities to do compile optimization. """ -from .graph_kernels import MaximumGrad, MinimumGrad, AbsGrad, ApplyMomentum, BiasAdd, EqualCount, \ - ReduceMean, ReLU, SoftmaxCrossEntropyWithLogits, LayerNorm, LayerNormXBackprop, \ - LayerNormBetaGammaBackprop, LogSoftmax, Tanh, TanhGrad, Gelu, Softmax, BiasAddGrad, \ - LambUpdateWithLR, LambNextMV +from .graph_kernels import LambUpdateWithLR, LambNextMV -__all__ = ['MaximumGrad', 'MinimumGrad', 'AbsGrad', 'ApplyMomentum', 'BiasAdd', 'EqualCount', - 'ReduceMean', 'ReLU', 'SoftmaxCrossEntropyWithLogits', 'LayerNorm', - 'LayerNormXBackprop', 'LayerNormBetaGammaBackprop', 'LogSoftmax', 'Tanh', 'TanhGrad', - 'Gelu', 'Softmax', 'BiasAddGrad', 'LambUpdateWithLR', 'LambNextMV' - ] +__all__ = ['LambUpdateWithLR', 'LambNextMV'] diff --git a/mindspore/nn/_graph_kernels/graph_kernels.py b/mindspore/nn/_graph_kernels/graph_kernels.py new file mode 100644 index 0000000000..b314476878 --- /dev/null +++ b/mindspore/nn/_graph_kernels/graph_kernels.py @@ -0,0 +1,259 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Graph kernels. They are composites of basic primitives and can be compiled into +a fused kernel automatically when context.set_context(enable_graph_kernel=True). +""" +from ...ops import operations as P +from ...ops.primitive import PrimitiveWithInfer, prim_attr_register +from ...ops.composite import multitype_ops as C +from ..cell import GraphKernel + + +class InplaceAssign(PrimitiveWithInfer): + """ + Inplace assign `Parameter` with a value. + + This primitive can only be used in graph kernel. + + Inputs: + - **variable** (Parameter) - The `Parameter`. + - **value** (Tensor) - The value to be assigned. + - **depend** (Tensor) - The dependent tensor to keep this op connected in graph. + + Outputs: + Tensor, has the same type as original `variable`. + + Examples: + >>> class MyClass(GraphKernel): + ... def __init__(self): + ... super(MyClass, self).__init__() + ... self.mul = P.Mul() + ... self.fake_output_assign = InplaceAssign() + ... self.fake_output_assign.add_prim_attr("fake_output", True) + ... + ... def construct(self, i0, i1): + ... mul_res = self.mul(i0, i1) + ... # mul_res is a fake output and parameter i0 will be updated. + ... mul_res = self.fake_output_assign(i0, mul_res, mul_res) + ... return mul_res + """ + + @prim_attr_register + def __init__(self): + super(InplaceAssign, self).__init__("InplaceAssign") + self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output']) + + def infer_shape(self, x, y, z): + return z + + def infer_dtype(self, x, y, z): + return z + + def get_bprop(self): + def bprop(x, y, z, out, dout): + return (x, C.zeros_like(y), dout) + + return bprop + + +class LambUpdateWithLR(GraphKernel): + r""" + Part of Lamb optimizer. + + .. math:: + s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se) + i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4 + + Inputs: + - **input0** (Tensor) - The first tensor to be computed. + - **input1** (Tensor) - The second tensor to be computed. + - **input2** (Tensor) - The third tensor to be computed. + - **input3** (Tensor) - The fourth tensor to be computed. + - **input4** (Tensor) - The fifth tensor to be computed. + - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result. + - **greater_y** (Tensor) - The seventh tensor to be computed. + - **select_e** (Tensor) - The eighth tensor to be computed. + - **minimum_y** (Tensor) - The ninth tensor to be computed. + + Outputs: + A fake output tensor. + + Examples: + >>> import numpy as np + >>> import mindspore.context as context + >>> from mindspore.common import dtype as mstype + >>> from mindspore.common.tensor import Tensor + >>> from mindspore.common.parameter import Parameter + >>> from mindspore.nn.cell import Cell + >>> class Net(Cell): + ... def __init__(self, i5): + ... super(Net, self).__init__() + ... self.i5 = Parameter(i5, name='i5') + ... self.lamb_update = LambUpdateWithLR() + ... + ... def construct(self, i0, i1, i2, i3, i4, i6, i7, i8): + ... return self.lamb_update(i0, i1, i2, i3, i4, self.i5, i6, i7, i8) + >>> shape = [1, 16] + >>> oshape = [1] + >>> i0 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i1 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i2 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i3 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i4 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i5 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i6 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i7 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> i8 = Tensor(np.random.normal(0, 1, oshape).astype(np.float32)) + >>> context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True) + >>> net = Net(i5) + >>> _ = net(i0, i1, i2, i3, i4, i6, i7, i8) + >>> output = (net.i5) + """ + + def __init__(self): + super(LambUpdateWithLR, self).__init__() + self.greater = P.Greater() + self.select = P.Select() + self.div = P.RealDiv() + self.min = P.Minimum() + self.max = P.Maximum() + self.mul = P.Mul() + self.sub = P.Sub() + self.fake_output_assign = InplaceAssign() + self.fake_output_assign.add_prim_attr("fake_output", True) + + def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y): + greater0 = self.greater(input0, greater_y) + greater1 = self.greater(input1, greater_y) + real_div0 = self.div(input1, input2) + select0 = self.select(greater0, real_div0, select_e) + select1 = self.select(greater1, select0, select_e) + min0 = self.min(select1, minimum_y) + max0 = self.max(min0, greater_y) + mul0 = self.mul(max0, input3) + mul1 = self.mul(mul0, input4) + sub0 = self.sub(input5, mul1) + sub0 = self.fake_output_assign(input5, sub0, sub0) + return sub0 + + +class LambNextMV(GraphKernel): + r""" + Part of Lamb optimizer. + + .. math:: + rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6} + rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3} + y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7 + y_3 = \frac{rd_0}{\sqrt{rd_1} + x3} + i5 = i_8 \times i_5 + i_9 \times i_4 + i2 = x_0 \times i_2 + x_1 \times i_1 + + Inputs: + - **inputs1** (Tensor) - The first input tensor to be computed. + - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result. + - **inputs3** (Tensor) - The third input tensor to be computed. + - **inputs4** (Tensor) - The fourth input tensor to be computed. + - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result. + - **inputs6** (Tensor) - The sixth input tensor to be computed. + - **inputs7** (Tensor) - The seventh input tensor to be computed. + - **inputs8** (Tensor) - The eighth input tensor to be computed. + - **inputs9** (Tensor) - The ninth input tensor to be computed. + - **inputsx0** (Tensor) - The tenth input tensor to be computed. + - **inputsx1** (Tensor) - The eleventh input tensor to be computed. + - **inputsx2** (Tensor) - The twelfth input tensor to be computed. + - **inputsx3** (Tensor) - The thirteenth input tensor to be computed. + + Outputs: + Tuple of 2 Tensors. + + - **add3** (Tensor) - the shape is the same as the one after broadcasting, and the data type is + the one with higher precision or higher digits among the inputs. + - **realdiv4** (Tensor) - the shape is the same as the one after broadcasting, and the data type is + the one with higher precision or higher digits among the inputs. + + Examples: + >>> import numpy as np + >>> import mindspore.context as context + >>> from mindspore.common import dtype as mstype + >>> from mindspore.common.tensor import Tensor + >>> from mindspore.common.parameter import Parameter + >>> from mindspore.nn.cell import Cell + >>> class Net(Cell): + ... def __init__(self, i1, i4): + ... super(Net, self).__init__() + ... self.i1 = Parameter(i1, name='i1') + ... self.i4 = Parameter(i4, name='i4') + ... self.lamb_next = LambNextMV() + ... + ... def construct(self, i0, i2, i3, i5, i6, i7, i8, i9, i10, i11, i12): + ... i0_ = i0 + i2 + ... return self.lamb_next(i0_, self.i1, i2, i3, self.i4, i5, i6, i7, i8, i9, i10, i11, i12) + >>> shape = [1, 16] + >>> i0 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i1 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i2 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i3 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i4 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i5 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i6 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i7 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i8 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i9 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i10 = Tensor(np.abs(np.random.normal(0, 1, shape)).astype(np.float32)) + >>> i11 = Tensor(np.random.normal(0, 1, shape).astype(np.float32)) + >>> i12 = Tensor(np.ones(shape).astype(np.float32) * 1e-6) + >>> context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True) + >>> net = Net(i1, i4) + >>> (o0, o1) = net(i0, i2, i3, i5, i6, i7, i8, i9, i10, i11, i12) + >>> output = (o0, net.i4, net.i1, o1) + """ + + def __init__(self): + super(LambNextMV, self).__init__() + self.mul = P.Mul() + self.add = P.TensorAdd() + self.div = P.RealDiv() + self.sqrt = P.Sqrt() + self.rsqrt = P.Rsqrt() + self.fake_output_assign_1 = InplaceAssign() + self.fake_output_assign_1.add_prim_attr("fake_output", False) + self.fake_output_assign_2 = InplaceAssign() + self.fake_output_assign_2.add_prim_attr("fake_output", False) + + def construct(self, input1, input2, input3, input4, input5, input6, input7, + input8, input9, inputx0, inputx1, inputx2, inputx3): + mul3 = self.mul(inputx1, input1) + mul2 = self.mul(inputx0, input2) + add1 = self.add(mul2, mul3) + realdiv1 = self.div(add1, input3) + add2 = self.add(realdiv1, inputx3) + sqrt0 = self.rsqrt(add2) + sqrt1 = self.sqrt(realdiv1) + add4 = self.add(sqrt1, inputx3) + mul1 = self.mul(input9, input4) + mul0 = self.mul(input8, input5) + add0 = self.add(mul0, mul1) + realdiv0 = self.div(add0, input6) + realdiv2 = self.mul(realdiv0, sqrt0) + realdiv4 = self.div(realdiv0, add4) + mul4 = self.mul(inputx2, input7) + add3 = self.add(realdiv2, mul4) + + add3 = self.fake_output_assign_1(input5, add0, add3) + add3 = self.fake_output_assign_2(input2, add1, add3) + + return add3, realdiv4 diff --git a/mindspore/nn/graph_kernels/graph_kernels.py b/mindspore/nn/graph_kernels/graph_kernels.py deleted file mode 100644 index ca542da989..0000000000 --- a/mindspore/nn/graph_kernels/graph_kernels.py +++ /dev/null @@ -1,1208 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Graph kernels. They are composites of basic primitives and can be compiled into -a fused kernel automaticly when context.set_context(enable_graph_kernel=True). -""" -from ...common import dtype as mstype -from ...ops import operations as P -from ...ops.primitive import PrimitiveWithInfer, prim_attr_register -from ...ops.composite import multitype_ops as C -from ...ops.operations import _grad_ops as G -from ..._checkparam import Validator as validator -from ..cell import Cell, GraphKernel - - -class InplaceAssign(PrimitiveWithInfer): - """ - Inplace assign `Parameter` with a value. - - This primitive can only use in graph kernel. - - Inputs: - - **variable** (Parameter) - The `Parameter`. - - **value** (Tensor) - The value to be assigned. - - **depend** (Tensor) - The dependent tensor to keep this op connected in graph. - - Outputs: - Tensor, has the same type as original `variable`. - - Examples: - >>> def construct(self, x): - ... val = x - 1.0 - ... ret = x + 2.0 - ... return InplaceAssign()(x, val, ret) - >>> x = Tensor([2.0], mindspore.float32) - >>> net = Net() - >>> net(x) - """ - @prim_attr_register - def __init__(self): - self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output']) - - def infer_shape(self, x, y, z): - return z - - def infer_dtype(self, x, y, z): - return z - - def get_bprop(self): - def bprop(x, y, z, out, dout): - return (x, C.zeros_like(y), dout) - return bprop - - -class MaximumGrad(GraphKernel): - """ - - Backprop function for Maximum operator. - - Inputs: - - **x** (Tensor) - The first input tensor of maximum. - - **y** (Tensor) - The second input tensor of maximum. - - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output. - - Outputs: - dx (Tensor): has the same shape as x and y, returns dout element if - `x >= y` returns true at the same position, or returns zero at that - position - dy (Tensor): has the same shape as x and y, dy = dout - dx - - Examples: - >>> layer = MaximumGrad() - >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6])) - """ - - def __init__(self, grad_x=True, grad_y=True): - super(MaximumGrad, self).__init__() - self.grad_x = grad_x - self.grad_y = grad_y - self.select = P.Select() - self.greater_equal = P.GreaterEqual() - self.zeros_like = P.ZerosLike() - self.sub = P.Sub() - - def construct(self, x, y, dout): - cmp_result = self.greater_equal(x, y) - dx = self.select(cmp_result, dout, self.zeros_like(dout)) - dy = dout - dx - - return dx, dy - - -class MinimumGrad(GraphKernel): - """ - Backprop function for Minimum operator. - - Compares x and y elementwise, dout must has the same shape with x and y. - - Inputs: - - **x** (Tensor) - The first input - - **y** (Tensor) - x and y must have same shape - - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output - - Outputs: - - dx (Tensor) - Has the same shape as x and y, returns dout element if - `x <= y` returns true at the same position, or returns zero at that - position - - dy (Tensor) - Has the same shape as x and y, dy = dout - dx - - Examples: - >>> layer = MinimumGrad() - >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6])) - """ - - def __init__(self, grad_x=True, grad_y=True): - super(MinimumGrad, self).__init__() - self.grad_x = grad_x - self.grad_y = grad_y - self.select = P.Select() - self.less_equal = P.LessEqual() - self.zeros_like = P.ZerosLike() - self.sub = P.Sub() - - def construct(self, x, y, dout): - cmp_result = self.less_equal(x, y) - dx = self.select(cmp_result, dout, self.zeros_like(dout)) - dy = dout - dx - - return dx, dy - - -class AbsGrad(GraphKernel): - """ - Abs's backprop function. - - Inputs: - **input_x** (Tensor) - input data of this operator. - **dout** (Tensor) - output of the next operator's backprop function. - - Outputs: - Tensor, has the same shape as input_x. - - Examples: - >>> back = AbsGrad() - >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6])) - """ - - def __init__(self): - super(AbsGrad, self).__init__() - self.mul = P.Mul() - self.abs = P.Abs() - self.add = P.TensorAdd() - self.div = P.RealDiv() - self.round = P.Round() - - def construct(self, input_x, dout): - NUM_MAX = 32768 - mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX)) - res_abs = self.abs(mul_max) - res_div = self.div(mul_max, res_abs) - res_round = self.round(res_div) - res = self.mul(res_round, dout) - return res - - -class ApplyMomentum(GraphKernel): - """ - Update parameter according to the ApplyMomentum algorithm. - - Inputs: - variable (Tensor): mutable tensor var - accumulation (Tensor): mutable tensor accum - learning_rate (float32): learning rate - gradient (float32): The gradient - momentum (float32): Momentum - - Outputs: updated accumulation and variable - """ - - def __init__(self, - use_nesterov=False, - use_locking=False, - gradient_scale=1.0): - super(ApplyMomentum, self).__init__() - self.gradient_scale = validator.check_value_type('gradient_scale', gradient_scale, [float], type(self).__name__) - self.fake_output_assign_1 = InplaceAssign() - self.fake_output_assign_1.add_prim_attr("fake_output", True) - self.fake_output_assign_2 = InplaceAssign() - self.fake_output_assign_2.add_prim_attr("fake_output", True) - - def construct(self, variable, accumulation, learning_rate, gradient, momentum): - gradient = gradient * self.gradient_scale - momt_accumulation = accumulation * momentum - accumulation_inplace = momt_accumulation + gradient - - sum_gradient = accumulation_inplace * learning_rate - variable_inplace = variable - sum_gradient - - accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace) - variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace) - return accumulation_inplace, variable_inplace - - -class BiasAdd(GraphKernel): - """ - Return the sum of x and bias. - - Inputs: - x (Tensor): Tensor of input data. - bias (Tensor): The bias tensor. - - Output: - Tensor, the sum of x and bias. - - Example: - >>> layer = BiasAdd() - >>> output = layer(Tensor([1, 2, 3]), Tensor([1,])) - """ - - def __init__(self): - super(BiasAdd, self).__init__() - - def construct(self, x, bias): - shape = P.Shape()(x) - if len(shape) == 4: - bias_shape = (1, P.Shape()(bias)[0], 1, 1) # NCHW - else: - bias_shape = (1, P.Shape()(bias)[0]) - res = x + P.Reshape()(bias, bias_shape) - return res - -class BiasAddGrad(GraphKernel): - """ - Computes gradients of BiasAdd. - - Inputs: - x (Tensor): the gradients of bias add output. - - Output: - Tensor, the gradients of bias add input. - - Examples: - >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32) - >>> bias_add_grad = BiasAddGrad() - >>> dx = bias_add_grad(dout) - """ - def __init__(self): - super(BiasAddGrad, self).__init__() - - def construct(self, x): - shape_x = P.Shape()(x) - reduce_axis = [0] - for i in range(2, len(shape_x)): - reduce_axis.append(i) - - res = P.ReduceSum()(x, reduce_axis) - return res - - -class EqualCount(GraphKernel): - """ - Computes the number of the same elements of two tensors. - - The two input tensors must have the same shape and data type. - - Inputs: - x (Tensor): the first input tensor. - y (Tensor): the second input tensor. - - Outputs: - Tensor, the type is same as input tensor and size as (1,). - - Examples: - >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32) - >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32) - >>> equal_count = EqualCount() - >>> print(equal_count(x, y)) - 2 - """ - def __init__(self): - super(EqualCount, self).__init__() - - def construct(self, x, y): - equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32)) - equal_count = P.Cast()(equal_bool, mstype.float16) - - axes = (0,) - res = P.ReduceSum()(equal_count, axes) - res = P.Cast()(res, P.DType()(x)) - return res - - -class ReduceMean(GraphKernel): - """ - Reduce a dimension of a tensor by averaging all elements in the dimension. - - The dtype of the tensor to be reduced is number. - - Args: - keep_dims (bool): If true, keep these reduced dimensions and the length is 1. - If false, don't keep these dimensions. Default: False. - - Inputs: - - **input_x** (Tensor[Number]) - The input tensor. - - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions. - Only constant value is allowed. - - Outputs: - Tensor, has the same dtype as the `input_x`. - - - If axis is (), and keep_dims is False, - the output is a 0-D tensor representing the sum of all elements in the input tensor. - - If axis is int, set as 2, and keep_dims is False, - the shape of output is :math:`(x_1, x_3, ..., x_R)`. - - If axis is tuple(int), set as (2, 3), and keep_dims is False, - the shape of output is :math:`(x_1, x_4, ..., x_R)`. - - Examples: - >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> op = ReduceMean(keep_dims=True) - >>> output = op(input_x, 1) - """ - - def __init__(self, keep_dims=True): - super(ReduceMean, self).__init__() - self.keep_dims = validator.check_value_type('keep_dims', keep_dims, [bool], type(self).__name__) - self.sum = P.ReduceSum(self.keep_dims) - - def construct(self, x, axis): - shape = P.Shape()(x) - value_num = 1 - for i in axis: - value_num *= shape[i] - - data_sum = self.sum(x, axis) - avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num) - res = data_sum * avg - return res - - -class ReLU(GraphKernel): - r""" - Computes ReLU(Rectified Linear Unit) of input tensor element-wise. - - It returns :math:`\max(x,\ 0)` element-wise. - - Inputs: - - **input_x** (Tensor) - The input tensor. - - Outputs: - Tensor, with the same type and shape as the `input_x`. - - Examples: - >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32) - >>> relu = ReLU() - >>> result = relu(input_x) - >>> print(result) - [[0. 4. 0.] - [2. 0. 9.]] - """ - def __init__(self): - super(ReLU, self).__init__() - self.max = P.Maximum() - - def construct(self, x): - return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x) - - -class SoftmaxCrossEntropyWithLogits(GraphKernel): - r""" - Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding. - - Note: - Sets input logits as `X`, input label as `Y`, output as `loss`. Then, - - .. math:: - p_{ij} = softmax(X_{ij}) = \frac{\exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)} - - .. math:: - loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})} - - Inputs: - - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`. - - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`. - - Outputs: - Tuple of 2 Tensors, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`. - - Examples: - >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32) - >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32) - >>> softmax_cross = SoftmaxCrossEntropyWithLogits() - >>> loss, backprop = softmax_cross(logits, labels) - """ - - def __init__(self): - super(SoftmaxCrossEntropyWithLogits, self).__init__() - self.max = P.ReduceMax(keep_dims=True) - self.sum_keep_dims = P.ReduceSum(keep_dims=True) - - def construct(self, features, labels): - data_max = self.max(features, (1,)) - data_sub = features - data_max - data_exp = P.Exp()(data_sub) - data_sum = self.sum_keep_dims(data_exp, (1,)) - data_div = data_exp / data_sum - data_log_tmp = P.Log()(data_sum) - data_log = data_sub - data_log_tmp - data_mul = labels * data_log - data_muls = P.Neg()(data_mul) - loss = P.ReduceSum()(data_muls, (1,)) - backprop = data_div - labels - return loss, backprop - - def bprop(self, features, labels, out, dout): - grad = out[1] - grad = grad * P.ExpandDims()(dout[0], -1) - return grad, P.ZerosLike()(labels) - - -class LayerNormForward(GraphKernel): - """ Forward function of the LayerNorm operator. """ - def __init__(self, begin_norm_axis=1, begin_params_axis=1): - super(LayerNormForward, self).__init__() - self.begin_norm_axis = validator.check_value_type('begin_norm_axis', begin_norm_axis, [int], - type(self).__name__) - self.begin_params_axis = validator.check_value_type('begin_params_axis', begin_params_axis, [int], - type(self).__name__) - self.mul = P.Mul() - self.sum_keep_dims = P.ReduceSum(keep_dims=True) - self.sub = P.Sub() - self.add = P.TensorAdd() - self.log = P.Log() - self.exp = P.Exp() - self.eps = P.Eps() - - def construct(self, input_x, input_gamma, input_beta): - shape_x = P.Shape()(input_x) - - # Calculate the scaling ratio of the average - begin_norm_axis = self.begin_norm_axis - if begin_norm_axis < 0: - begin_norm_axis += len(shape_x) - reduce_axis = () - for i in range(len(shape_x)): - if i > begin_norm_axis or i == begin_norm_axis: - reduce_axis = reduce_axis + (i,) - - reduce_elts = 1.0 - for i in reduce_axis: - reduce_elts *= shape_x[i] - mean_cof = 1.0 / reduce_elts - - # Calculate mean - mean_muls = self.mul(input_x, mean_cof) - mean = self.sum_keep_dims(mean_muls, reduce_axis) - - # Calculate variance - variance_sub = self.sub(input_x, mean) - variance_mul = self.mul(variance_sub, variance_sub) - variance_muls = self.mul(variance_mul, mean_cof) - variance = self.sum_keep_dims(variance_muls, reduce_axis) - - # Calculate normalize - normalize_sub = self.sub(input_x, mean) - epsilon = self.eps(input_x) - normalize_add = self.add(variance, epsilon) - normalize_log = self.log(normalize_add) - normalize_log_mul = self.mul(normalize_log, -0.5) - normalize_exp = self.exp(normalize_log_mul) - normalize_mul = self.mul(normalize_sub, normalize_exp) - - # Calculate scale and translate - if self.begin_params_axis == 0: - scale_mul = self.mul(input_gamma, normalize_mul) - res = self.add(scale_mul, input_beta) - else: - scale_mul = self.mul(input_gamma, normalize_mul) - res = self.add(scale_mul, input_beta) - - return res, mean, variance - - -class LayerNormXBackprop(GraphKernel): - r""" - Together with LayerNormBetaGammaBackprop, to supply the backprop - functionality for LayerNorm. - - Note: - Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`, - input_gamma as :math:`\gamma`. Then, - .. math:: - \begin{array}{ll} \\ - \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\ - \frac {\partial L} {\partial x_i} = - \frac{\gamma}{\sqrt{\sigma^2+\epsilon}} - ( \frac{\partial L}{\partial y_i} - - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta} - - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma}) - \end{array} - - Inputs: - - **dy**(Tensor) - The first item of the next operator's backprop's output. - - **input_x**(Tensor) - The first input of the forward function of LayerNorm. - - **variance**(Tensor) - The second input of the forward function of LayerNorm. - - **mean**(Tensor) - The third input of the forward function of LayerNorm. - - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm. - - Outputs: - Tensor, the output of this operator, will be used as the first item of the result of - LayerNorm's backprop function, has the same shape and data type as 'input_x'. - - Examples: - >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> op = LayerNormXBackprop(keep_dims=False) - >>> output = op(dy, input_x, variance, mean, input_gamma) - """ - - def __init__(self): - super(LayerNormXBackprop, self).__init__() - self.sum_keep_dims = P.ReduceSum(keep_dims=True) - self.log = P.Log() - self.exp = P.Exp() - self.eps = P.Eps() - - def construct(self, dy, input_x, variance, mean, input_gamma): - shape_x = P.Shape()(input_x) - shape_mean = P.Shape()(mean) - reduce_axis = () - flag = -1 - min_l = 0 - if len(shape_x) > len(shape_mean): - min_l = len(shape_x) - else: - min_l = len(shape_mean) - for i in range(min_l): - if (shape_x[i] != shape_mean[i]) and (flag == -1): - flag = i - if flag != -1: - for i in range(flag, len(shape_x)): - reduce_axis = reduce_axis + (i,) - else: - reduce_axis = reduce_axis + (len(shape_x) - 1,) - mean_num = 1.0 - for i in reduce_axis: - mean_num *= shape_x[i] - pd_xl = input_gamma * dy - epsilon = self.eps(input_x) - var_elta = variance + epsilon - var_elta_log = self.log(var_elta) - var_elta_mul = var_elta_log * -0.5 - var_elta_2 = P.Exp()(var_elta_mul) - pdvar1_mul = var_elta_2 * var_elta_2 - pd_var_1 = pdvar1_mul * var_elta_2 - sub_x_mean = input_x - mean - pdvar_mul1 = pd_xl * sub_x_mean - pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis) - pdvar_mul3 = pdvar_sum * pd_var_1 - pd_var = pdvar_mul3 * -0.5 - pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis) - pdmean1_mul = pdmean1_sum * var_elta_2 - pd_mean_1 = pdmean1_mul * -1.0 - pdmean2_mul1 = sub_x_mean * -2.0 - pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis) - pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num) - pd_mean_2 = pd_var * pdmean2_mul3 - pd_mean = pd_mean_2 + pd_mean_1 - pd_x_1 = var_elta_2 * pd_xl - pdx2_mul = pd_var * sub_x_mean - pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num)) - pd_x_3 = pd_mean * (1.0 / mean_num) - pdx_add = pd_x_1 + pd_x_2 - pd_x = pdx_add + pd_x_3 - return pd_x - - -class LayerNormBetaGammaBackprop(GraphKernel): - r""" - Together with LayerNormXBackprop, to supply the backprop functionality for - LayerNorm. - Note: - Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`, - input_gamma as :math:`\gamma`. Then, - .. math:: - \begin{array}{ll} \\ - \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\ - \frac {\partial L} {\partial \beta} = - \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\ - \frac {\partial L} {\partial \gamma} = - \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i} - \end{array} - - Inputs: - - **dy**(Tensor) - The first item of the next operator's backprop's output. - - **input_x**(Tensor) - The first input of the forward function of LayerNorm. - - **variance**(Tensor) - The second input of the forward function of LayerNorm. - - **mean**(Tensor) - The third input of the forward function of LayerNorm. - - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm. - - Outputs: - Tuple of 2 Tensors, the backprop outputs. - - - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as - the second item of the LayerNorm's backprop function. - - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as - the third item of the LayerNorm's backprop function. - - Examples: - >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) - >>> op = LayerNormBetaGammaBackprop(keep_dims=False) - >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma) - """ - def __init__(self): - super(LayerNormBetaGammaBackprop, self).__init__() - self.sum_not_keep_dims = P.ReduceSum(keep_dims=False) - self.log = P.Log() - self.exp = P.Exp() - self.eps = P.Eps() - - def construct(self, dy, input_x, variance, mean, shape_gamma): - shape_x = P.Shape()(input_x) - params_axis = () - - if len(shape_x) != len(shape_gamma): - sub = len(shape_x) - len(shape_gamma) - for i in range(sub): - params_axis = params_axis + (i,) - - pd_beta = self.sum_not_keep_dims(dy, params_axis) - epsilon = self.eps(input_x) - var_elta = variance + epsilon - var_elta_log = self.log(var_elta) - var_elta_mul = var_elta_log * -0.5 - var_elta_2 = P.Exp()(var_elta_mul) - sub_x_mean = input_x - mean - var_elta_2_cast = var_elta_2 - xl_mul = var_elta_2_cast * sub_x_mean - pdga_mul = dy * xl_mul - pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis) - return pd_beta, pd_gamma - - -class LogSoftmax(GraphKernel): - r""" - Log Softmax activation function. - - Applies the Log Softmax function to the input tensor on the specified axis. - Suppose a slice in the given aixs :math:`x` then for each element :math:`x_i` - the Log Softmax function is shown as follows: - - .. math:: - \text{output}(x_i) = \log \left(\frac{\exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right), - - where :math:`N` is the length of the Tensor. - - Args: - axis (int): The axis to do the Log softmax operation. Default: -1. - - Inputs: - - **logits** (Tensor) - The input of Log Softmax. - - Outputs: - Tensor, with the same type and shape as the logits. - - Examples: - >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32) - >>> log_softmax = LogSoftmax() - >>> result = log_softmax(input_x) - >>> print(result) - [-4.4519143 -3.4519143 -2.4519143 -1.4519144 -0.4519144] - """ - - def __init__(self, axis=-1): - super(LogSoftmax, self).__init__() - self.axis = validator.check_value_type('axis', axis, [int], type(self).__name__) - self.max_keep_dims = P.ReduceMax(keep_dims=True) - self.sub = P.Sub() - self.exp = P.Exp() - self.sum_keep_dims = P.ReduceSum(keep_dims=True) - self.log = P.Log() - self.mul = P.Mul() - - def construct(self, input_x): - data_max = self.max_keep_dims(input_x, (self.axis,)) - data_sub = self.sub(input_x, data_max) - - data_exp = self.exp(data_sub) - data_sum = self.sum_keep_dims(data_exp, (self.axis,)) - data_log = self.log(data_sum) - - res = self.sub(data_sub, data_log) - return res - - def bprop(self, input_x, out, dout): - input_x = out - input_dy = dout - - data_exp = self.exp(input_x) - data_sum = self.sum_keep_dims(input_dy, (self.axis,)) - data_softmax = self.mul(data_exp, data_sum) - - res = self.sub(input_dy, data_softmax) - return (res,) - - -class Tanh(GraphKernel): - r""" - Tanh activation function. - - Computes hyperbolic tangent of input element-wise. The Tanh function is defined as: - - .. math:: - tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1}, - - where :math:`x_i` is an element of the input Tensor. - - Inputs: - - **input_x** (Tensor) - The input of Tanh. - - Outputs: - Tensor, with the same type and shape as the input_x. - - Examples: - >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32) - >>> tanh = Tanh() - >>> result = tanh(input_x) - >>> print(result) - [0.7615941 0.9640276 0.9950548 0.9993293 0.99990916] - """ - def __init__(self): - super(Tanh, self).__init__() - self.abs = P.Abs() - self.add = P.TensorAdd() - self.div = P.RealDiv() - self.mul = P.Mul() - self.mul_fp16 = P.Mul() - self.mul_fp16.add_prim_attr("output_precision", "float16") - self.exp = P.Exp() - - def construct(self, input_x): - input_abs = self.abs(input_x) - sign_flag = self.div(input_x, input_abs) - sign_flag_neg = self.mul(sign_flag, -1.0) - - power_val = self.mul(input_abs, -2.0) - exp_val = self.exp(power_val) - up_val = self.add(exp_val, -1.0) - down_val = self.add(exp_val, 1.0) - - div_val = self.div(up_val, down_val) - res = self.mul(sign_flag_neg, div_val) - return res - - def bprop(self, input_x, out, dout): - input_y = out - input_dy = dout - - data_square = self.mul(input_y, input_y) - data_mul = self.mul(data_square, -1.0) - anuminate = self.add(data_mul, 1.0) - res = self.mul_fp16(anuminate, input_dy) - - return (res,) - -class TanhGrad(GraphKernel): - """ - Backprop function of Tanh - - Mathematical calculating: - result = Tanh(out) - result = 1 - result * result - result = result * dout - Inputs: - out (Tensor): Tanh's output - dout (Tensor): next layer's backward function's output, has same shape as out - - Outputs: - result (Tensor): result of (1 - tanh(out)^2) * dout - - Examples: - >>> x_np = np.random.randn(5, 3, 6).astype(np.float16) - >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16) - >>> x_ms = Tensor(x_np) - >>> dy_ms = Tensor(dy_np) - >>> tanh_grad = TanhGrad() - >>> out = tanh_grad(x_np, dy_np) - """ - def __init__(self): - super(TanhGrad, self).__init__() - self.add = P.TensorAdd() - self.mul = P.Mul() - self.mul_fp16 = P.Mul() - self.mul_fp16.add_prim_attr("output_precision", "float16") - - def construct(self, out, dout): - input_y = out - input_dy = dout - - data_square = self.mul(input_y, input_y) - data_mul = self.mul(data_square, -1.0) - anuminate = self.add(data_mul, 1.0) - res = self.mul_fp16(anuminate, input_dy) - - return res - -class Gelu(GraphKernel): - r""" - Gaussian Error Linear Units activation function. - - GeLU is described in the paper `Gaussian Error Linear Units (GELUs) `_. - And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. - `_. - - Defined as follows: - - .. math:: - \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})), - - where :math:`erf` is the "Gauss error function" . - - Inputs: - - **input_x** (Tensor) - Input to compute the Gelu. - - Outputs: - Tensor, with the same type and shape as input. - - Examples: - >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32) - >>> gelu = Gelu() - >>> result = gelu(tensor) - """ - - def __init__(self): - super(Gelu, self).__init__() - self.add = P.TensorAdd() - self.abs = P.Abs() - self.exp = P.Exp() - self.neg = P.Neg() - self.minimum = P.Minimum() - self.div = P.RealDiv() - self.mul = P.Mul() - self.CSVALUE = 0.044715 - self.CSVALUE_A = 1.59576912 - self.CSVALUE_5 = 0.3989422804 - self.CSVALUE_3B = 0.2140644488 - - def construct(self, input_x): - def _tanh_parameter_compute(data_x): - """ - compute the parameter of tanh: - return: result equal (x+0.044715*tf.pow(x,3)) - """ - mul_0 = self.mul(data_x, data_x) - pow_0 = self.mul(mul_0, data_x) - mul_1 = self.mul(pow_0, self.CSVALUE) - result = self.add(data_x, mul_1) - - return result - - tanh_parameter = _tanh_parameter_compute(input_x) - mul_0 = self.mul(tanh_parameter, 1.5957691) - - mul_0_min = self.minimum(mul_0, 0.0) - right_mul = self.exp(mul_0_min) - - mul_0_abs = self.abs(mul_0) - mul_0_abs_neg = self.mul(mul_0_abs, -1.0) - mul_0_abs_neg_exp = self.exp(mul_0_abs_neg) - - mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0) - left_mul = self.div(input_x, mul_0_abs_neg_exp_add) - - result = self.mul(left_mul, right_mul) - return result - - def bprop(self, input_x, out, dout): - """ register backprop function for Gelu """ - data_x = input_x - data_gelu = out - data_dy = dout - - def _math_four_compute(data_x): - """ - Return: - math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3))) - """ - datax_pow = data_x * data_x * data_x - datax_muls_c = self.mul(datax_pow, self.CSVALUE) - datax_addx = self.add(datax_muls_c, data_x) - datax_muls_s = self.mul(datax_addx, self.CSVALUE_A) - - return datax_muls_s - - # common part - math_four = _math_four_compute(data_x) - math_four_abs = self.abs(math_four) - math_four_abs_neg = self.mul(math_four_abs, -1.0) - math_four_abs_neg_exp = self.exp(math_four_abs_neg) - math_four_min = self.minimum(math_four, 0.0) - - # dividend part - datax_pow = self.mul(data_x, data_x) - datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B) - datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A) - data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add) - math_four_min_2 = self.mul(math_four_min, 2.0) - div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp) - div_left = self.exp(math_four_min_2) - dividend = self.add(div_left, div_right) - - # divisor part - div_0 = self.add(math_four_abs_neg_exp, 1.0) - div_1 = self.exp(math_four_min) - divisor = self.mul(div_1, div_0) - res_grad = self.div(dividend, divisor) - - result = self.mul(res_grad, data_dy) - return (result,) - - -class Softmax(GraphKernel): - """ - Operator Softmax - .. math: `exp(x-max(x)) / sum(exp(x-max(x)))` - - Args: - axis (int, tuple): Axis along which the softmax normalization is applied - - Inputs: - x (Tensor): input data for softmax - - Outputs: - output (Tensor): a tensor with the same shape of the input - - Examples: - >>> layer = Softmax(1) - >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32) - >>> output = layer(x) - """ - - def __init__(self, axis): - super(Softmax, self).__init__() - validator.check_value_type("axis", axis, [int, tuple], type(self).__name__) - if isinstance(axis, int): - self.axis = (axis,) - else: - self.axis = axis - for item in self.axis: - validator.check_value_type("item of axis", item, [int], type(self).__name__) - self.max = P.ReduceMax(keep_dims=True) - self.sub = P.Sub() - self.exp = P.Exp() - self.sum = P.ReduceSum(keep_dims=True) - self.mul = P.Mul() - - def construct(self, x): - max_x = self.max(x, self.axis) - data_sub = self.sub(x, max_x) - data_exp = self.exp(data_sub) - data_expsum = self.sum(data_exp, self.axis) - output = data_exp / data_expsum - return output - - def bprop(self, x, out, dout): - mul_res = self.mul(dout, out) - sum_res = self.sum(mul_res, self.axis) - sub_res = self.sub(dout, sum_res) - res = self.mul(sub_res, out) - return (res,) - - -class LayerNorm(Cell): - r""" - Applies Layer Normalization over a mini-batch of inputs. - - Layer normalization is widely used in recurrent neural networks. It applies - normalization on a mini-batch of inputs for each single training case as described - in the paper `Layer Normalization `_. Unlike batch - normalization, layer normalization performs exactly the same computation at training and - testing time. It can be described using the following formula. It is applied across all channels - and pixel but only one batch size. - - .. math:: - y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta - - Args: - normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis - `begin_norm_axis ... R - 1`. - begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions - `begin_norm_axis: rank(inputs)`, the value must be in [-1, rank(input)). Default: -1. - begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters - will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with - the normalized inputs accordingly, the value must be in [-1, rank(input)). Default: -1. - gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight. - The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', - 'he_uniform', etc. Default: 'ones'. - beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight. - The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', - 'he_uniform', etc. Default: 'zeros'. - - Inputs: - - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`, - and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`. - - Outputs: - Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`. - - Examples: - >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32) - >>> shape1 = x.shape[1:] - >>> m = G.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1) - >>> m(x) - """ - - def __init__(self, - begin_norm_axis=-1, - begin_params_axis=-1 - ): - super(LayerNorm, self).__init__() - self.begin_norm_axis = begin_norm_axis - self.begin_params_axis = begin_params_axis - self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis) - self.layer_norm_x_grad = LayerNormXBackprop() - self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop() - self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis) - - def construct(self, input_x, input_gamma, input_beta): - return self.layer_norm(input_x, input_gamma, input_beta) - - # case 1 - def bprop(self, input_x, input_gamma, input_beta, out, dout): - dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma) - return dx, d_gamma, d_beta - - -class LambUpdateWithLR(GraphKernel): - r""" - Part of Lamb optimizer. - - .. math:: - s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se) - i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4 - - Inputs: - - **input0** (Tensor) - The first tensor to be computed. - - **input1** (Tensor) - The second tensor to be computed. - - **input2** (Tensor) - The third tensor to be computed. - - **input3** (Tensor) - The fourth tensor to be computed. - - **input4** (Tensor) - The fifth tensor to be computed. - - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result. - - **greater_y** (Tensor) - The seventh tensor to be computed. - - **select_e** (Tensor) - The eighth tensor to be computed. - - **minimum_y** (Tensor) - The ninth tensor to be computed. - - Outputs: - A fake output tensor. - - Examples: - >>> lamb_update = LambUpdateWithLR() - >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32) - >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32) - >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32) - >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32) - >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32) - >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32) - >>> yg = np.random.normal(0, 1, [1]).astype(np.float32) - >>> se = np.random.normal(0, 1, [1]).astype(np.float32) - >>> ym = np.random.normal(0, 1, [1]).astype(np.float32) - >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym) - - """ - - def __init__(self): - super(LambUpdateWithLR, self).__init__() - self.greater = P.Greater() - self.select = P.Select() - self.div = P.RealDiv() - self.min = P.Minimum() - self.max = P.Maximum() - self.mul = P.Mul() - self.sub = P.Sub() - self.fake_output_assign = InplaceAssign() - self.fake_output_assign.add_prim_attr("fake_output", True) - - def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y): - greater0 = self.greater(input0, greater_y) - greater1 = self.greater(input1, greater_y) - real_div0 = self.div(input1, input2) - select0 = self.select(greater0, real_div0, select_e) - select1 = self.select(greater1, select0, select_e) - min0 = self.min(select1, minimum_y) - max0 = self.max(min0, greater_y) - mul0 = self.mul(max0, input3) - mul1 = self.mul(mul0, input4) - sub0 = self.sub(input5, mul1) - sub0 = self.fake_output_assign(input5, sub0, sub0) - return sub0 - -class LambNextMV(GraphKernel): - r""" - Part of Lamb optimizer. - - .. math:: - rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6} - rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3} - y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7 - y_3 = \frac{rd_0}{\sqrt{rd_1} + x3} - i5 = i_8 \times i_5 + i_9 \times i_4 - i2 = x_0 \times i_2 + x_1 \times i_1 - - Inputs: - - **inputs1** (Tensor) - The first input tensor to be computed. - - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result. - - **inputs3** (Tensor) - The third input tensor to be computed. - - **inputs4** (Tensor) - The fourth input tensor to be computed. - - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result. - - **inputs6** (Tensor) - The sixth input tensor to be computed. - - **inputs7** (Tensor) - The seventh input tensor to be computed. - - **inputs8** (Tensor) - The eighth input tensor to be computed. - - **inputs9** (Tensor) - The ninth input tensor to be computed. - - **inputsx0** (Tensor) - The tenth input tensor to be computed. - - **inputsx1** (Tensor) - The eleventh input tensor to be computed. - - **inputsx2** (Tensor) - The twelfth input tensor to be computed. - - **inputsx3** (Tensor) - The thirteenth input tensor to be computed. - - Outputs: - Tuple of 2 Tensors. - - - **add3** (Tensor) - the shape is the same as the one after broadcasting, and the data type is - the one with higher precision or higher digits among the inputs. - - **realdiv4** (Tensor) - the shape is the same as the one after broadcasting, and the data type is - the one with higher precision or higher digits among the inputs. - - Examples: - >>> lamb_next_mv = LambNextMV() - >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) - >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6) - >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3) - - """ - - def __init__(self): - super(LambNextMV, self).__init__() - self.mul = P.Mul() - self.add = P.TensorAdd() - self.div = P.RealDiv() - self.sqrt = P.Sqrt() - self.rsqrt = P.Rsqrt() - self.fake_output_assign_1 = InplaceAssign() - self.fake_output_assign_1.add_prim_attr("fake_output", False) - self.fake_output_assign_2 = InplaceAssign() - self.fake_output_assign_2.add_prim_attr("fake_output", False) - - - def construct(self, input1, input2, input3, input4, input5, input6, input7, - input8, input9, inputx0, inputx1, inputx2, inputx3): - mul3 = self.mul(inputx1, input1) - mul2 = self.mul(inputx0, input2) - add1 = self.add(mul2, mul3) - realdiv1 = self.div(add1, input3) - add2 = self.add(realdiv1, inputx3) - sqrt0 = self.rsqrt(add2) - sqrt1 = self.sqrt(realdiv1) - add4 = self.add(sqrt1, inputx3) - mul1 = self.mul(input9, input4) - mul0 = self.mul(input8, input5) - add0 = self.add(mul0, mul1) - realdiv0 = self.div(add0, input6) - realdiv2 = self.mul(realdiv0, sqrt0) - realdiv4 = self.div(realdiv0, add4) - mul4 = self.mul(inputx2, input7) - add3 = self.add(realdiv2, mul4) - - add3 = self.fake_output_assign_1(input5, add0, add3) - add3 = self.fake_output_assign_2(input2, add1, add3) - - return add3, realdiv4 diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py index 997c5304d5..2b6b5f9d5b 100755 --- a/mindspore/nn/optim/lamb.py +++ b/mindspore/nn/optim/lamb.py @@ -26,7 +26,7 @@ from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer from .. import layer -from .. import graph_kernels as G +from .. import _graph_kernels as G num_one = Tensor(np.ones([1]), mstype.float32) diff --git a/mindspore/ops/_selected_grad_ops.py b/mindspore/ops/_selected_grad_ops.py index 5da1d53abf..73a3f4af70 100644 --- a/mindspore/ops/_selected_grad_ops.py +++ b/mindspore/ops/_selected_grad_ops.py @@ -17,7 +17,7 @@ from mindspore.ops.op_selector import new_ops_selector op_selector = new_ops_selector( - "mindspore.ops.operations._grad_ops", "mindspore.nn.graph_kernels") + "mindspore.ops.operations._grad_ops", "mindspore.nn._graph_kernels") @op_selector diff --git a/mindspore/ops/_selected_ops.py b/mindspore/ops/_selected_ops.py index 2d816f58f8..dd5ee410e7 100644 --- a/mindspore/ops/_selected_ops.py +++ b/mindspore/ops/_selected_ops.py @@ -17,11 +17,11 @@ from mindspore.ops.op_selector import new_ops_selector op_selector = new_ops_selector( - "mindspore.ops.operations", "mindspore.nn.graph_kernels") + "mindspore.ops.operations", "mindspore.nn._graph_kernels") opt_selector = new_ops_selector( - "mindspore.nn.optim", "mindspore.nn.graph_kernels") + "mindspore.nn.optim", "mindspore.nn._graph_kernels") nn_selector = new_ops_selector( - "mindspore.nn", "mindspore.nn.graph_kernels") + "mindspore.nn", "mindspore.nn._graph_kernels") @nn_selector diff --git a/tests/st/ops/graph_kernel/test_fuse.py b/tests/st/ops/graph_kernel/test_fuse.py index 726e2cc16c..2168ca2546 100644 --- a/tests/st/ops/graph_kernel/test_fuse.py +++ b/tests/st/ops/graph_kernel/test_fuse.py @@ -19,7 +19,7 @@ import mindspore.context as context from mindspore import Tensor from mindspore.nn import Cell import mindspore.ops.operations as P -from mindspore.nn.graph_kernels import ReLU +from mindspore.ops.operations import _grad_ops as G class Net(Cell): @@ -28,41 +28,45 @@ class Net(Cell): self.add = P.TensorAdd() self.sub = P.Sub() self.mul = P.Mul() - self.relu = ReLU() + self.sqrt_grad = G.SqrtGrad() - def construct(self, x, y): + def construct(self, x, y, z): sub_res = self.sub(x, y) mul_res = self.mul(sub_res, x) - relu_res = self.relu(mul_res) - square_res = P.Square()(relu_res) - add_res = self.add(relu_res, square_res) + sqrt_grad_res = self.sqrt_grad(mul_res, z) + square_res = P.Square()(sqrt_grad_res) + add_res = self.add(sqrt_grad_res, square_res) add1_res = self.add(add_res, add_res) return self.add(add1_res, add1_res) +def get_output(i0, i1, i2, enable_graph_kernel=False): + if enable_graph_kernel: + context.set_context(enable_graph_kernel=True) + net = Net() + output = net(i0, i1, i2) + return output + + def test_basic(): - input_x = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32) - input_y = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32) - sub_res = input_x - input_y - mul_res = sub_res * input_x - relu_res = np.maximum(mul_res, 0) - square_res = np.square(relu_res) - add_res = relu_res + square_res - add1_res = add_res + add_res - expect = add1_res + add1_res + i0 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32)) + i1 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32)) + i2 = Tensor(np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32)) - net = Net() - result = net(Tensor(input_x), Tensor(input_y)) + expect = get_output(i0, i1, i2, False) + output = get_output(i0, i1, i2, True) + + expect_np = expect.asnumpy().copy() + output_np = output.asnumpy().copy() - res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) - assert res + assert np.allclose(expect_np, output_np, 1.e-4, 1.e-7) @pytest.mark.level0 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard def test_basic_gpu(): - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") test_basic() @@ -71,5 +75,5 @@ def test_basic_gpu(): @pytest.mark.platform_x86_ascend_training @pytest.mark.env_onecard def test_basic_ascend(): - context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="Ascend") + context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") test_basic() diff --git a/tests/st/ops/graph_kernel/test_lamb.py b/tests/st/ops/graph_kernel/test_lamb.py index e3b4ef72eb..6d146553c0 100644 --- a/tests/st/ops/graph_kernel/test_lamb.py +++ b/tests/st/ops/graph_kernel/test_lamb.py @@ -18,7 +18,7 @@ import numpy as np import mindspore.context as context from mindspore import Tensor, Parameter from mindspore.nn import Cell -from mindspore.nn.graph_kernels import LambUpdateWithLR, LambNextMV +from mindspore.nn._graph_kernels import LambUpdateWithLR, LambNextMV class LambNet(Cell):