|
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """
- Graph kernels. They are composites of basic primitives and can be compiled into
- a fused kernel automaticly when context.set_context(enable_graph_kernel=True).
- """
- from ...common import dtype as mstype
- from ...ops import operations as P
- from ...ops.primitive import PrimitiveWithInfer, prim_attr_register
- from ...ops.composite import multitype_ops as C
- from ...ops.operations import _grad_ops as G
- from ..._checkparam import ParamValidator as validator
- from ..cell import Cell, GraphKernel
-
-
- class InplaceAssign(PrimitiveWithInfer):
- """
- Inplace assign `Parameter` with a value.
-
- This primitive can only use in graph kernel.
-
- Inputs:
- - **variable** (Parameter) - The `Parameter`.
- - **value** (Tensor) - The value to be assigned.
- - **depend** (Tensor) - The dependent tensor to keep this op connected in graph.
-
- Outputs:
- Tensor, has the same type as original `variable`.
-
- Examples:
- >>> def construct(self, x):
- >>> val = x - 1.0
- >>> ret = x + 2.0
- >>> return InplaceAssign()(x, val, ret)
- >>> x = Tensor([2.0], mindspore.float32)
- >>> net = Net()
- >>> net(x)
- """
- @prim_attr_register
- def __init__(self):
- self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output'])
-
- def infer_shape(self, x, y, z):
- return z
-
- def infer_dtype(self, x, y, z):
- return z
-
- def get_bprop(self):
- def bprop(x, y, z, out, dout):
- return (x, C.zeros_like(y), dout)
- return bprop
-
-
- class MaximumGrad(GraphKernel):
- """
-
- Backprop function for Maximum operator.
-
- Inputs:
- - **x** (Tensor) - The first input tensor of maximum.
- - **y** (Tensor) - The second input tensor of maximum.
- - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output.
-
- Outputs:
- dx (Tensor): has the same shape as x and y, returns dout element if
- `x >= y` returns true at the same position, or returns zero at that
- position
- dy (Tensor): has the same shape as x and y, dy = dout - dx
-
- Examples:
- >>> layer = MaximumGrad()
- >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
- """
-
- def __init__(self, grad_x=True, grad_y=True):
- super(MaximumGrad, self).__init__()
- self.grad_x = grad_x
- self.grad_y = grad_y
- self.select = P.Select()
- self.greater_equal = P.GreaterEqual()
- self.zeros_like = P.ZerosLike()
- self.sub = P.Sub()
-
- def construct(self, x, y, dout):
- cmp_result = self.greater_equal(x, y)
- dx = self.select(cmp_result, dout, self.zeros_like(dout))
- dy = dout - dx
-
- return dx, dy
-
-
- class MinimumGrad(GraphKernel):
- """
- Backprop function for Minimum operator.
-
- Compares x and y elementwise, dout should has the same shape with x and y.
-
- Inputs:
- - **x** (Tensor) - The first input
- - **y** (Tensor) - x and y should have same shape
- - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output
-
- Outputs:
- - dx (Tensor) - Has the same shape as x and y, returns dout element if
- `x <= y` returns true at the same position, or returns zero at that
- position
- - dy (Tensor) - Has the same shape as x and y, dy = dout - dx
-
- Examples:
- >>> layer = MinimumGrad()
- >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6]))
- """
-
- def __init__(self, grad_x=True, grad_y=True):
- super(MinimumGrad, self).__init__()
- self.grad_x = grad_x
- self.grad_y = grad_y
- self.select = P.Select()
- self.less_equal = P.LessEqual()
- self.zeros_like = P.ZerosLike()
- self.sub = P.Sub()
-
- def construct(self, x, y, dout):
- cmp_result = self.less_equal(x, y)
- dx = self.select(cmp_result, dout, self.zeros_like(dout))
- # dy = self.select(cmp_result, self.zeros_like(dout), dout)
- dy = dout - dx
-
- return dx, dy
-
-
- class AbsGrad(GraphKernel):
- """
- Abs's backprop function.
-
- Inputs:
- **input_x** (Tensor) - input data of this operator.
- **dout** (Tensor) - output of the next operator's backprop function.
-
- Outputs:
- Tensor, has the same shape as input_x.
-
- Examples:
- >>> back = AbsGrad()
- >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6]))
- """
-
- def __init__(self):
- super(AbsGrad, self).__init__()
- self.mul = P.Mul()
- self.abs = P.Abs()
- self.add = P.TensorAdd()
- self.div = P.RealDiv()
- self.round = P.Round()
-
- def construct(self, input_x, dout):
- NUM_MAX = 32768
- mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX))
- res_abs = self.abs(mul_max)
- res_div = self.div(mul_max, res_abs)
- res_round = self.round(res_div)
- res = self.mul(res_round, dout)
- return res
-
-
- class ApplyMomentum(GraphKernel):
- """
- Update parameter according to the ApplyMomentum algorithm.
-
- Inputs:
- variable (Tensor): mutable tensor var
- accumulation (Tensor): mutable tensor accum
- learning_rate (float32): learning rate
- gradient (float32): The gradient
- momentum (float32): Momentum
-
- Outputs: updated accumulation and variable
- """
-
- def __init__(self,
- use_nesterov=False,
- use_locking=False,
- gradient_scale=1.0):
- super(ApplyMomentum, self).__init__()
- self.gradient_scale = validator.check_type('gradient_scale', gradient_scale, [float])
- self.fake_output_assign_1 = InplaceAssign()
- self.fake_output_assign_1.add_prim_attr("fake_output", True)
- self.fake_output_assign_2 = InplaceAssign()
- self.fake_output_assign_2.add_prim_attr("fake_output", True)
-
- def construct(self, variable, accumulation, learning_rate, gradient, momentum):
- gradient = gradient * self.gradient_scale
- momt_accumulation = accumulation * momentum
- accumulation_inplace = momt_accumulation + gradient
-
- sum_gradient = accumulation_inplace * learning_rate
- variable_inplace = variable - sum_gradient
-
- accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace)
- variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace)
- return accumulation_inplace, variable_inplace
-
-
- class BiasAdd(GraphKernel):
- """
- Return the sum of x and bias.
-
- Inputs:
- x (Tensor): Tensor of input data.
- bias (Tensor): The bias tensor.
-
- Output:
- Tensor, the sum of x and bias.
-
- Example:
- >>> layer = BiasGrad()
- >>> output = BiasAdd(Tensor([1, 2, 3]), Tensor([1,]))
- """
-
- def __init__(self):
- super(BiasAdd, self).__init__()
-
- def construct(self, x, bias):
- shape = P.Shape()(x)
- if len(shape) == 4:
- bias_shape = (1, P.Shape()(bias)[0], 1, 1) # NCHW
- else:
- bias_shape = (1, P.Shape()(bias)[0])
- res = x + P.Reshape()(bias, bias_shape)
- return res
-
- class BiasAddGrad(GraphKernel):
- """
- Computes gradients of BiasAdd.
-
- Inputs:
- x (Tensor): the gradients of bias add output.
-
- Output:
- Tensor, the gradients of bias add input.
-
- Examples:
- >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32)
- >>> bias_add_grad = BiasAddGrad()
- >>> dx = bias_add_grad(dout)
- """
- def __init__(self):
- super(BiasAddGrad, self).__init__()
-
- def construct(self, x):
- shape_x = P.Shape()(x)
- reduce_axis = [0]
- for i in range(2, len(shape_x)):
- reduce_axis.append(i)
-
- res = P.ReduceSum()(x, reduce_axis)
- return res
-
-
- class EqualCount(GraphKernel):
- """
- Computes the number of the same elements of two tensors.
-
- The two input tensors should have the same shape and data type.
-
- Inputs:
- x (Tensor): the first input tensor.
- y (Tensor): the second input tensor.
-
- Outputs:
- Tensor, the type is same as input tensor and size as (1,).
-
- Examples:
- >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32)
- >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32)
- >>> equal_count = EqualCount()
- >>> equal_count(x, y)
- """
- def __init__(self):
- super(EqualCount, self).__init__()
-
- def construct(self, x, y):
- equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32))
- equal_count = P.Cast()(equal_bool, mstype.float16)
-
- axes = (0,)
- res = P.ReduceSum()(equal_count, axes)
- res = P.Cast()(res, P.DType()(x))
- return res
-
-
- class ReduceMean(GraphKernel):
- """
- Reduce a dimension of a tensor by averaging all elements in the dimension.
-
- The dtype of the tensor to be reduced is number.
-
- Args:
- keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
- If False, don't keep these dimensions. Default: False.
-
- Inputs:
- - **input_x** (Tensor[Number]) - The input tensor.
- - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions.
- Only constant value is allowed.
-
- Outputs:
- Tensor, has the same dtype as the `input_x`.
-
- - If axis is (), and keep_dims is False,
- the output is a 0-D tensor representing the sum of all elements in the input tensor.
- - If axis is int, set as 2, and keep_dims is False,
- the shape of output is :math:`(x_1, x_3, ..., x_R)`.
- - If axis is tuple(int), set as (2, 3), and keep_dims is False,
- the shape of output is :math:`(x_1, x_4, ..., x_R)`.
-
- Examples:
- >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> op = ReduceMean(keep_dims=True)
- >>> output = op(input_x, 1)
- """
-
- def __init__(self, keep_dims=True):
- super(ReduceMean, self).__init__()
- self.keep_dims = validator.check_type('keep_dims', keep_dims, [bool])
- self.sum = P.ReduceSum(self.keep_dims)
-
- def construct(self, x, axis):
- shape = P.Shape()(x)
- value_num = 1
- for i in axis:
- value_num *= shape[i]
-
- data_sum = self.sum(x, axis)
- avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num)
- res = data_sum * avg
- return res
-
-
- class ReLU(GraphKernel):
- r"""
- Computes ReLU(Rectified Linear Unit) of input tensor element-wise.
-
- It returns :math:`\max(x,\ 0)` element-wise.
-
- Inputs:
- - **input_x** (Tensor) - The input tensor.
-
- Outputs:
- Tensor, with the same type and shape as the `input_x`.
-
- Examples:
- >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32)
- >>> relu = ReLU()
- >>> result = relu(input_x)
- [[0, 4.0, 0.0], [2.0, 0.0, 9.0]]
- """
- def __init__(self):
- super(ReLU, self).__init__()
- self.max = P.Maximum()
-
- def construct(self, x):
- return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x)
-
-
- class SoftmaxCrossEntropyWithLogits(GraphKernel):
- r"""
- Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding.
-
- Note:
- Sets input logits as `X`, input label as `Y`, output as `loss`. Then,
-
- .. math::
- p_{ij} = softmax(X_{ij}) = \frac{exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)}
-
- .. math::
- loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})}
-
- Inputs:
- - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`.
- - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`.
-
- Outputs:
- Tuple of 2 Tensors, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`.
-
- Examples:
- >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32)
- >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32)
- >>> softmax_cross = SoftmaxCrossEntropyWithLogits()
- >>> loss, backprop = softmax_cross(logits, labels)
- """
-
- def __init__(self):
- super(SoftmaxCrossEntropyWithLogits, self).__init__()
- self.max = P.ReduceMax(keep_dims=True)
- self.sum_keep_dims = P.ReduceSum(keep_dims=True)
-
- def construct(self, features, labels):
- data_max = self.max(features, (1,))
- data_sub = features - data_max
- data_exp = P.Exp()(data_sub)
- data_sum = self.sum_keep_dims(data_exp, (1,))
- data_div = data_exp / data_sum
- data_log_tmp = P.Log()(data_sum)
- data_log = data_sub - data_log_tmp
- data_mul = labels * data_log
- data_muls = P.Neg()(data_mul)
- loss = P.ReduceSum()(data_muls, (1,))
- backprop = data_div - labels
- return loss, backprop
-
- def bprop(self, features, labels, out, dout):
- grad = out[1]
- grad = grad * P.ExpandDims()(dout[0], -1)
- return grad, P.ZerosLike()(labels)
-
-
- class LayerNormForward(GraphKernel):
- """ Forward function of the LayerNorm operator. """
- def __init__(self, begin_norm_axis=1, begin_params_axis=1):
- super(LayerNormForward, self).__init__()
- self.begin_norm_axis = validator.check_type('begin_norm_axis', begin_norm_axis, [int])
- self.begin_params_axis = validator.check_type('begin_params_axis', begin_params_axis, [int])
- self.mul = P.Mul()
- self.sum_keep_dims = P.ReduceSum(keep_dims=True)
- self.sub = P.Sub()
- self.add = P.TensorAdd()
- self.log = P.Log()
- self.exp = P.Exp()
- self.eps = P.Eps()
-
- def construct(self, input_x, input_gamma, input_beta):
- shape_x = P.Shape()(input_x)
-
- # Calculate the scaling ratio of the average
- begin_norm_axis = self.begin_norm_axis
- if begin_norm_axis < 0:
- begin_norm_axis += len(shape_x)
- reduce_axis = ()
- for i in range(len(shape_x)):
- if i > begin_norm_axis or i == begin_norm_axis:
- reduce_axis = reduce_axis + (i,)
-
- reduce_elts = 1.0
- for i in reduce_axis:
- reduce_elts *= shape_x[i]
- mean_cof = 1.0 / reduce_elts
-
- # Calculate mean
- mean_muls = self.mul(input_x, mean_cof)
- mean = self.sum_keep_dims(mean_muls, reduce_axis)
-
- # Calculate variance
- variance_sub = self.sub(input_x, mean)
- variance_mul = self.mul(variance_sub, variance_sub)
- variance_muls = self.mul(variance_mul, mean_cof)
- variance = self.sum_keep_dims(variance_muls, reduce_axis)
-
- # Calculate normalize
- normalize_sub = self.sub(input_x, mean)
- epsilon = self.eps(input_x)
- normalize_add = self.add(variance, epsilon)
- normalize_log = self.log(normalize_add)
- normalize_log_mul = self.mul(normalize_log, -0.5)
- normalize_exp = self.exp(normalize_log_mul)
- normalize_mul = self.mul(normalize_sub, normalize_exp)
-
- # Calculate scale and translate
- if self.begin_params_axis == 0:
- scale_mul = self.mul(input_gamma, normalize_mul)
- res = self.add(scale_mul, input_beta)
- else:
- scale_mul = self.mul(input_gamma, normalize_mul)
- res = self.add(scale_mul, input_beta)
-
- return res, mean, variance
-
-
- class LayerNormXBackprop(GraphKernel):
- r"""
- Together with LayerNormBetaGammaBackprop, to supply the backprop
- functionality for LayerNorm.
-
- Note:
- Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
- input_gamma as :math:`\gamma`. Then,
- .. math::
- \begin{array}{ll} \\
- \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
- \frac {\partial L} {\partial x_i} =
- \frac{\gamma}{\sqrt{\sigma^2+\epsilon}}
- ( \frac{\partial L}{\partial y_i}
- - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta}
- - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma})
- \end{array}
-
- Inputs:
- - **dy**(Tensor) - The first item of the next operator's backprop's output.
- - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
- - **variance**(Tensor) - The second input of the forward function of LayerNorm.
- - **mean**(Tensor) - The third input of the forward function of LayerNorm.
- - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
-
- Outputs:
- Tensor, the output of this operator, will be used as the first item of the result of
- LayerNorm's backprop function, has the same shape and data type as 'input_x'.
-
- Examples:
- >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> op = LayerNormXBackprop(keep_dims=False)
- >>> output = op(dy, input_x, variance, mean, input_gamma)
- """
-
- def __init__(self):
- super(LayerNormXBackprop, self).__init__()
- self.sum_keep_dims = P.ReduceSum(keep_dims=True)
- self.log = P.Log()
- self.exp = P.Exp()
- self.eps = P.Eps()
-
- def construct(self, dy, input_x, variance, mean, input_gamma):
- shape_x = P.Shape()(input_x)
- shape_mean = P.Shape()(mean)
- reduce_axis = ()
- flag = -1
- min_l = 0
- if len(shape_x) > len(shape_mean):
- min_l = len(shape_x)
- else:
- min_l = len(shape_mean)
- for i in range(min_l):
- if (shape_x[i] != shape_mean[i]) and (flag == -1):
- flag = i
- if flag != -1:
- for i in range(flag, len(shape_x)):
- reduce_axis = reduce_axis + (i,)
- else:
- reduce_axis = reduce_axis + (len(shape_x) - 1,)
- mean_num = 1.0
- for i in reduce_axis:
- mean_num *= shape_x[i]
- pd_xl = input_gamma * dy
- epsilon = self.eps(input_x)
- var_elta = variance + epsilon
- var_elta_log = self.log(var_elta)
- var_elta_mul = var_elta_log * -0.5
- var_elta_2 = P.Exp()(var_elta_mul)
- pdvar1_mul = var_elta_2 * var_elta_2
- pd_var_1 = pdvar1_mul * var_elta_2
- sub_x_mean = input_x - mean
- pdvar_mul1 = pd_xl * sub_x_mean
- pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis)
- pdvar_mul3 = pdvar_sum * pd_var_1
- pd_var = pdvar_mul3 * -0.5
- pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis)
- pdmean1_mul = pdmean1_sum * var_elta_2
- pd_mean_1 = pdmean1_mul * -1.0
- pdmean2_mul1 = sub_x_mean * -2.0
- pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis)
- pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num)
- pd_mean_2 = pd_var * pdmean2_mul3
- pd_mean = pd_mean_2 + pd_mean_1
- pd_x_1 = var_elta_2 * pd_xl
- pdx2_mul = pd_var * sub_x_mean
- pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num))
- pd_x_3 = pd_mean * (1.0 / mean_num)
- pdx_add = pd_x_1 + pd_x_2
- pd_x = pdx_add + pd_x_3
- return pd_x
-
-
- class LayerNormBetaGammaBackprop(GraphKernel):
- r"""
- Together with LayerNormXBackprop, to supply the backprop functionality for
- LayerNorm.
- Note:
- Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`,
- input_gamma as :math:`\gamma`. Then,
- .. math::
- \begin{array}{ll} \\
- \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\
- \frac {\partial L} {\partial \beta} =
- \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\
- \frac {\partial L} {\partial \gamma} =
- \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i}
- \end{array}
-
- Inputs:
- - **dy**(Tensor) - The first item of the next operator's backprop's output.
- - **input_x**(Tensor) - The first input of the forward function of LayerNorm.
- - **variance**(Tensor) - The second input of the forward function of LayerNorm.
- - **mean**(Tensor) - The third input of the forward function of LayerNorm.
- - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm.
-
- Outputs:
- Tuple of 2 Tensors, the backprop outputs.
-
- - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as
- the second item of the LayerNorm's backprop function.
- - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as
- the third item of the LayerNorm's backprop function.
-
- Examples:
- >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32))
- >>> op = LayerNormBetaGammaBackprop(keep_dims=False)
- >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma)
- """
- def __init__(self):
- super(LayerNormBetaGammaBackprop, self).__init__()
- self.sum_not_keep_dims = P.ReduceSum(keep_dims=False)
- self.log = P.Log()
- self.exp = P.Exp()
- self.eps = P.Eps()
-
- def construct(self, dy, input_x, variance, mean, shape_gamma):
- shape_x = P.Shape()(input_x)
- params_axis = ()
-
- if len(shape_x) != len(shape_gamma):
- sub = len(shape_x) - len(shape_gamma)
- for i in range(sub):
- params_axis = params_axis + (i,)
-
- pd_beta = self.sum_not_keep_dims(dy, params_axis)
- epsilon = self.eps(input_x)
- var_elta = variance + epsilon
- var_elta_log = self.log(var_elta)
- var_elta_mul = var_elta_log * -0.5
- var_elta_2 = P.Exp()(var_elta_mul)
- sub_x_mean = input_x - mean
- var_elta_2_cast = var_elta_2
- xl_mul = var_elta_2_cast * sub_x_mean
- pdga_mul = dy * xl_mul
- pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis)
- return pd_beta, pd_gamma
-
-
- class LogSoftmax(GraphKernel):
- r"""
- Log Softmax activation function.
-
- Applies the Log Softmax function to the input tensor on the specified axis.
- Suppose a slice in the given aixs :math:`x` then for each element :math:`x_i`
- the Log Softmax function is shown as follows:
-
- .. math::
- \text{output}(x_i) = \log \left(\frac{exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right),
-
- where :math:`N` is the length of the Tensor.
-
- Args:
- axis (int): The axis to do the Log softmax operation. Default: -1.
-
- Inputs:
- logits (Tensor): The input of Log Softmax.
-
- Outputs:
- Tensor, with the same type and shape as the logits.
-
- Examples:
- >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
- >>> log_softmax = LogSoftmax()
- >>> log_softmax(input_x)
- [-4.4519143, -3.4519143, -2.4519143, -1.4519144, -0.4519144]
- """
-
- def __init__(self, axis=-1):
- super(LogSoftmax, self).__init__()
- self.axis = validator.check_type('axis', axis, [int])
- self.max_keep_dims = P.ReduceMax(keep_dims=True)
- self.sub = P.Sub()
- self.exp = P.Exp()
- self.sum_keep_dims = P.ReduceSum(keep_dims=True)
- self.log = P.Log()
- self.mul = P.Mul()
-
- def construct(self, input_x):
- data_max = self.max_keep_dims(input_x, (self.axis,))
- data_sub = self.sub(input_x, data_max)
-
- data_exp = self.exp(data_sub)
- data_sum = self.sum_keep_dims(data_exp, (self.axis,))
- data_log = self.log(data_sum)
-
- res = self.sub(data_sub, data_log)
- return res
-
- def bprop(self, input_x, out, dout):
- input_x = out
- input_dy = dout
-
- data_exp = self.exp(input_x)
- data_sum = self.sum_keep_dims(input_dy, (self.axis,))
- data_softmax = self.mul(data_exp, data_sum)
-
- res = self.sub(input_dy, data_softmax)
- return (res,)
-
-
- class Tanh(GraphKernel):
- r"""
- Tanh activation function.
-
- Computes hyperbolic tangent of input element-wise. The Tanh function is defined as:
-
- .. math::
- tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1},
-
- where :math:`x_i` is an element of the input Tensor.
-
- Inputs:
- - **input_x** (Tensor) - The input of Tanh.
-
- Outputs:
- Tensor, with the same type and shape as the input_x.
-
- Examples:
- >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32)
- >>> tanh = Tanh()
- >>> tanh(input_x)
- [0.7615941, 0.9640276, 0.9950548, 0.9993293, 0.99990916]
- """
- def __init__(self):
- super(Tanh, self).__init__()
- self.abs = P.Abs()
- self.add = P.TensorAdd()
- self.div = P.RealDiv()
- self.mul = P.Mul()
- self.mul_fp16 = P.Mul()
- self.mul_fp16.add_prim_attr("output_precision", "float16")
- self.exp = P.Exp()
-
- def construct(self, input_x):
- input_abs = self.abs(input_x)
- sign_flag = self.div(input_x, input_abs)
- sign_flag_neg = self.mul(sign_flag, -1.0)
-
- power_val = self.mul(input_abs, -2.0)
- exp_val = self.exp(power_val)
- up_val = self.add(exp_val, -1.0)
- down_val = self.add(exp_val, 1.0)
-
- div_val = self.div(up_val, down_val)
- res = self.mul(sign_flag_neg, div_val)
- return res
-
- def bprop(self, input_x, out, dout):
- input_y = out
- input_dy = dout
-
- data_square = self.mul(input_y, input_y)
- data_mul = self.mul(data_square, -1.0)
- anuminate = self.add(data_mul, 1.0)
- res = self.mul_fp16(anuminate, input_dy)
-
- return (res,)
-
- class TanhGrad(GraphKernel):
- """
- Backprop function of Tanh
-
- Mathematical calculating:
- result = Tanh(out)
- result = 1 - result * result
- result = result * dout
- Inputs:
- out (Tensor): Tanh's output
- dout (Tensor): next layer's backward function's output, has same shape as out
-
- Outputs:
- result (Tensor): result of (1 - tanh(out)^2) * dout
-
- Examples:
- >>> x_np = np.random.randn(5, 3, 6).astype(np.float16)
- >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16)
- >>> x_ms = Tensor(x_np)
- >>> dy_ms = Tensor(dy_np)
- >>> tanh_grad = TanhGrad()
- >>> out = tanh_grad(x_np, dy_np)
- """
- def __init__(self):
- super(TanhGrad, self).__init__()
- self.add = P.TensorAdd()
- self.mul = P.Mul()
- self.mul_fp16 = P.Mul()
- self.mul_fp16.add_prim_attr("output_precision", "float16")
-
- def construct(self, out, dout):
- input_y = out
- input_dy = dout
-
- data_square = self.mul(input_y, input_y)
- data_mul = self.mul(data_square, -1.0)
- anuminate = self.add(data_mul, 1.0)
- res = self.mul_fp16(anuminate, input_dy)
-
- return res
-
- class Gelu(GraphKernel):
- r"""
- Gaussian Error Linear Units activation function.
-
- GeLU is described in the paper `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
- And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.
- <https://arxiv.org/abs/1810.04805>`_.
-
- Defined as follows:
-
- .. math::
- \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})),
-
- where :math:`erf` is the "Gauss error function" .
-
- Inputs:
- - **input_x** (Tensor) - Input to compute the Gelu.
-
- Outputs:
- Tensor, with the same type and shape as input.
-
- Examples:
- >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32)
- >>> gelu = Gelu()
- >>> result = gelu(tensor)
- """
-
- def __init__(self):
- super(Gelu, self).__init__()
- self.add = P.TensorAdd()
- self.abs = P.Abs()
- self.exp = P.Exp()
- self.neg = P.Neg()
- self.minimum = P.Minimum()
- self.div = P.RealDiv()
- self.mul = P.Mul()
- self.CSVALUE = 0.044715
- self.CSVALUE_A = 1.59576912
- self.CSVALUE_5 = 0.3989422804
- self.CSVALUE_3B = 0.2140644488
-
- def construct(self, input_x):
- def _tanh_parameter_compute(data_x):
- """
- compute the parameter of tanh:
- return: result equal (x+0.044715*tf.pow(x,3))
- """
- mul_0 = self.mul(data_x, data_x)
- pow_0 = self.mul(mul_0, data_x)
- mul_1 = self.mul(pow_0, self.CSVALUE)
- result = self.add(data_x, mul_1)
-
- return result
-
- tanh_parameter = _tanh_parameter_compute(input_x)
- mul_0 = self.mul(tanh_parameter, 1.5957691)
-
- mul_0_min = self.minimum(mul_0, 0.0)
- right_mul = self.exp(mul_0_min)
-
- mul_0_abs = self.abs(mul_0)
- mul_0_abs_neg = self.mul(mul_0_abs, -1.0)
- mul_0_abs_neg_exp = self.exp(mul_0_abs_neg)
-
- mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0)
- left_mul = self.div(input_x, mul_0_abs_neg_exp_add)
-
- result = self.mul(left_mul, right_mul)
- return result
-
- def bprop(self, input_x, out, dout):
- """ register backprop function for Gelu """
- data_x = input_x
- data_gelu = out
- data_dy = dout
-
- def _math_four_compute(data_x):
- """
- return: math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))
- """
- datax_pow = data_x * data_x * data_x
- datax_muls_c = self.mul(datax_pow, self.CSVALUE)
- datax_addx = self.add(datax_muls_c, data_x)
- datax_muls_s = self.mul(datax_addx, self.CSVALUE_A)
-
- return datax_muls_s
-
- # common part
- math_four = _math_four_compute(data_x)
- math_four_abs = self.abs(math_four)
- math_four_abs_neg = self.mul(math_four_abs, -1.0)
- math_four_abs_neg_exp = self.exp(math_four_abs_neg)
- math_four_min = self.minimum(math_four, 0.0)
-
- # dividend part
- datax_pow = self.mul(data_x, data_x)
- datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B)
- datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A)
- data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add)
- math_four_min_2 = self.mul(math_four_min, 2.0)
- div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp)
- div_left = self.exp(math_four_min_2)
- dividend = self.add(div_left, div_right)
-
- # divisor part
- div_0 = self.add(math_four_abs_neg_exp, 1.0)
- div_1 = self.exp(math_four_min)
- divisor = self.mul(div_1, div_0)
- res_grad = self.div(dividend, divisor)
-
- result = self.mul(res_grad, data_dy)
- return (result,)
-
-
- class Softmax(GraphKernel):
- """
- Operator Softmax
- .. math: `exp(x-max(x)) / sum(exp(x-max(x)))`
-
- Args:
- axis (int, tuple): Axis along which the softmax normalization is applied
-
- Inputs:
- x (Tensor): input data for softmax
-
- Outputs:
- output (Tensor): a tensor with the same shape of the input
-
- Examples:
- >>> layer = Softmax(1)
- >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32)
- >>> output = layer(x)
- """
-
- def __init__(self, axis):
- super(Softmax, self).__init__()
- validator.check_type("axis", axis, [int, tuple])
- if isinstance(axis, int):
- self.axis = (axis,)
- else:
- self.axis = axis
- for item in self.axis:
- validator.check_type("item of axis", item, [int])
- self.max = P.ReduceMax(keep_dims=True)
- self.sub = P.Sub()
- self.exp = P.Exp()
- self.sum = P.ReduceSum(keep_dims=True)
- self.mul = P.Mul()
-
- def construct(self, x):
- max_x = self.max(x, self.axis)
- data_sub = self.sub(x, max_x)
- data_exp = self.exp(data_sub)
- data_expsum = self.sum(data_exp, self.axis)
- output = data_exp / data_expsum
- return output
-
- def bprop(self, x, out, dout):
- mul_res = self.mul(dout, out)
- sum_res = self.sum(mul_res, self.axis)
- sub_res = self.sub(dout, sum_res)
- res = self.mul(sub_res, out)
- return (res,)
-
-
- class LayerNorm(Cell):
- r"""
- Applies Layer Normalization over a mini-batch of inputs.
-
- Layer normalization is widely used in recurrent neural networks. It applies
- normalization on a mini-batch of inputs for each single training case as described
- in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
- normalization, layer normalization performs exactly the same computation at training and
- testing time. It can be described using the following formula. It is applied across all channels
- and pixel but only one batch size.
-
- .. math::
- y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
-
- Args:
- normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
- `begin_norm_axis ... R - 1`.
- begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
- `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
- begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
- will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
- the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
- gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
- The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
- 'he_uniform', etc. Default: 'ones'.
- beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
- The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
- 'he_uniform', etc. Default: 'zeros'.
-
- Inputs:
- - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
- and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
-
- Outputs:
- Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
-
- Examples:
- >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
- >>> shape1 = x.shape[1:]
- >>> m = G.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1)
- >>> m(x)
- """
-
- def __init__(self,
- begin_norm_axis=-1,
- begin_params_axis=-1
- ):
- super(LayerNorm, self).__init__()
- self.begin_norm_axis = begin_norm_axis
- self.begin_params_axis = begin_params_axis
- self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis)
- self.layer_norm_x_grad = LayerNormXBackprop()
- self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop()
- self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis)
-
- def construct(self, input_x, input_gamma, input_beta):
- return self.layer_norm(input_x, input_gamma, input_beta)
-
- # case 1
- def bprop(self, input_x, input_gamma, input_beta, out, dout):
- dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma)
- return dx, d_gamma, d_beta
-
-
- class LambUpdateWithLR(GraphKernel):
- r"""
- Part of Lamb optimizer.
-
- .. math::
- s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se)
- i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4
-
- Inputs:
- - **input0** (Tensor) - The first tensor to be computed.
- - **input1** (Tensor) - The second tensor to be computed.
- - **input2** (Tensor) - The third tensor to be computed.
- - **input3** (Tensor) - The fourth tensor to be computed.
- - **input4** (Tensor) - The fifth tensor to be computed.
- - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result.
- - **greater_y** (Tensor) - The seventh tensor to be computed.
- - **select_e** (Tensor) - The eighth tensor to be computed.
- - **minimum_y** (Tensor) - The ninth tensor to be computed.
-
- Outputs:
- A fake output tensor.
-
- Examples:
- >>> lamb_update = LambUpdateWithLR()
- >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
- >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
- >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32)
- >>> yg = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> se = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> ym = np.random.normal(0, 1, [1]).astype(np.float32)
- >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym)
-
- """
-
- def __init__(self):
- super(LambUpdateWithLR, self).__init__()
- self.greater = P.Greater()
- self.select = P.Select()
- self.div = P.RealDiv()
- self.min = P.Minimum()
- self.max = P.Maximum()
- self.mul = P.Mul()
- self.sub = P.Sub()
- self.fake_output_assign = InplaceAssign()
- self.fake_output_assign.add_prim_attr("fake_output", True)
-
- def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y):
- greater0 = self.greater(input0, greater_y)
- greater1 = self.greater(input1, greater_y)
- real_div0 = self.div(input1, input2)
- select0 = self.select(greater0, real_div0, select_e)
- select1 = self.select(greater1, select0, select_e)
- min0 = self.min(select1, minimum_y)
- max0 = self.max(min0, greater_y)
- mul0 = self.mul(max0, input3)
- mul1 = self.mul(mul0, input4)
- sub0 = self.sub(input5, mul1)
- sub0 = self.fake_output_assign(input5, sub0, sub0)
- return sub0
-
- class LambNextMV(GraphKernel):
- r"""
- Part of Lamb optimizer.
-
- .. math::
- rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6}
- rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3}
- y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7
- y_3 = \frac{rd_0}{\sqrt{rd_1} + x3}
- i5 = i_8 \times i_5 + i_9 \times i_4
- i2 = x_0 \times i_2 + x_1 \times i_1
-
- Inputs:
- - **inputs1** (Tensor) - The first input tensor to be computed.
- - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result.
- - **inputs3** (Tensor) - The third input tensor to be computed.
- - **inputs4** (Tensor) - The fourth input tensor to be computed.
- - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result.
- - **inputs6** (Tensor) - The sixth input tensor to be computed.
- - **inputs7** (Tensor) - The seventh input tensor to be computed.
- - **inputs8** (Tensor) - The eighth input tensor to be computed.
- - **inputs9** (Tensor) - The ninth input tensor to be computed.
- - **inputsx0** (Tensor) - The tenth input tensor to be computed.
- - **inputsx1** (Tensor) - The eleventh input tensor to be computed.
- - **inputsx2** (Tensor) - The twelfth input tensor to be computed.
- - **inputsx3** (Tensor) - The thirteenth input tensor to be computed.
-
- Outputs:
- Tuple of 2 Tensors.
-
- - **add3** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
- the one with higher precision or higher digits among the inputs.
- - **realdiv4** (Tensor) - the shape is the same as the one after broadcasting, and the data type is
- the one with higher precision or higher digits among the inputs.
-
- Examples:
- >>> lamb_next_mv = LambNextMV()
- >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32))
- >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6)
- >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3)
-
- """
-
- def __init__(self):
- super(LambNextMV, self).__init__()
- self.mul = P.Mul()
- self.add = P.TensorAdd()
- self.div = P.RealDiv()
- self.sqrt = P.Sqrt()
- self.rsqrt = P.Rsqrt()
- self.fake_output_assign_1 = InplaceAssign()
- self.fake_output_assign_1.add_prim_attr("fake_output", False)
- self.fake_output_assign_2 = InplaceAssign()
- self.fake_output_assign_2.add_prim_attr("fake_output", False)
-
-
- def construct(self, input1, input2, input3, input4, input5, input6, input7,
- input8, input9, inputx0, inputx1, inputx2, inputx3):
- mul3 = self.mul(inputx1, input1)
- mul2 = self.mul(inputx0, input2)
- add1 = self.add(mul2, mul3)
- realdiv1 = self.div(add1, input3)
- add2 = self.add(realdiv1, inputx3)
- sqrt0 = self.rsqrt(add2)
- sqrt1 = self.sqrt(realdiv1)
- add4 = self.add(sqrt1, inputx3)
- mul1 = self.mul(input9, input4)
- mul0 = self.mul(input8, input5)
- add0 = self.add(mul0, mul1)
- realdiv0 = self.div(add0, input6)
- realdiv2 = self.mul(realdiv0, sqrt0)
- realdiv4 = self.div(realdiv0, add4)
- mul4 = self.mul(inputx2, input7)
- add3 = self.add(realdiv2, mul4)
-
- add3 = self.fake_output_assign_1(input5, add0, add3)
- add3 = self.fake_output_assign_2(input2, add1, add3)
-
- return add3, realdiv4
|