# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Graph kernels. They are composites of basic primitives and can be compiled into a fused kernel automaticly when context.set_context(enable_graph_kernel=True). """ from ...common import dtype as mstype from ...ops import operations as P from ...ops.primitive import PrimitiveWithInfer, prim_attr_register from ...ops.composite import multitype_ops as C from ...ops.operations import _grad_ops as G from ..._checkparam import ParamValidator as validator from ..cell import Cell, GraphKernel class InplaceAssign(PrimitiveWithInfer): """ Inplace assign `Parameter` with a value. This primitive can only use in graph kernel. Inputs: - **variable** (Parameter) - The `Parameter`. - **value** (Tensor) - The value to assign. - **depend** (Tensor) - The dependent tensor to keep this op connected in graph. Outputs: Tensor, has the same type as original `variable`. Examples: >>> def construct(self, x): >>> val = x - 1.0 >>> ret = x + 2.0 >>> return InplaceAssign()(x, val, ret) >>> x = Tensor([2.0], mindspore.float32) >>> net = Net() >>> net(x) """ @prim_attr_register def __init__(self): self.init_prim_io_names(inputs=['x', 'y', 'z'], outputs=['output']) def infer_shape(self, x, y, z): return z def infer_dtype(self, x, y, z): return z def get_bprop(self): def bprop(x, y, z, out, dout): return (x, C.zeros_like(y), dout) return bprop class MaximumGrad(GraphKernel): """ Backprop function for Maximum operator. Inputs: - **x** (Tensor) - The first input tensor of maximum. - **y** (Tensor) - The second input tensor of maximum. - **dout** (Tensor) - has the same shape as x and y, next operator's backprop output. Outputs: dx (Tensor): has the same shape as x and y, returns dout element if `x >= y` returns true at the same position, or returns zero at that position dy (Tensor): has the same shape as x and y, dy = dout - dx Examples: >>> layer = MaximumGrad() >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6])) """ def __init__(self, grad_x=True, grad_y=True): super(MaximumGrad, self).__init__() self.grad_x = grad_x self.grad_y = grad_y self.select = P.Select() self.greater_equal = P.GreaterEqual() self.zeros_like = P.ZerosLike() self.sub = P.Sub() def construct(self, x, y, dout): cmp_result = self.greater_equal(x, y) dx = self.select(cmp_result, dout, self.zeros_like(dout)) dy = dout - dx return dx, dy class MinimumGrad(GraphKernel): """ Backprop function for Minimum operator. Compares x and y elementwise, dout should has the same shape with x and y. Inputs: - **x** (Tensor) - The first input - **y** (Tensor) - x and y should have same shape - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output Outputs: - dx (Tensor) - Has the same shape as x and y, returns dout element if `x <= y` returns true at the same position, or returns zero at that position - dy (Tensor) - Has the same shape as x and y, dy = dout - dx Examples: >>> layer = MinimumGrad() >>> output = layer(Tensor([1,2,3], [3, 2, 1], [4, 5, 6])) """ def __init__(self, grad_x=True, grad_y=True): super(MinimumGrad, self).__init__() self.grad_x = grad_x self.grad_y = grad_y self.select = P.Select() self.less_equal = P.LessEqual() self.zeros_like = P.ZerosLike() self.sub = P.Sub() def construct(self, x, y, dout): cmp_result = self.less_equal(x, y) dx = self.select(cmp_result, dout, self.zeros_like(dout)) # dy = self.select(cmp_result, self.zeros_like(dout), dout) dy = dout - dx return dx, dy class AbsGrad(GraphKernel): """ Abs's backprop function. Inputs: **input_x** (Tensor) - input data of this operator. **dout** (Tensor) - output of the next operator's backprop function. Outputs: Tensor, has the same shape as input_x. Examples: >>> back = AbsGrad() >>> output = back(Tensor([1, 2, 3]), Tensor([4, 5, 6])) """ def __init__(self): super(AbsGrad, self).__init__() self.mul = P.Mul() self.abs = P.Abs() self.add = P.TensorAdd() self.div = P.RealDiv() self.round = P.Round() def construct(self, input_x, dout): NUM_MAX = 32768 mul_max = self.mul(input_x, P.Fill()(P.DType()(input_x), (1,), NUM_MAX)) res_abs = self.abs(mul_max) res_div = self.div(mul_max, res_abs) res_round = self.round(res_div) res = self.mul(res_round, dout) return res class ApplyMomentum(GraphKernel): """ Update parameter according to the ApplyMomentum algorithm. Inputs: variable (Tensor): mutable tensor var accumulation (Tensor): mutable tensor accum learning_rate (float32): learning rate gradient (float32): The gradient momentum (float32): Momentum Outputs: updated accumulation and variable """ def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0): super(ApplyMomentum, self).__init__() self.gradient_scale = validator.check_type('gradient_scale', gradient_scale, [float]) self.fake_output_assign_1 = InplaceAssign() self.fake_output_assign_1.add_prim_attr("fake_output", True) self.fake_output_assign_2 = InplaceAssign() self.fake_output_assign_2.add_prim_attr("fake_output", True) def construct(self, variable, accumulation, learning_rate, gradient, momentum): gradient = gradient * self.gradient_scale momt_accumulation = accumulation * momentum accumulation_inplace = momt_accumulation + gradient sum_gradient = accumulation_inplace * learning_rate variable_inplace = variable - sum_gradient accumulation_inplace = self.fake_output_assign_1(accumulation, accumulation_inplace, accumulation_inplace) variable_inplace = self.fake_output_assign_2(variable, variable_inplace, variable_inplace) return accumulation_inplace, variable_inplace class BiasAdd(GraphKernel): """ Return the sum of x and bias. Inputs: x (Tensor): Tensor of input data. bias (Tensor): The bias tensor. Output: Tensor, the sum of x and bias. Example: >>> layer = BiasGrad() >>> output = BiasAdd(Tensor([1, 2, 3]), Tensor([1,])) """ def __init__(self): super(BiasAdd, self).__init__() def construct(self, x, bias): shape = P.Shape()(x) if len(shape) == 4: bias_shape = (1, P.Shape()(bias)[0], 1, 1) # NCHW else: bias_shape = (1, P.Shape()(bias)[0]) res = x + P.Reshape()(bias, bias_shape) return res class BiasAddGrad(GraphKernel): """ Computes gradients of BiasAdd. Inputs: x (Tensor): the gradients of bias add output. Output: Tensor, the gradients of bias add input. Examples: >>> dout = Tensor(np.ones(shape=[1, 2, 3, 4]), mindspore.float32) >>> bias_add_grad = BiasAddGrad() >>> dx = bias_add_grad(dout) """ def __init__(self): super(BiasAddGrad, self).__init__() def construct(self, x): shape_x = P.Shape()(x) reduce_axis = [0] for i in range(2, len(shape_x)): reduce_axis.append(i) res = P.ReduceSum()(x, reduce_axis) return res class EqualCount(GraphKernel): """ Computes the number of the same elements of two tensors. The two input tensors should have same shape and data type. Inputs: x (Tensor): the first input tensor. y (Tensor): the second input tensor. Outputs: Tensor, the type is same as input tensor and size as (1,). Examples: >>> x = Tensor(np.array([1, 2, 3]), mindspore.int32) >>> y = Tensor(np.array([1, 2, 4]), mindspore.int32) >>> equal_count = EqualCount() >>> equal_count(x, y) """ def __init__(self): super(EqualCount, self).__init__() def construct(self, x, y): equal_bool = P.Equal()(P.Cast()(x, mstype.float32), P.Cast()(y, mstype.float32)) equal_count = P.Cast()(equal_bool, mstype.float16) axes = (0,) res = P.ReduceSum()(equal_count, axes) res = P.Cast()(res, P.DType()(x)) return res class ReduceMean(GraphKernel): """ Reduce a dimension of a tensor by averaging all elements in the dimension. The dtype of the tensor to be reduced is number. Args: keep_dims (bool): If True, keep these reduced dimensions and the length is 1. If False, don't keep these dimensions. Default : False. Inputs: - **input_x** (Tensor[Number]) - The input tensor. - **axis** (Union[int, tuple(int), list(int)]) - The dimensions to reduce. Default: (), reduce all dimensions. Only constant value is allowed. Outputs: Tensor, has the same dtype as the 'input_x'. - If axis is (), and keep_dims is false, the output is a 0-D tensor representing the sum of all elements in the input tensor. - If axis is int, set as 2, and keep_dims is false, the shape of output is :math:`(x_1, x_3, ..., x_R)`. - If axis is tuple(int), set as (2, 3), and keep_dims is false, the shape of output is :math:`(x_1, x_4, ..., x_R)`. Examples: >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> op = ReduceMean(keep_dims=True) >>> output = op(input_x, 1) """ def __init__(self, keep_dims=True): super(ReduceMean, self).__init__() self.keep_dims = validator.check_type('keep_dims', keep_dims, [bool]) self.sum = P.ReduceSum(self.keep_dims) def construct(self, x, axis): shape = P.Shape()(x) value_num = 1 for i in axis: value_num *= shape[i] data_sum = self.sum(x, axis) avg = 1.0 / P.Fill()(P.DType()(x), (1,), value_num) res = data_sum * avg return res class ReLU(GraphKernel): r""" Computes ReLU(Rectified Linear Unit) of input tensor element-wise. It returns :math:`\max(x,\ 0)` element-wise. Inputs: - **input_x** (Tensor) - The input tensor. Outputs: Tensor, with the same type and shape as the `input_x`. Examples: >>> input_x = Tensor(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]]), mindspore.float32) >>> relu = ReLU() >>> result = relu(input_x) [[0, 4.0, 0.0], [2.0, 0.0, 9.0]] """ def __init__(self): super(ReLU, self).__init__() self.max = P.Maximum() def construct(self, x): return self.max(P.Fill()(P.DType()(x), P.Shape()(x), 0.0), x) class SoftmaxCrossEntropyWithLogits(GraphKernel): r""" Gets the softmax cross-entropy value between logits and labels which shoule be one-hot encoding. Note: Sets input logits as `X`, input label as `Y`, output as `loss`. Then, .. math:: p_{ij} = softmax(X_{ij}) = \frac{exp(x_i)}{\sum_{j = 0}^{N-1}\exp(x_j)} .. math:: loss_{ij} = -\sum_j{Y_{ij} * ln(p_{ij})} Inputs: - **logits** (Tensor) - Input logits, with shape :math:`(N, C)`. - **labels** (Tensor) - Ground truth labels, with shape :math:`(N, C)`. Outputs: Tuple of 2 Tensor, the loss shape is `(N,)`, and the dlogits with the same shape as `logits`. Examples: >>> logits = Tensor([[2, 4, 1, 4, 5], [2, 1, 2, 4, 3]], mindspore.float32) >>> labels = Tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], mindspore.float32) >>> softmax_cross = SoftmaxCrossEntropyWithLogits() >>> loss, backprop = softmax_cross(logits, labels) """ def __init__(self): super(SoftmaxCrossEntropyWithLogits, self).__init__() self.max = P.ReduceMax(keep_dims=True) self.sum_keep_dims = P.ReduceSum(keep_dims=True) def construct(self, features, labels): data_max = self.max(features, (1,)) data_sub = features - data_max data_exp = P.Exp()(data_sub) data_sum = self.sum_keep_dims(data_exp, (1,)) data_div = data_exp / data_sum data_log_tmp = P.Log()(data_sum) data_log = data_sub - data_log_tmp data_mul = labels * data_log data_muls = P.Neg()(data_mul) loss = P.ReduceSum()(data_muls, (1,)) backprop = data_div - labels return loss, backprop def bprop(self, features, labels, out, dout): grad = out[1] grad = grad * P.ExpandDims()(dout[0], -1) return grad, P.ZerosLike()(labels) class LayerNormForward(GraphKernel): """ Forward function of the LayerNorm operator. """ def __init__(self, begin_norm_axis=1, begin_params_axis=1): super(LayerNormForward, self).__init__() self.begin_norm_axis = validator.check_type('begin_norm_axis', begin_norm_axis, [int]) self.begin_params_axis = validator.check_type('begin_params_axis', begin_params_axis, [int]) self.mul = P.Mul() self.sum_keep_dims = P.ReduceSum(keep_dims=True) self.sub = P.Sub() self.add = P.TensorAdd() self.log = P.Log() self.exp = P.Exp() self.eps = P.Eps() def construct(self, input_x, input_gamma, input_beta): shape_x = P.Shape()(input_x) # Calculate the scaling ratio of the average begin_norm_axis = self.begin_norm_axis if begin_norm_axis < 0: begin_norm_axis += len(shape_x) reduce_axis = () for i in range(len(shape_x)): if i > begin_norm_axis or i == begin_norm_axis: reduce_axis = reduce_axis + (i,) reduce_elts = 1.0 for i in reduce_axis: reduce_elts *= shape_x[i] mean_cof = 1.0 / reduce_elts # Calculate mean mean_muls = self.mul(input_x, mean_cof) mean = self.sum_keep_dims(mean_muls, reduce_axis) # Calculate variance variance_sub = self.sub(input_x, mean) variance_mul = self.mul(variance_sub, variance_sub) variance_muls = self.mul(variance_mul, mean_cof) variance = self.sum_keep_dims(variance_muls, reduce_axis) # Calculate normalize normalize_sub = self.sub(input_x, mean) epsilon = self.eps(input_x) normalize_add = self.add(variance, epsilon) normalize_log = self.log(normalize_add) normalize_log_mul = self.mul(normalize_log, -0.5) normalize_exp = self.exp(normalize_log_mul) normalize_mul = self.mul(normalize_sub, normalize_exp) # Calculate scale and translate if self.begin_params_axis == 0: scale_mul = self.mul(input_gamma, normalize_mul) res = self.add(scale_mul, input_beta) else: scale_mul = self.mul(input_gamma, normalize_mul) res = self.add(scale_mul, input_beta) return res, mean, variance class LayerNormXBackprop(GraphKernel): r""" Together with LayerNormBetaGammaBackprop, to supply the backprop functionality for LayerNorm. Note: Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`, input_gamma as :math:`\gamma`. Then, .. math:: \begin{array}{ll} \\ \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\ \frac {\partial L} {\partial x_i} = \frac{\gamma}{\sqrt{\sigma^2+\epsilon}} ( \frac{\partial L}{\partial y_i} - \frac{1}{m} \cdot \frac{\partial L}{\partial \beta} - \frac{\hat{x_i}}{m} \cdot \frac{\partial L}{\partial \gamma}) \end{array} Inputs: - **dy**(Tensor) - The first item of the next operator's backprop's output. - **input_x**(Tensor) - The first input of the forward function of LayerNorm. - **variance**(Tensor) - The second input of the forward function of LayerNorm. - **mean**(Tensor) - The third input of the forward function of LayerNorm. - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm. Outputs: Tensor, the output of this operator, will be used as the first item of the result of LayerNorm's backprop function, has the same shape and data type as 'input_x'. Examples: >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> op = LayerNormXBackprop(keep_dims=False) >>> output = op(dy, input_x, variance, mean, input_gamma) """ def __init__(self): super(LayerNormXBackprop, self).__init__() self.sum_keep_dims = P.ReduceSum(keep_dims=True) self.log = P.Log() self.exp = P.Exp() self.eps = P.Eps() def construct(self, dy, input_x, variance, mean, input_gamma): shape_x = P.Shape()(input_x) shape_mean = P.Shape()(mean) reduce_axis = () flag = -1 min_l = 0 if len(shape_x) > len(shape_mean): min_l = len(shape_x) else: min_l = len(shape_mean) for i in range(min_l): if (shape_x[i] != shape_mean[i]) and (flag == -1): flag = i if flag != -1: for i in range(flag, len(shape_x)): reduce_axis = reduce_axis + (i,) else: reduce_axis = reduce_axis + (len(shape_x) - 1,) mean_num = 1.0 for i in reduce_axis: mean_num *= shape_x[i] pd_xl = input_gamma * dy epsilon = self.eps(input_x) var_elta = variance + epsilon var_elta_log = self.log(var_elta) var_elta_mul = var_elta_log * -0.5 var_elta_2 = P.Exp()(var_elta_mul) pdvar1_mul = var_elta_2 * var_elta_2 pd_var_1 = pdvar1_mul * var_elta_2 sub_x_mean = input_x - mean pdvar_mul1 = pd_xl * sub_x_mean pdvar_sum = self.sum_keep_dims(pdvar_mul1, reduce_axis) pdvar_mul3 = pdvar_sum * pd_var_1 pd_var = pdvar_mul3 * -0.5 pdmean1_sum = self.sum_keep_dims(pd_xl, reduce_axis) pdmean1_mul = pdmean1_sum * var_elta_2 pd_mean_1 = pdmean1_mul * -1.0 pdmean2_mul1 = sub_x_mean * -2.0 pdmean2_sum = self.sum_keep_dims(pdmean2_mul1, reduce_axis) pdmean2_mul3 = pdmean2_sum * (1.0 / mean_num) pd_mean_2 = pd_var * pdmean2_mul3 pd_mean = pd_mean_2 + pd_mean_1 pd_x_1 = var_elta_2 * pd_xl pdx2_mul = pd_var * sub_x_mean pd_x_2 = pdx2_mul * (2.0 * (1.0 / mean_num)) pd_x_3 = pd_mean * (1.0 / mean_num) pdx_add = pd_x_1 + pd_x_2 pd_x = pdx_add + pd_x_3 return pd_x class LayerNormBetaGammaBackprop(GraphKernel): r""" Together with LayerNormXBackprop, to supply the backprop functionality for LayerNorm. Note: Sets input_x as :math:`x_i`, variance as :math:`\sigma^2`, mean as :math:`\mu`, input_gamma as :math:`\gamma`. Then, .. math:: \begin{array}{ll} \\ \hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \\ \frac {\partial L} {\partial \beta} = \sum_{i=1}^m \\frac{\\partial L}{\partial y_i} \\ \frac {\partial L} {\partial \gamma} = \sum_{i=1}^m \\frac{\partial L}{\partial y_i} \cdot \hat{x_i} \end{array} Inputs: - **dy**(Tensor) - The first item of the next operator's backprop's output. - **input_x**(Tensor) - The first input of the forward function of LayerNorm. - **variance**(Tensor) - The second input of the forward function of LayerNorm. - **mean**(Tensor) - The third input of the forward function of LayerNorm. - **input_gamma**(Tensor) - The fourth input of the forward function of LayerNorm. Outputs: Tuple of 2 Tensor, the backprop outputs. - **pd_beta**(Tensor) - The first item of return value of this operator, will be used as the second item of the LayerNorm's backprop function. - **pd_gamma**(Tensor) - The second item of return value of this operator, will be used as the third item of the LayerNorm's backprop function. Examples: >>> dy = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> input_x = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> variance = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> mean = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> input_gamma = Tensor(np.random.randn(3, 4, 5, 6).astype(np.float32)) >>> op = LayerNormBetaGammaBackprop(keep_dims=False) >>> pd_beta, pd_gamma = op(dy, input_x, variance, mean, input_gamma) """ def __init__(self): super(LayerNormBetaGammaBackprop, self).__init__() self.sum_not_keep_dims = P.ReduceSum(keep_dims=False) self.log = P.Log() self.exp = P.Exp() self.eps = P.Eps() def construct(self, dy, input_x, variance, mean, shape_gamma): shape_x = P.Shape()(input_x) params_axis = () if len(shape_x) != len(shape_gamma): sub = len(shape_x) - len(shape_gamma) for i in range(sub): params_axis = params_axis + (i,) pd_beta = self.sum_not_keep_dims(dy, params_axis) epsilon = self.eps(input_x) var_elta = variance + epsilon var_elta_log = self.log(var_elta) var_elta_mul = var_elta_log * -0.5 var_elta_2 = P.Exp()(var_elta_mul) sub_x_mean = input_x - mean var_elta_2_cast = var_elta_2 xl_mul = var_elta_2_cast * sub_x_mean pdga_mul = dy * xl_mul pd_gamma = self.sum_not_keep_dims(pdga_mul, params_axis) return pd_beta, pd_gamma class LogSoftmax(GraphKernel): r""" Log Softmax activation function. Applies the Log Softmax function to the input tensor on the specified axis. Suppose a slice in the given aixs :math:`x` then for each element :math:`x_i` the Log Softmax function is shown as follows: .. math:: \text{output}(x_i) = \log \left(\frac{exp(x_i)} {\sum_{j = 0}^{N-1}\exp(x_j)}\right), where :math:`N` is the length of the Tensor. Args: axis (int): The axis to do the Log softmax operation. Default: -1. Inputs: logits (Tensor): The input of Log Softmax. Outputs: Tensor, with the same type and shape as the logits. Examples: >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32) >>> log_softmax = LogSoftmax() >>> log_softmax(input_x) [-4.4519143, -3.4519143, -2.4519143, -1.4519144, -0.4519144] """ def __init__(self, axis=-1): super(LogSoftmax, self).__init__() self.axis = validator.check_type('axis', axis, [int]) self.max_keep_dims = P.ReduceMax(keep_dims=True) self.sub = P.Sub() self.exp = P.Exp() self.sum_keep_dims = P.ReduceSum(keep_dims=True) self.log = P.Log() self.mul = P.Mul() def construct(self, input_x): data_max = self.max_keep_dims(input_x, (self.axis,)) data_sub = self.sub(input_x, data_max) data_exp = self.exp(data_sub) data_sum = self.sum_keep_dims(data_exp, (self.axis,)) data_log = self.log(data_sum) res = self.sub(data_sub, data_log) return res def bprop(self, input_x, out, dout): input_x = out input_dy = dout data_exp = self.exp(input_x) data_sum = self.sum_keep_dims(input_dy, (self.axis,)) data_softmax = self.mul(data_exp, data_sum) res = self.sub(input_dy, data_softmax) return (res,) class Tanh(GraphKernel): r""" Tanh activation function. Computes hyperbolic tangent of input element-wise. The Tanh function is defined as: .. math:: tanh(x_i) = \frac{\exp(x_i) - \exp(-x_i)}{\exp(x_i) + \exp(-x_i)} = \frac{\exp(2x_i) - 1}{\exp(2x_i) + 1}, where :math:`x_i` is an element of the input Tensor. Inputs: - **input_x** (Tensor) - The input of Tanh. Outputs: Tensor, with the same type and shape as the input_x. Examples: >>> input_x = Tensor(np.array([1, 2, 3, 4, 5]), mindspore.float32) >>> tanh = Tanh() >>> tanh(input_x) [0.7615941, 0.9640276, 0.9950548, 0.9993293, 0.99990916] """ def __init__(self): super(Tanh, self).__init__() self.abs = P.Abs() self.add = P.TensorAdd() self.div = P.RealDiv() self.mul = P.Mul() self.mul_fp16 = P.Mul() self.mul_fp16.add_prim_attr("output_precision", "float16") self.exp = P.Exp() def construct(self, input_x): input_abs = self.abs(input_x) sign_flag = self.div(input_x, input_abs) sign_flag_neg = self.mul(sign_flag, -1.0) power_val = self.mul(input_abs, -2.0) exp_val = self.exp(power_val) up_val = self.add(exp_val, -1.0) down_val = self.add(exp_val, 1.0) div_val = self.div(up_val, down_val) res = self.mul(sign_flag_neg, div_val) return res def bprop(self, input_x, out, dout): input_y = out input_dy = dout data_square = self.mul(input_y, input_y) data_mul = self.mul(data_square, -1.0) anuminate = self.add(data_mul, 1.0) res = self.mul_fp16(anuminate, input_dy) return (res,) class TanhGrad(GraphKernel): """ Backprop function of Tanh Mathematical calculating: result = Tanh(out) result = 1 - result * result result = result * dout Inputs: out (Tensor): Tanh's output dout (Tensor): next layer's backward function's output, has same shape as out Outputs: result (Tensor): result of (1 - tanh(out)^2) * dout Examples: >>> x_np = np.random.randn(5, 3, 6).astype(np.float16) >>> dy_np = np.random.randn(5, 3, 6).astype(np.float16) >>> x_ms = Tensor(x_np) >>> dy_ms = Tensor(dy_np) >>> tanh_grad = TanhGrad() >>> out = tanh_grad(x_np, dy_np) """ def __init__(self): super(TanhGrad, self).__init__() self.add = P.TensorAdd() self.mul = P.Mul() self.mul_fp16 = P.Mul() self.mul_fp16.add_prim_attr("output_precision", "float16") def construct(self, out, dout): input_y = out input_dy = dout data_square = self.mul(input_y, input_y) data_mul = self.mul(data_square, -1.0) anuminate = self.add(data_mul, 1.0) res = self.mul_fp16(anuminate, input_dy) return res class Gelu(GraphKernel): r""" Gaussian Error Linear Units activation function. GeLU is described in the paper `Gaussian Error Linear Units (GELUs) `_. And also please refer to `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. `_. Defined as follows: .. math:: \text{output} = 0.5 * x * (1 + erf(x / \sqrt{2})), where :math:`erf` is the "Gauss error function" . Inputs: - **input_x** (Tensor) - Input to compute the Gelu. Outputs: Tensor, with the same type and shape as input. Examples: >>> tensor = Tensor(np.array([1.0, 2.0, 3.0]), mindspore.float32) >>> gelu = Gelu() >>> result = gelu(tensor) """ def __init__(self): super(Gelu, self).__init__() self.add = P.TensorAdd() self.abs = P.Abs() self.exp = P.Exp() self.neg = P.Neg() self.minimum = P.Minimum() self.div = P.RealDiv() self.mul = P.Mul() self.CSVALUE = 0.044715 self.CSVALUE_A = 1.59576912 self.CSVALUE_5 = 0.3989422804 self.CSVALUE_3B = 0.2140644488 def construct(self, input_x): def _tanh_parameter_compute(data_x): """ compute the parameter of tanh: return: result equal (x+0.044715*tf.pow(x,3)) """ mul_0 = self.mul(data_x, data_x) pow_0 = self.mul(mul_0, data_x) mul_1 = self.mul(pow_0, self.CSVALUE) result = self.add(data_x, mul_1) return result tanh_parameter = _tanh_parameter_compute(input_x) mul_0 = self.mul(tanh_parameter, 1.5957691) mul_0_min = self.minimum(mul_0, 0.0) right_mul = self.exp(mul_0_min) mul_0_abs = self.abs(mul_0) mul_0_abs_neg = self.mul(mul_0_abs, -1.0) mul_0_abs_neg_exp = self.exp(mul_0_abs_neg) mul_0_abs_neg_exp_add = self.add(mul_0_abs_neg_exp, 1.0) left_mul = self.div(input_x, mul_0_abs_neg_exp_add) result = self.mul(left_mul, right_mul) return result def bprop(self, input_x, out, dout): """ register backprop function for Gelu """ data_x = input_x data_gelu = out data_dy = dout def _math_four_compute(data_x): """ return: math_four equal 2*(np(sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3))) """ datax_pow = data_x * data_x * data_x datax_muls_c = self.mul(datax_pow, self.CSVALUE) datax_addx = self.add(datax_muls_c, data_x) datax_muls_s = self.mul(datax_addx, self.CSVALUE_A) return datax_muls_s # common part math_four = _math_four_compute(data_x) math_four_abs = self.abs(math_four) math_four_abs_neg = self.mul(math_four_abs, -1.0) math_four_abs_neg_exp = self.exp(math_four_abs_neg) math_four_min = self.minimum(math_four, 0.0) # dividend part datax_pow = self.mul(data_x, data_x) datax_pow_mul = self.mul(datax_pow, self.CSVALUE_3B) datax_pow_mul_add = self.add(datax_pow_mul, self.CSVALUE_A) data_gelu_mul = self.mul(data_gelu, datax_pow_mul_add) math_four_min_2 = self.mul(math_four_min, 2.0) div_right = self.mul(data_gelu_mul, math_four_abs_neg_exp) div_left = self.exp(math_four_min_2) dividend = self.add(div_left, div_right) # divisor part div_0 = self.add(math_four_abs_neg_exp, 1.0) div_1 = self.exp(math_four_min) divisor = self.mul(div_1, div_0) res_grad = self.div(dividend, divisor) result = self.mul(res_grad, data_dy) return (result,) class Softmax(GraphKernel): """ Operator Softmax .. math: `exp(x-max(x)) / sum(exp(x-max(x)))` Args: axis (int, tuple): Axis along which the softmax normalization is applied Inputs: x (Tensor): input data for softmax Outputs: output (Tensor): a tensor with the same shape of the input Examples: >>> layer = Softmax(1) >>> x = Tensor(np.array([1.2, 2.1], [2.2, 3.2]), mindspore.float32) >>> output = layer(x) """ def __init__(self, axis): super(Softmax, self).__init__() validator.check_type("axis", axis, [int, tuple]) if isinstance(axis, int): self.axis = (axis,) else: self.axis = axis for item in self.axis: validator.check_type("item of axis", item, [int]) self.max = P.ReduceMax(keep_dims=True) self.sub = P.Sub() self.exp = P.Exp() self.sum = P.ReduceSum(keep_dims=True) self.mul = P.Mul() def construct(self, x): max_x = self.max(x, self.axis) data_sub = self.sub(x, max_x) data_exp = self.exp(data_sub) data_expsum = self.sum(data_exp, self.axis) output = data_exp / data_expsum return output def bprop(self, x, out, dout): mul_res = self.mul(dout, out) sum_res = self.sum(mul_res, self.axis) sub_res = self.sub(dout, sum_res) res = self.mul(sub_res, out) return (res,) class LayerNorm(Cell): r""" Applies Layer Normalization over a mini-batch of inputs. Layer normalization is widely used in recurrent neural networks. It applies normalization on a mini-batch of inputs for each single training case as described in the paper `Layer Normalization `_. Unlike batch normalization, layer normalization performs exactly the same computation at training and testing time. It can be described using the following formula. It is applied across all channels and pixel but only one batch size. .. math:: y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta Args: normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis `begin_norm_axis ... R - 1`. begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1. begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', 'he_uniform', etc. Default: 'ones'. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform', 'he_uniform', etc. Default: 'zeros'. Inputs: - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`, and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`. Outputs: Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`. Examples: >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32) >>> shape1 = x.shape[1:] >>> m = G.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1) >>> m(x) """ def __init__(self, begin_norm_axis=-1, begin_params_axis=-1 ): super(LayerNorm, self).__init__() self.begin_norm_axis = begin_norm_axis self.begin_params_axis = begin_params_axis self.layer_norm = LayerNormForward(begin_norm_axis, begin_params_axis) self.layer_norm_x_grad = LayerNormXBackprop() self.layer_norm_beta_gamma = LayerNormBetaGammaBackprop() self.layer_norm_grad = G.LayerNormGrad(self.begin_norm_axis, self.begin_params_axis) def construct(self, input_x, input_gamma, input_beta): return self.layer_norm(input_x, input_gamma, input_beta) # case 1 def bprop(self, input_x, input_gamma, input_beta, out, dout): dx, d_gamma, d_beta = self.layer_norm_grad(input_x, dout[0], out[2], dout[1], input_gamma) return dx, d_gamma, d_beta class LambUpdateWithLR(GraphKernel): r""" Part of Lamb optimizer. .. math:: s_1 = select(i_1 \gt y_g, select(i_0 \gt y_g, \frac{i_1}{i_2}, se), se) i_5 = i_5 - max(min(s_1, y_m), y_g) \times i_3 \times i_4 Inputs: - **input0** (Tensor) - The first tensor to be computed. - **input1** (Tensor) - The second tensor to be computed. - **input2** (Tensor) - The third tensor to be computed. - **input3** (Tensor) - The fourth tensor to be computed. - **input4** (Tensor) - The fifth tensor to be computed. - **input5** (Tensor) - The sixth tensor to be computed. It will be updated by result. - **greater_y** (Tensor) - The seventh tensor to be computed. - **select_e** (Tensor) - The eighth tensor to be computed. - **minimum_y** (Tensor) - The ninth tensor to be computed. Outputs: A fake output tensor. Examples: >>> lamb_update = LambUpdateWithLR() >>> i0 = np.random.normal(0, 1, [1, 16]).astype(np.float32) >>> i1 = np.random.normal(0, 1, [1]).astype(np.float32) >>> i2 = np.random.normal(0, 1, [1]).astype(np.float32) >>> i3 = np.random.normal(0, 1, [1]).astype(np.float32) >>> i4 = np.random.normal(0, 1, [1, 16]).astype(np.float32) >>> i5 = np.random.normal(0, 1, [1, 16]).astype(np.float32) >>> yg = np.random.normal(0, 1, [1]).astype(np.float32) >>> se = np.random.normal(0, 1, [1]).astype(np.float32) >>> ym = np.random.normal(0, 1, [1]).astype(np.float32) >>> lamb_update(i0, i1, i2, i3, i4, i5, yg, se, ym) """ def __init__(self): super(LambUpdateWithLR, self).__init__() self.greater = P.Greater() self.select = P.Select() self.div = P.RealDiv() self.min = P.Minimum() self.max = P.Maximum() self.mul = P.Mul() self.sub = P.Sub() self.fake_output_assign = InplaceAssign() self.fake_output_assign.add_prim_attr("fake_output", True) def construct(self, input0, input1, input2, input3, input4, input5, greater_y, select_e, minimum_y): greater0 = self.greater(input0, greater_y) greater1 = self.greater(input1, greater_y) real_div0 = self.div(input1, input2) select0 = self.select(greater0, real_div0, select_e) select1 = self.select(greater1, select0, select_e) min0 = self.min(select1, minimum_y) max0 = self.max(min0, greater_y) mul0 = self.mul(max0, input3) mul1 = self.mul(mul0, input4) sub0 = self.sub(input5, mul1) sub0 = self.fake_output_assign(input5, sub0, sub0) return sub0 class LambNextMV(GraphKernel): r""" Part of Lamb optimizer. .. math:: rd_0 = \frac{i_8 \times i_5 + i_9 \times i_4}{i6} rd_1 = \frac{x_0 \times i_2 + x_1 \times i_1}{i3} y_2 = \frac{rd_0}{\sqrt{rd_1 + x3}} + x_2 \times i_7 y_3 = \frac{rd_0}{\sqrt{rd_1} + x3} i5 = i_8 \times i_5 + i_9 \times i_4 i2 = x_0 \times i_2 + x_1 \times i_1 Inputs: - **inputs1** (Tensor) - The first input tensor to be computed. - **inputs2** (Tensor) - The second input tensor to be computed. It will be updated by result. - **inputs3** (Tensor) - The third input tensor to be computed. - **inputs4** (Tensor) - The fourth input tensor to be computed. - **inputs5** (Tensor) - The fifth input tensor to be computed. It will be updated by result. - **inputs6** (Tensor) - The sixth input tensor to be computed. - **inputs7** (Tensor) - The seventh input tensor to be computed. - **inputs8** (Tensor) - The eighth input tensor to be computed. - **inputs9** (Tensor) - The ninth input tensor to be computed. - **inputsx0** (Tensor) - The tenth input tensor to be computed. - **inputsx1** (Tensor) - The eleventh input tensor to be computed. - **inputsx2** (Tensor) - The twelfth input tensor to be computed. - **inputsx3** (Tensor) - The thirteenth input tensor to be computed. Outputs: Tuple of 2 Tensor. - **add3** (Tensor) - The shape is the same as the shape after broadcasting, and the data type is the one with high precision or high digits among the inputs. - **realdiv4** (Tensor) - The shape is the same as the shape after broadcasting, and the data type is the one with high precision or high digits among the inputs. Examples: >>> lamb_next_mv = LambNextMV() >>> i1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i3 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i4 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i5 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i6 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i7 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i8 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> i9 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> x0 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> x1 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> x2 = Tensor(np.random.normal(0, 1, [1, 16]).astype(np.float32)) >>> x3 = Tensor(np.ones([1, 16]).astype(np.float32) * 1e-6) >>> lamb_next_mv(i1, i2, i3, i4, i5, i6, i7, i8, i9, x0, x1, x2, x3) """ def __init__(self): super(LambNextMV, self).__init__() self.mul = P.Mul() self.add = P.TensorAdd() self.div = P.RealDiv() self.sqrt = P.Sqrt() self.rsqrt = P.Rsqrt() self.fake_output_assign_1 = InplaceAssign() self.fake_output_assign_1.add_prim_attr("fake_output", False) self.fake_output_assign_2 = InplaceAssign() self.fake_output_assign_2.add_prim_attr("fake_output", False) def construct(self, input1, input2, input3, input4, input5, input6, input7, input8, input9, inputx0, inputx1, inputx2, inputx3): mul3 = self.mul(inputx1, input1) mul2 = self.mul(inputx0, input2) add1 = self.add(mul2, mul3) realdiv1 = self.div(add1, input3) add2 = self.add(realdiv1, inputx3) sqrt0 = self.rsqrt(add2) sqrt1 = self.sqrt(realdiv1) add4 = self.add(sqrt1, inputx3) mul1 = self.mul(input9, input4) mul0 = self.mul(input8, input5) add0 = self.add(mul0, mul1) realdiv0 = self.div(add0, input6) realdiv2 = self.mul(realdiv0, sqrt0) realdiv4 = self.div(realdiv0, add4) mul4 = self.mul(inputx2, input7) add3 = self.add(realdiv2, mul4) add3 = self.fake_output_assign_1(input5, add0, add3) add3 = self.fake_output_assign_2(input2, add1, add3) return add3, realdiv4