# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""layers for second order optimization"""
import numpy as np
import mindspore.common.dtype as mstype
from mindspore.common.tensor import Tensor
from mindspore.common.initializer import initializer, Initializer
from mindspore.ops import operations as P
from mindspore.common.parameter import Parameter
from mindspore._checkparam import Validator, Rel, twice
from mindspore import context
from mindspore.nn.cell import Cell
from mindspore.nn.layer.activation import get_activation


__all__ = ['Dense_Thor', 'Conv2d_Thor', 'Embedding_Thor']

class Dense_Thor(Cell):
    r"""
    The dense connected layer.

    Applies dense connected layer for the input. This layer implements the operation as:

    .. math::
        \text{outputs} = \text{activation}(\text{inputs} * \text{kernel} + \text{bias}),

    where :math:`\text{activation}` is the activation function passed as the activation
    argument (if passed in), :math:`\text{kernel}` is a weight matrix with the same
    data type as the inputs created by the layer, and :math:`\text{bias}` is a bias vector
    with the same data type as the inputs created by the layer (only if has_bias is True).

    Args:
        in_channels (int): The number of channels in the input space.
        out_channels (int): The number of channels in the output space.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable weight_init parameter. The dtype
            is same as input x. The values of str refer to the function `initializer`. Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (str): activate function applied to the output of the fully connected layer, eg. 'ReLU'.
            Default: None.

    Raises:
        ValueError: If weight_init or bias_init shape is incorrect.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.

    Outputs:
        Tensor of shape :math:`(N, out\_channels)`.

    Examples:
        >>> input = Tensor(np.random.randint(0, 255, [2, 3]), mindspore.float32)
        >>> net = nn.Dense(3, 4)
        >>> net(input)
        [[ 2.5246444   2.2738023   0.5711005  -3.9399147 ]
         [ 1.0739875   4.0155234   0.94188046 -5.459526  ]]
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 has_bias=True,
                 activation=None):
        super(Dense_Thor, self).__init__()
        self.thor = True
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
                    weight_init.shape[1] != in_channels:
                raise ValueError("Weight init shape error.")
        self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")

        self.bias = None
        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                    raise ValueError("Bias init shape error.")
            self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
            self.bias_add = P.BiasAdd()

        self.matmul = P.MatMul(transpose_b=True)
        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None

        self.matrix_A = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float32)),
                                  name='matrix_A', requires_grad=False)
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.mul = P.Mul()
        self.is_Ascend = True
        if context.get_context("device_target") == "Ascend":
            if out_channels == 1001:
                self.matrix_G = Parameter(Tensor(np.zeros([1024, 1024]).astype(np.float32)),
                                          name='matrix_G', requires_grad=False)
                self.pad = P.Pad(((0, 23), (0, 23)))
                self.pad1 = P.Pad(((0, 7), (0, 7)))
                self.slice = P.Slice()
                self.add = P.TensorAdd()
            else:
                self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
                                          name="matrix_G", requires_grad=False)
                self.abs = P.Abs()
                self.reduce_max = P.ReduceMax(keep_dims=False)
                self.neg = P.Neg()
                self.reduce_sum = P.ReduceSum()
            self.matmul = P.MatMul(transpose_b=True)
            self.cube_matmul = P.CusMatMulCube(transpose_a=True)
            self.cast = P.Cast()
            self.is_nsp_layer = (out_channels == 2)
        else:
            self.is_Ascend = False
            self.matrix_G = Parameter(Tensor(np.eye(out_channels).astype(np.float32)),
                                      name="matrix_G", requires_grad=False)
            self.cube_matmul = P.MatMul(transpose_a=True)
        self.getG = P.InsertGradientOf(self.save_gradient)


    def save_gradient(self, dout):
        """
           this function only for thor optimizer
           save_gradient
        """
        out = dout
        if self.is_Ascend:
            if not self.is_nsp_layer:
                shape = self.shape(dout)
                normalizer = self.cast(shape[0], mstype.float32)
                matrix_G = self.cube_matmul(dout, dout)
                matrix_G = self.mul(matrix_G, 1.0 / normalizer)
                if self.out_channels == 1001:
                    matrix_G = P.Pad(((0, 23), (0, 23)))(matrix_G)
                self.matrix_G = matrix_G
        else:
            dout_shape = self.shape(dout)
            normalizer = dout_shape[0]
            matrix_G = self.cube_matmul(dout, dout)
            matrix_G = self.mul(matrix_G, 1.0 / normalizer)
            self.matrix_G = matrix_G
        return out

    def construct(self, x):
        if self.thor:
            if self.is_Ascend:
                inputs = self.cube_matmul(x, x)
                shape = self.shape(x)
                normalizer = self.cast(shape[0], mstype.float32)
                matrix_A = self.mul(inputs, 1.0 / normalizer)
                self.matrix_A = matrix_A
            else:
                inputs = self.cube_matmul(x, x)
                inputs_shape = self.shape(inputs)
                normalizer = inputs_shape[0]
                matrix_A = self.mul(inputs, 1.0 / normalizer)
                self.matrix_A = matrix_A
            x = self.matmul(x, self.weight)
            x = self.getG(x)
        else:
            x = self.matmul(x, self.weight)
        if self.has_bias:
            x = self.bias_add(x, self.bias)
        if self.activation_flag:
            x = self.activation(x)
        return x

    def extend_repr(self):
        s = 'input_channels={}, output_channels={}'.format(self.in_channels, self.out_channels)
        if self.has_bias:
            s += ', has_bias={}'.format(self.has_bias)
        # if self.activation_flag:
        #     s += ', activation={}'.format(self.activation)
        return s

class _Conv(Cell):
    """
    Applies a N-D convolution over an input signal composed of several input planes.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 pad_mode,
                 padding,
                 dilation,
                 group,
                 has_bias,
                 weight_init,
                 bias_init,
                 transposed=False):
        super(_Conv, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad_mode = pad_mode
        # self.weight_init = weight_init
        self.bias_init = bias_init
        if isinstance(padding, int):
            Validator.check_non_negative_int(padding, 'padding', self.cls_name)
            self.padding = padding
        elif isinstance(padding, tuple):
            for pad in padding:
                Validator.check_non_negative_int(pad, 'padding item', self.cls_name)
            self.padding = padding
        else:
            raise TypeError("padding type must be int/tuple(int) cannot be {}!".format(type(padding)))

        self.dilation = dilation
        self.group = Validator.check_positive_int(group)
        self.has_bias = has_bias
        if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
                isinstance(kernel_size[0], bool) or isinstance(kernel_size[1], bool) or \
                kernel_size[0] < 1 or kernel_size[1] < 1:
            raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
                             + str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
        if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or \
                isinstance(stride[0], bool) or isinstance(stride[1], bool) or stride[0] < 1 or stride[1] < 1:
            raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
                             + str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
        if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
                isinstance(dilation[0], bool) or isinstance(dilation[1], bool) or dilation[0] < 1 or dilation[1] < 1:
            raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
                             + str(self.dilation) + ", should be a int or tuple and equal to or greater than 1.")
        if in_channels % group != 0:
            raise ValueError("Attr 'in_channels' of 'Conv2D' Op must be divisible by "
                             "attr 'group' of 'Conv2D' Op.")
        if out_channels % group != 0:
            raise ValueError("Attr 'out_channels' of 'Conv2D' Op must be divisible by "
                             "attr 'group' of 'Conv2D' Op.")
        if transposed:
            shape = [in_channels, out_channels // group, *kernel_size]
        else:
            shape = [out_channels, in_channels // group, *kernel_size]
        self.weight = Parameter(initializer(weight_init, shape), name='weight')

        if Validator.check_bool(has_bias):
            self.bias = Parameter(initializer(self.bias_init, [out_channels]), name='bias')
        else:
            if self.bias_init != 'zeros':
                logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
            self.bias = None

    def construct(self, *inputs):
        """Must be overridden by all subclasses."""
        raise NotImplementedError


class Conv2d_Thor(_Conv):
    r"""
    2D convolution layer.

    Applies a 2D convolution over an input tensor which is typically of shape :math:`(N, C_{in}, H_{in}, W_{in})`,
    where :math:`N` is batch size, :math:`C_{in}` is channel number, and :math:`H_{in}, W_{in})` are height and width.
    For each batch of shape :math:`(C_{in}, H_{in}, W_{in})`, the formula is defined as:

    .. math::

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

    where :math:`ccor` is the cross-correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

    If the 'pad_mode' is set to be "valid", the output height and width will be
    :math:`\left \lfloor{1 + \frac{H_{in} + 2 \times \text{padding} - \text{ks_h} -
    (\text{ks_h} - 1) \times (\text{dilation} - 1) }{\text{stride}}} \right \rfloor` and
    :math:`\left \lfloor{1 + \frac{W_{in} + 2 \times \text{padding} - \text{ks_w} -
    (\text{ks_w} - 1) \times (\text{dilation} - 1) }{\text{stride}}} \right \rfloor` respectively.

    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the height
            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

            - same: Adopts the way of completion. The height and width of the output will be the same as
              the input. The total number of padding will be calculated in horizontal and vertical
              directions and evenly distributed to top and bottom, left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

            - valid: Adopts the way of discarding. The possible largest height and width of output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

            - pad: Implicit paddings on both sides of the input. The number of `padding` will be padded to the input
              Tensor borders. `padding` must be greater than or equal to 0.

        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
                    the paddings of top, bottom, left and right are the same, equal to padding. If `padding` is a tuple
                    with four integers, the paddings of top, bottom, left and right will be equal to padding[0],
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. If the group is equal to `in_channels` and `out_channels`,
            this 2D convolution layer also can be called 2D depthwise convolution layer. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
            Initializer for more details. Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Possible
            Initializer and string are the same as 'weight_init'. Refer to the values of
            Initializer for more details. Default: 'zeros'.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, C_{out}, H_{out}, W_{out})`.

    Examples:
        >>> net = nn.Conv2d(120, 240, 4, has_bias=False, weight_init='normal')
        >>> input = Tensor(np.ones([1, 120, 1024, 640]), mindspore.float32)
        >>> net(input).shape
        (1, 240, 1024, 640)
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros'):
        kernel_size = twice(kernel_size)
        stride = twice(stride)
        self._dilation = dilation
        dilation = twice(dilation)
        super(Conv2d_Thor, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init)
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group)
        self._init_depthwise_conv2d(weight_init)
        self.bias_add = P.BiasAdd()

        self.thor = True
        self.hw = kernel_size[0] * kernel_size[1]
        self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
        self.matrix_G_dim = self.out_channels
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.A_normalizer = Parameter(initializer(0, [1], mstype.float32), name="A_normalizer", requires_grad=False)
        self.G_normalizer = Parameter(initializer(0, [1], mstype.float32), name="G_normalizer", requires_grad=False)
        self.is_Ascend = True
        if context.get_context("device_target") == "Ascend":
            ksizes = (1, kernel_size[0], kernel_size[1], 1)
            strides = (1, stride[0], stride[1], 1)
            self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
            self.cube_matmul = P.CusMatMulCube(transpose_a=True)
            self.transpose02314 = P.CusTranspose02314()
            dampingA_dim = self.matrix_A_dim
            self.diag_block_dim = 128
            if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
                dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
            dampingG_dim = self.matrix_G_dim
            if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
                dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim
            self.matrix_A_cov = Parameter(Tensor(np.zeros([dampingA_dim, dampingA_dim]).astype(np.float32)),
                                          name='matrix_A', requires_grad=False)
            self.matrix_G_cov = Parameter(Tensor(np.zeros([dampingG_dim, dampingG_dim]).astype(np.float32)),
                                          name='matrix_G', requires_grad=False)

            self.channels_slice_flag = False
            self.C0 = 16
            if self.in_channels % self.C0 != 0:
                self.channels_slice_flag = True
            self.padA_flag = False
            if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
                    and self.matrix_A_dim > self.diag_block_dim:
                self.padA_flag = True
                pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
                self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
            self.slice = P.Slice()
        else:
            self.is_Ascend = False
            self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same")
            self.matmul = P.MatMul(transpose_b=True)
            self.reduce_mean = P.ReduceMean(keep_dims=False)
            self.matrix_A_cov = Parameter(Tensor(np.zeros([self.matrix_A_dim, self.matrix_A_dim]).astype(np.float32)),
                                          name='matrix_A', requires_grad=False)
            self.matrix_G_cov = Parameter(Tensor(np.zeros([self.matrix_G_dim, self.matrix_G_dim]).astype(np.float32)),
                                          name='matrix_G', requires_grad=False)
        self.getG = P.InsertGradientOf(self.save_gradient)


    def _init_depthwise_conv2d(self, weight_init):
        """Initialize depthwise conv2d op"""
        if context.get_context("device_target") == "Ascend" and self.group > 1:
            self.dilation = self._dilation
            Validator.check_integer('group', self.group, self.in_channels, Rel.EQ)
            Validator.check_integer('group', self.group, self.out_channels, Rel.EQ)
            self.conv2d = P.DepthwiseConv2dNative(channel_multiplier=1,
                                                  kernel_size=self.kernel_size,
                                                  pad_mode=self.pad_mode,
                                                  pad=self.padding,
                                                  stride=self.stride,
                                                  dilation=self.dilation)
            weight_shape = [1, self.in_channels, *self.kernel_size]
            self.weight_init = weight_init
            if isinstance(weight_init, Tensor):
                self.weight_init = Tensor(weight_init.asnumpy().swapaxes(0, 1), weight_init.dtype)
            if isinstance(weight_init, Initializer):
                self.weight_init.shape = weight_shape
            self.weight = Parameter(initializer(self.weight_init, weight_shape), name='weight')


    def save_gradient(self, dout):
        """save_gradient"""
        out = dout
        if self.is_Ascend:
            dout = self.transpose02314(dout)
            dout_shape = self.shape(dout)
            normalizer = dout_shape[0]
            matrix_G = self.cube_matmul(dout, dout)
            normalizer = self.cast(normalizer, mstype.float32)
            matrix_G = self.mul(matrix_G, 1.0 / normalizer)
            self.G_normalizer = normalizer
            self.matrix_G_cov = matrix_G
        else:
            dout = self.reduce_mean(dout, 0)
            dout_shape = self.shape(dout)
            dout = self.reshape(dout, (dout_shape[0], -1))
            dout_shape = self.shape(dout)
            normalizer = dout_shape[1]
            dout = self.cast(dout, mstype.float32)
            matrix_G = self.matmul(dout, dout)
            matrix_G = self.mul(matrix_G, 1.0 / normalizer)
            self.G_normalizer = normalizer
            self.matrix_G_cov = matrix_G
        return out


    def construct(self, x):
        if self.thor:
            matrix_A = self.img2col(x)
            matrix_A_shape = self.shape(matrix_A)
            if self.is_Ascend:
                normalizer = matrix_A_shape[0]
                matrix_A = self.cube_matmul(matrix_A, matrix_A)
                if self.channels_slice_flag:
                    matrix_A = self.reshape(matrix_A, (self.hw, self.C0, self.hw, self.C0))
                    matrix_A = self.slice(matrix_A, (0, 0, 0, 0),
                                          (self.hw, self.in_channels, self.hw, self.in_channels))
                    matrix_A = self.reshape(matrix_A, (self.matrix_A_dim, self.matrix_A_dim))
                normalizer = self.cast(normalizer, mstype.float32)
                matrix_A = self.mul(matrix_A, 1.0 / normalizer)
                if self.padA_flag:
                    matrix_A = self.padA(matrix_A)
                self.A_normalizer = normalizer
                self.matrix_A_cov = matrix_A
            else:
                matrix_A = self.reshape(matrix_A, (matrix_A_shape[0] * matrix_A_shape[1] * matrix_A_shape[2],
                                                   matrix_A_shape[3], -1))
                matrix_A = self.reduce_mean(matrix_A, 1)
                matrix_A_shape = self.shape(matrix_A)
                normalizer = matrix_A_shape[1]
                matrix_A = self.cast(matrix_A, mstype.float32)
                matrix_A = self.matmul(matrix_A, matrix_A)
                matrix_A = self.mul(matrix_A, 1.0 / normalizer)
                self.A_normalizer = normalizer
                self.matrix_A_cov = matrix_A
            output = self.conv2d(x, self.weight)
            output = self.getG(output)
        else:
            output = self.conv2d(x, self.weight)
            if self.has_bias:
                output = self.bias_add(output, self.bias)
        return output

    def extend_repr(self):
        s = 'input_channels={}, output_channels={}, kernel_size={},' \
            'stride={},  pad_mode={}, padding={}, dilation={}, ' \
            'group={}, has_bias={},' \
            'weight_init={}, bias_init={}'.format(
                self.in_channels,
                self.out_channels,
                self.kernel_size,
                self.stride,
                self.pad_mode,
                self.padding,
                self.dilation,
                self.group,
                self.has_bias,
                self.weight_init,
                self.bias_init)
        return s

class Embedding_Thor(Cell):
    r"""
    A simple lookup table that stores embeddings of a fixed dictionary and size.

    This module is often used to store word embeddings and retrieve them using
    indices. The input to the module is a list of indices, and the output is
    the corresponding word embeddings.

    Note:
        When 'use_one_hot' is set to True, the type of the input must be mindspore.int32.

    Args:
        vocab_size (int): Size of the dictionary of embeddings.
        embedding_size (int): The size of each embedding vector.
        use_one_hot (bool): Specifies whether to apply one_hot encoding form. Default: False.
        embedding_table (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the embedding_table.
            Refer to class `initializer` for the values of string when a string
            is specified. Default: 'normal'.
        dtype (:class:`mindspore.dtype`): Data type of input. Default: mindspore.float32.
        padding_idx (int, None): When the padding_idx encounters index, the output embedding vector of this index
                                 will be initialized to zero. Default: None. The feature is inactivated.
    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(\text{batch_size}, \text{input_length})`. The elements of
          the Tensor must be integer and not larger than vocab_size. Otherwise the corresponding embedding vector will
          be zero.

    Outputs:
        Tensor of shape :math:`(\text{batch_size}, \text{input_length}, \text{embedding_size})`.

    Examples:
        >>> net = nn.Embedding(20000, 768,  True)
        >>> input_data = Tensor(np.ones([8, 128]), mindspore.int32)
        >>>
        >>> # Maps the input word IDs to word embedding.
        >>> output = net(input_data)
        >>> output.shape
        (8, 128, 768)
    """

    def __init__(self, vocab_size, embedding_size, use_one_hot=False, embedding_table='normal',
                 dtype=mstype.float32, padding_idx=None):
        super(Embedding_Thor, self).__init__()
        self.vocab_size = Validator.check_value_type('vocab_size', vocab_size, [int], self.cls_name)
        self.embedding_size = Validator.check_value_type('embedding_size', embedding_size, [int], self.cls_name)
        Validator.check_value_type('use_one_hot', use_one_hot, [bool], self.cls_name)
        Validator.check_subclass("dtype", dtype, mstype.number_type, self.cls_name)
        self.use_one_hot = use_one_hot
        self.dtype = dtype
        self.init_tensor = initializer(embedding_table, [vocab_size, embedding_size])
        self.padding_idx = padding_idx
        if padding_idx is not None:
            self.padding_idx = Validator.check_int_range(padding_idx, 0, vocab_size, Rel.INC_BOTH,
                                                         "padding_idx", self.cls_name)
            self.init_tensor = self.init_tensor.to_tensor().asnumpy()
            self.init_tensor[self.padding_idx] = 0
        self.embedding_table = Parameter(self.init_tensor, name='embedding_table')
        self.expand = P.ExpandDims()
        self.reshape_flat = P.Reshape()
        self.shp_flat = (-1,)
        self.gather = P.GatherV2()
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, self.dtype)
        self.off_value = Tensor(0.0, self.dtype)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.get_shp = P.Shape()
        self.thor = True
        self.matrix_A = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float32)),
                                  name='matrix_A', requires_grad=False)
        self.matrix_G = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float32)),
                                  name="matrix_G", requires_grad=False)
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.cast = P.Cast()
        if context.get_context("device_target") == "Ascend":
            self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        else:
            self.cube_matmul = P.MatMul(transpose_a=True)
        self.mul = P.Mul()


    def save_gradient(self, dout):
        """
           this function only for thor optimizer
           save_gradient
        """
        out = dout
        shape = self.get_shp(dout)
        normalizer = self.cast(shape[0], mstype.float32)
        matrix_G = self.cube_matmul(dout, dout)
        matrix_G = self.mul(matrix_G, 1.0 / normalizer)
        self.matrix_G = matrix_G
        return out

    def construct(self, ids):
        extended_ids = self.expand(ids, -1)
        out_shape = self.get_shp(ids) + (self.embedding_size,)
        flat_ids = self.reshape_flat(extended_ids, self.shp_flat)

        if self.use_one_hot:
            one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
            output_for_reshape = self.array_mul(one_hot_ids, self.embedding_table)
        else:
            if self.thor:
                one_hot_ids = self.one_hot(flat_ids, self.vocab_size, self.on_value, self.off_value)
                matrix_A = self.reduce_sum(one_hot_ids, 0)
                self.matrix_A = matrix_A
                output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
                output_for_reshape = self.getG(output_for_reshape)
            else:
                output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)


        output = self.reshape(output_for_reshape, out_shape)
        return output


    def extend_repr(self):
        s = 'vocab_size={}, embedding_size={}, use_one_hot={}, embedding_table={}, dtype={}, padding_idx={}'.format(
            self.vocab_size, self.embedding_size, self.use_one_hot, self.embedding_table, self.dtype, self.padding_idx)
        return s