!3991 support updatethorgradient for resnet50 thor GPU

Merge pull request !3991 from mamba_ni/master
5 years ago · 779c668ab0
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cu
@@ -0,0 +1,100 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "convert_gradient_impl.cuh"

 template <typename T>
 __global__ void ConvertGradientKernel(const size_t size, const size_t height_h, const size_t height_w,
                                      const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) {
  for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
    size_t dst_batchIdx = pointIdx / (height_h * height_w);
    size_t dst_batchIdxX = dst_batchIdx / batchwidth;
    size_t dst_batchIdxY = dst_batchIdx % batchwidth;
    size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
    size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
    size_t src_coordinate = dst_batchIdxX * height_h * width + dst_x * width + dst_batchIdxY * height_w + dst_y;
    output_addr[pointIdx] = input_addr[src_coordinate];
  }
 }

 template <typename T>
 __global__ void ConvertGradientBackKernel(const size_t size, const size_t height_h, const size_t height_w,
                                          const size_t batchwidth, const size_t width, T *input_addr, T *output_addr) {
  for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
    size_t dst_batchIdx = pointIdx / (height_h * height_w);
    size_t dst_batchIdxX = dst_batchIdx / batchwidth;
    size_t dst_batchIdxY = dst_batchIdx % batchwidth;
    size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
    size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
    size_t src_coordinate = dst_batchIdxX * height_h * width + dst_x * width + dst_batchIdxY * height_w + dst_y;
    output_addr[src_coordinate] = input_addr[pointIdx];
  }
 }

 template <typename T>
 __global__ void ConvertGradientBackKernel(const size_t size, const size_t height_h, const size_t height_w,
                                          const size_t ori_h, const size_t ori_w, const size_t batchwidth,
                                          const size_t width, T *input_addr, T *output_addr) {
  for (size_t pointIdx = blockIdx.x * blockDim.x + threadIdx.x; pointIdx < (size); pointIdx += blockDim.x * gridDim.x) {
    size_t dst_batchIdx = pointIdx / (height_h * height_w);
    size_t dst_batchIdxX = dst_batchIdx / batchwidth;
    size_t dst_batchIdxY = dst_batchIdx % batchwidth;
    size_t dst_x = (pointIdx - dst_batchIdx * height_h * height_w) / height_w;
    size_t dst_y = (pointIdx - dst_batchIdx * height_h * height_w) % height_w;
    size_t src_x = dst_batchIdxX * height_h + dst_x;
    size_t src_y = dst_batchIdxY * height_w + dst_y;
    if (src_x < ori_h && src_y < ori_w) {
      size_t src_coordinate = src_x * ori_w + src_y;
      output_addr[src_coordinate] = input_addr[pointIdx];
    }
  }
 }

 template <typename T>
 void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                     const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
  ConvertGradientKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, height_h, height_w, batchwidth, width,
                                                                           input_addr, output_addr);
 }

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                         const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream) {
  ConvertGradientBackKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, height_h, height_w, batchwidth,
                                                                               width, input_addr, output_addr);
 }

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h,
                         const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
                         cudaStream_t cuda_stream) {
  ConvertGradientBackKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
    size, height_h, height_w, ori_h, ori_w, batchwidth, width, input_addr, output_addr);
 }

 template void ConvertGradient<float>(const size_t size, const size_t height_h, const size_t height_w,
                                     const size_t batchwidth, const size_t width, float *input_addr, float *output_addr,
                                     cudaStream_t cuda_stream);

 template void ConvertGradientBack<float>(const size_t size, const size_t height_h, const size_t height_w,
                                         const size_t batchwidth, const size_t width, float *input_addr,
                                         float *output_addr, cudaStream_t cuda_stream);

 template void ConvertGradientBack<float>(const size_t size, const size_t height_h, const size_t height_w,
                                         const size_t ori_h, const size_t ori_w, const size_t batchwidth,
                                         const size_t width, float *input_addr, float *output_addr,
                                         cudaStream_t cuda_stream);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh
@@ -0,0 +1,34 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_

 #include "runtime/device/gpu/cuda_common.h"
 template <typename T>
 void ConvertGradient(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                     const size_t width, T *input_addr, T *outt_addr, cudaStream_t cuda_stream);

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t batchwidth,
                         const size_t width, T *input_addr, T *output_addr, cudaStream_t cuda_stream);

 template <typename T>
 void ConvertGradientBack(const size_t size, const size_t height_h, const size_t height_w, const size_t ori_h,
                         const size_t ori_w, const size_t batchwidth, const size_t width, T *input_addr, T *output_addr,
                         cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.cc
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/math/update_thor_gradient.h"
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(UpdateThorGradient,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      UpdateThorGradientGpuKernel, float)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
@@ -0,0 +1,241 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_UPDATE_THOR_GRADIENT_GPU_KERNEL_H
 #define MINDSPORE_UPDATE_THOR_GRADIENT_GPU_KERNEL_H
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/pad_impl.cuh"
 #include "utils/convert_utils.h"

 namespace mindspore {
 namespace kernel {
 struct GradientSize {
  size_t batch_h;
  size_t batch_w;
  size_t h;
  size_t w;
  size_t ori_h;
  size_t ori_w;
  size_t pad_h;
  size_t pad_w;
  bool need_convert;
  cudaDataType_t dtype;
 };
 template <typename T>
 class UpdateThorGradientGpuKernel : public GpuKernel {
 public:
  UpdateThorGradientGpuKernel() : split_dim(128) {}
  ~UpdateThorGradientGpuKernel() = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto input1_addr = GetDeviceAddress<T>(inputs, 0);
    auto input2_addr = GetDeviceAddress<T>(inputs, 1);
    auto input3_addr = GetDeviceAddress<T>(inputs, 2);
    auto workspace1_addr = GetDeviceAddress<T>(workspace, 0);
    T *workspace2_addr = nullptr;
    T *workspace3_addr = nullptr;
    if (gradient_size.need_convert) {
      workspace2_addr = GetDeviceAddress<T>(workspace, 1);
      workspace3_addr = GetDeviceAddress<T>(workspace, 2);
    }
    T *workspace4_addr = nullptr;
    auto output_addr = GetDeviceAddress<T>(outputs, 0);
    if (gradient_size.pad_h != 0 || gradient_size.pad_w != 0) {
      workspace4_addr = GetDeviceAddress<T>(workspace, 3);
      const size_t size = (gradient_size.ori_h + gradient_size.pad_h) * (gradient_size.ori_w + gradient_size.pad_w);
      CalPad(size, input2_addr, 1, 1, gradient_size.ori_h, gradient_size.ori_w,
             gradient_size.ori_h + gradient_size.pad_h, gradient_size.ori_w + gradient_size.pad_w, 0, 0, 0.0,
             workspace4_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
      cudaMemsetAsync(workspace1_addr, 0,
                      gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * sizeof(T),
                      reinterpret_cast<cudaStream_t>(stream_ptr));
      input2_addr = workspace4_addr;
    }
    const float alpha = 1;
    const float beta = 0;
    const int lda = SizeToInt(gradient_size.h);
    const int ldb = SizeToInt(gradient_size.ori_w + gradient_size.pad_w);
    const int ldc = SizeToInt(gradient_size.ori_w + gradient_size.pad_w);

    auto stride_a = SizeToInt(gradient_size.h * gradient_size.h);
    auto stride_b = SizeToInt(gradient_size.h * (gradient_size.ori_w + gradient_size.pad_w));
    auto stride_c = SizeToInt(gradient_size.h * (gradient_size.ori_w + gradient_size.pad_w));

    try {
      CHECK_CUBLAS_RET_WITH_EXCEPT(
        cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.ori_w),
                                   SizeToInt(gradient_size.h), SizeToInt(gradient_size.h), &alpha, input2_addr,
                                   gradient_size.dtype, ldb, stride_b, input1_addr, gradient_size.dtype, lda, stride_a,
                                   &beta, workspace1_addr, gradient_size.dtype, ldc, stride_c, gradient_size.batch_h,
                                   CUDA_R_32F, algo_),
        "cublasSgemm Call Fail");
    } catch (const std::exception &e) {
      MS_LOG(EXCEPTION) << "Encountered an exception: " << e.what() << "when invoke cubals cublasGemmStridedBatchedEx";
    }

    auto r_input_addr = workspace1_addr;
    if (gradient_size.need_convert) {
      size_t size = gradient_size.batch_w * gradient_size.batch_h * gradient_size.w * gradient_size.h;
      ConvertGradient(size, gradient_size.h, gradient_size.w, gradient_size.batch_w,
                      gradient_size.batch_w * gradient_size.w, workspace1_addr, workspace2_addr,
                      reinterpret_cast<cudaStream_t>(stream_ptr));
      r_input_addr = workspace2_addr;
    }

    const int lda_r = SizeToInt(gradient_size.w);
    const int ldb_r = SizeToInt(gradient_size.w);
    const int ldc_r = SizeToInt(gradient_size.w);

    stride_a = SizeToInt(gradient_size.h * gradient_size.w);
    stride_b = SizeToInt(gradient_size.w * gradient_size.w);
    stride_c = SizeToInt(gradient_size.h * gradient_size.w);
    auto r_output_addr = output_addr;
    if (gradient_size.need_convert) {
      r_output_addr = workspace3_addr;
    }
    CHECK_CUBLAS_RET_WITH_EXCEPT(
      cublasGemmStridedBatchedEx(handle_, CUBLAS_OP_N, CUBLAS_OP_N, SizeToInt(gradient_size.w),
                                 SizeToInt(gradient_size.h), SizeToInt(gradient_size.w), &alpha, input3_addr,
                                 gradient_size.dtype, ldb_r, stride_b, r_input_addr, gradient_size.dtype, lda_r,
                                 stride_a, &beta, r_output_addr, gradient_size.dtype, ldc_r, stride_c,
                                 gradient_size.batch_h * gradient_size.batch_w, CUDA_R_32F, algo_),
      "cublasSgemm Call Fail");
    if (gradient_size.need_convert) {
      size_t size = gradient_size.batch_w * gradient_size.batch_h * gradient_size.w * gradient_size.h;
      if (gradient_size.pad_h == 0 && gradient_size.pad_w == 0) {
        ConvertGradientBack(size, gradient_size.h, gradient_size.w, gradient_size.batch_w,
                            gradient_size.batch_w * gradient_size.w, r_output_addr, output_addr,
                            reinterpret_cast<cudaStream_t>(stream_ptr));
      } else {
        ConvertGradientBack(size, gradient_size.h, gradient_size.w, gradient_size.ori_h, gradient_size.ori_w,
                            gradient_size.batch_w, gradient_size.ori_w, r_output_addr, output_addr,
                            reinterpret_cast<cudaStream_t>(stream_ptr));
      }
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    SetProperty(kernel_node);
    InitSizeLists();
    return true;
  }

 protected:
  void InitSizeLists() override {
    size_t unit_size = sizeof(T);
    size_t input_size_ = gradient_size.h * gradient_size.h * gradient_size.batch_h * unit_size;
    input_size_list_.push_back(input_size_);

    input_size_ = gradient_size.ori_h * gradient_size.ori_w * unit_size;
    input_size_list_.push_back(input_size_);

    input_size_ = gradient_size.w * gradient_size.w * gradient_size.batch_w * unit_size;
    input_size_list_.push_back(input_size_);

    size_t output_size = gradient_size.ori_h * gradient_size.ori_w * unit_size;
    output_size_list_.push_back(output_size);

    size_t workspace_size_ = 0;
    workspace_size_ = gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size;
    workspace_size_list_.push_back(workspace_size_);

    if (gradient_size.need_convert) {
      workspace_size_ = gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size;
      workspace_size_list_.push_back(workspace_size_);
      workspace_size_ = gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size;
      workspace_size_list_.push_back(workspace_size_);
    }

    if (gradient_size.pad_h != 0 || gradient_size.pad_w != 0) {
      workspace_size_ =
        (gradient_size.ori_w + gradient_size.pad_w) * (gradient_size.ori_h + gradient_size.pad_h) * unit_size;
      workspace_size_list_.push_back(workspace_size_);
    }
  }

 private:
  void SetProperty(const CNodePtr &kernel_node) {
    auto matrix_a_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto matrix_g_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);

    split_dim = size_t(GetAttr<int>(kernel_node, "split_dim"));

    gradient_size.batch_h = gradient_shape[0] / split_dim;
    gradient_size.batch_w = gradient_shape[1] / split_dim;
    if (gradient_size.batch_h * split_dim != gradient_shape[0]) {
      gradient_size.batch_h += 1;
      if (gradient_shape[0] > split_dim) {
        gradient_size.h = split_dim;
        gradient_size.pad_h = gradient_size.batch_h * split_dim - gradient_shape[0];
      } else {
        gradient_size.h = gradient_shape[0];
        gradient_size.pad_h = 0;
      }
    } else {
      gradient_size.h = split_dim;
      gradient_size.pad_h = 0;
    }

    if (gradient_size.batch_w * split_dim != gradient_shape[1]) {
      gradient_size.batch_w += 1;
      if (gradient_shape[1] > split_dim) {
        gradient_size.w = split_dim;
        gradient_size.pad_w = gradient_size.batch_w * split_dim - gradient_shape[1];
      } else {
        gradient_size.w = gradient_shape[1];
        gradient_size.pad_w = 0;
      }
    } else {
      gradient_size.w = split_dim;
      gradient_size.pad_w = 0;
    }

    if (gradient_size.batch_w * gradient_size.w <= split_dim) {
      gradient_size.need_convert = false;
    } else {
      gradient_size.need_convert = true;
    }

    gradient_size.ori_w = gradient_shape[1];
    gradient_size.ori_h = gradient_shape[0];
    gradient_size.dtype = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1)));
  }

  size_t split_dim;
  struct GradientSize gradient_size;
  cublasHandle_t handle_;
  cublasGemmAlgo_t algo_ = CUBLAS_GEMM_DEFAULT;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif
--- a/mindspore/ops/operations/init.py
+++ b/mindspore/ops/operations/init.py
@@ -83,10 +83,10 @@ from . import _quant_ops
 from ._quant_ops import *
 from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, PopulationCount,
                        CheckValid, MakeRefKey, Partial, Depend, CheckBprop, Push, Pull)
 from .thor_ops import (CusBatchMatMul, CusCholeskyTrsm, CusFusedAbsMax1, CusImg2Col, CusMatMulCubeDenseLeft,
                       CusMatMulCubeFraczRightMul, CusMatMulCube, CusMatrixCombine, CusTranspose02314,
                       CusMatMulCubeDenseRight,
                       CusMatMulCubeFraczLeftCast, Im2Col)
 from ._thor_ops import (CusBatchMatMul, CusCholeskyTrsm, CusFusedAbsMax1, CusImg2Col, CusMatMulCubeDenseLeft,
                        CusMatMulCubeFraczRightMul, CusMatMulCube, CusMatrixCombine, CusTranspose02314,
                        CusMatMulCubeDenseRight,
                        CusMatMulCubeFraczLeftCast, Im2Col, UpdateThorGradient)
 from .sparse_ops import SparseToDense

 __all__ = [
--- a/mindspore/ops/operations/_thor_ops.py
+++ b/mindspore/ops/operations/_thor_ops.py
@@ -562,3 +562,48 @@ class Im2Col(PrimitiveWithInfer):
        valid_types = [mstype.float16, mstype.float32]
        validator.check_tensor_type_same(args, valid_types, self.name)
        return x_dtype


 class UpdateThorGradient(PrimitiveWithInfer):
    """
    Update Thor Gradient with Approximate Fisher info matrix(for GPU backend).

    The rank of input_x1 must be `3`, which indicates the A matrix.
    The rank of input_x2 must be `2`, which indicates the 1st-order gradient.
    The rank of input_x3 must be `4`, which indicates the G matrix.

    Inputs:
        - **input_x1** (Tensor) - The first input is the diag part of the cov matrix of feature map.
                                  Supported dtype [float32].
        - **input_x2** (Tensor) - The second input is the corresponding 1st-order grad. Supported dtype [float32].
        - **input_x3** (Tensor) - The third input is the diag part of the cov matrix of dout. Supported dtype [float32].

    Outputs:
        Tensor, the shape is the same as the shape of input_x2, it will be used to update the weights.

    Examples:
        >>> input_x1 = Tensor(np.random.rand(16, 128, 128).astype(np.float32))
        >>> input_x2 = Tensor(np.random.rand(2048, 1024).astype(np.float32))
        >>> temp_x3 = np.random.rand(8, 128, 128).astype(np.float32)
        >>> input_x3 = np.zeros(16,8,128,128).astype(np.float32)
        >>> for i in range(16):
        >>>     input_x3[i,:,:,:] = temp_x3
        >>> input_x3 = Tensor(input_x3)
        >>> update_thor_gradient = P.UpdateThorGradient(split_dim=128)
        >>> output = update_thor_gradient(input_x1, input_x2, input_x3)
    """

    @prim_attr_register
    def __init__(self, split_dim=0):
        """init UpdateThorGradient"""
        self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
        self.split_dim = split_dim
        self.add_prim_attr('split_dim', self.split_dim)

    def infer_shape(self, x1_shape, x2_shape, x3_shape):
        return x2_shape

    def infer_dtype(self, x1_dtype, x2_dtype, x3_dtype):
        validator.check_tensor_type_same({'x1_dtype': x1_dtype, 'x2_dtype': x2_dtype, 'x3_dtype': x3_dtype},
                                         [mstype.float32], self.name)
        return x2_dtype