optimizes the kernel error description of GPU about tile,topk,transpose etc.

4 years ago · 0609a235fd
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
@@ -96,12 +96,10 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
      return false;
    }

    input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tile_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tile_gpu_kernel.h
@@ -59,25 +59,26 @@ class TileGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Tile needs 1 input.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Tile has 1 output.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_) || CHECK_NULL_INPUT(output_shape_);
    is_null_input_ =
      CHECK_SHAPE_NULL(input_shape_, kernel_name, "input") || CHECK_SHAPE_NULL(output_shape_, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TileGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
    if (output_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'TileGpuKernel', the rank of output cannot be less than 1, but got "
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
                        << output_shape_.size();
    }
    input_size_ = 1;
@@ -87,8 +88,8 @@ class TileGpuKernel : public GpuKernel {

    output_size_ = 1;
    if (output_shape_.size() > TILE_MAX_DIMENSION) {
      MS_LOG(EXCEPTION) << "Output is " << output_shape_.size() << "-D, but Tile supports up to " << TILE_MAX_DIMENSION
                        << "-D.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than "
                        << TILE_MAX_DIMENSION << ", but got " << output_shape_.size();
    }
    shape_size_ = output_shape_.size();
    for (size_t i = 0; i < output_shape_.size(); i++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@@ -83,12 +83,13 @@ class TopKGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(output_shapes);
    is_null_input_ =
      CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TopkGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
@@ -74,28 +74,26 @@ class TransposeGpuFwdKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 1 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TransposeGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    shape_size_ = input_shape.size();
    if (shape_size_ > TRANSPOSE_MAX_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION
                        << "-D inputs.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than "
                        << TRANSPOSE_MAX_DIMENSION << ", but got " << shape_size_;
    }

    input_size_ = 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unique_gpu_kernel.h
@@ -50,11 +50,11 @@ class UniqueGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    std::vector<size_t> shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape);
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UniqueGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unpack_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_UNPACK_GPU_KERNEL_H

 #include <vector>
 #include <string>
 #include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
@@ -55,10 +56,9 @@ class UnpackGpuFwdKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    if (!CheckParam(kernel_node)) {
      return false;
    }
    (void)CheckParam(kernel_node);
    axis_ = static_cast<int32_t>(GetAttr<int64_t>(kernel_node, "axis"));
    if (axis_ < 0) {
      auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
@@ -73,9 +73,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
    for (size_t i = 0; i < output_num_; i++) {
      size_t _size = 1;
      auto _shape = AnfAlgo::GetOutputDeviceShape(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(_shape);
      is_null_input_ = CHECK_SHAPE_NULL(_shape, kernel_name_, "output");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'UnpackGpuKernel', output is null";
        InitSizeLists();
        return true;
      }
@@ -87,9 +86,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
    workspace_size_list_.push_back(sizeof(T *) * output_num_);

    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnpackGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -108,13 +106,11 @@ class UnpackGpuFwdKernel : public GpuKernel {
  void InitSizeLists() override {}

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
  void CheckParam(const CNodePtr &kernel_node) {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "input number is " << input_num << ", but UnpackGpuFwdKernel needs 1 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
    }
    return true;
  }
  int axis_;
  bool is_null_input_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_max_gpu_kernel.h
@@ -54,14 +54,15 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto segment_ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ =
      CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(segment_ids_shapes) || CHECK_NULL_INPUT(output_shapes);
    is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
                     CHECK_SHAPE_NULL(segment_ids_shapes, kernel_name, "segment_ids") ||
                     CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnsortedSegmentMaxGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -73,9 +74,8 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
      MS_LOG(INFO) << "UnsortedSegmentMax Kernel Input count is 2";
    }
    if (output_shapes.size() < 1) {
      MS_LOG(EXCEPTION)
        << "For UnsortedSegmentMax, output shape incorrect rank. Expect Rank at least rank 1, got Rank: "
        << output_shapes.size() << ".";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
                        << output_shapes.size();
    }
    num_segments_ = output_shapes[0];
    input_size_ = 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_min_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_min_gpu_kernel.h
@@ -49,13 +49,14 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto segment_ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ =
      CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(segment_ids_shapes) || CHECK_NULL_INPUT(output_shapes);
    is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
                     CHECK_SHAPE_NULL(segment_ids_shapes, kernel_name, "segment_ids") ||
                     CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnsortedSegmentMinGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -67,9 +68,8 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
      MS_LOG(INFO) << "UnsortedSegmentMin Kernel Input count is 2";
    }
    if (output_shapes.size() < 1) {
      MS_LOG(EXCEPTION)
        << "For UnsortedSegmentMin, output shape incorrect rank. Expect Rank at least rank 1, got Rank: "
        << output_shapes.size() << ".";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
                        << output_shapes.size();
    }
    num_segments_ = output_shapes[0];
    input_size_ = 1;
@@ -96,7 +96,6 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
    InitSizeLists();
    return true;
  }

  void ResetResource() noexcept override {
    num_segments_ = 1;
    inner_size_ = 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
@@ -52,13 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(ids_shapes) || CHECK_NULL_INPUT(output_shapes);
    is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
                     CHECK_SHAPE_NULL(ids_shapes, kernel_name, "segment_ids") ||
                     CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnsortedSegmentSumGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/zeroslike_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/zeroslike_gpu_kernel.h
@@ -51,12 +51,12 @@ class ZerosLikeGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;

    std::vector<size_t> input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ZeroslikeGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/custom/custom_aot_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/custom/custom_aot_gpu_kernel.h
@@ -55,7 +55,7 @@ class CustomAOTGpuKernel : public GpuKernel {
    if (!handle_) {
      handle_ = dlopen(file_path_.c_str(), RTLD_LAZY | RTLD_LOCAL);
      if (!handle_) {
        MS_LOG(ERROR) << "Open Error: " << dlerror();
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', open should be successful, but error, " << dlerror();
        return false;
      }
    }
@@ -65,7 +65,7 @@ class CustomAOTGpuKernel : public GpuKernel {
        reinterpret_cast<std::add_pointer<int(int, void **, int *, int64_t **, const char **, void *, void *)>::type>(
          dlsym(handle_, func_name_.c_str()));
      if (auto error_info = dlerror(); error_info != nullptr) {
        MS_LOG(ERROR) << error_info;
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', error info: " << error_info;
        return false;
      }
    }
@@ -79,7 +79,8 @@ class CustomAOTGpuKernel : public GpuKernel {
        ret = aot_func_(nparam, &params[0], &ndims_[0], &shapes_[0], &type_pointer_list_[0], stream_ptr, nullptr);
      }
    } catch (const std::exception &e) {
      MS_LOG(ERROR) << "CustomAOT operator failed when running user defined file " << file_path_ << "! "
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', operator failed when running user defined file " << file_path_
                    << "! "
                    << "Error message is " << e.what();
      return false;
    }
@@ -88,14 +89,15 @@ class CustomAOTGpuKernel : public GpuKernel {
      case 0:
        break;
      case 1:
        MS_LOG(ERROR) << "Number of parameters passed to AOT kernel is  " << nparam
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', the number of parameters passed to AOT kernel is " << nparam
                      << ", inconsistent with what the user wants";
        return false;
      case 2:
        MS_LOG(ERROR) << "Type of parameters passed to AOT kernel is inconsistent with what the user wants";
        MS_LOG(ERROR) << "For '" << kernel_name_
                      << "', type of parameters passed to AOT kernel is inconsistent with what the user wants";
        return false;
      default:
        MS_LOG(ERROR) << "Error occurred when running AOT kernel, "
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', error occurred when running AOT kernel, "
                      << "error id is " << ret;
        return false;
    }
@@ -104,27 +106,25 @@ class CustomAOTGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    const auto &exec_info = AnfAlgo::GetNodeAttr<std::string>(kernel_node, "func_name");
    if (auto pos = exec_info.find(":"); pos != std::string::npos) {
      auto path = exec_info.substr(0, pos);
      auto real_path = FileUtils::GetRealPath(path.c_str());
      if (!real_path.has_value()) {
        MS_LOG(ERROR) << "Invalid file path, " << path << " does not exist.";
        return false;
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the file path should be exist, but got " << path;
      }
      file_path_ = real_path.value();
      func_name_ = exec_info.substr(pos + 1);
    } else {
      MS_LOG(ERROR) << "Wrong execute info:" << exec_info;
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', Wrong execute info:" << exec_info;
    }

    num_input_ = AnfAlgo::GetInputTensorNum(kernel_node);
    auto input_type_list = AnfAlgo::GetAllInputDeviceTypes(kernel_node);
    if (num_input_ != input_type_list.size()) {
      MS_LOG(ERROR) << "Input shapes'size is " << num_input_ << ", while input types' size is "
                    << input_type_list.size();
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be " << input_type_list.size()
                        << ", but got " << num_input_;
    }

    for (size_t i = 0; i < num_input_; i++) {
@@ -141,9 +141,8 @@ class CustomAOTGpuKernel : public GpuKernel {
    auto output_type_list = AnfAlgo::GetAllOutputDeviceTypes(kernel_node);

    if (num_output_ != output_type_list.size()) {
      MS_LOG(ERROR) << "Output shapes'size is " << num_output_ << ", while output types' size is "
                    << output_type_list.size();
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be " << output_type_list.size()
                        << ", but got " << num_output_;
    }

    for (size_t i = 0; i < num_output_; i++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_init_kernel.cc
@@ -61,12 +61,13 @@ bool DatasetInitKernel::Launch(const std::vector<AddressPtr> &, const std::vecto
  size_t len = total_bytes_ * buffer_q_capacity_;

  if (!device::gpu::GPUMemoryAllocator::GetInstance().AllocBufferQueueMem(len, &addr)) {
    MS_LOG(EXCEPTION) << "Memory not enough: failed to allocate GPU buffer queue memory[" << len << "].";
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', memory not enough: failed to allocate GPU buffer queue memory["
                      << len << "].";
  }

  auto status = GpuBufferMgr::GetInstance().Create(0, queue_name_, addr, shapes_, buffer_q_capacity_);
  if (status) {
    MS_LOG(EXCEPTION) << "Init Dataset Failed. len: " << len << ", status:" << status;
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init Dataset Failed. len: " << len << ", status:" << status;
  }

  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
@@ -121,7 +121,7 @@ bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) {
 #ifdef ENABLE_DUMP_IR
        mindspore::RDR::TriggerAll();
 #endif
        MS_LOG(EXCEPTION) << "Get data timeout";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', get data timeout";
      }
    }
 #ifndef ENABLE_SECURITY
@@ -130,7 +130,7 @@ bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) {
      profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
    }
 #endif
    MS_LOG(ERROR) << "Get data failed, errcode " << ret;
    MS_LOG(ERROR) << "For '" << kernel_name_ << "', get data failed, errcode " << ret;
    return false;
  }
  return true;
@@ -141,7 +141,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
  if (handle_ == HandleMgr::INVALID_HANDLE) {
    handle_ = GpuBufferMgr::GetInstance().Open(0, queue_name_, output_size_list_);
    if (handle_ == HandleMgr::INVALID_HANDLE) {
      MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', gpu Queue(" << queue_name_ << ") Open Failed";
    }
  }

@@ -151,7 +151,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
    return false;
  }
  if (total_bytes_ != len) {
    MS_LOG(ERROR) << "Dataset front error. read: " << len << ", expect: " << total_bytes_ << ", ";
    MS_LOG(ERROR) << "For '" << kernel_name_ << "', dataset front error, read: " << len << ", expect: " << total_bytes_;
    return false;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.cc
@@ -26,7 +26,7 @@

 namespace mindspore {
 namespace kernel {
 GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {}
 GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path), kernel_name_("GetNextProfiling") {}

 void GetNextProfiling::GetDeviceId() {
  auto context_ptr = MsContext::GetInstance();
@@ -44,13 +44,13 @@ void GetNextProfiling::Init() {
 void GetNextProfiling::SaveProfilingData() {
  std::ofstream handle(file_name_, std::ios::trunc);
  if (!handle.is_open()) {
    MS_LOG(ERROR) << "Open get-next profiling file failed.";
    MS_LOG(ERROR) << "For '" << kernel_name_ << "', open get-next profiling file failed.";
    return;
  }
  for (uint32_t index = 0; index < queue_size_.size(); index++) {
    if (index > time_stamp_.size() - 1) {
      handle.close();
      MS_LOG(EXCEPTION) << "index exceeds time_stamp_ size.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', index exceeds time_stamp_ size.";
    }
    handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index]
           << std::endl;
@@ -62,7 +62,7 @@ void GetNextProfiling::SaveProfilingData() {

 void GetNextProfiling::ChangeFileMode() {
  if (chmod(common::SafeCStr(file_name_), S_IRUSR | S_IWUSR) == -1) {
    MS_LOG(ERROR) << "Modify file:" << file_name_ << " to rw fail.";
    MS_LOG(ERROR) << "For '" << kernel_name_ << "', modify file:" << file_name_ << " to rw fail.";
    return;
  }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_profiling.h
@@ -44,6 +44,7 @@ class GetNextProfiling : public ProfilingOp {
  std::vector<std::pair<uint64_t, uint64_t>> time_stamp_;  // First value of std::pair is the start time stamp,
                                                           // Second value of std::pair is the stop time stamp
  std::string device_id_;
  std::string kernel_name_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/debug/print_gpu_kernel.h
@@ -90,6 +90,7 @@ class PrintGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    if (AnfAlgo::HasNodeAttr("string_pos", kernel_node)) {
@@ -105,9 +106,8 @@ class PrintGpuKernel : public GpuKernel {
    input_flag_ = SetInputFlag(&string_pos_, input_tensor_num);
    for (size_t i = 0; i < input_tensor_num; i++) {
      auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(input_shape);
      is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'PrintGpuKernel', input is null";
        InitSizeLists();
        return true;
      }
@@ -183,7 +183,7 @@ class PrintGpuKernel : public GpuKernel {
          input_device_data->push_back(GetDeviceAddress<double>(inputs, i));
          break;
        default:
          MS_LOG(EXCEPTION) << "TypeId: " << type_id << " is not supported in Print.";
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the typeid cannot be " << type_id;
      }
    }
  }
@@ -200,7 +200,7 @@ class PrintGpuKernel : public GpuKernel {
    }
    for (size_t i = 0; i < string_pos->size(); i++) {
      if ((*string_pos)[i] < 0) {
        MS_LOG(EXCEPTION) << "string_pos cannot be a negative value";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', string_pos cannot be a negative value";
      }
      auto index = IntToSize((*string_pos)[i]);
      res[index] = -1;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
@@ -65,22 +65,21 @@ class AddNGpuFwdKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    num_input_ = GetAttr<int64_t>(kernel_node, "n");
    if (num_input_ != input_num) {
      MS_LOG(ERROR) << "Input number is " << num_input_ << " in attr, but got " << input_num << "input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be  " << num_input_ << ", but got "
                        << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "AddNGpuFwdKernel input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/assign_add_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/assign_add_gpu_kernel.h
@@ -49,20 +49,18 @@ class AssignAddGpuFwdKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but cudnnAddTensor needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "AssignAddGpuFwdKernel input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_complex_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_complex_gpu_kernel.h
@@ -57,13 +57,15 @@ class BroadcastComplexOpGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    GetOpType(kernel_node);
    auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
    if (need_broadcast_ && shape1.size() > MAX_DIMS) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << MAX_DIMS;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS
                        << ", but got " << shape1.size();
    }

    lhs_shape_.resize(MAX_DIMS, 1);
@@ -128,7 +130,9 @@ class BroadcastComplexOpGpuKernel : public GpuKernel {
      return;
    }

    MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
    MS_LOG(EXCEPTION) << "For '" << kernel_name_
                      << ", only support these types: RealDiv, Mul, Sub, Add, Div or Complex currently, but got "
                      << kernel_name;
  }

  BroadcastOpType op_type_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
@@ -70,20 +70,22 @@ class BroadcastOpGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    GetOpType(kernel_node);
    auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape1) || CHECK_NULL_INPUT(shape2) || CHECK_NULL_INPUT(shape3);
    is_null_input_ = CHECK_SHAPE_NULL(shape1, kernel_name_, "input") ||
                     CHECK_SHAPE_NULL(shape2, kernel_name_, "input") ||
                     CHECK_SHAPE_NULL(shape3, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BroadcastGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
    need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
    if (need_broadcast_ && shape1.size() > MAX_DIMS) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than: " << MAX_DIMS << ", actual size is "
                        << shape1.size();
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS
                        << ", but got " << shape1.size();
    }

    lhs_shape_.resize(MAX_DIMS, 1);
@@ -94,7 +96,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
        if (i < MAX_DIMS) {
          output_shape_[i] = shape3[i];
        } else {
          MS_LOG(EXCEPTION) << "Output index: " << i << " should be less than " << MAX_DIMS;
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of output should be less than " << MAX_DIMS
                            << ", but got " << i;
        }
      }
      output_num_ *= shape3[i];
@@ -106,7 +109,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
          lhs_shape_[j + lhs_offset] = shape1[j];
        } else {
          auto index = j + lhs_offset;
          MS_LOG(EXCEPTION) << "Invalid input1 index: " << index;
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of input cannot be " << index << ", but got "
                            << index;
        }
      }
      input1_num_ *= shape1[j];
@@ -118,7 +122,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
          rhs_shape_[k + rhs_offset] = shape2[k];
        } else {
          auto index = k + rhs_offset;
          MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of input cannot be " << index << ", but got "
                            << index;
        }
      }
      input2_num_ *= shape2[k];
@@ -201,7 +206,10 @@ class BroadcastOpGpuKernel : public GpuKernel {
      return;
    }

    MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
    MS_LOG(EXCEPTION) << "For '" << kernel_name_
                      << ", only support these types: Maximum, Minimum, Pow, RealDiv, Mul, Sub, Add, Div, DivNoNan, "
                         "Mod, FloorDiv, AbsGrad, FloorMod, Atan2, TruncateDiv or TruncateMod currently, but got "
                      << kernel_name;
  }

  BroadcastOpType op_type_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
@@ -69,20 +69,23 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    GetOpType(kernel_node);
    auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto shape3 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    is_null_input_ = CHECK_NULL_INPUT(shape1) || CHECK_NULL_INPUT(shape2) || CHECK_NULL_INPUT(shape3);
    is_null_input_ = CHECK_SHAPE_NULL(shape1, kernel_name_, "input_1") ||
                     CHECK_SHAPE_NULL(shape2, kernel_name_, "input_2") ||
                     CHECK_SHAPE_NULL(shape3, kernel_name_, "input_3");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BroadcastGradGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
    need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
    if (need_broadcast_ && shape1.size() > kMaxShapeSize) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << kMaxShapeSize;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
                        << kMaxShapeSize << ", but got " << shape1.size();
    }

    for (size_t i = 0; i < shape3.size(); i++) {
@@ -98,8 +101,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
          x1_shape_[i + x1_offset] = shape1[i];
        } else {
          auto index = i + x1_offset;
          MS_LOG(EXCEPTION) << "For 'BroadcastOpGrad', the dimension of input cannot be greater than " << kMaxShapeSize
                            << ", but got " << (index + 1);
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
                            << kMaxShapeSize << ", but got " << (index + 1);
        }
      }
      input1_num_ *= shape1[i];
@@ -111,7 +114,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
          x2_shape_[i + x2_offset] = shape2[i];
        } else {
          auto index = i + x2_offset;
          MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
          MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
                            << kMaxShapeSize << ", but got " << (index + 1);
        }
      }
      input2_num_ *= shape2[i];
@@ -162,7 +166,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {

    auto iter = kBroadcastTypeMap.find(kernel_name);
    if (iter == kBroadcastTypeMap.end()) {
      MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_
                        << ", only support these types: MaximumGrad or MinimumGrad currently, but got " << kernel_name;
    } else {
      op_type_ = iter->second;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
@@ -66,14 +66,14 @@ class CastAllGpuFwdKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    num_input_ = GetAttr<size_t>(kernel_node, "n");
    size_ = std::make_unique<size_t[]>(num_input_);
    for (size_t i = 0; i < num_input_; i++) {
      auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'CastAllGpuKernel', input is null";
        InitSizeLists();
        return true;
      }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_gpu_kernel.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/cuda_impl/eye_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/matrix_split_impl.cuh"
@@ -62,6 +63,7 @@ class CholeskyGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, kLower));
    split_dim_ = static_cast<int>(GetAttr<int64_t>(kernel_node, kSplitDim));
@@ -77,11 +79,11 @@ class CholeskyGpuKernel : public GpuKernel {

    auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kInputIndex);

    is_null_input_ = CHECK_NULL_INPUT(in_shape);
    is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(EXCEPTION) << "For 'CholeskyGpuKernel', input is null";
      InitSizeLists();
      return true;
    }

    if (split_dim_ == 0) {
      return InitNoSplitDim(in_shape);
    }
@@ -103,11 +105,11 @@ class CholeskyGpuKernel : public GpuKernel {
      cho_row_ = in_shape.at(kDim1);
      cho_col_ = in_shape.at(kDim2);
    } else {
      MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
      return false;
    }
    if (cho_row_ != cho_col_) {
      MS_LOG(ERROR) << "Cholesky need square matrix as input.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
      return false;
    }
    // set matrix row or col to be lead dimension
@@ -121,13 +123,14 @@ class CholeskyGpuKernel : public GpuKernel {

  bool InitSplitDim(const std::vector<size_t> &in_shape) {
    if (in_shape.size() != kCholeskyNormalShape) {
      MS_LOG(ERROR) << "Cholesky Split Matrix Need Input Rank as 2.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input should be " << kCholeskyNormalShape
                    << ", but got " << in_shape.size();
      return false;
    }
    cho_row_ = in_shape.at(kDim0);
    cho_col_ = in_shape.at(kDim1);
    if (cho_row_ != cho_col_) {
      MS_LOG(ERROR) << "Cholesky Split Matrix Need Square Matrix as Input.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
      return false;
    }

@@ -206,7 +209,7 @@ class CholeskyGpuKernel : public GpuKernel {
        kernel_node_, cusolverDnDpotrfBatched(handle_, uplo_, m_, d_array_addr, lda_, d_info_array_addr, batch_),
        "cusolver cholesky batched Fail");
    } else {
      MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
    }
    size_t output_elements = outputs.at(kDim0)->size / unit_size_;
    // copy results from written input's matrix to output's matrix by up or lower flag.
@@ -243,7 +246,7 @@ class CholeskyGpuKernel : public GpuKernel {
        kernel_node_, cusolverDnDpotrfBatched(handle_, uplo_, m_, d_array_addr, lda_, d_info_array_addr, batch_),
        "cusolver cholesky batched Fail");
    } else {
      MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
    }

    TriangleMatrixCopy(d_batch_input_addr, output_addr, uplo_, outputs[0]->size / sizeof(T), ldb_, m_,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_solve_gpu_kernel.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/cuda_impl/triangle_matrix_copy_impl.cuh"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
@@ -41,7 +42,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
 public:
  using pointer = T *;

  CholeskySolveGpuKernel() = default;
  CholeskySolveGpuKernel() : is_null_input_(false) {}
  ~CholeskySolveGpuKernel() = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -49,6 +50,9 @@ class CholeskySolveGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
                                  "cusolverDnSetStream failed");
    auto input_a_addr = GetDeviceAddress<T>(inputs, kDim0);
@@ -81,7 +85,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
                                                             d_b_array_addr, ldb_, d_info_array_addr, batch_),
                                     "cusolver cholesky solve batched Fail");
    } else {
      MS_LOG(EXCEPTION) << "cholesky solve do not support other data type but only float or double, right now.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
    }
    size_t output_elements = outputs.at(kDim0)->size / unit_size_;
    // copy results from written input's matrix to output's matrix.
@@ -90,6 +94,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, kLower));
    // gpu input is col major default, so need to change row major.
@@ -103,10 +108,14 @@ class CholeskySolveGpuKernel : public GpuKernel {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
    auto in_a_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kDim0);
    auto in_b_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kDim1);
    if (CHECK_NULL_INPUT(in_a_shape) || CHECK_NULL_INPUT(in_b_shape)) {
      MS_LOG(EXCEPTION) << "For 'CholeskySolveGpuKernel', input is null";
    is_null_input_ =
      CHECK_SHAPE_NULL(in_a_shape, kernel_name_, "input_a") || CHECK_SHAPE_NULL(in_b_shape, kernel_name_, "input_b");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    return InitDim(in_a_shape, in_b_shape);
    (void)InitDim(in_a_shape, in_b_shape);
    return true;
  }

 protected:
@@ -127,7 +136,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
  }

 private:
  bool InitDim(const std::vector<size_t> &in_a_shape, const std::vector<size_t> &in_b_shape) {
  void InitDim(const std::vector<size_t> &in_a_shape, const std::vector<size_t> &in_b_shape) {
    if (in_a_shape.size() == kCholeskyDefaultShape) {
      batch_ = 1;
      cho_row_ = in_a_shape.at(kDim0);
@@ -141,17 +150,14 @@ class CholeskySolveGpuKernel : public GpuKernel {
      cho_row_ = in_a_shape.at(kDim1);
      cho_col_ = in_a_shape.at(kDim2);
    } else {
      MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
    }
    if (cho_row_ != cho_col_) {
      MS_LOG(ERROR) << "Cholesky need square matrix as input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
    }
    size_t b_row = in_b_shape.size() == kCholeskyBatchedShape ? in_b_shape.at(kDim1) : in_b_shape.at(kDim0);
    if (cho_row_ != b_row) {
      MS_LOG(ERROR) << "Cholesky right hand matrix is not equal to left matrix.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', right hand matrix should be equal to left matrix";
    }
    m_ = SizeToInt(in_a_shape.at(kDim1));
    lda_ = m_;
@@ -159,7 +165,6 @@ class CholeskySolveGpuKernel : public GpuKernel {
    h_a_array_.resize(batch_);
    h_b_array_.resize(batch_);
    InitSizeLists();
    return true;
  }
  size_t cho_row_{0};
  size_t cho_col_{0};
@@ -177,6 +182,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cholesky_trsm_solve_gpu_kernel.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/cuda_impl/eye_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/matrix_split_impl.cuh"
@@ -67,13 +68,13 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
    blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(in_shape);
    is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CholeskyTrsmSolveGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -84,12 +85,10 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
      }
    } else {
      if (in_shape.size() != 2) {
        MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Input Rank as 2.";
        return false;
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input only should be 2";
      }
      if (in_shape[0] != in_shape[1]) {
        MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Square Matrix as Input.";
        return false;
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
      }
      InitDimOthers(kernel_node, in_shape);
    }
@@ -201,17 +200,19 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
    if (in_shape.size() == 2) {
      batch_ = 1;
      if (in_shape[0] != in_shape[1]) {
        MS_LOG(ERROR) << "CholeskyTrsm shape0: " << in_shape[0] << ", is not equal to shape1: " << in_shape[1];
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', shape0 should be equal to " << in_shape[1] << ", but got "
                      << in_shape[0];
        return false;
      }
    } else if (in_shape.size() == 3) {
      batch_ = SizeToInt(in_shape[0]);
      if (in_shape[1] != in_shape[2]) {
        MS_LOG(ERROR) << "CholeskyTrsm shape1: " << in_shape[1] << ", is not equal to shape2: " << in_shape[2];
        MS_LOG(ERROR) << "For '" << kernel_name_ << "', shape1 should be equal to " << in_shape[2] << ", but got "
                      << in_shape[1];
        return false;
      }
    } else {
      MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
      return false;
    }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumprod_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumprod_gpu_kernel.h
@@ -55,15 +55,15 @@ class CumProdGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumProdGpuKernel needs 1.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    input_size_0_ = sizeof(T);
    shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape_);
    is_null_input_ = CHECK_SHAPE_NULL(shape_, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CumProdGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
@@ -72,7 +72,8 @@ class CumProdGpuKernel : public GpuKernel {
    reverse_ = GetAttr<bool>(kernel_node, "reverse");
    int input_dim_length = SizeToInt(shape_.size());
    if (axis_ >= input_dim_length) {
      MS_LOG(EXCEPTION) << "Axis is: " << axis_ << " out of bounds.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of 'axis' should be less than " << input_dim_length
                        << ", but got " << axis_;
    }
    while (axis_ < 0) {
      axis_ += input_dim_length;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumsum_gpu_kernel.h
@@ -55,15 +55,15 @@ class CumSumGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumSumGpuKernel needs 1.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    input_size_0_ = sizeof(T);
    shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape_);
    is_null_input_ = CHECK_SHAPE_NULL(shape_, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CumSumGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -72,7 +72,8 @@ class CumSumGpuKernel : public GpuKernel {
    reverse_ = GetAttr<bool>(kernel_node, "reverse");
    int input_dim_length = SizeToInt(shape_.size());
    if (axis_ >= input_dim_length) {
      MS_LOG(EXCEPTION) << "Axis is: " << axis_ << " out of bounds.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of 'axis' should be less than " << input_dim_length
                        << ", but got " << axis_;
    }
    while (axis_ < 0) {
      axis_ += input_dim_length;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/determinant_triangle_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/determinant_triangle_gpu_kernel.h
@@ -47,7 +47,8 @@ class DetTriangleGpuKernel : public GpuKernel {

    if (!CheckTriangle(input_addr, fill_mode_, matrix_n_, outputs[0]->size / sizeof(T),
                       reinterpret_cast<cudaStream_t>(stream_ptr))) {
      MS_LOG(ERROR) << "The elements in the upper half of the matrix should be all 0, fill mode is: " << fill_mode_;
      MS_LOG(ERROR) << "For '" << kernel_name_
                    << "', the elements in the upper half of the matrix should be all 0, fill mode is: " << fill_mode_;
      return false;
    }
    DetTriangle(input_addr, output_addr, matrix_n_, outputs[0]->size / sizeof(T),
@@ -56,21 +57,20 @@ class DetTriangleGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but DetTriangle needs 1 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but DetTriangle needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ =
      CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'DeterminantTriangleGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -79,8 +79,8 @@ class DetTriangleGpuKernel : public GpuKernel {
    }

    if (input_shape.size() < 2) {
      MS_LOG(ERROR) << "The input should have rank at least 2.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be less than 2, but got "
                        << input_shape.size();
    }

    matrix_n_ = input_shape[input_shape.size() - 1];
@@ -89,12 +89,11 @@ class DetTriangleGpuKernel : public GpuKernel {
      output_size_ *= output_shape[i];
    }
    if (matrix_n_ == 0 || output_size_ != input_size_ / matrix_n_ / matrix_n_) {
      MS_LOG(ERROR) << "The output shape is wrong.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of output should be "
                        << (input_size_ / matrix_n_ / matrix_n_) << ", but got " << output_size_;
    }
    if (input_shape[input_shape.size() - 2] != input_shape[input_shape.size() - 1]) {
      MS_LOG(ERROR) << "The matrix should be in shape of square.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
    }
    auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
    MS_EXCEPTION_IF_NULL(prim);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h
@@ -21,6 +21,7 @@
 #include <cusolverDn.h>
 #include <cuda_runtime.h>
 #include <vector>
 #include <string>
 #include <complex>
 #include <algorithm>
 #include <type_traits>
@@ -52,13 +53,14 @@ struct Complex_traits<Complex<T>> {
 template <typename T>
 class EighcGpuKernel : public GpuKernel {
 public:
  EighcGpuKernel() = default;
  EighcGpuKernel() : is_null_input_(false) {}
  ~EighcGpuKernel() = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
    auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@@ -70,13 +72,14 @@ class EighcGpuKernel : public GpuKernel {
      jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
    }
    cusolver_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
    bool is_null_input = CHECK_NULL_INPUT(A_shape);
    if (is_null_input) {
      MS_LOG(EXCEPTION) << "For 'EighValue GpuKernel', input is null";
    is_null_input_ = CHECK_SHAPE_NULL(A_shape, kernel_name_, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    if (A_shape.size() != kShape2dDims || A_shape[1] != A_shape[1]) {
      MS_LOG(EXCEPTION) << "wrong array shape, A should be a square matrix, but got [" << A_shape[0] << " X "
                        << A_shape[1] << "]";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix, but got ["
                        << A_shape[0] << " X " << A_shape[1] << "]";
    }
    m_ = A_shape[0];
    InitSizeLists();
@@ -85,6 +88,9 @@ class EighcGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    CHECK_CUBLAS_RET_WITH_ERROR(cublasSetStream(blas_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
                                "cublasSetStream failed");
    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(cusolver_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
@@ -136,7 +142,7 @@ class EighcGpuKernel : public GpuKernel {
    }
    d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork);
    if (!d_work) {
      MS_LOG(EXCEPTION) << "GPU memory alloca failed.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', GPU memory alloca failed.";
    }
    if constexpr (std::is_same_v<T, Complex<float>>) {
      cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(w_v_addr), lda_, w_w_addr,
@@ -198,6 +204,7 @@ class EighcGpuKernel : public GpuKernel {
  std::vector<size_t> output_size_list_{};
  std::vector<size_t> workspace_size_list_{};
  using D = typename Complex_traits<T>::value_type;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h
@@ -39,13 +39,14 @@ constexpr char LOWER[] = "lower";
 template <typename T>
 class EighGpuKernel : public GpuKernel {
 public:
  EighGpuKernel() = default;
  EighGpuKernel() : is_null_input_(false) {}
  ~EighGpuKernel() = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
    auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    compute_eigen_vectors_ = static_cast<bool>(GetAttr<bool>(kernel_node, C_EIEH_VECTOR));
@@ -56,13 +57,14 @@ class EighGpuKernel : public GpuKernel {
      jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
    }
    cusolver_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
    bool is_null_input = CHECK_NULL_INPUT(A_shape);
    if (is_null_input) {
      MS_LOG(EXCEPTION) << "For 'EighValue GpuKernel', input is null";
    is_null_input_ = CHECK_SHAPE_NULL(A_shape, kernel_name, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    if (A_shape.size() != kShape2dDims || A_shape[0] != A_shape[1]) {
      MS_LOG(EXCEPTION) << "wrong array shape, A should be a square matrix, but got [" << A_shape[0] << " X "
                        << A_shape[1] << "]";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be square matrix, but got ["
                        << A_shape[0] << " X " << A_shape[1] << "]";
    }
    m_ = A_shape[0];
    InitSizeLists();
@@ -71,6 +73,9 @@ class EighGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(cusolver_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
                                  "cusolverDnSetStream failed");
    // matrix A, input or output(eigenvector)
@@ -152,6 +157,7 @@ class EighGpuKernel : public GpuKernel {
  cusolverEigMode_t jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
  bool compute_eigen_vectors_{false};
  bool lower_{true};
  bool is_null_input_;
  std::vector<T *> h_array_{};
  std::vector<size_t> input_size_list_{};
  std::vector<size_t> output_size_list_{};
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/equalcount_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/equalcount_gpu_kernel.h
@@ -49,23 +49,21 @@ class EqualCountGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but equalcount needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but equalcount needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }

    output_size_ = sizeof(T);
    input_size_ = sizeof(T);
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'EqualcountGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/float_status_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/float_status_gpu_kernel.h
@@ -77,13 +77,11 @@ class FloatStatusGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    if (!CheckParam(kernel_node)) {
      return false;
    }
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    (void)CheckParam(kernel_node);
    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape);
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'FloatStatusGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -91,10 +89,10 @@ class FloatStatusGpuKernel : public GpuKernel {
    for (size_t x : shape) {
      input_size_ = input_size_ * x;
    }
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kOpTypeMap.find(kernel_name);
    if (iter == kOpTypeMap.end()) {
      MS_LOG(EXCEPTION) << "FloatStatus kernel " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: FloatStatus, IsInf, IsNan, IsFinite "
                        << "currently, but got " << kernel_name;
    }
    kernel_name_ = iter->second;

@@ -114,18 +112,16 @@ class FloatStatusGpuKernel : public GpuKernel {
  }

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
  void CheckParam(const CNodePtr &kernel_node) {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but FloatStatusGpuKernel needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but FloatStatusGpuKernel needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    return true;
  }

  std::vector<size_t> input_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/identity_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/identity_gpu_kernel.h
@@ -52,20 +52,18 @@ class IdentityGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but identity needs 1 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but identity needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "IdentityGpuKernel input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/index_add_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/index_add_gpu_kernel.h
@@ -62,18 +62,20 @@ class IndexAddGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but index add needs 3 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
    }
    std::vector<size_t> dst_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    std::vector<size_t> index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    std::vector<size_t> src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    is_null_input_ = CHECK_NULL_INPUT(dst_shape) || CHECK_NULL_INPUT(index_shape) || CHECK_NULL_INPUT(src_shape);
    is_null_input_ = CHECK_SHAPE_NULL(dst_shape, kernel_name, "x") ||
                     CHECK_SHAPE_NULL(index_shape, kernel_name, "indices") ||
                     CHECK_SHAPE_NULL(src_shape, kernel_name, "y");

    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'IndexAddGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -91,8 +93,8 @@ class IndexAddGpuKernel : public GpuKernel {
      inner_size_ *= src_shape[i];
    }
    if (axis < 0 || axis >= SizeToInt(src_shape.size()) || axis >= SizeToInt(dst_shape.size())) {
      MS_LOG(EXCEPTION) << "Init axis size failed, actual src axis size is " << src_axis_size_
                        << ", actual dst axis size is " << dst_axis_size_;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the size of 'axis' cannot be greater than or equal to "
                        << SizeToInt(src_shape.size()) << " or " << SizeToInt(dst_shape.size()) << ", but got " << axis;
    }
    src_axis_size_ = src_shape[axis];
    dst_axis_size_ = dst_shape[axis];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/linspace.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/linspace.h
@@ -49,37 +49,34 @@ class LinSpaceGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but DynamicLinSpace needs 3 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but DynamicLinSpace needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto input_2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto value_count = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_1) || CHECK_NULL_INPUT(input_2) || CHECK_NULL_INPUT(value_count);
    is_null_input_ = CHECK_SHAPE_NULL(input_1, kernel_name, "start") ||
                     CHECK_SHAPE_NULL(input_2, kernel_name, "stop") ||
                     CHECK_SHAPE_NULL(value_count, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'LinspaceGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    // error checking input data
    if ((input_1.size() != 0) || (input_2.size() != 0)) {
      MS_LOG(ERROR) << "For LinShape "
                    << "both start and end must be 0-D Tensors. Got " << input_1.size() << " and " << input_2.size()
                    << ".";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', both start and end should be 0-D Tensors, but got dimension "
                        << "of start: " << input_1.size() << " and dimension of end: " << input_2.size();
    }

    if (value_count.size() != 1) {
      MS_LOG(ERROR) << "For LinShape, output shape incorrect rank. Expect Rank: 1, got Rank: " << value_count.size()
                    << ".";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output should be 1, but got "
                        << value_count.size();
    }
    value_count_ = value_count[0];
    InitSizeLists();
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/logical_not_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/logical_not_gpu_kernel.h
@@ -49,11 +49,11 @@ class LogicalNotGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'LogicalGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/lu_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/lu_gpu_kernel.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include <type_traits>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
@@ -39,7 +40,7 @@ constexpr size_t kLuNormalShape = 2;
 template <typename T>
 class LUGpuKernel : public GpuKernel {
 public:
  LUGpuKernel() = default;
  LUGpuKernel() : is_null_input_(false) {}
  ~LUGpuKernel() = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -47,6 +48,9 @@ class LUGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
                                  "cusolverDnSetStream failed");
    auto input_addr = GetDeviceAddress<T>(inputs, kDim0);
@@ -65,7 +69,7 @@ class LUGpuKernel : public GpuKernel {
                                     "cusolver query lu work size fail");

      if (cudaMalloc(reinterpret_cast<void **>(&d_work_), unit_size_ * lwork_) != cudaSuccess) {
        MS_LOG(EXCEPTION) << "cusolver malloc work size fail";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', cusolver malloc work size fail";
      }

      CHECK_CUSOLVER_RET_WITH_EXCEPT(
@@ -79,7 +83,7 @@ class LUGpuKernel : public GpuKernel {
      // 5. malloc device working space of getrf

      if (cudaMalloc(reinterpret_cast<void **>(&d_work_), unit_size_ * lwork_) != cudaSuccess) {
        MS_LOG(EXCEPTION) << "cusolver malloc work size fail";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', cusolver malloc work size fail";
      }

      // 6. solve to lu factorization according to cuSolver api, outputs have been written to input's matrix.
@@ -87,7 +91,7 @@ class LUGpuKernel : public GpuKernel {
        kernel_node_, cusolverDnDgetrf(handle_, m_, m_, input_addr, lda_, d_work_, piv_output_addr, info_output_addr),
        "cusolver lu fail");
    } else {
      MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
    }
    // 7. copy results from written input's matrix to output's matrix.
    //    if (cudaMemcpy(output_addr, input_addr, lda_ * m_ * unit_size_, cudaMemcpyDeviceToDevice) != cudaSuccess) {
@@ -101,14 +105,16 @@ class LUGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    // 1. get CuSolver Dense matrix handler
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
    auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    // 2. check input shape not null
    bool is_null_input = CHECK_NULL_INPUT(in_shape);
    if (is_null_input) {
      MS_LOG(EXCEPTION) << "For 'PureCholeskyGpuKernel', input is null";
    is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    // 3. calculate input size
    if (!InitInputSize(in_shape)) {
@@ -126,11 +132,11 @@ class LUGpuKernel : public GpuKernel {
      lu_row_ = in_shape.at(kDim0);
      lu_col_ = in_shape.at(kDim1);
    } else {
      MS_LOG(ERROR) << "Input Only support Rank 1 OR 2";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 1 or 2";
      return false;
    }
    if (lu_row_ != lu_col_) {
      MS_LOG(ERROR) << "Cholesky need square matrix as input.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
      return false;
    }
    // set matrix row or col to be lead dimension
@@ -170,6 +176,7 @@ class LUGpuKernel : public GpuKernel {
  std::vector<size_t> input_size_list_{};
  std::vector<size_t> output_size_list_{};
  std::vector<size_t> workspace_size_list_{};
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matmul_gpu_kernel.h
@@ -91,13 +91,14 @@ class MatMulGpuKernel : public GpuKernel {
          "cublasGemmStridedBatchedEx failed. Possible reasons: the GPU is occupied by other processes.");
      }
    } catch (const std::exception &e) {
      MS_LOG(EXCEPTION) << "Encountered an exception: " << e.what() << " when invoke cublas "
                        << (batch_ == 1 ? "cublasGemmEx" : "cublasGemmStridedBatchedEx");
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', encountered an exception: " << e.what() << " when invoke "
                        << "cublas " << (batch_ == 1 ? "cublasGemmEx" : "cublasGemmStridedBatchedEx");
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    dtype_a_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
@@ -105,7 +106,7 @@ class MatMulGpuKernel : public GpuKernel {
    dtype_c_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0)));
    auto node_name = AnfAlgo::GetCNodeName(kernel_node);
    if (dtype_a_ != dtype_b_ || dtype_a_ != dtype_c_) {
      MS_LOG(EXCEPTION) << "input and output types are not the same in " << node_name;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', input and output types are not the same in " << node_name;
    }
    if (dtype_a_ == CUDA_R_16F && dtype_b_ == CUDA_R_16F && dtype_c_ == CUDA_R_16F) {
      MS_LOG(INFO) << "input and output type is float16, allow to use Tensor Core operations if possible";
@@ -113,15 +114,16 @@ class MatMulGpuKernel : public GpuKernel {
    }
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    auto input1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(output_shape) || CHECK_NULL_INPUT(input1_shape);
    is_null_input_ =
      CHECK_SHAPE_NULL(input1_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'MatmulGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
    auto dims = output_shape.size();
    if (dims < 2) {
      MS_LOG(EXCEPTION) << "Output dims " << dims << " not support.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of output cannot be less than 2, but got "
                        << dims;
    }

    m_ = output_shape[dims - 2];
@@ -139,7 +141,7 @@ class MatMulGpuKernel : public GpuKernel {
    } else if (!transpose && input1_shape.size() > (dims - 1)) {
      k_ = input1_shape[dims - 1];
    } else {
      MS_LOG(EXCEPTION) << "Init k_ via input1_shape failed.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init k_ via input1_shape failed.";
    }

    transpose = GetAttr<bool>(kernel_node, "transpose_x2");
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matrix_inverse_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/matrix_inverse_gpu_kernel.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <type_traits>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
@@ -90,28 +91,30 @@ class MatrixInverseGpuKernel : public GpuKernel {
                            reinterpret_cast<double **>(inv_batch_addr), len, info_addr, batchsize),
        "cublas trsm batched Fail");
    } else {
      MS_LOG(EXCEPTION) << "The data type entered must be float or double.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type entered must be float or double.";
    }

    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'MatrixInverseGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() < 2) {
      MS_LOG(EXCEPTION) << "The dim entered needs to be greater than 2, but " << input_shape.size() << " was taken";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be less than 2, but got "
                        << input_shape.size();
    }
    size_t last_index = input_shape.size() - 1;
    if (input_shape[last_index] != input_shape[last_index - 1]) {
      MS_LOG(EXCEPTION) << "The last two dimensions of the input matrix should be equal!";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the last two dimensions of the input matrix should be equal, "
                        << "but got one: " << input_shape[last_index] << ", another: " << input_shape[last_index - 1];
    }
    size_ = input_shape[last_index];
    for (size_t i = 0; i < last_index - 1; i++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/multinomial_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/multinomial_gpu_kernel.h
@@ -59,7 +59,7 @@ class MultinomialGpuKernel : public GpuKernel {
    T *probs_addr = GetDeviceAddress<T>(inputs, 0);
    int64_t *num_sample_addr = GetDeviceAddress<int64_t>(inputs, 1);
    if (distributions_ == 0) {
      MS_LOG(ERROR) << "Divide by zero. the distributions_ is 0.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << "', divide by zero. the distributions_ is 0.";
      return false;
    }

@@ -83,22 +83,20 @@ class MultinomialGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but multinomial needs 2 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but multinomial needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape_0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_0) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ =
      CHECK_SHAPE_NULL(input_shape_0, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'MultinomialGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h
@@ -65,24 +65,22 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    iou_value_ = GetAttr<float>(kernel_node, "iou_threshold");

    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but NMSWithMask needs 1 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }

    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 3) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but NMSWithMask needs 3 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 3, but got " << output_num;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'NMSWithMaskGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/random_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/random_op_gpu_kernel.h
@@ -120,7 +120,7 @@ class RandomOpGpuKernel : public GpuKernel {
                              inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T),
                              reinterpret_cast<cudaStream_t>(stream_ptr));
        if (!ret) {
          MS_LOG(ERROR) << "For UniformInt op, `minval` should be strictly less than `maxval`";
          MS_LOG(ERROR) << "For '" << kernel_name_ << "', `minval` should be strictly less than `maxval`";
          return false;
        }
        break;
@@ -148,7 +148,8 @@ class RandomOpGpuKernel : public GpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Random operation " << random_op_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: StandardNormal, CudnnUniformReal, "
                          << "UniformInt, UniformReal currently, but got " << random_op_type_;
      }
    }
    return true;
@@ -158,29 +159,27 @@ class RandomOpGpuKernel : public GpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kRandomOpTypeMap.find(kernel_name);
    if (iter == kRandomOpTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Random operation " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: StandardNormal, CudnnUniformReal, "
                        << "UniformInt, UniformReal currently, but got " << kernel_name;
    } else {
      random_op_type_ = iter->second;
    }
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if ((random_op_type_ == RANDOM_OP_NORMAL || random_op_type_ == RANDOM_OP_UNIFORM_REAL) && input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    if (random_op_type_ == RANDOM_OP_UNIFORM_INT && input_num != 3) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but random op needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape_0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_0) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ =
      CHECK_SHAPE_NULL(input_shape_0, kernel_name, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'RandomOpGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/square_sum_all_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/square_sum_all_gpu_kernel.h
@@ -51,11 +51,12 @@ class SquareSumAllGpuFwdKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(ERROR) << "SquareSumAllGpuFwdKernel input is null";
      return false;
      InitSizeLists();
      return true;
    }
    for (size_t i = 0; i < input_shape.size(); i++) {
      input_size_ *= input_shape[i];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/squared_difference_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/squared_difference_kernel.h
@@ -56,18 +56,21 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto input_shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    auto input_shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape1) || CHECK_NULL_INPUT(input_shape2) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape1, kernel_name, "input") ||
                     CHECK_SHAPE_NULL(input_shape2, kernel_name, "input") ||
                     CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'SquaredDifferenceGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
    need_broadcast_ = IsBroadcast(input_shape1, input_shape2);
    if (need_broadcast_ && output_shape.size() > MAX_DIMS) {
      MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << MAX_DIMS;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than " << MAX_DIMS
                        << ", but got " << output_shape.size();
    }

    lhs_shape_.resize(MAX_DIMS, 1);
@@ -86,7 +89,8 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
          lhs_shape_[j + lhs_offset] = input_shape1[j];
        } else {
          auto index = j + lhs_offset;
          MS_LOG(EXCEPTION) << "Invalid input1 index: " << index;
          MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the index of input cannot be " << index << ", but got "
                            << index;
        }
      }
      input1_num_ *= input_shape1[j];
@@ -98,7 +102,8 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
          rhs_shape_[k + rhs_offset] = input_shape2[k];
        } else {
          auto index = k + rhs_offset;
          MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
          MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the index of input cannot be " << index << ", but got "
                            << index;
        }
      }
      input2_num_ *= input_shape2[k];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/trsm_solve_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/trsm_solve_gpu_kernel.h
@@ -118,28 +118,30 @@ class TrsmGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto b_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    is_null_input_ = CHECK_NULL_INPUT(A_shape) || CHECK_NULL_INPUT(b_shape);
    is_null_input_ =
      CHECK_SHAPE_NULL(A_shape, kernel_name, "input_A") || CHECK_SHAPE_NULL(b_shape, kernel_name, "input_b");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TrsmGpuKernel', input is null";
      InitSizeLists();
      return true;
    }

    if (A_shape[kDim0] != A_shape[kDim1]) {
      MS_LOG(EXCEPTION) << "wrong array shape, A should be a squre matrix, but got [" << A_shape[kDim0] << " X "
                        << A_shape[kDim1] << "]";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be square matrix, but got ["
                        << A_shape[kDim0] << " X " << A_shape[kDim1] << "]";
    }
    m_ = A_shape[kDim0];

    if (b_shape.size() != kAVectorxDimNum && b_shape.size() != kAMatrixDimNum) {
      MS_LOG(EXCEPTION) << "wrong array shape, b should be 1D or 2D, but got [" << b_shape.size() << "] dimensions";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 1 or 2, but got "
                        << b_shape.size();
    }
    if (b_shape[kDim0] != m_) {
      MS_LOG(EXCEPTION) << "wrong array shape, b should match the shape of A, excepted [" << m_ << "] but got ["
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be [" << m_ << "], but got ["
                        << b_shape[kDim0] << "]";
    }
    if (b_shape.size() == kAVectorxDimNum || (b_shape.size() == kAMatrixDimNum && b_shape[kDim1] == 1)) {
@@ -158,7 +160,7 @@ class TrsmGpuKernel : public GpuKernel {
    } else if (trans == "T") {
      trans_ = CUBLAS_OP_N;
    } else {
      MS_LOG(EXCEPTION) << "trans should be in [N, T], but got [" << trans << "]";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', trans should be in [N, T], but got [" << trans << "]";
    }

    bool lower = AnfAlgo::GetNodeAttr<bool>(kernel_node, "lower");
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_complex_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_complex_gpu_kernel.h
@@ -64,29 +64,28 @@ class UnaryOpComplexGpuKernel : public GpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Real, Imag, Conj currently, "
                          << "but got " << unary_op_type_;
      }
    }

    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    GetOpType(kernel_node);
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 3, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "UnaryOpGpuKernel input is null";
      InitSizeLists();
      return true;
    }
@@ -123,8 +122,8 @@ class UnaryOpComplexGpuKernel : public GpuKernel {
      unary_op_type_ = iter->second;
      return;
    }

    MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Real, Imag, Conj currently, but got "
                      << kernel_name;
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
@@ -184,7 +184,9 @@ class UnaryOpGpuKernel : public GpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
                          << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
                          << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
      }
    }
    return true;
@@ -193,23 +195,22 @@ class UnaryOpGpuKernel : public GpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kUnaryOpTypeMap.find(kernel_name);
    if (iter == kUnaryOpTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Unary operation " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
                        << " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
                        << "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name;
    }
    unary_op_type_ = iter->second;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnaryOpGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_grad_gpu_kernel.h
@@ -112,7 +112,9 @@ class UnaryGradOpGpuKernel : public GpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Unary grad operation " << unary_grad_op_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: SqrtGrad, RsqrtGrad, AsinGrad, "
                          << "ACosGrad, AtanGrad, AsinhGrad, AcoshGrad, ReciprocalGrad currently, but got "
                          << unary_grad_op_type_;
      }
    }
    return true;
@@ -121,23 +123,22 @@ class UnaryGradOpGpuKernel : public GpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kUnaryGradOpTypeMap.find(kernel_name);
    if (iter == kUnaryGradOpTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Unary grad operation " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: SqrtGrad, RsqrtGrad, AsinGrad, "
                        << "ACosGrad, AtanGrad, AsinhGrad, AcoshGrad, ReciprocalGrad currently, but got "
                        << kernel_name;
    }
    unary_grad_op_type_ = iter->second;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but unary grad op needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but unary grad op needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UnaryOpGradGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -145,9 +146,8 @@ class UnaryGradOpGpuKernel : public GpuKernel {
      input_size_ *= input_shape[i];
    }
    auto dx_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    is_null_input_ = CHECK_NULL_INPUT(dx_shape);
    is_null_input_ = CHECK_SHAPE_NULL(dx_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "UnaryGradOpGpuKernel input 1 is null";
      InitSizeLists();
      return true;
    }
@@ -155,7 +155,8 @@ class UnaryGradOpGpuKernel : public GpuKernel {
      dx_size_ *= dx_shape[i];
    }
    if (input_size_ != dx_size_) {
      MS_LOG(WARNING) << "UnaryGradOpGpuKernel inputs should be same, but got " << input_size_ << " and " << dx_size_;
      MS_LOG(WARNING) << "For '" << kernel_name << "', both inputs should be equal, but got " << input_size_ << " and "
                      << dx_size_;
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
@@ -19,6 +19,7 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
@@ -98,7 +99,8 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
                                   CUDA_R_32F, algo_),
        "cublasSgemm Call Fail");
    } catch (const std::exception &e) {
      MS_LOG(EXCEPTION) << "Encountered an exception: " << e.what() << "when invoke cubals cublasGemmStridedBatchedEx";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", encountered an exception: " << e.what()
                        << " when invoke cubals cublasGemmStridedBatchedEx";
    }

    auto r_input_addr = workspace1_addr;
@@ -147,9 +149,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
    if (!SetProperty(kernel_node)) {
      return false;
    }
    (void)SetProperty(kernel_node);
    InitSizeLists();
    return true;
  }
@@ -188,22 +188,20 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
  }

 private:
  bool SetProperty(const CNodePtr &kernel_node) {
  void SetProperty(const CNodePtr &kernel_node) {
    auto matrix_a_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto matrix_g_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    is_null_input_ =
      CHECK_NULL_INPUT(matrix_a_shape) || CHECK_NULL_INPUT(gradient_shape) || CHECK_NULL_INPUT(matrix_g_shape);
    is_null_input_ = CHECK_SHAPE_NULL(matrix_a_shape, kernel_name_, "matrix_a") ||
                     CHECK_SHAPE_NULL(gradient_shape, kernel_name_, "gradient") ||
                     CHECK_SHAPE_NULL(matrix_g_shape, kernel_name_, "matrix_g");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'UpdateThorGradientGpuKernel', input is null";
      InitSizeLists();
      return true;
      return;
    }

    split_dim = LongToSize(GetAttr<int64_t>(kernel_node, "split_dim"));
    if (split_dim == 0) {
      MS_LOG(ERROR) << "Divide by zero, split_dim can not be zero.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", divide by zero, split_dim cannot be 0, but got " << split_dim;
    }
    gradient_size.batch_h = gradient_shape[0] / split_dim;
    gradient_size.batch_w = gradient_shape[1] / split_dim;
@@ -244,7 +242,6 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
    gradient_size.ori_w = gradient_shape[1];
    gradient_size.ori_h = gradient_shape[0];
    gradient_size.dtype = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1)));
    return true;
  }

  size_t split_dim;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_collective_gpu_kernel.h
@@ -71,13 +71,15 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllReduce, AllGather, Broadcast, "
                          << "ReduceScatter currently, but got " << nccl_kernel_type_;
      }
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
@@ -87,9 +89,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    for (size_t i = 0; i < input_num; ++i) {
      auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclCollectiveGpuKernel', input is null";
        InitSizeLists();
        return true;
      }
@@ -103,9 +104,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
    }
    for (size_t i = 0; i < output_num; ++i) {
      auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "output");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclCollectiveGpuKernel', output is null";
        InitSizeLists();
        return true;
      }
@@ -201,7 +201,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kNcclTypeMap.find(kernel_name);
    if (iter == kNcclTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllReduce, AllGather, Broadcast, "
                        << "ReduceScatter currently, but got " << kernel_name;
    } else {
      nccl_kernel_type_ = iter->second;
    }
@@ -220,7 +221,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
      } else if (type == "prod") {
        nccl_reduce_type_ = ncclProd;
      } else {
        MS_LOG(EXCEPTION) << "Nccl reduce type " << type << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: sum, max, min, prod currently, "
                          << "but got " << type;
      }
    }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_p2p_gpu_kernel.h
@@ -52,13 +52,15 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
        MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllToAllv, NeighborExchange "
                          << "currently, but got " << nccl_kernel_type_;
      }
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    InferCommType(kernel_node);
@@ -73,9 +75,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
    }
    for (size_t i = 0; i < input_num; ++i) {
      auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', input shape is null ";
        InitSizeLists();
        return true;
      }
@@ -88,9 +89,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
    }
    for (size_t i = 0; i < output_num; ++i) {
      auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
      is_null_input_ = CHECK_NULL_INPUT(shape);
      is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "output");
      if (is_null_input_) {
        MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', output shape is null";
        InitSizeLists();
        return true;
      }
@@ -157,10 +157,12 @@ class NcclP2PGpuKernel : public NcclGpuKernel {

    // send_rank_id and recv rank_id size needs to be equal to input_list size
    if (send_rank_ids.size() != input_size_list_.size()) {
      MS_LOG(ERROR) << "Trying to use AlltoAllv, but send_rank_ids vector size not equals to input_list size.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", trying to use AlltoAllv, the size of  send_rank_ids vector "
                        << "should be " << input_size_list_.size() << ", but got " << send_rank_ids.size();
    }
    if (recv_rank_ids.size() != output_size_list_.size()) {
      MS_LOG(ERROR) << "Trying to use AlltoAllv, but recv_rank_ids vector size not equals to output_list size.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", trying to use AlltoAllv, the size of  recv_rank_ids vector "
                        << "should be " << output_size_list_.size() << ", but got " << recv_rank_ids.size();
    }

    // This implementation refers to NVIDIA NCCL 2.11 doc.
@@ -182,7 +184,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kNcclTypeMap.find(kernel_name);
    if (iter == kNcclTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllToAllv, NeighborExchange "
                        << "currently, but got " << kernel_name;
    } else {
      nccl_kernel_type_ = iter->second;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_recv_gpu_kernel.h
@@ -46,21 +46,20 @@ class NcclRecvGpuKernel : public NcclGpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but NCCL receive needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }
    src_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "src_rank"));
    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
    nccl_data_type_ = nccl_dtype(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0));

    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(output_shape);
    is_null_input_ = CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'NcclRecvGpuKernel', output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/nccl_send_gpu_kernel.h
@@ -46,12 +46,12 @@ class NcclSendGpuKernel : public NcclGpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL send needs 1 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
    }

    dest_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "dest_rank"));
@@ -60,9 +60,8 @@ class NcclSendGpuKernel : public NcclGpuKernel {
    MS_LOG(INFO) << "NcclSend dest rank is " << dest_rank_ << ", group name is " << group_name_;

    auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'NcclSendGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/sync_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/sync_batch_norm_gpu_kernel.h
@@ -41,6 +41,9 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *x = GetDeviceAddress<T>(inputs, 0);
    S *scale = GetDeviceAddress<S>(inputs, 1);
    S *bias = GetDeviceAddress<S>(inputs, 2);
@@ -78,6 +81,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
    if (root_rank) {
      root_ = static_cast<int>(GetValue<int64_t>(root_rank));
@@ -86,24 +90,22 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 5) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but SyncBatchNorm needs 5 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 5, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 5) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but SyncBatchNorm needs 5 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 5, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    if (CHECK_NULL_INPUT(input_shape)) {
      MS_LOG(WARNING) << "SyncBatchNorm input is null";
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    auto input_shape_dims = input_shape.size();
    if (input_shape_dims != 4 && input_shape_dims != 2) {
      MS_LOG(EXCEPTION) << "Tensor shape is " << input_shape.size()
                        << ", SyncBatchNormGpuKernel input should be 2D or 4D";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input only should be 2 or 4, but got "
                        << input_shape_dims;
    }
    input_size_ = 1;
    for (auto dim : input_shape) {
@@ -175,6 +177,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
    is_null_input_ = false;
  }

 protected:
@@ -233,6 +236,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
  string group_name_;
  int root_;
  cudaStream_t comm_stream_;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/sync_batch_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nccl/sync_batch_norm_grad_gpu_kernel.h
@@ -41,6 +41,9 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *dy = GetDeviceAddress<T>(inputs, 0);
    T *x_input = GetDeviceAddress<T>(inputs, 1);
    S *scale = GetDeviceAddress<S>(inputs, 2);
@@ -65,6 +68,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
    if (root_rank) {
      root_ = static_cast<int>(GetValue<int64_t>(root_rank));
@@ -73,24 +77,22 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
    group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 5) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but SyncBatchNormGrad needs 5 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 5, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 3) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but SyncBatchNormGrad needs 5 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 3, but got " << output_num;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    if (CHECK_NULL_INPUT(input_shape)) {
      MS_LOG(WARNING) << "SyncBatchNormGrad input is null";
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      InitSizeLists();
      return true;
    }
    auto input_shape_dims = input_shape.size();
    if (input_shape_dims != 4 && input_shape_dims != 2) {
      MS_LOG(EXCEPTION) << "Tensor shape is " << input_shape.size()
                        << ", SyncBatchNormGpuGrad input should be 2D or 4D";
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input only should be 2 or 4, but got "
                        << input_shape_dims;
    }
    input_size_ = 1;
    for (auto dim : input_shape) {
@@ -155,6 +157,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
    is_null_input_ = false;
  }

 protected:
@@ -201,6 +204,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
  string group_name_;
  int root_;
  cudaStream_t comm_stream_;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/assign_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_OTHER_ASSIGN_GPU_KERNEL_H_

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"

@@ -52,15 +53,13 @@ class AssignGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    if (!CheckParam(kernel_node)) {
      return false;
    }
    (void)CheckParam(kernel_node);
    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape);
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'AssignGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
@@ -80,19 +79,16 @@ class AssignGpuKernel : public GpuKernel {
  }

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
  void CheckParam(const CNodePtr &kernel_node) {
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but AssignGpuKernel needs 2 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but AssignGpuKernel needs 1 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
    }
    return true;
  }

  std::vector<size_t> input_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_decode_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_DECODE_GPU_KERNEL_H

 #include <vector>
 #include <string>
 #include <algorithm>
 #include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
@@ -46,7 +47,8 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
    T *bboxes_addr = GetDeviceAddress<T>(outputs, 0);

    if (inputs[0]->size != inputs[1]->size) {
      MS_LOG(ERROR) << "Rois box size must equal with deltas box size -" << inputs[1]->size << ", but got"
      MS_LOG(ERROR) << "For '" << kernel_name_
                    << "', rois box size must equal with deltas box size: " << inputs[1]->size << ", but got "
                    << inputs[0]->size;
      return false;
    }
@@ -54,7 +56,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
    const size_t coordinate = 4;
    const size_t block_size = inputs[0]->size / sizeof(T);
    if ((block_size % coordinate) != 0) {
      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
      return false;
    }

@@ -65,11 +67,11 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxDecode needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    rois_size_ = sizeof(T);
    deltas_size_ = sizeof(T);
@@ -78,9 +80,10 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(logits_shape) || CHECK_NULL_INPUT(labels_shape) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ = CHECK_SHAPE_NULL(logits_shape, kernel_name_, "anchor_box") ||
                     CHECK_SHAPE_NULL(labels_shape, kernel_name_, "deltas") ||
                     CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BoundingBoxDecodeGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -111,7 +114,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
        means_.emplace_back(mean);
      }
    } else {
      MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute means type is invalid.";
    }

    auto stds = prim->GetAttr("stds");
@@ -124,7 +127,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
        stds_.emplace_back(std);
      }
    } else {
      MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute stds type is invalid.";
    }

    std::vector<int64_t> max_shape_me = GetAttr<std::vector<int64_t>>(kernel_node, "max_shape");
@@ -133,11 +136,13 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
    wh_ratio_clip_ = GetAttr<float>(kernel_node, "wh_ratio_clip");

    if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
      MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the both size of means or stds cannot be less than 4, but got"
                        << " the size of means: " << means_.size() << ", the size of stds: " << stds_.size();
    }

    if (max_shape_.size() < 2) {
      MS_LOG(EXCEPTION) << "The size of max_shape is less than 2.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of max_shape cannot be less than 2, but got "
                        << max_shape_.size();
    }

    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/boundingbox_encode_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_ENCODE_GPU_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
@@ -28,7 +29,6 @@ template <typename T>
 class BoundingBoxEncodeGpuKernel : public GpuKernel {
 public:
  BoundingBoxEncodeGpuKernel() : anchor_size_(0), groundtruth_size_(0), deltas_size_(0), is_null_input_(false) {}

  ~BoundingBoxEncodeGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -45,7 +45,8 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
    T *deltas_addr = GetDeviceAddress<T>(outputs, 0);

    if (inputs[0]->size != inputs[1]->size) {
      MS_LOG(ERROR) << "Anchor box size must equal with groundtruth box size -" << inputs[1]->size << ", but got"
      MS_LOG(ERROR) << "For '" << kernel_name_
                    << "', anchor box size must equal with groundtruth box size: " << inputs[1]->size << ", but got "
                    << inputs[0]->size;
      return false;
    }
@@ -53,7 +54,7 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
    const size_t coordinate = 4;
    const size_t block_size = inputs[0]->size / sizeof(T);
    if ((block_size % coordinate) != 0) {
      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
      return false;
    }

@@ -64,11 +65,11 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxEncode needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    anchor_size_ = sizeof(T);
    groundtruth_size_ = sizeof(T);
@@ -77,9 +78,10 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
    auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(logits_shape) || CHECK_NULL_INPUT(labels_shape) || CHECK_NULL_INPUT(output_shape);
    is_null_input_ = CHECK_SHAPE_NULL(logits_shape, kernel_name_, "anchor_box") ||
                     CHECK_SHAPE_NULL(labels_shape, kernel_name_, "groundtruth_box") ||
                     CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BoundingBoxEncodeGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -110,7 +112,7 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
        means_.emplace_back(mean);
      }
    } else {
      MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute means type is invalid.";
    }
    auto stds = prim->GetAttr("stds");
    MS_EXCEPTION_IF_NULL(stds);
@@ -122,11 +124,12 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
        stds_.emplace_back(std);
      }
    } else {
      MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute stds type is invalid.";
    }

    if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
      MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the both size of means or stds cannot be less than 4, but got"
                        << " the size of means: " << means_.size() << ", the size of stds: " << stds_.size();
    }

    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_CHECK_VALID_GPU_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cuh"
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
@@ -28,7 +29,6 @@ template <typename T, typename S>
 class CheckValidGpuKernel : public GpuKernel {
 public:
  CheckValidGpuKernel() : anchor_boxes_size_(0), img_metas_size_(0), valid_size_(0), is_null_input_(false) {}

  ~CheckValidGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -48,7 +48,7 @@ class CheckValidGpuKernel : public GpuKernel {
    const size_t coordinate = 4;
    const size_t block_size = inputs[0]->size / sizeof(T);
    if ((block_size % coordinate) != 0) {
      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
      return false;
    }

@@ -58,11 +58,11 @@ class CheckValidGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but CheckValid needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    anchor_boxes_size_ = sizeof(T);
    img_metas_size_ = sizeof(T);
@@ -71,10 +71,10 @@ class CheckValidGpuKernel : public GpuKernel {
    auto anchor_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto img_metas_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto valid_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ =
      CHECK_NULL_INPUT(anchor_boxes_shape) || CHECK_NULL_INPUT(img_metas_shape) || CHECK_NULL_INPUT(valid_shape);
    is_null_input_ = CHECK_SHAPE_NULL(anchor_boxes_shape, kernel_name_, "bboxes") ||
                     CHECK_SHAPE_NULL(img_metas_shape, kernel_name_, "img_metas") ||
                     CHECK_SHAPE_NULL(valid_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CheckValidGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/gpu_convert_to_dynamic_shape_gpu_kernel.h
@@ -62,18 +62,17 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    MS_EXCEPTION_IF_NULL(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_count != 1) {
      MS_LOG(ERROR) << input_count << "inputs were provided, but GpuConvertToDynamicShapeGpuKernel expects 1.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_count;
    }

    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape_, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'GpuConvertToDynamicShapeGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.h
@@ -29,7 +29,6 @@ template <typename T>
 class IOUGpuKernel : public GpuKernel {
 public:
  IOUGpuKernel() : gt_boxes_size_(0), anchor_boxes_size_(0), iou_size_(0), mode_(0), is_null_input_(false) {}

  ~IOUGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -50,7 +49,7 @@ class IOUGpuKernel : public GpuKernel {
    const size_t block_size_0 = inputs[0]->size / sizeof(T);
    const size_t block_size_1 = inputs[1]->size / sizeof(T);
    if ((block_size_0 % coordinate) != 0 || (block_size_1 % coordinate) != 0) {
      MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
      MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
      return false;
    }

@@ -65,8 +64,7 @@ class IOUGpuKernel : public GpuKernel {
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but IOU needs 2 inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
    }
    gt_boxes_size_ = sizeof(T);
    anchor_boxes_size_ = sizeof(T);
@@ -75,10 +73,10 @@ class IOUGpuKernel : public GpuKernel {
    auto gt_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto anchor_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto iou_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ =
      CHECK_NULL_INPUT(gt_boxes_shape) || CHECK_NULL_INPUT(anchor_boxes_shape) || CHECK_NULL_INPUT(iou_shape);
    is_null_input_ = CHECK_SHAPE_NULL(gt_boxes_shape, kernel_name_, "anchor_boxes") ||
                     CHECK_SHAPE_NULL(anchor_boxes_shape, kernel_name_, "gt_boxes") ||
                     CHECK_SHAPE_NULL(iou_shape, kernel_name_, "output");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'IOUGpuKernel', input or output is null";
      InitSizeLists();
      return true;
    }
@@ -103,8 +101,7 @@ class IOUGpuKernel : public GpuKernel {
    } else if (mode == "iof") {
      mode_ = 1;
    } else {
      MS_LOG(ERROR) << "Mode only support 'iou' or 'iof'.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', mode only support 'iou' or 'iof'.";
    }

    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
@@ -66,26 +66,25 @@ class BatchNormFold2GpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    InitResource();

    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != INPUT_NUM) {
      MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GpuKernel needs " << INPUT_NUM
                    << " inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
                        << input_num;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "BatchNormFold2GpuKernel input is null";
      InitSizeLists();
      return true;
    }

    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "BatchNormFold2GpuKernel input shape needs (N,C,H,W).";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    batch_size_ = input_shape[0];
    channel_ = input_shape[1];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_grad_gpu_kernel.h
@@ -94,26 +94,25 @@ class BatchNormFold2GradGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    InitResource();
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != INPUT_NUM) {
      MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GradGpuKernel needs " << INPUT_NUM
                    << " inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
                        << input_num;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "BatchNormFold2GradGpuKernel input is null";
      InitSizeLists();
      return true;
    }

    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "BatchNormFold2GradGpuKernel input shape needs (N,C,H,W).";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    batch_size_ = input_shape[0];
    channel_ = input_shape[1];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
@@ -99,18 +99,17 @@ class BatchNormFoldGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    InitResource();
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 4) {
      MS_LOG(ERROR) << "Input number is " << input_num << " but BatchNormFold GpuKernel OP needs 4 input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
    }

    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 4) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but BatchNormFold GpuKernel OP needs 4 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 4, but got " << output_num;
    }

    auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
@@ -122,16 +121,14 @@ class BatchNormFoldGpuKernel : public GpuKernel {
    freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BatchNormFoldGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "Input shape is " << input_shape.size()
                    << ", but BatchNormFold GpuKernel OP needs 4DTensor input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    CheckTensorSize({input_shape});
    batch_ = input_shape[0];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_grad_gpu_kernel.h
@@ -78,18 +78,17 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != INPUT_NUM) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but BatchNormFoldGrad GpuKernel OP needs " << INPUT_NUM
                    << " inputs.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
                        << input_num;
    }

    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but BatchNormFoldGrad GpuKernel OP needs 4 output.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
    }

    auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
@@ -99,16 +98,14 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
    freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BatchNormFoldGradGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "Input shape is " << input_shape.size()
                    << ", but BatchNormFoldGrad GpuKernel OP needs 4DTensor input.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    batch_ = input_shape[0];
    channel_ = input_shape[1];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
@@ -49,24 +49,23 @@ class CorrectionMulGpuKernel : public GpuKernel {
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    InitResource();

    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
      MS_LOG(ERROR) << "Argument number is " << input_num << ", but CorrectionMulGpuKernel needs 3.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CorrectionMulGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W).";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    batch_size_ = input_shape[0];
    channel_ = input_shape[1];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
@@ -55,24 +55,23 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    InitResource();

    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 4) {
      MS_LOG(ERROR) << "Argument number is " << input_num << ", but CorrectionMulGradGpuKernel needs 4.";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CorrectionMulGradGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W).";
      return false;
      MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
                        << input_shape.size();
    }
    batch_size_ = input_shape[0];
    channel_ = input_shape[1];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/quant/fake_learned_scale_quant_perchannel_gpu_kernel.cc
@@ -45,17 +45,16 @@ const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGpuKernel::GetWorkspac
 }

 bool FakeLearnedScaleQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  kernel_node_ = kernel_node;
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num
                      << ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 3 Input.";
    MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
  }

  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num
                      << ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 1 output.";
    MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
  }

  quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));