add vector size check, input shape check and divide by zero check for gpu operators.

4 years ago · 7a1862a6e6
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/crop_and_resize_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/crop_and_resize_gpu_kernel.h
@@ -41,13 +41,17 @@ class CropAndResizeGpuKernel : public GpuKernel {
        input_width_(0),
        final_height_(0),
        final_width_(0),
        channel_(0) {}
        channel_(0),
        is_null_input_(false) {}
  ~CropAndResizeGpuKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    VARIABLE_NOT_USED(workspace);
    T *input_image = GetDeviceAddress<T>(inputs, 0);
    float *input_boxes = GetDeviceAddress<float>(inputs, 1);
@@ -72,6 +76,18 @@ class CropAndResizeGpuKernel : public GpuKernel {
    }
    // input image
    auto input_image_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto input_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto input_box_index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    auto input_crop_size_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_image_shape) || CHECK_NULL_INPUT(input_boxes_shape) ||
                     CHECK_NULL_INPUT(input_box_index_shape) || CHECK_NULL_INPUT(input_crop_size_shape) ||
                     CHECK_NULL_INPUT(output_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CropAndResizeGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    size_t input_image_shape_len = input_image_shape.size();
    if (input_image_shape_len != 4) {
      MS_LOG(ERROR) << " image tensor is " << input_image_shape_len << "-D, but CropAndResize supports only " << 4
@@ -86,7 +102,6 @@ class CropAndResizeGpuKernel : public GpuKernel {
    input_height_ = input_image_shape[1];
    input_width_ = input_image_shape[2];
    // input boxes
    auto input_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    size_t input_boxes_shape_len = input_boxes_shape.size();
    if (input_boxes_shape_len != 2) {
      MS_LOG(ERROR) << "Boxes is rank" << input_boxes_shape_len << " but CropAndResize supports only rank " << 2
@@ -99,7 +114,6 @@ class CropAndResizeGpuKernel : public GpuKernel {
    }
    input_boxes_size_ *= sizeof(float);
    // input box_index
    auto input_box_index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    size_t input_box_index_shape_len = input_box_index_shape.size();
    if (input_box_index_shape_len != 1) {
      MS_LOG(ERROR) << "Box_index is rank " << input_box_index_shape_len << " but CropAndResize supports only rank "
@@ -110,7 +124,6 @@ class CropAndResizeGpuKernel : public GpuKernel {
    input_box_ind_size_ *= input_box_index_shape[0];  // single dim required
    input_box_ind_size_ *= sizeof(int);
    // input crop_size
    auto input_crop_size_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    size_t input_crop_size_shape_len = input_crop_size_shape.size();
    if (input_crop_size_shape_len != 1) {
      MS_LOG(ERROR) << "Crop_size is rank " << input_crop_size_shape_len << "-D, but CropAndResize supports only rank "
@@ -126,8 +139,11 @@ class CropAndResizeGpuKernel : public GpuKernel {
    input_crop_size_ *= input_crop_size_shape[0];
    input_crop_size_ *= sizeof(int);
    // output
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    auto output_shape_len = output_shape.size();
    if (output_shape_len != 4) {
      MS_LOG(ERROR) << "For 'CropAndResize', the rank of output should be 4, but got " << output_shape_len;
      return false;
    }
    output_size_ = 1;
    for (size_t i = 0; i < output_shape_len; i++) {
      output_size_ *= output_shape[i];
@@ -175,6 +191,7 @@ class CropAndResizeGpuKernel : public GpuKernel {
  int final_height_;
  int final_width_;
  int channel_;
  bool is_null_input_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/depthtospace_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/depthtospace_gpu_kernel.h
@@ -1,4 +1,3 @@

 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
@@ -37,6 +36,9 @@ class DepthToSpaceFwdKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    // get device buffer ptr
    T *input = GetDeviceAddress<T>(inputs, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);
@@ -71,6 +73,12 @@ class DepthToSpaceFwdKernel : public GpuKernel {
    }
    // check input_shape
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'DepthToSpaceGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    shape_size_ = input_shape.size();
    if (shape_size_ != DEPTHTOSPACE_BUFFER_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but DepthToSpace supports 4-D tensor.";
@@ -111,6 +119,7 @@ class DepthToSpaceFwdKernel : public GpuKernel {
    oc_ = 0;
    oh_ = 0;
    ow_ = 0;
    is_null_input_ = false;

    input_size_list_.clear();
    output_size_list_.clear();
@@ -140,6 +149,7 @@ class DepthToSpaceFwdKernel : public GpuKernel {
  size_t oc_;
  size_t oh_;
  size_t ow_;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/embedding_lookup_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/embedding_lookup_gpu_kernel.h
@@ -37,6 +37,9 @@ class EmbeddingLookupKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    VARIABLE_NOT_USED(workspace);
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    S *indices_addr = GetDeviceAddress<S>(inputs, 1);
@@ -69,6 +72,16 @@ class EmbeddingLookupKernel : public GpuKernel {
    input_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    indices_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
    output_shapes_ = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ =
      CHECK_NULL_INPUT(input_shapes_) || CHECK_NULL_INPUT(indices_shapes_) || CHECK_NULL_INPUT(output_shapes_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'EmbeddingLookupGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    if (input_shapes_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'EmbeddingLookupGpuKernel', the rank of input cannot be less than 1.";
    }
    if (!is_dynamic_shape_) {
      offset_ = GetAttr<int64_t>(kernel_node, "offset");
    }
@@ -78,6 +91,7 @@ class EmbeddingLookupKernel : public GpuKernel {
  }
  void ResetResource() noexcept override {
    is_dynamic_shape_ = false;
    is_null_input_ = false;
    input_shapes_.clear();
    indices_shapes_.clear();
    output_shapes_.clear();
@@ -138,6 +152,7 @@ class EmbeddingLookupKernel : public GpuKernel {
  size_t dims_[3] = {};
  int64_t offset_;
  bool is_dynamic_shape_;
  bool is_null_input_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/extract_image_patches_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/extract_image_patches_gpu_kernel.h
@@ -40,6 +40,9 @@ class ExtractImagePatchesKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input = GetDeviceAddress<T>(inputs, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);
    T *t_input = GetDeviceAddress<T>(workspace, 0);
@@ -92,16 +95,28 @@ class ExtractImagePatchesKernel : public GpuKernel {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ExtractImagePatches has 1 output.";
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(output_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ExtractImagePatchesGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    input_size_ = 1;
    for (size_t i = 0; i < input_shape.size(); i++) {
      input_size_ *= input_shape[i];
      input_shape_.push_back(input_shape[i]);
    }
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);

    output_size_ = 1;
    for (size_t i = 0; i < output_shape.size(); i++) {
      output_size_ *= output_shape[i];
    }
    if (input_shape.size() != 4 || output_shape.size() != 4) {
      MS_LOG(EXCEPTION) << "For 'ExtractImagePatchesGpuKernel', the rank of input and output should be 4, "
                        << "but got the rank of input: " << input_shape.size()
                        << ", the rank of output: " << output_shape.size();
    }
    // transposed NHWC shape
    t_output_shape_ = {output_shape[0], output_shape[2], output_shape[3], output_shape[1]};

@@ -109,6 +124,11 @@ class ExtractImagePatchesKernel : public GpuKernel {
    auto ksizes = GetAttr<std::vector<int64_t>>(kernel_node, "ksizes");
    auto strides = GetAttr<std::vector<int64_t>>(kernel_node, "strides");
    auto rates = GetAttr<std::vector<int64_t>>(kernel_node, "rates");
    if (ksizes.size() != 4 || strides.size() != 4 || rates.size() != 4) {
      MS_LOG(EXCEPTION) << "For 'ExtractImagePatchesGpuKernel', the rank of ksizes, strides and rates should be 4, "
                        << "but got the rank of ksizes: " << ksizes.size()
                        << ", the rank of strides: " << strides.size() << ", the rank of rates: " << rates.size();
    }

    ksize_row_ = ksizes[2];
    ksize_col_ = ksizes[3];
@@ -127,6 +147,9 @@ class ExtractImagePatchesKernel : public GpuKernel {
    int64_t patch_rows_eff = ksize_row_ + (ksize_row_ - 1) * (rate_row_ - 1);
    int64_t patch_cols_eff = ksize_col_ + (ksize_col_ - 1) * (rate_col_ - 1);

    MS_EXCEPTION_IF_ZERO("stride row", stride_row_);
    MS_EXCEPTION_IF_ZERO("stride col", stride_col_);

    if (padding == "VALID") {
      output_rows_ = std::ceil((input_row_size_ - patch_rows_eff + 1.f) / static_cast<float>(stride_row_));
      output_cols_ = std::ceil((input_col_size_ - patch_cols_eff + 1.f) / static_cast<float>(stride_col_));
@@ -148,6 +171,7 @@ class ExtractImagePatchesKernel : public GpuKernel {
    row_input_stride_ = input_depth * input_col_size_;
    patch_input_stride_ = input_depth * input_col_size_ * input_row_size_;
    output_depth_ = input_depth;
    MS_EXCEPTION_IF_ZERO("other stride", other_stride_);
    need_batch_ = (output_size_ - 1) / other_stride_;

    InitSizeLists();
@@ -177,6 +201,7 @@ class ExtractImagePatchesKernel : public GpuKernel {
    row_input_stride_ = 1;
    patch_input_stride_ = 1;
    output_depth_ = 1;
    is_null_input_ = false;
    input_shape_.clear();
    t_output_shape_.clear();
    input_size_list_.clear();
@@ -208,6 +233,7 @@ class ExtractImagePatchesKernel : public GpuKernel {
  int64_t output_rows_;
  int64_t output_cols_;
  bool need_batch_;
  bool is_null_input_;
  int64_t row_stride_;
  int64_t patch_stride_;
  int64_t other_stride_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
@@ -40,6 +40,9 @@ class InTopKGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *predictions_device = GetDeviceAddress<T>(inputs, 0);
    int32_t *targets_device = GetDeviceAddress<int32_t>(inputs, 1);

@@ -106,6 +109,16 @@ class InTopKGpuKernel : public GpuKernel {
    }

    input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    if (input_shape_.size() < 2) {
      MS_LOG(EXCEPTION) << "For 'InTopKGpuKernel', the rank of input cannot be less than 2, but got "
                        << input_shape_.size();
    }
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'InTopKGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    input_rank_ = input_shape_.size();
    input_size_ = 1;
    for (size_t i = 0; i < input_rank_; i++) {
@@ -136,6 +149,7 @@ class InTopKGpuKernel : public GpuKernel {
    input_rank_ = 0;
    outer_size_ = 0;
    inner_size_ = 0;
    is_null_input_ = false;
    top_k_init_ = static_cast<T>(0.);
    input_size_list_.clear();
    output_size_list_.clear();
@@ -171,6 +185,7 @@ class InTopKGpuKernel : public GpuKernel {
  // for topk
  size_t outer_size_;
  size_t inner_size_;
  bool is_null_input_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/meshgrid_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/meshgrid_gpu_kernel.h
@@ -41,6 +41,9 @@ class MeshgridGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *ones_device = GetDeviceAddress<T>(workspace, 0);
    CalOnesLike(output_size_, static_cast<T *>(nullptr), ones_device, reinterpret_cast<cudaStream_t>(stream_ptr));

@@ -79,9 +82,14 @@ class MeshgridGpuKernel : public GpuKernel {
    input_size_ = 1;
    input_count_ = AnfAlgo::GetInputTensorNum(kernel_node);
    for (size_t i = 0; i < input_count_; i++) {
      size_t input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i)[0];
      input_shapes_.push_back(input_shape);
      input_size_ *= input_shape;
      auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
      if (input_shape.size() < 1) {
        MS_LOG(ERROR) << "For 'MeshGridGpuKernel', the rank of input" << i << " cannot be less than 1.";
        return false;
      }
      size_t input_size = input_shape[0];
      input_shapes_.push_back(input_size);
      input_size_ *= input_size;
    }

    output_size_ = 1;
@@ -89,6 +97,12 @@ class MeshgridGpuKernel : public GpuKernel {

    // inferred shape swaps output shape for us if needed
    output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(output_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'MeshGridGpuKernel', output is null.";
      InitSizeLists();
      return true;
    }

    if (output_count_ != input_count_) {
      MS_LOG(ERROR) << "output count is " << output_count_ << ", but MeshgridGpuKernel needs " << input_count_
@@ -101,7 +115,7 @@ class MeshgridGpuKernel : public GpuKernel {
    }

    // need to pad output shape with ones for broadcast kernel
    for (size_t i = 0; i < output_shape_.size() - MAX_DIMS; i++) {
    for (size_t i = 0; i < MAX_DIMS - output_shape_.size(); i++) {
      output_shape_.push_back(1);
    }

@@ -118,6 +132,7 @@ class MeshgridGpuKernel : public GpuKernel {
    output_size_ = 0;
    output_count_ = 0;
    swap_indexing_ = true;
    is_null_input_ = false;

    input_size_list_.clear();
    output_size_list_.clear();
@@ -145,6 +160,7 @@ class MeshgridGpuKernel : public GpuKernel {
  size_t output_size_;
  size_t output_count_;
  bool swap_indexing_;
  bool is_null_input_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/reverse_sequence_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/reverse_sequence_gpu_kernel.h
@@ -36,6 +36,7 @@ class ReverseSequenceGpuFwdKernel : public GpuKernel {
        input_size_(0),
        batch_dim_(0),
        seq_dim_(0),
        is_null_input_(false),
        seq_len_size_(0),
        total_index_dim_(0),
        output_size_(0),
@@ -46,6 +47,9 @@ class ReverseSequenceGpuFwdKernel : public GpuKernel {
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input = GetDeviceAddress<T>(inputs, 0);
    S *seq_len = GetDeviceAddress<S>(inputs, 1);
    size_t *input_shape_ptr = GetDeviceAddress<size_t>(workspace, 0);
@@ -75,23 +79,23 @@ class ReverseSequenceGpuFwdKernel : public GpuKernel {
      return false;
    }
    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    if (CHECK_NULL_INPUT(input_shape_)) {
      MS_LOG(WARNING) << "ReverseSequence input is null";
    auto seq_len_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_) || CHECK_NULL_INPUT(seq_len_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ReverseSequenceGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    if (input_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'ReverseSequenceGpuKernel', the rank of input cannot be less than 1, but got "
                        << input_shape_.size();
    }
    input_size_ = 1;
    shape_size_ = input_shape_.size();  // required for calls
    for (size_t i = 0; i < shape_size_; i++) {
      input_size_ *= input_shape_[i];
    }
    // get seq len shape
    auto seq_len_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    if (CHECK_NULL_INPUT(seq_len_shape)) {
      MS_LOG(WARNING) << "ReverseSequence seq lengths input is null";
      InitSizeLists();
      return true;
    }
    seq_len_size_ = seq_len_shape.size();
    output_size_ = input_size_;  // size does not change
    // Allocate workspace memory to use for storing indices for each thread to compute with
@@ -116,6 +120,7 @@ class ReverseSequenceGpuFwdKernel : public GpuKernel {
  size_t input_size_;
  int64_t batch_dim_;
  int64_t seq_dim_;
  bool is_null_input_;
  size_t seq_len_size_;
  size_t total_index_dim_;
  size_t output_size_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/reverse_v2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/reverse_v2_gpu_kernel.h
@@ -38,6 +38,9 @@ class ReverseV2GpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_device = GetDeviceAddress<T>(inputs, 0);
    T *output_device = GetDeviceAddress<T>(outputs, 0);
    size_t *input_shape_device = GetDeviceAddress<size_t>(workspace, 0);
@@ -79,7 +82,16 @@ class ReverseV2GpuKernel : public GpuKernel {
    }

    input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ReverseV2GpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    input_rank_ = input_shape_.size();
    if (input_rank_ < 1) {
      MS_LOG(EXCEPTION) << "For 'ReverseV2GpuKernel', the rank of input cannot be less than 1, bot got " << input_rank_;
    }
    input_size_ = 1;
    for (size_t i = 0; i < input_rank_; i++) {
      input_size_ *= input_shape_[i];
@@ -92,6 +104,9 @@ class ReverseV2GpuKernel : public GpuKernel {
    }

    axis_ = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
    if (axis_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'ReverseV2GpuKernel', the rank of axis cannot be less than 1, bot got " << axis_.size();
    }
    for (int64_t &dimension : axis_) {
      if (dimension < 0) {
        dimension += input_rank_;
@@ -106,6 +121,7 @@ class ReverseV2GpuKernel : public GpuKernel {
  void ResetResource() noexcept override {
    input_size_ = 0;
    input_rank_ = 0;
    is_null_input_ = false;
    input_shape_.clear();
    strides_.clear();
    axis_.clear();
@@ -131,6 +147,7 @@ class ReverseV2GpuKernel : public GpuKernel {
  std::vector<size_t> input_shape_;
  std::vector<int64_t> strides_;
  std::vector<int64_t> axis_;
  bool is_null_input_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/sort_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/sort_gpu_kernel.h
@@ -42,6 +42,9 @@ class SortGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_device = GetDeviceAddress<T>(inputs, 0);

    T *output_device = GetDeviceAddress<T>(outputs, 0);
@@ -127,10 +130,17 @@ class SortGpuKernel : public GpuKernel {
    }

    input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'SortGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }

    input_rank_ = input_shape_.size();
    if (input_rank_ > TRANSPOSE_MAX_DIMENSION) {
      MS_LOG(ERROR) << "Sort cannot support input that has more than " << TRANSPOSE_MAX_DIMENSION << " dimensions.";
    if (input_rank_ > TRANSPOSE_MAX_DIMENSION || input_rank_ < 1) {
      MS_LOG(ERROR) << "For 'SortGpuKernel', the rank of input cannot be more than " << TRANSPOSE_MAX_DIMENSION
                    << " dimensions or less than 1 dimension.";
      return false;
    }

@@ -145,6 +155,11 @@ class SortGpuKernel : public GpuKernel {
    if (axis_ < 0) {
      axis_ += input_rank_;
    }
    if ((size_t)axis_ >= input_rank_) {
      MS_LOG(ERROR) << "For 'SortGpuKernel', axis should be less than the rank of input, bot got axis: " << axis_
                    << " the rank of input: " << input_rank_;
      return false;
    }

    perm_.resize(input_rank_);
    std::iota(perm_.begin(), perm_.end(), 0);
@@ -172,6 +187,7 @@ class SortGpuKernel : public GpuKernel {
    input_size_ = 0;
    axis_ = 0;
    descending_ = false;
    is_null_input_ = false;
    input_shape_.clear();
    input_rank_ = 0;
    transposed_shape_.clear();
@@ -206,6 +222,7 @@ class SortGpuKernel : public GpuKernel {
  size_t input_size_;
  int64_t axis_;
  bool descending_;
  bool is_null_input_;
  std::vector<size_t> input_shape_;
  size_t input_rank_;

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/spacetodepth_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/spacetodepth_gpu_kernel.h
@@ -37,6 +37,9 @@ class SpaceToDepthFwdKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    // get device buffer ptr
    T *input = GetDeviceAddress<T>(inputs, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);
@@ -71,6 +74,12 @@ class SpaceToDepthFwdKernel : public GpuKernel {
    }
    // check input_shape
    auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'SpaceToDepthGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    shape_size_ = input_shape.size();
    if (shape_size_ != SPACETODEPTH_BUFFER_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but SpaceToDepth supports 4-D tensor.";
@@ -102,6 +111,7 @@ class SpaceToDepthFwdKernel : public GpuKernel {
    input_size_ = 0;
    output_size_ = 0;
    block_size_ = 0;
    is_null_input_ = false;
    in_ = 0;
    ic_ = 0;
    ih_ = 0;
@@ -131,6 +141,7 @@ class SpaceToDepthFwdKernel : public GpuKernel {
  size_t input_size_;
  size_t output_size_;
  size_t block_size_;
  bool is_null_input_;
  size_t in_;
  size_t ic_;
  size_t ih_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tensor_copy_slices_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tensor_copy_slices_gpu_kernel.h
@@ -31,11 +31,14 @@ namespace kernel {
 template <typename T>
 class TensorCopySlicesGpuKernel : public GpuKernel {
 public:
  TensorCopySlicesGpuKernel() : input_size_(0), update_size_(0), output_size_(0) {}
  TensorCopySlicesGpuKernel() : input_size_(0), update_size_(0), output_size_(0), is_null_input_(false) {}
  ~TensorCopySlicesGpuKernel() {}

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *update_addr = GetDeviceAddress<T>(inputs, 1);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
@@ -69,7 +72,13 @@ class TensorCopySlicesGpuKernel : public GpuKernel {
    }

    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);

    auto update_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    is_null_input_ = CHECK_NULL_INPUT(input_shape_) || CHECK_NULL_INPUT(update_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TensorCopySlicesGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    if (input_shape_.size() > kMaxDims) {
      MS_LOG(ERROR) << "StridedSlice support dims no more than " << kMaxDims << ", but the input shape is "
                    << input_shape_.size();
@@ -80,6 +89,13 @@ class TensorCopySlicesGpuKernel : public GpuKernel {
    end_ = GetAttr<std::vector<int64_t>>(kernel_node, kAttrEnd);
    strides_ = GetAttr<std::vector<int64_t>>(kernel_node, kAttrStrides);

    if (begin_.size() > input_shape_.size()) {
      MS_LOG(ERROR) << "For 'TensorCopySlicesGpuKernel', the rank of begin attr cannot be more than the rank of input, "
                    << "but got the rank of begin attr: " << begin_.size()
                    << ", the rank of input: " << input_shape_.size();
      return false;
    }

    FillEmptyDims(kernel_node);
    output_shape_ = input_shape_;
    FillUpdateDim();
@@ -100,6 +116,7 @@ class TensorCopySlicesGpuKernel : public GpuKernel {
    auto len = begin_.size();
    size_t total_input_num = 1;
    for (size_t i = 0; i < len; ++i) {
      MS_EXCEPTION_IF_ZERO("strides_[i]", strides_[i]);
      total_input_num *= ((end_[i] - begin_[i]) / strides_[i]);
    }
    if (total_input_num != total_update_num) {
@@ -161,6 +178,7 @@ class TensorCopySlicesGpuKernel : public GpuKernel {
      if (begin_[i] <= end_[i] && strides_[i] > 0) {
        update_shape_.push_back((end_[i] - 1 - begin_[i]) / strides_[i] + 1);
      } else if (begin_[i] > end_[i] && strides_[i] < 0) {
        MS_EXCEPTION_IF_ZERO("strides_[i] + 1", strides_[i] + 1);
        update_shape_.push_back((end_[i] - begin_[i] + 1) / strides_[i] + 1);
      } else {
        update_shape_.push_back(0);
@@ -185,6 +203,7 @@ class TensorCopySlicesGpuKernel : public GpuKernel {
  size_t update_size_;
  size_t output_size_;
  inline static size_t kMaxDims = 8;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tensor_scatter_add_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tensor_scatter_add_gpu_kernel.h
@@ -38,7 +38,8 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
        work_shape_(nullptr),
        indices_dim_0_(0),
        indices_dim_1_(0),
        memcpy_flag_(false) {}
        memcpy_flag_(false),
        is_null_input_(false) {}
  ~TensorScatterAddGpuFwdKernel() {
    if (indices_stride_ != nullptr) {
      device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(static_cast<void *>(indices_stride_));
@@ -54,6 +55,9 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    VARIABLE_NOT_USED(workspace);
    T *input = GetDeviceAddress<T>(inputs, 0);
    S *indices = GetDeviceAddress<S>(inputs, 1);
@@ -77,10 +81,10 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
    const size_t update_size = update_size_ / sizeof(T);
    const size_t output_size = output_size_ / sizeof(T);

    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
                               cudaMemcpyAsync(&output[0], &input[0], input_size_, cudaMemcpyDeviceToDevice,
                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "cudaMemcpyAsync output failed");
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_,
      cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
      "cudaMemcpyAsync output failed");

    TensorScatterAdd(input, indices, update, output, block_size_, update_size, output_size, indices_dim_0_,
                     indices_dim_1_, indices_stride_, work_shape_, reinterpret_cast<cudaStream_t>(stream_ptr));
@@ -106,6 +110,13 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
    indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(update_shapes_) || CHECK_NULL_INPUT(indices_shapes_) ||
                     CHECK_NULL_INPUT(input_shapes_) || CHECK_NULL_INPUT(output_shapes_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'TensorScatterAddGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }

    std::vector<size_t> shape_me = input_shapes_;
    (void)std::transform(shape_me.begin(), shape_me.end(), std::back_inserter(vec_work_shape_),
@@ -126,6 +137,10 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
      MS_LOG(EXCEPTION) << "Failed to alloc work_shape_work, size: " << vec_work_len;
    }
    work_shape_ = static_cast<S *>(work_shape_work);
    if (vec_work_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'TensorScatterAddGpuKernel', the rank of vec work cannot be less than 1, but got "
                        << vec_work_shape_.size();
    }

    InitSizeLists();

@@ -160,6 +175,10 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
      output_size_ *= output_shapes_[i];
    }

    if (indices_shapes_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'TensorScatterAddGpuKernel', the rank of indices cannot be less than 1, but got "
                        << indices_shapes_.size();
    }
    // calculate indices dim 0/1
    indices_dim_0_ = indices_shapes_[0];
    indices_dim_1_ = indices_shapes_[indices_shapes_.size() - 1];
@@ -169,6 +188,11 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
      block_size_ *= output_shapes_[i];
    }

    if (indices_dim_1_ < 1 || indices_dim_1_ > output_shapes_.size()) {
      MS_LOG(EXCEPTION) << "For 'TensorScatterAddGpuKernel', indices_shapes[-1] cannot be less than 1 and greater than "
                        << "the rank of output_shapes, but got indices_shapes[-1]: " << indices_dim_1_
                        << ", rank of output_shapes: " << output_shapes_.size();
    }
    // calculate indices_stride
    vec_indices_stride_.resize(indices_dim_1_, 0);
    vec_indices_stride_[indices_dim_1_ - 1] = block_size_;
@@ -201,6 +225,7 @@ class TensorScatterAddGpuFwdKernel : public GpuKernel {
  size_t indices_dim_0_;
  size_t indices_dim_1_;
  bool memcpy_flag_;
  bool is_null_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tile_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/tile_gpu_kernel.h
@@ -63,12 +63,10 @@ class TileGpuKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Tile needs 1 input.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Tile has 1 output.";
      return false;
    }
    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@@ -78,6 +76,10 @@ class TileGpuKernel : public GpuKernel {
      InitSizeLists();
      return true;
    }
    if (output_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'TileGpuKernel', the rank of output cannot be less than 1, but got "
                        << output_shape_.size();
    }
    input_size_ = 1;
    for (size_t i = 0; i < input_shape_.size(); i++) {
      input_size_ *= input_shape_[i];
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc
@@ -140,6 +140,9 @@ std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &
    }
    bool flag = true;
    auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize();
    if (kernel_info->GetInputNum() > 0) {
      MS_EXCEPTION_IF_ZERO("attr size", attr_size);
    }
    // data type matching check of all input parameters of kernel
    for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) {
      GpuKernelFactory::CheckSM(kernel_info, input_index);
@@ -153,6 +156,9 @@ std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &
      continue;
    }
    attr_size = (&(iter->second))->at(attr_index).first.GetOutputSize();
    if (kernel_info->GetOutputNum() > 0) {
      MS_EXCEPTION_IF_ZERO("attr size", attr_size);
    }
    // data type matching check of all output parameters of kernel
    for (size_t output_index = 0; output_index < kernel_info->GetOutputNum(); output_index++) {
      if (kernel_info->GetOutputDeviceType(output_index) !=
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumprod_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cumprod_gpu_kernel.h
@@ -28,7 +28,14 @@ constexpr int kMaxDimsSize = 3;
 template <typename T>
 class CumProdGpuKernel : public GpuKernel {
 public:
  CumProdGpuKernel() : exclusive_(false), reverse_(false), axis_(0), input_size_0_(0), stride_(0), stride2_(0) {}
  CumProdGpuKernel()
      : exclusive_(false),
        reverse_(false),
        is_null_input_(false),
        axis_(0),
        input_size_0_(0),
        stride_(0),
        stride2_(0) {}
  ~CumProdGpuKernel() = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -37,6 +44,9 @@ class CumProdGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *ws_addr = GetDeviceAddress<T>(workspace, 0);
@@ -51,6 +61,12 @@ class CumProdGpuKernel : public GpuKernel {
    }
    input_size_0_ = sizeof(T);
    shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'CumProdGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
    exclusive_ = GetAttr<bool>(kernel_node, "exclusive");
    reverse_ = GetAttr<bool>(kernel_node, "reverse");
@@ -93,6 +109,7 @@ class CumProdGpuKernel : public GpuKernel {
  }
  bool exclusive_;
  bool reverse_;
  bool is_null_input_;
  int axis_;
  size_t input_size_0_;
  size_t stride_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/identity_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/identity_gpu_kernel.h
@@ -39,6 +39,9 @@ class IdentityGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/apply_gradient_descent_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/apply_gradient_descent_gpu_kernel.h
@@ -36,6 +36,9 @@ class ApplyGradientDescentKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    VARIABLE_NOT_USED(workspace);
    T *var = GetDeviceAddress<T>(inputs, 0);
    T *alpha = GetDeviceAddress<T>(inputs, 1);
@@ -50,14 +53,18 @@ class ApplyGradientDescentKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 3) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but ApplyGradientDescent needs 3 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ApplyGradientDescent has 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ApplyGradientDescentGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    input_size_ = 1;
    for (size_t i = 0; i < input_shape.size(); i++) {
      input_size_ *= input_shape[i];
@@ -68,6 +75,7 @@ class ApplyGradientDescentKernel : public GpuKernel {

  void ResetResource() noexcept override {
    input_size_ = 1;
    is_null_input_ = false;
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
@@ -83,6 +91,7 @@ class ApplyGradientDescentKernel : public GpuKernel {

 private:
  size_t input_size_;
  bool is_null_input_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bce_with_logits_loss_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bce_with_logits_loss_kernel.h
@@ -36,6 +36,9 @@ class BCEWithLogitsLossKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *predict = GetDeviceAddress<T>(inputs, 0);
    T *target = GetDeviceAddress<T>(inputs, 1);
    T *weight = GetDeviceAddress<T>(inputs, 2);
@@ -69,25 +72,42 @@ class BCEWithLogitsLossKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 4) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but BCEWithLogitsLoss needs 4 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but BCEWithLogitsLoss has 1 output.";
      return false;
    }
    input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    weight_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    pos_weight_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    is_null_input_ =
      CHECK_NULL_INPUT(input_shape_) || CHECK_NULL_INPUT(weight_shape_) || CHECK_NULL_INPUT(pos_weight_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'BCEWithLogitsLossGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    if (input_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'BCEWithLogitsLossGpuKernel', the rank of input cannot be less than 1, but got "
                        << input_shape_.size();
    }
    if (weight_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'BCEWithLogitsLossGpuKernel', the rank of weight cannot be less than 1, but got "
                        << weight_shape_.size();
    }
    if (pos_weight_shape_.size() < 1) {
      MS_LOG(EXCEPTION) << "For 'BCEWithLogitsLossGpuKernel', the rank of pos_weight cannot be less than 1, but got "
                        << pos_weight_shape_.size();
    }
    input_size_ = 1;
    if (input_shape_.size() > MAX_LOGITS_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input dimension is " << input_shape_.size()
                        << ", but BCEWithLogitsLoss can only support up to " << MAX_LOGITS_DIMENSION << "-D.";
      return false;
    }
    for (size_t i = 0; i < input_shape_.size(); i++) {
      input_size_ *= input_shape_[i];
    }
    // weight shape
    weight_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
    weight_size_ = 1;
    for (size_t i = 0; i < weight_shape_.size(); i++) {
      weight_size_ *= weight_shape_[i];
@@ -95,7 +115,6 @@ class BCEWithLogitsLossKernel : public GpuKernel {
    weight_need_broadcast_ = NeedBroadcast(&weight_shape_, input_shape_);
    // pos_weight shape
    pos_weight_size_ = 1;
    pos_weight_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3);
    for (size_t i = 0; i < pos_weight_shape_.size(); i++) {
      pos_weight_size_ *= pos_weight_shape_[i];
    }
@@ -110,6 +129,7 @@ class BCEWithLogitsLossKernel : public GpuKernel {
    pos_weight_size_ = 1;
    weight_need_broadcast_ = false;
    pos_weight_need_broadcast_ = false;
    is_null_input_ = false;
    input_shape_.clear();
    weight_shape_.clear();
    pos_weight_shape_.clear();
@@ -137,7 +157,7 @@ class BCEWithLogitsLossKernel : public GpuKernel {
  bool NeedBroadcast(std::vector<size_t> *shape, const std::vector<size_t> &result_shape) {
    // result_shape is larger that shape
    // and shape is able to broadcasted to result_shape
    if (shape->size() != result_shape.size()) {
    if (shape->size() < result_shape.size()) {
      size_t fill_size = result_shape.size() - shape->size();
      (void)shape->insert(shape->begin(), fill_size, 1);
      return true;
@@ -155,6 +175,7 @@ class BCEWithLogitsLossKernel : public GpuKernel {
  size_t pos_weight_size_;
  bool weight_need_broadcast_;
  bool pos_weight_need_broadcast_;
  bool is_null_input_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> weight_shape_;
  std::vector<size_t> pos_weight_shape_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
@@ -77,7 +77,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
    const float beta = 0;

    if (use_pad_) {
      T *padded = GetPossiblyNullDeviceAddress<T>(workspace, 1);
      T *padded = GetDeviceAddress<T>(workspace, 1);
      if (data_format_ == kOpFormat_NHWC) {
        CalPadNHWC(padded_size_ / sizeof(T), x, n_, old_height_, old_width_, c_, old_height_ + pad_height_,
                   old_width_ + pad_width_, pad_top_, pad_left_, pad_value_, padded,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_gpu_kernel.h
@@ -50,7 +50,7 @@ class Conv3dGpuKernel : public GpuKernel {
    const float alpha = 1;
    const float beta = 0;
    if (use_pad_) {
      T *padded_addr = GetPossiblyNullDeviceAddress<T>(workspace, 1);
      T *padded_addr = GetDeviceAddress<T>(workspace, 1);
      CalPad3d(padded_size_ / sizeof(T), input_addr, n_, c_, old_depth_, old_height_, old_width_,
               old_depth_ + pad_depth_, old_height_ + pad_height_, old_width_ + pad_width_, pad_head_, pad_top_,
               pad_left_, pad_value_, padded_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_grad_filter_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_grad_filter_gpu_kernel.h
@@ -61,7 +61,7 @@ class Conv3dGradFilterGpuKernel : public GpuKernel {
    const float alpha = 1;
    const float beta = 0;
    if (use_pad_) {
      T *padded = GetPossiblyNullDeviceAddress<T>(workspace, 1);
      T *padded = GetDeviceAddress<T>(workspace, 1);
      CalPad3d(padded_size_ / sizeof(T), x, n_, c_, old_depth_, old_height_, old_width_, old_depth_ + pad_depth_,
               old_height_ + pad_height_, old_width_ + pad_width_, pad_head_, pad_top_, pad_left_, pad_value_, padded,
               reinterpret_cast<cudaStream_t>(stream_ptr));
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv3d_transpose_gpu_kernel.h
@@ -72,7 +72,7 @@ class Conv3dTransposeGpuFwdKernel : public GpuKernel {
      }
    } else {
      if (greater_stride_) {
        T *stride_padded = GetPossiblyNullDeviceAddress<T>(workspace, 1);
        T *stride_padded = GetDeviceAddress<T>(workspace, 1);
        CHECK_CUDNN_RET_WITH_EXCEPT(
          kernel_node_,
          cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr,
@@ -93,6 +93,23 @@ class Conv3dTransposeGpuFwdKernel : public GpuKernel {
    return true;
  }

  bool CheckNull(const std::vector<size_t> filter_shape, const std::vector<size_t> input_shape) {
    is_null_input_ = CHECK_NULL_INPUT(filter_shape) || CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'Conv3dTransposeGpuKernel', input is null.";
      InitSizeLists();
      return true;
    }
    return false;
  }

  void CheckSize(const size_t value, const size_t expect_value, const string arg_name) {
    if (value != expect_value) {
      MS_LOG(EXCEPTION) << "For 'Conv3dTransposeGpuKernel', the length of " << arg_name << " must be " << expect_value
                        << ", but got " << value;
    }
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    InitResource();
@@ -105,10 +122,7 @@ class Conv3dTransposeGpuFwdKernel : public GpuKernel {
    }
    auto filter_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "Conv3dTransposeGpuBkwKernel input is null.";
      InitSizeLists();
    if (CheckNull(filter_shape, input_shape)) {
      return true;
    }
    std::vector<size_t> output_shape;
@@ -133,6 +147,8 @@ class Conv3dTransposeGpuFwdKernel : public GpuKernel {
    pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode");
    SetStrideAndDilation(kernel_node);
    std::vector<int> stride_pad_list(6, 0);
    (void)CheckSize(filter_shape.size(), 5, "filter_shape");
    (void)CheckSize(pad_list.size(), 6, "pad_list");
    if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) {  // pad_mode_ = same
      UpdatePaddingAndDilation(input_shape, filter_shape, pad_list.data(), stride_pad_list.data());
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout3d_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/dropout3d_gpu_kernel.h
@@ -101,6 +101,7 @@ class Dropout3DGpuFwdKernel : public GpuKernel {
    }

    num_chan_ = n_ * c_;
    MS_EXCEPTION_IF_ZERO("num channel", num_chan_);
    num_per_chan_ = num_count_ / num_chan_;  // number of elements per channel

    keep_prob_ = GetAttr<float>(kernel_node, "keep_prob");
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/hsigmoid_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/hsigmoid_gpu_kernel.h
@@ -51,12 +51,10 @@ class HSigmoidKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSigmoid needs 1 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSigmoid has 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/hsigmoid_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/hsigmoid_grad_gpu_kernel.h
@@ -52,12 +52,10 @@ class HSigmoidGradKernel : public GpuKernel {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSigmoidGrad needs 2 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSigmoidGrad has 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/nll_loss_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/nll_loss_gpu_kernel.h
@@ -37,6 +37,9 @@ class NLLLossGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_device = GetDeviceAddress<T>(inputs, 0);
    int32_t *target_device = GetDeviceAddress<int32_t>(inputs, 1);  // nll_loss only supports int32 target
    S *weight_device = GetDeviceAddress<S>(inputs, 2);
@@ -44,7 +47,9 @@ class NLLLossGpuKernel : public GpuKernel {
    T *loss_device = GetDeviceAddress<T>(outputs, 0);
    S *total_weight_device = GetDeviceAddress<S>(outputs, 1);

    T *tmp_loss_device = GetPossiblyNullDeviceAddress<T>(workspace, 0);
    T *tmp_loss_device =
      reduction_ != 0 ? GetDeviceAddress<T>(workspace, 0) : GetPossiblyNullDeviceAddress<T>(workspace, 0);

    S *tmp_target_weight_device = GetDeviceAddress<S>(workspace, 1);

    NLLLoss(n_, c_, reduction_, input_device, target_device, weight_device, loss_device, total_weight_device,
@@ -54,6 +59,16 @@ class NLLLossGpuKernel : public GpuKernel {

  bool Init(const CNodePtr &kernel_node) override {
    std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'NllLossGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() < 2) {
      MS_LOG(EXCEPTION) << "For 'NllLossGpuKernel', the rank of input cannot less than 2, but got "
                        << input_shape.size();
    }
    n_ = static_cast<int>(input_shape[0]);
    c_ = static_cast<int>(input_shape[1]);
    for (size_t i = 0; i < input_shape.size(); i++) {
@@ -83,6 +98,7 @@ class NLLLossGpuKernel : public GpuKernel {
    input_size_ = 1;
    n_ = 0;
    c_ = 0;
    is_null_input_ = false;
    reduction_ = 1;  // default value
    tmp_loss_size_ = 0;
    tmp_target_weight_size_ = 0;  // tmp_target_weight (N,) array
@@ -114,6 +130,7 @@ class NLLLossGpuKernel : public GpuKernel {
  size_t tmp_target_weight_size_;
  int n_;
  int c_;
  bool is_null_input_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/nll_loss_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/nll_loss_grad_gpu_kernel.h
@@ -37,6 +37,9 @@ class NLLLossGradGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_device = GetDeviceAddress<T>(inputs, 0);
    T *dloss_device = GetDeviceAddress<T>(inputs, 1);
    int32_t *target_device = GetDeviceAddress<int32_t>(inputs, 2);  // nll_loss_grad only supports int32 target
@@ -53,6 +56,16 @@ class NLLLossGradGpuKernel : public GpuKernel {

  bool Init(const CNodePtr &kernel_node) override {
    std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(input_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'NllLossGradGpuKernel', input is null";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() < 2) {
      MS_LOG(EXCEPTION) << "For 'NllLossGradGpuKernel', the rank of input cannot less than 2, but got "
                        << input_shape.size();
    }
    n_ = static_cast<int>(input_shape[0]);
    c_ = static_cast<int>(input_shape[1]);
    for (size_t i = 0; i < input_shape.size(); i++) {
@@ -79,6 +92,7 @@ class NLLLossGradGpuKernel : public GpuKernel {
    input_size_ = 1;
    n_ = 0;
    c_ = 0;
    is_null_input_ = false;
    reduction_ = 1;  // default value
    num_dloss_ = 1;  // default size (scalar)
    input_size_list_.clear();
@@ -102,6 +116,7 @@ class NLLLossGradGpuKernel : public GpuKernel {
  int reduction_;
  int n_;
  int c_;
  bool is_null_input_;
  int num_dloss_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/resize_bilinear_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/resize_bilinear_gpu_kernel.h
@@ -36,6 +36,9 @@ class ResizeBilinearGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input = GetDeviceAddress<T>(inputs, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);
    float h_scale = Scaling(input_h_, output_h_, align_corners_);
@@ -58,8 +61,15 @@ class ResizeBilinearGpuKernel : public GpuKernel {
    }
    std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    std::vector<size_t> output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    if (input_shape.size() != 4) {
      MS_LOG(ERROR) << "Input is " << input_shape.size() << "-D, but ResizeBilinear supports only 4-D inputs.";
    is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(output_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ResizeBilinearGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    if (input_shape.size() != 4 || output_shape.size() != 4) {
      MS_LOG(ERROR) << "For 'ResizeBilinear', the rank of input and output must be 4, but got the rank of input: "
                    << input_shape.size() << ", the rank of output: " << output_shape.size();
      return false;
    }
    n_ = SizeToInt(input_shape[0]);
@@ -83,6 +93,7 @@ class ResizeBilinearGpuKernel : public GpuKernel {

  void ResetResource() noexcept override {
    align_corners_ = false;
    is_null_input_ = false;
    n_ = 0;
    c_ = 0;
    input_h_ = 0;
@@ -110,6 +121,7 @@ class ResizeBilinearGpuKernel : public GpuKernel {
  }

  bool align_corners_;
  bool is_null_input_;
  int n_;
  int c_;
  int input_h_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/resize_bilinear_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/resize_bilinear_grad_gpu_kernel.h
@@ -36,6 +36,9 @@ class ResizeBilinearGradGpuKernel : public GpuKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *dy = GetDeviceAddress<T>(inputs, 0);
    float *interim = GetDeviceAddress<float>(workspace, 0);
    T *dx = GetDeviceAddress<T>(outputs, 0);
@@ -67,6 +70,12 @@ class ResizeBilinearGradGpuKernel : public GpuKernel {
    std::vector<size_t> dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    std::vector<size_t> dx_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    is_null_input_ = CHECK_NULL_INPUT(dy_shape) || CHECK_NULL_INPUT(x_shape) || CHECK_NULL_INPUT(dx_shape);
    if (is_null_input_) {
      MS_LOG(WARNING) << "For 'ResizeBilinearGradGpuKernel', input or output is null.";
      InitSizeLists();
      return true;
    }
    if (dy_shape.size() != 4) {
      MS_LOG(ERROR) << "Input is " << dy_shape.size() << "-D, but ResizeBilinearGrad supports only 4-D inputs.";
      return false;
@@ -75,6 +84,10 @@ class ResizeBilinearGradGpuKernel : public GpuKernel {
      MS_LOG(ERROR) << "Input is " << x_shape.size() << "-D, but ResizeBilinearGrad supports only 4-D inputs.";
      return false;
    }
    if (dx_shape.size() != 4) {
      MS_LOG(ERROR) << "For 'ResizeBilinearGradGpuKernel', the rank of output must be 4, but got " << dx_shape.size();
      return false;
    }
    n_ = SizeToInt(dy_shape[0]);
    c_ = SizeToInt(dy_shape[1]);
    dy_h_ = SizeToInt(dy_shape[2]);
@@ -97,6 +110,7 @@ class ResizeBilinearGradGpuKernel : public GpuKernel {

  void ResetResource() noexcept override {
    align_corners_ = false;
    is_null_input_ = false;
    n_ = 0;
    c_ = 0;
    dy_h_ = 0;
@@ -125,6 +139,7 @@ class ResizeBilinearGradGpuKernel : public GpuKernel {
  }

  bool align_corners_;
  bool is_null_input_;
  int n_;
  int c_;
  int dy_h_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/random/randperm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/random/randperm_gpu_kernel.h
@@ -84,6 +84,10 @@ class RandpermGpuKernel : public GpuKernel {
    }

    max_length_ = static_cast<size_t>(GetAttr<int64_t>(kernel_node, "max_length"));
    if (max_length_ < 1) {
      MS_LOG(ERROR) << "For 'RandpermGpuKernel', the max_length cannot be less than 1, but got " << max_length_;
      return false;
    }
    pad_ = static_cast<T>(GetAttr<int64_t>(kernel_node, "pad"));

    InitSizeLists();