Browse Source

!8162 gpu support dynamic shape

From: @wilfchen
Reviewed-by: @limingqi107,@cristoval
Signed-off-by: @cristoval
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
36e69f6ef9
52 changed files with 525 additions and 319 deletions
  1. +33
    -26
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
  2. +9
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc
  3. +14
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h
  4. +17
    -11
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
  5. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
  6. +15
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
  7. +36
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc
  8. +23
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
  9. +4
    -3
      mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h
  10. +6
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h
  11. +18
    -10
      mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
  12. +17
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h
  13. +13
    -9
      mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
  14. +23
    -17
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
  15. +20
    -14
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
  16. +7
    -7
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h
  17. +7
    -7
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h
  18. +9
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h
  19. +9
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h
  20. +9
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h
  21. +9
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h
  22. +10
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h
  23. +14
    -14
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h
  24. +6
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
  25. +15
    -16
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h
  26. +7
    -7
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
  27. +9
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h
  28. +15
    -15
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h
  29. +12
    -12
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h
  30. +7
    -7
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h
  31. +7
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h
  32. +8
    -8
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h
  33. +7
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h
  34. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h
  35. +4
    -4
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h
  36. +7
    -6
      mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h
  37. +0
    -2
      mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h
  38. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h
  39. +0
    -2
      mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h
  40. +0
    -2
      mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h
  41. +4
    -0
      mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
  42. +28
    -0
      mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
  43. +2
    -0
      mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
  44. +3
    -4
      mindspore/ccsrc/backend/session/gpu_session.cc
  45. +1
    -1
      mindspore/ccsrc/runtime/device/CMakeLists.txt
  46. +1
    -1
      mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
  47. +1
    -1
      mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
  48. +1
    -3
      mindspore/ccsrc/runtime/device/executor/executor_callback.cc
  49. +3
    -5
      mindspore/ccsrc/runtime/device/executor/executor_callback.h
  50. +2
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
  51. +30
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
  52. +12
    -7
      tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc

+ 33
- 26
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h View File

@@ -35,21 +35,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = {
template <typename T> template <typename T>
class ArrayReduceGpuKernel : public GpuKernel { class ArrayReduceGpuKernel : public GpuKernel {
public: public:
ArrayReduceGpuKernel()
: cudnn_handle_(nullptr),
reduce_tensor_op_(CUDNN_REDUCE_TENSOR_ADD),
data_type_(CUDNN_DATA_FLOAT),
nan_prop_(CUDNN_NOT_PROPAGATE_NAN),
reduce_indices_(CUDNN_REDUCE_TENSOR_NO_INDICES),
reduce_tensor_descriptor_(nullptr),
inputA_descriptor_(nullptr),
outputC_descriptor_(nullptr),
keep_dims_(false),
all_match_(false),
is_null_input_(false),
input_size_(0),
output_size_(0),
workspace_size_(0) {}
ArrayReduceGpuKernel() { ResetResource(); }
~ArrayReduceGpuKernel() override { DestroyResource(); } ~ArrayReduceGpuKernel() override { DestroyResource(); }


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -94,7 +80,7 @@ class ArrayReduceGpuKernel : public GpuKernel {
MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output."; MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output.";
return false; return false;
} }
int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size());
int input_dim_length = SizeToInt(AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0).size());


if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueTuple>() || if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueTuple>() ||
AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueList>()) { AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueList>()) {
@@ -117,8 +103,8 @@ class ArrayReduceGpuKernel : public GpuKernel {
} }
keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims"); keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims");


auto inputA_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto outputC_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
auto inputA_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
auto outputC_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(inputA_shape); is_null_input_ = CHECK_NULL_INPUT(inputA_shape);
if (is_null_input_) { if (is_null_input_) {
MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null"; MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null";
@@ -132,6 +118,35 @@ class ArrayReduceGpuKernel : public GpuKernel {
return true; return true;
} }


void ResetResource() noexcept override {
cudnn_handle_ = nullptr;
reduce_tensor_op_ = CUDNN_REDUCE_TENSOR_ADD;
data_type_ = CUDNN_DATA_FLOAT;
nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
reduce_indices_ = CUDNN_REDUCE_TENSOR_NO_INDICES;
reduce_tensor_descriptor_ = nullptr;
inputA_descriptor_ = nullptr;
outputC_descriptor_ = nullptr;
keep_dims_ = false;
all_match_ = false;
is_null_input_ = false;
input_size_ = 0;
output_size_ = 0;
workspace_size_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}

void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
"cudnnDestroyReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -160,14 +175,6 @@ class ArrayReduceGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
"cudnnDestroyReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}
void InferArrayReduceType(const CNodePtr &kernel_node) { void InferArrayReduceType(const CNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
auto iter = kReduceTypeMap.find(kernel_name); auto iter = kReduceTypeMap.find(kernel_name);


+ 9
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.cc View File

@@ -26,5 +26,14 @@ MS_REG_GPU_KERNEL_TWO(
GatherV2, GatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
GatherV2GpuFwdKernel, half, int) GatherV2GpuFwdKernel, half, int)

MS_REG_GPU_KERNEL_TWO(
SparseGatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
GatherV2GpuFwdKernel, float, int)
MS_REG_GPU_KERNEL_TWO(
SparseGatherV2,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
GatherV2GpuFwdKernel, half, int)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

+ 14
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gatherv2_gpu_kernel.h View File

@@ -27,7 +27,7 @@ namespace kernel {
template <typename T, typename S> template <typename T, typename S>
class GatherV2GpuFwdKernel : public GpuKernel { class GatherV2GpuFwdKernel : public GpuKernel {
public: public:
GatherV2GpuFwdKernel() : axis_(0), handle_(nullptr) {}
GatherV2GpuFwdKernel() { ResetResource(); }
~GatherV2GpuFwdKernel() = default; ~GatherV2GpuFwdKernel() = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -52,9 +52,9 @@ class GatherV2GpuFwdKernel : public GpuKernel {
if (input_num != 2) { if (input_num != 2) {
MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2."; MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2.";
} }
input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
input_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
indices_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
output_shapes_ = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);


axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis")); axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
if (axis_ < 0) { if (axis_ < 0) {
@@ -65,9 +65,18 @@ class GatherV2GpuFwdKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void ResetResource() noexcept override {
input_shapes_.clear();
indices_shapes_.clear();
output_shapes_.clear();
std::fill(dims_, dims_ + 3, 0);
axis_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}


protected: protected:
void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); }
void InitSizeLists() override { void InitSizeLists() override {
size_t size = GetSize(input_shapes_); size_t size = GetSize(input_shapes_);
input_size_list_.push_back(size); input_size_list_.push_back(size);
@@ -118,7 +127,6 @@ class GatherV2GpuFwdKernel : public GpuKernel {


size_t dims_[3] = {}; size_t dims_[3] = {};
int axis_; int axis_;
cudnnHandle_t handle_;


std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;


+ 17
- 11
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h View File

@@ -28,14 +28,7 @@ namespace kernel {
template <typename T> template <typename T>
class SplitGpuFwdKernel : public GpuKernel { class SplitGpuFwdKernel : public GpuKernel {
public: public:
SplitGpuFwdKernel()
: axis_(0),
output_num_(1),
input_size_(1),
axis_step_(1),
all_size_before_axis_(1),
all_size_axis_(1),
outputs_host_(nullptr) {}
SplitGpuFwdKernel() { ResetResource(); }
~SplitGpuFwdKernel() override = default; ~SplitGpuFwdKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -59,7 +52,7 @@ class SplitGpuFwdKernel : public GpuKernel {
bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis")); axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis"));
if (axis_ < 0) { if (axis_ < 0) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
axis_ += SizeToInt(input_shape.size()); axis_ += SizeToInt(input_shape.size());
} }
output_num_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "output_num")); output_num_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "output_num"));
@@ -68,7 +61,7 @@ class SplitGpuFwdKernel : public GpuKernel {
return false; return false;
} }


auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
input_size_ = 1; input_size_ = 1;
all_size_before_axis_ = 1; all_size_before_axis_ = 1;
all_size_axis_ = 1; all_size_axis_ = 1;
@@ -88,7 +81,7 @@ class SplitGpuFwdKernel : public GpuKernel {


for (int i = 0; i < output_num_; i++) { for (int i = 0; i < output_num_; i++) {
size_t output_size = 1; size_t output_size = 1;
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
for (size_t j = 0; j < output_shape.size(); j++) { for (size_t j = 0; j < output_shape.size(); j++) {
output_size *= output_shape[j]; output_size *= output_shape[j];
} }
@@ -100,6 +93,19 @@ class SplitGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void ResetResource() noexcept override {
axis_ = 0;
output_num_ = 1;
input_size_ = 1;
axis_step_ = 1;
all_size_before_axis_ = 1;
all_size_axis_ = 1;
outputs_host_ = nullptr;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}

protected: protected:
void InitSizeLists() override {} void InitSizeLists() override {}




+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h View File

@@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel {
MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output."; MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
return false; return false;
} }
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
shape_size_ = input_shape.size(); shape_size_ = input_shape.size();
if (shape_size_ > TRANSPOSE_MAX_DIMENSION) { if (shape_size_ > TRANSPOSE_MAX_DIMENSION) {
MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION


+ 15
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/unsorted_segment_sum_gpu_kernel.h View File

@@ -27,8 +27,7 @@ namespace kernel {
template <typename T, typename S> template <typename T, typename S>
class UnsortedSegmentSumGpuKernel : public GpuKernel { class UnsortedSegmentSumGpuKernel : public GpuKernel {
public: public:
UnsortedSegmentSumGpuKernel()
: input_dim0_(1), input_dim1_(1), output_dim0_(1), output_dim1_(1), is_null_input_(false) {}
UnsortedSegmentSumGpuKernel() { ResetResource(); }
~UnsortedSegmentSumGpuKernel() override = default; ~UnsortedSegmentSumGpuKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -53,15 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
} }


bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shapes); is_null_input_ = CHECK_NULL_INPUT(input_shapes);
if (is_null_input_) { if (is_null_input_) {
MS_LOG(WARNING) << "UnsortedSegmentSum input is null"; MS_LOG(WARNING) << "UnsortedSegmentSum input is null";
InitSizeLists(); InitSizeLists();
return true; return true;
} }
auto ids_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);


auto axis = ids_shapes.size(); auto axis = ids_shapes.size();
for (size_t i = 0; i < input_shapes.size(); i++) { for (size_t i = 0; i < input_shapes.size(); i++) {
@@ -81,6 +80,17 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
return true; return true;
} }


void ResetResource() noexcept override {
input_dim0_ = 1;
input_dim1_ = 1;
output_dim0_ = 1;
output_dim1_ = 1;
is_null_input_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}

protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T)); input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T));


+ 36
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.cc View File

@@ -0,0 +1,36 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/gpu/gpu_kernel.h"

namespace mindspore {
namespace kernel {
void GpuDynamicKernel::UpdateArgs() {
if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) {
return;
}

MS_LOG(INFO) << "Update Args: " << cnode_ptr_->fullname_with_scope();
auto kernel_mod = AnfAlgo::GetKernelMod(cnode_ptr_);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto gpu_kernel_mod = dynamic_cast<GpuKernel *>(kernel_mod);
MS_EXCEPTION_IF_NULL(gpu_kernel_mod);
gpu_kernel_mod->DestroyResource();
gpu_kernel_mod->ResetResource();
gpu_kernel_mod->Init(cnode_ptr_);
}
} // namespace kernel
} // namespace mindspore

+ 23
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h View File

@@ -23,11 +23,13 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <map> #include <map>
#include <memory>
#include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/kernel.h"
#include "backend/kernel_compiler/gpu/kernel_constants.h" #include "backend/kernel_compiler/gpu/kernel_constants.h"
#include "runtime/device/gpu/gpu_device_manager.h" #include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_common.h" #include "runtime/device/gpu/gpu_common.h"
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/executor/dynamic_kernel.h"
using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
namespace mindspore { namespace mindspore {
@@ -45,10 +47,28 @@ static std::map<int, int> kNHWCToNCHWAxisMap = {
{3, 1}, {3, 1},
}; };
class GpuDynamicKernel : public device::DynamicKernel {
public:
explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {}
~GpuDynamicKernel() = default;
void UpdateArgs() override;
void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
};
class GpuKernel : public KernelMod { class GpuKernel : public KernelMod {
public: public:
virtual ~GpuKernel() = default; virtual ~GpuKernel() = default;
virtual bool Init(const CNodePtr &kernel_node) = 0; virtual bool Init(const CNodePtr &kernel_node) = 0;
virtual void ResetResource() noexcept {
MS_LOG(EXCEPTION) << "kernel must override the `ResetResource()` method when dynamic shape";
}
virtual void DestroyResource() noexcept {}
virtual void PostExecute() {}
void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<GpuDynamicKernel>(cnode_ptr); }
device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
protected: protected:
virtual void InitResource() {} virtual void InitResource() {}
@@ -228,7 +248,10 @@ class GpuKernel : public KernelMod {
} }
return type->second; return type->second;
} }
device::DynamicKernelPtr dynamic_kernel_;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore


+ 4
- 3
mindspore/ccsrc/backend/kernel_compiler/gpu/math/addn_gpu_kernel.h View File

@@ -123,6 +123,10 @@ class AddNGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -141,9 +145,6 @@ class AddNGpuFwdKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
}
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_descriptor_; cudnnTensorDescriptor_t input_descriptor_;
cudnnDataType_t cudnn_data_type_; cudnnDataType_t cudnn_data_type_;


+ 6
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/math/bias_add_gpu_kernel.h View File

@@ -112,6 +112,12 @@ class BiasAddGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -129,12 +135,6 @@ class BiasAddGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed");
}

cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cudnnDataType_t cudnn_data_type_; cudnnDataType_t cudnn_data_type_;
cudnnTensorDescriptor_t x_desc_; cudnnTensorDescriptor_t x_desc_;


+ 18
- 10
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h View File

@@ -31,13 +31,7 @@ constexpr int MAX_DIMS = 7;
template <typename T> template <typename T>
class BroadcastOpGpuKernel : public GpuKernel { class BroadcastOpGpuKernel : public GpuKernel {
public: public:
BroadcastOpGpuKernel()
: op_type_(BROADCAST_TYPE_INVALID),
need_broadcast_(false),
is_comp_op_(false),
input1_num_(1),
input2_num_(1),
output_num_(1) {}
BroadcastOpGpuKernel() { ResetResource(); }
~BroadcastOpGpuKernel() override = default; ~BroadcastOpGpuKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -71,9 +65,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
} }
bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
GetOpType(kernel_node); GetOpType(kernel_node);
auto shape1 = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto shape2 = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
auto shape3 = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
need_broadcast_ = IsBroadcast(shape1, shape2); need_broadcast_ = IsBroadcast(shape1, shape2);
if (need_broadcast_ && shape1.size() > 7) { if (need_broadcast_ && shape1.size() > 7) {
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
@@ -106,6 +100,20 @@ class BroadcastOpGpuKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void ResetResource() noexcept override {
op_type_ = BROADCAST_TYPE_INVALID;
need_broadcast_ = false;
is_comp_op_ = false;
input1_num_ = 1;
input2_num_ = 1;
output_num_ = 1;
lhs_shape_.clear();
rhs_shape_.clear();
output_shape_.clear();
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}


protected: protected:
void InitResource() override { return; } void InitResource() override { return; }


+ 17
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_grad_gpu_kernel.h View File

@@ -30,14 +30,7 @@ namespace kernel {
template <typename T> template <typename T>
class BroadcastOpGradGpuKernel : public GpuKernel { class BroadcastOpGradGpuKernel : public GpuKernel {
public: public:
BroadcastOpGradGpuKernel()
: op_type_(BROADCAST_GRAD_TYPE_INVALID),
need_broadcast_(false),
input1_num_(1),
input2_num_(1),
output_num_(1),
grad_x_(false),
grad_y_(false) {}
BroadcastOpGradGpuKernel() { ResetResource(); }
~BroadcastOpGradGpuKernel() override = default; ~BroadcastOpGradGpuKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -105,6 +98,22 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
return true; return true;
} }


void ResetResource() noexcept override {
op_type_ = BROADCAST_GRAD_TYPE_INVALID;
need_broadcast_ = false;
input1_num_ = 1;
input2_num_ = 1;
output_num_ = 1;
std::fill(x1_shape_, x1_shape_ + 4, 1);
std::fill(x2_shape_, x2_shape_ + 4, 1);
std::fill(dy_shape_, dy_shape_ + 4, 1);
grad_x_ = false;
grad_y_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}

protected: protected:
void InitResource() override { return; } void InitResource() override { return; }
void InitSizeLists() override { void InitSizeLists() override {


+ 13
- 9
mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h View File

@@ -69,21 +69,15 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
template <typename T> template <typename T>
class UnaryOpGpuKernel : public GpuKernel { class UnaryOpGpuKernel : public GpuKernel {
public: public:
UnaryOpGpuKernel()
: unary_op_type_(UNARY_OP_INVALID_TYPE),
input_size_(sizeof(T)),
output_size_(sizeof(T)),
workspace_size_(0),
is_null_input_(false) {}
UnaryOpGpuKernel() { ResetResource(); }
~UnaryOpGpuKernel() override = default; ~UnaryOpGpuKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }


bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override { const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
VARIABLE_NOT_USED(workspace);
T *input_addr = GetDeviceAddress<T>(inputs, 0); T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0); T *output_addr = GetDeviceAddress<T>(outputs, 0);


@@ -184,7 +178,7 @@ class UnaryOpGpuKernel : public GpuKernel {
MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output."; MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
return false; return false;
} }
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shape); is_null_input_ = CHECK_NULL_INPUT(input_shape);
if (is_null_input_) { if (is_null_input_) {
MS_LOG(WARNING) << "UnaryOpGpuKernel input is null"; MS_LOG(WARNING) << "UnaryOpGpuKernel input is null";
@@ -198,6 +192,16 @@ class UnaryOpGpuKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void ResetResource() noexcept override {
unary_op_type_ = UNARY_OP_INVALID_TYPE;
input_size_ = sizeof(T);
output_size_ = sizeof(T);
workspace_size_ = 0;
is_null_input_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}


protected: protected:
void InitSizeLists() override { void InitSizeLists() override {


+ 23
- 17
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h View File

@@ -29,16 +29,7 @@ namespace kernel {
template <typename T> template <typename T>
class ActivationGpuFwdKernel : public GpuKernel { class ActivationGpuFwdKernel : public GpuKernel {
public: public:
ActivationGpuFwdKernel()
: cudnn_handle_(nullptr),
activation_desc_(nullptr),
mode_(CUDNN_ACTIVATION_RELU),
data_descriptor_(nullptr),
is_null_input_(false),
cudnn_data_type_(CUDNN_DATA_FLOAT),
input_size_(0),
output_size_(0),
workspace_size_(0) {}
ActivationGpuFwdKernel() { ResetResource(); }
~ActivationGpuFwdKernel() override { DestroyResource(); } ~ActivationGpuFwdKernel() override { DestroyResource(); }
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -75,7 +66,7 @@ class ActivationGpuFwdKernel : public GpuKernel {
MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1."; MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1.";
return false; return false;
} }
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shape); is_null_input_ = CHECK_NULL_INPUT(input_shape);
if (is_null_input_) { if (is_null_input_) {
MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null."; MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null.";
@@ -113,6 +104,27 @@ class ActivationGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"cudnnDestroyActivationDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

void ResetResource() noexcept override {
cudnn_handle_ = nullptr;
activation_desc_ = nullptr;
mode_ = CUDNN_ACTIVATION_RELU;
data_descriptor_ = nullptr;
is_null_input_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
cudnn_data_type_ = CUDNN_DATA_FLOAT;
input_size_ = 0;
output_size_ = 0;
workspace_size_ = 0;
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -132,12 +144,6 @@ class ActivationGpuFwdKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"cudnnDestroyActivationDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU}, std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU},
{"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU}, {"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU},
{"Tanh", CUDNN_ACTIVATION_TANH}, {"Tanh", CUDNN_ACTIVATION_TANH},


+ 20
- 14
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h View File

@@ -29,14 +29,7 @@ namespace kernel {
template <typename T> template <typename T>
class ActivationGradGpuKernel : public GpuKernel { class ActivationGradGpuKernel : public GpuKernel {
public: public:
ActivationGradGpuKernel()
: cudnn_handle_(nullptr),
activation_desc_(nullptr),
mode_(CUDNN_ACTIVATION_RELU),
data_descriptor_(nullptr),
is_null_input_(false),
cudnn_data_type_(CUDNN_DATA_FLOAT),
input_size_(0) {}
ActivationGradGpuKernel() { ResetResource(); }
~ActivationGradGpuKernel() override { DestroyResource(); } ~ActivationGradGpuKernel() override { DestroyResource(); }
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
@@ -117,6 +110,25 @@ class ActivationGradGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"cudnnDestroyActivationDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

void ResetResource() noexcept override {
cudnn_handle_ = nullptr;
activation_desc_ = nullptr;
mode_ = CUDNN_ACTIVATION_RELU;
data_descriptor_ = nullptr;
is_null_input_ = false;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
cudnn_data_type_ = CUDNN_DATA_FLOAT;
input_size_ = 0;
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -135,12 +147,6 @@ class ActivationGradGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"cudnnDestroyActivationDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU}, std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU},
{"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU}, {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU},
{"TanhGrad", CUDNN_ACTIVATION_TANH}, {"TanhGrad", CUDNN_ACTIVATION_TANH},


+ 7
- 7
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/batchnorm_grad_gpu_kernel.h View File

@@ -121,6 +121,13 @@ class BatchNormGradGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -152,13 +159,6 @@ class BatchNormGradGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
}

int batch_; int batch_;
int channel_; int channel_;
int height_; int height_;


+ 7
- 7
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/bias_add_grad_gpu_kenel.h View File

@@ -111,6 +111,13 @@ class BiasAddGradGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_),
"cudnnDestroyReduceTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -137,13 +144,6 @@ class BiasAddGradGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_),
"cudnnDestroyReduceTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed");
}

bool same_dims_; bool same_dims_;
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cudnnDataType_t cudnn_data_type_; cudnnDataType_t cudnn_data_type_;


+ 9
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_gpu_kernel.h View File

@@ -198,6 +198,15 @@ class Conv2dGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -243,14 +252,6 @@ class Conv2dGpuFwdKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
}
bool CheckParam(const CNodePtr &kernel_node) { bool CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 2) { if (input_num != 2) {


+ 9
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_filter_gpu_kernel.h View File

@@ -199,6 +199,15 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -243,14 +252,6 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed");
}
bool CheckParam(const CNodePtr &kernel_node) { bool CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 2) { if (input_num != 2) {


+ 9
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/conv2d_grad_input_gpu_kernel.h View File

@@ -203,6 +203,15 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -244,14 +253,6 @@ class ConvGradInputGpuBkwKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed");
}
bool CheckParam(const CNodePtr &kernel_node) { bool CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 2) { if (input_num != 2) {


+ 9
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_gpu_kernel.h View File

@@ -27,7 +27,7 @@ namespace kernel {
template <typename T> template <typename T>
class FlattenGpuFwdKernel : public GpuKernel { class FlattenGpuFwdKernel : public GpuKernel {
public: public:
FlattenGpuFwdKernel() : input_size_(0), output_size_(0), workspace_size_(0) {}
FlattenGpuFwdKernel() : input_size_(0) {}
~FlattenGpuFwdKernel() override = default; ~FlattenGpuFwdKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -47,7 +47,7 @@ class FlattenGpuFwdKernel : public GpuKernel {
return true; return true;
} }
bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
input_size_ = sizeof(T); input_size_ = sizeof(T);
for (size_t i = 0; i < shape.size(); ++i) { for (size_t i = 0; i < shape.size(); ++i) {
input_size_ *= shape[i]; input_size_ *= shape[i];
@@ -55,12 +55,17 @@ class FlattenGpuFwdKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void ResetResource() noexcept override {
input_size_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}


protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(input_size_); input_size_list_.push_back(input_size_);
output_size_ = input_size_;
output_size_list_.push_back(output_size_);
output_size_list_.push_back(input_size_);
} }


private: private:
@@ -69,8 +74,6 @@ class FlattenGpuFwdKernel : public GpuKernel {
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;


size_t input_size_; size_t input_size_;
size_t output_size_;
size_t workspace_size_;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore


+ 10
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/flatten_grad_gpu_kernel.h View File

@@ -27,7 +27,7 @@ namespace kernel {
template <typename T> template <typename T>
class FlattenGardGpuBkwKernel : public GpuKernel { class FlattenGardGpuBkwKernel : public GpuKernel {
public: public:
FlattenGardGpuBkwKernel() : input_size_(0), output_size_(0), workspace_size_(0) {}
FlattenGardGpuBkwKernel() { ResetResource(); }
~FlattenGardGpuBkwKernel() override = default; ~FlattenGardGpuBkwKernel() override = default;


const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -54,7 +54,7 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
return false; return false;
} }


auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
for (size_t i = 0; i < shape.size(); ++i) { for (size_t i = 0; i < shape.size(); ++i) {
if (input_size_ == 0) { if (input_size_ == 0) {
input_size_ = 1; input_size_ = 1;
@@ -67,11 +67,17 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
return true; return true;
} }


void ResetResource() noexcept override {
input_size_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}

protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
input_size_list_.push_back(input_size_); input_size_list_.push_back(input_size_);
output_size_ = input_size_;
output_size_list_.push_back(output_size_);
output_size_list_.push_back(input_size_);
} }


private: private:
@@ -80,8 +86,6 @@ class FlattenGardGpuBkwKernel : public GpuKernel {
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;


size_t input_size_; size_t input_size_;
size_t output_size_;
size_t workspace_size_;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore


+ 14
- 14
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_ex_gpu_kernel.h View File

@@ -140,6 +140,20 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
}

if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"Destroy activation descriptor failed");
}
}

protected: protected:
void InitResource() override { void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -238,20 +252,6 @@ class FusedBatchNormExGpuKernel : public GpuKernel {
} }
} }


void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed");
}

if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"Destroy activation descriptor failed");
}
}

size_t input_x_size_; size_t input_x_size_;
size_t input_z_size_; size_t input_z_size_;
size_t para_size_; size_t para_size_;


+ 6
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h View File

@@ -133,6 +133,12 @@ class FusedBatchNormGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -165,12 +171,6 @@ class FusedBatchNormGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
}

int batch_; int batch_;
int channel_; int channel_;
int height_; int height_;


+ 15
- 16
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_grad_ex_gpu_kernel.h View File

@@ -201,6 +201,21 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {


workspace_size_list_.push_back(workspace_size_); workspace_size_list_.push_back(workspace_size_);
} }
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"Destroy activation descriptor failed");
}
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");

CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
}
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed");
}


private: private:
void SetTensorDescriptor(const std::string &format, const std::vector<size_t> &shape) { void SetTensorDescriptor(const std::string &format, const std::vector<size_t> &shape) {
@@ -255,22 +270,6 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel {
} }
} }


void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_),
"Destroy activation descriptor failed");
}
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");

CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed");
}
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed");
}

size_t x_size_; size_t x_size_;
size_t para_size_; size_t para_size_;
size_t workspace_size_; size_t workspace_size_;


+ 7
- 7
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h View File

@@ -117,6 +117,13 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -146,13 +153,6 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
}

int batch_; int batch_;
int channel_; int channel_;
int height_; int height_;


+ 9
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/im2col_gpu_kernel.h View File

@@ -123,6 +123,15 @@ class Im2ColGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -152,14 +161,6 @@ class Im2ColGpuFwdKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_),
"cudnnDestroyConvolutionDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed");
}
bool CheckParam(const CNodePtr &kernel_node) { bool CheckParam(const CNodePtr &kernel_node) {
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);


+ 15
- 15
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_gpu_kernel.h View File

@@ -157,6 +157,21 @@ class LstmGpuKernel : public GpuKernel {
} }
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");

for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
}
}

protected: protected:
void InitResource() override { void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -195,21 +210,6 @@ class LstmGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed");

for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
}
}

int batch_size_; int batch_size_;
int seq_len_; int seq_len_;
int input_size_; int input_size_;


+ 12
- 12
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_data_gpu_kernel.h View File

@@ -150,6 +150,18 @@ class LstmGradDataGpuKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
DestroyTensorDescGrp();
}


protected: protected:
void InitResource() override { void InitResource() override {
@@ -195,18 +207,6 @@ class LstmGradDataGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed");
DestroyTensorDescGrp();
}
void CreateTensorDescGrp() { void CreateTensorDescGrp() {
int x_dims[3]{batch_size_, input_size_, 1}; int x_dims[3]{batch_size_, input_size_, 1};
int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1}; int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1};


+ 7
- 7
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/lstm_grad_weight_gpu_kernel.h View File

@@ -162,6 +162,13 @@ class LstmGradWeightGpuKernel : public GpuKernel {
"get workspace size failed"); "get workspace size failed");
workspace_size_list_.push_back(workspace_size); workspace_size_list_.push_back(workspace_size);
} }
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
DestroyTensorDescGrp();
}


private: private:
void CreateTensorDescGrp() { void CreateTensorDescGrp() {
@@ -187,13 +194,6 @@ class LstmGradWeightGpuKernel : public GpuKernel {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed");
} }
} }
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed");
DestroyTensorDescGrp();
}


int batch_size_; int batch_size_;
int seq_len_; int seq_len_;


+ 7
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_gpu_kernel.h View File

@@ -113,6 +113,13 @@ class PoolingGpuFwdKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
}

protected: protected:
void InitResource() { void InitResource() {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -196,12 +203,6 @@ class PoolingGpuFwdKernel : public GpuKernel {
2, windowDimA, paddingA, strideA), 2, windowDimA, paddingA, strideA),
"cudnnSetPoolingNdDescriptor failed"); "cudnnSetPoolingNdDescriptor failed");
} }
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed");
}


cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_descriptor_; cudnnTensorDescriptor_t input_descriptor_;


+ 8
- 8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pooling_grad_gpu_kernel.h View File

@@ -129,6 +129,14 @@ class PoolingGradGpuKernel : public GpuKernel {
InitSizeLists(); InitSizeLists();
return true; return true;
} }
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed");
}


protected: protected:
void InitResource() override { void InitResource() override {
@@ -230,14 +238,6 @@ class PoolingGradGpuKernel : public GpuKernel {
pad_value_ = kSignedMinFloat; pad_value_ = kSignedMinFloat;
} }
} }
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
"cudnnDestroyPoolingDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed");
}


cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cudnnPoolingDescriptor_t pooling_descriptor_; cudnnPoolingDescriptor_t pooling_descriptor_;


+ 7
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_cross_entropy_with_logits_gpu_kernel.h View File

@@ -101,6 +101,13 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -118,12 +125,6 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}
void InferInputOutputSize(const CNodePtr &kernel_node) { void InferInputOutputSize(const CNodePtr &kernel_node) {
auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(logits_shape); is_null_input_ = CHECK_NULL_INPUT(logits_shape);


+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_gpu_kernel.h View File

@@ -140,6 +140,11 @@ class SoftmaxGpuKernel : public GpuKernel {
return true; return true;
} }
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
}
protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -159,11 +164,6 @@ class SoftmaxGpuKernel : public GpuKernel {
} }
private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
}
void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) { void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) {
if (input_shape.size() == 2) { if (input_shape.size() == 2) {
InitSizeByAxis2D(input_shape, axis); InitSizeByAxis2D(input_shape, axis);


+ 4
- 4
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/softmax_grad_gpu_kernel.h View File

@@ -142,6 +142,10 @@ class SoftmaxGradGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -161,10 +165,6 @@ class SoftmaxGradGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed");
}

void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) { void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) {
axis_ = axis; axis_ = axis;
if (axis_ < 0) { if (axis_ < 0) {


+ 7
- 6
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_softmax_cross_entropy_with_logits_gpu_kernel.h View File

@@ -103,6 +103,13 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}

protected: protected:
void InitResource() override { void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
@@ -120,12 +127,6 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}
void InferInputOutputSize(const CNodePtr &kernel_node) { void InferInputOutputSize(const CNodePtr &kernel_node) {
auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(logits_shape); is_null_input_ = CHECK_NULL_INPUT(logits_shape);


+ 0
- 2
mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold2_gpu_kernel.h View File

@@ -113,8 +113,6 @@ class BatchNormFold2GpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {}

cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
bool is_null_input_; bool is_null_input_;
size_t batch_size_; size_t batch_size_;


+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/quant/batchnorm_fold_gpu_kernel.h View File

@@ -152,6 +152,11 @@ class BatchNormFoldGpuKernel : public GpuKernel {
return true; return true;
} }


void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
}

protected: protected:
void InitSizeLists() override { void InitSizeLists() override {
// x, mean, variance, current_step // x, mean, variance, current_step
@@ -177,11 +182,6 @@ class BatchNormFoldGpuKernel : public GpuKernel {
} }


private: private:
void DestroyResource() noexcept {
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed");
}

size_t input_size_; size_t input_size_;
size_t output_size_; size_t output_size_;
std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;


+ 0
- 2
mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_gpu_kernel.h View File

@@ -81,8 +81,6 @@ class CorrectionMulGpuKernel : public GpuKernel {
void InitResource() override {} void InitResource() override {}


private: private:
void DestroyResource() noexcept {}

size_t batch_size_; size_t batch_size_;
size_t channel_; size_t channel_;
size_t height_; size_t height_;


+ 0
- 2
mindspore/ccsrc/backend/kernel_compiler/gpu/quant/correction_mul_grad_gpu_kernel.h View File

@@ -89,8 +89,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
void InitResource() override {} void InitResource() override {}


private: private:
void DestroyResource() noexcept {}

size_t batch_size_; size_t batch_size_;
size_t channel_; size_t channel_;
size_t height_; size_t height_;


+ 4
- 0
mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc View File

@@ -237,6 +237,10 @@ void MemSwapManager::SaveUserKernelTopoOrder() {
continue; continue;
} }


if (opt::IsNopNode(user_kernel)) {
continue;
}

size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_; size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_;
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1); auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1);
auto &output_idx = kernel_with_index.second; auto &output_idx = kernel_with_index.second;


+ 28
- 0
mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc View File

@@ -50,6 +50,10 @@ bool IsShapeDynamic(const abstract::ShapePtr &shape) {
return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; }); return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; });
} }


bool IsShapeDynamic(const std::vector<size_t> &shape) {
return std::any_of(shape.begin(), shape.end(), [](int s) { return s < 0; });
}

std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) { std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) {
MS_EXCEPTION_IF_NULL(shape); MS_EXCEPTION_IF_NULL(shape);
std::vector<size_t> shape_size_t; std::vector<size_t> shape_size_t;
@@ -1389,5 +1393,29 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
} }
return false; return false;
} }

std::vector<size_t> AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
auto device_shape = GetInputDeviceShape(anf_node, index);
// Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
if (IsShapeDynamic(device_shape)) {
auto max_shape = GetInputMaxShape(anf_node, index);
std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize);
auto format = GetInputFormat(anf_node, index);
trans::TransShapeToDevice(device_shape, format);
}
return device_shape;
}

std::vector<size_t> AnfRuntimeAlgorithm::GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
auto device_shape = GetOutputDeviceShape(anf_node, index);
// Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
if (IsShapeDynamic(device_shape)) {
auto max_shape = GetOutputMaxShape(anf_node, index);
std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize);
auto format = GetOutputFormat(anf_node, index);
trans::TransShapeToDevice(device_shape, format);
}
return device_shape;
}
} // namespace session } // namespace session
} // namespace mindspore } // namespace mindspore

+ 2
- 0
mindspore/ccsrc/backend/session/anf_runtime_algorithm.h View File

@@ -230,6 +230,8 @@ class AnfRuntimeAlgorithm {
static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index); static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index);
static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index); static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index);
static bool IsNodeDynamicShape(const AnfNodePtr &node); static bool IsNodeDynamicShape(const AnfNodePtr &node);
static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
}; };
} // namespace session } // namespace session
using AnfAlgo = session::AnfRuntimeAlgorithm; using AnfAlgo = session::AnfRuntimeAlgorithm;


+ 3
- 4
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -306,7 +306,9 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
if (save_graphs) { if (save_graphs) {
DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id)); DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id));
} }

// Update Graph Dynamic Shape Attr.
UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
graph->UpdateGraphDynamicAttr();
// Hide NopOp from execution graph // Hide NopOp from execution graph
opt::HideNopNode(graph.get()); opt::HideNopNode(graph.get());
// Build kernel if node is cnode // Build kernel if node is cnode
@@ -317,13 +319,10 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
graph->set_execution_order(execution_order); graph->set_execution_order(execution_order);
// Get summary nodes. // Get summary nodes.
SetSummaryNodes(graph.get()); SetSummaryNodes(graph.get());
// Remove NopOp from execution graph
opt::RemoveNopNode(graph.get());
// Dump .pb graph after graph optimization // Dump .pb graph after graph optimization
if (save_graphs) { if (save_graphs) {
DumpIRProto(graph, "after_opt_" + std::to_string(graph_id)); DumpIRProto(graph, "after_opt_" + std::to_string(graph_id));
} }

// Set graph manager. // Set graph manager.
MS_EXCEPTION_IF_NULL(context_); MS_EXCEPTION_IF_NULL(context_);
FuncGraphManagerPtr manager = MakeManager({graph}); FuncGraphManagerPtr manager = MakeManager({graph});


+ 1
- 1
mindspore/ccsrc/runtime/device/CMakeLists.txt View File

@@ -1,5 +1,5 @@
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
"kernel_info.cc" "executor/dynamic_kernel.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
) )


if (ENABLE_GPU) if (ENABLE_GPU)


+ 1
- 1
mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc View File

@@ -48,7 +48,7 @@
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h" #include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
#endif #endif
#include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h"
#include "runtime/device/ascend/executor/executor_callback.h"
#include "runtime/device/executor/executor_callback.h"
#include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h"
#include "profiler/device/ascend/ascend_profiling.h" #include "profiler/device/ascend/ascend_profiling.h"
#include "profiler/device/ascend/profiling_context.h" #include "profiler/device/ascend/profiling_context.h"


+ 1
- 1
mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc View File

@@ -22,7 +22,7 @@
#include "runtime/kernel.h" #include "runtime/kernel.h"
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/aicpu/aicpu_util.h" #include "backend/kernel_compiler/aicpu/aicpu_util.h"
#include "runtime/device/ascend/executor/executor_callback.h"
#include "runtime/device/executor/executor_callback.h"


namespace mindspore { namespace mindspore {
namespace device { namespace device {


mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.cc → mindspore/ccsrc/runtime/device/executor/executor_callback.cc View File

@@ -14,12 +14,11 @@
* limitations under the License. * limitations under the License.
*/ */


#include "runtime/device/ascend/executor/executor_callback.h"
#include "runtime/device/executor/executor_callback.h"
#include "utils/log_adapter.h" #include "utils/log_adapter.h"


namespace mindspore { namespace mindspore {
namespace device { namespace device {
namespace ascend {
void ExecutorCallback::RegistCallback(const std::function<void()> &callback) { void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {
std::lock_guard<std::mutex> guard(lock_); std::lock_guard<std::mutex> guard(lock_);
callback_queue_.push(callback); callback_queue_.push(callback);
@@ -36,6 +35,5 @@ void ExecutorCallback::Consume() {
callback_func(); callback_func();
} }
} }
} // namespace ascend
} // namespace device } // namespace device
} // namespace mindspore } // namespace mindspore

mindspore/ccsrc/runtime/device/ascend/executor/executor_callback.h → mindspore/ccsrc/runtime/device/executor/executor_callback.h View File

@@ -14,8 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */


#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_


#include <queue> #include <queue>
#include <mutex> #include <mutex>
@@ -24,7 +24,6 @@


namespace mindspore { namespace mindspore {
namespace device { namespace device {
namespace ascend {
class ExecutorCallback { class ExecutorCallback {
public: public:
static ExecutorCallback &GetInstance() { static ExecutorCallback &GetInstance() {
@@ -43,7 +42,6 @@ class ExecutorCallback {
std::queue<std::function<void()>> callback_queue_; std::queue<std::function<void()>> callback_queue_;
std::mutex lock_; std::mutex lock_;
}; };
} // namespace ascend
} // namespace device } // namespace device
} // namespace mindspore } // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_

+ 2
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc View File

@@ -67,6 +67,8 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) {
if (!gpu_kernel_ptr->Init(kernel)) { if (!gpu_kernel_ptr->Init(kernel)) {
MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed."; MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed.";
} }
gpu_kernel_ptr->InitDynamicKernel(kernel);
gpu_kernel_ptr->DynamicKernel()->Initialize();
session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get()); session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get());
} }
} }


+ 30
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -36,6 +36,8 @@
#include "profiler/device/gpu/gpu_profiling.h" #include "profiler/device/gpu/gpu_profiling.h"
#include "utils/shape_utils.h" #include "utils/shape_utils.h"
#include "debug/data_dump/dump_json_parser.h" #include "debug/data_dump/dump_json_parser.h"
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "runtime/device/executor/executor_callback.h"
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h" #include "debug/debug_services.h"
#endif #endif
@@ -588,6 +590,29 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString(); MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString();
continue; continue;
} }

// akg kernel do not support dynamic shape by now.
device::DynamicKernelPtr dynamic_kernel = nullptr;
kernel::GpuKernel *gpu_kernel = nullptr;
if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) != KernelType::AKG_KERNEL) {
gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
dynamic_kernel = gpu_kernel->DynamicKernel();
}

if (dynamic_kernel && dynamic_kernel->have_depends()) {
MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream";
if (!SyncStream()) {
MS_LOG(ERROR) << "SyncStream failed";
return false;
}
}

if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) {
ExecutorCallback::GetInstance().Consume();
dynamic_kernel->InferShape();
dynamic_kernel->UpdateArgs();
}

AddressPtrList kernel_inputs; AddressPtrList kernel_inputs;
AddressPtrList kernel_workspaces; AddressPtrList kernel_workspaces;
AddressPtrList kernel_outputs; AddressPtrList kernel_outputs;
@@ -615,6 +640,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
} else { } else {
LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs);
} }

ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] {
if (gpu_kernel) gpu_kernel->PostExecute();
});
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
dump_enabled); dump_enabled);
@@ -633,6 +662,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
// collect weights and bias for dump mode // collect weights and bias for dump mode
debugger_->LoadParametersAndConst(); debugger_->LoadParametersAndConst();
CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
ExecutorCallback::GetInstance().Consume();
} }
ClearSwapInfo(mock); ClearSwapInfo(mock);
return true; return true;


+ 12
- 7
tests/ut/cpp/stub/dynamic_shape/dynamic_shape_stub.cc View File

@@ -19,12 +19,19 @@
#include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h" #include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h"
#include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h" #include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h"
#include "profiler/device/ascend/rt_callback_manager.h" #include "profiler/device/ascend/rt_callback_manager.h"
#include "runtime/device/ascend/executor/executor_callback.h"
#include "runtime/device/executor/executor_callback.h"
#include "profiler/device/ascend/ascend_profiling.h" #include "profiler/device/ascend/ascend_profiling.h"
#include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h"
#include "backend/kernel_compiler/host/host_kernel_metadata.h" #include "backend/kernel_compiler/host/host_kernel_metadata.h"
#include "backend/kernel_compiler/host/host_kernel_build.h" #include "backend/kernel_compiler/host/host_kernel_build.h"


namespace mindspore {
namespace device {
void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {}
void ExecutorCallback::Consume() {}
} // namespace device
} // namespace mindspore

namespace mindspore { namespace mindspore {
namespace device { namespace device {
namespace ascend { namespace ascend {
@@ -45,13 +52,11 @@ void AiCoreDynamicKernel::PostExecute() {}
bool HcclExecutorManager::Initialize() { return true; } bool HcclExecutorManager::Initialize() { return true; }
bool HcclExecutorManager::Finalize() { return true; } bool HcclExecutorManager::Finalize() { return true; }


void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {}
void ExecutorCallback::Consume() {}

void OpTilingCalculater::Init() {} void OpTilingCalculater::Init() {}
void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode, const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json,
const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
NotNull<optiling::OpRunInfo *> op_run_info) {}
void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode,
const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json,
const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
NotNull<optiling::OpRunInfo *> op_run_info) {}
} // namespace ascend } // namespace ascend
} // namespace device } // namespace device
} // namespace mindspore } // namespace mindspore


Loading…
Cancel
Save