From: @wilfchen Reviewed-by: @limingqi107,@cristoval Signed-off-by: @cristovaltags/v1.1.0
| @@ -35,21 +35,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = { | |||||
| template <typename T> | template <typename T> | ||||
| class ArrayReduceGpuKernel : public GpuKernel { | class ArrayReduceGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| ArrayReduceGpuKernel() | |||||
| : cudnn_handle_(nullptr), | |||||
| reduce_tensor_op_(CUDNN_REDUCE_TENSOR_ADD), | |||||
| data_type_(CUDNN_DATA_FLOAT), | |||||
| nan_prop_(CUDNN_NOT_PROPAGATE_NAN), | |||||
| reduce_indices_(CUDNN_REDUCE_TENSOR_NO_INDICES), | |||||
| reduce_tensor_descriptor_(nullptr), | |||||
| inputA_descriptor_(nullptr), | |||||
| outputC_descriptor_(nullptr), | |||||
| keep_dims_(false), | |||||
| all_match_(false), | |||||
| is_null_input_(false), | |||||
| input_size_(0), | |||||
| output_size_(0), | |||||
| workspace_size_(0) {} | |||||
| ArrayReduceGpuKernel() { ResetResource(); } | |||||
| ~ArrayReduceGpuKernel() override { DestroyResource(); } | ~ArrayReduceGpuKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -94,7 +80,7 @@ class ArrayReduceGpuKernel : public GpuKernel { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output."; | MS_LOG(ERROR) << "Output number is " << output_num << ", but reduce op needs 1 output."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); | |||||
| int input_dim_length = SizeToInt(AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0).size()); | |||||
| if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueTuple>() || | if (AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueTuple>() || | ||||
| AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueList>()) { | AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("axis")->isa<ValueList>()) { | ||||
| @@ -117,8 +103,8 @@ class ArrayReduceGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims"); | keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims"); | ||||
| auto inputA_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto outputC_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| auto inputA_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| auto outputC_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| is_null_input_ = CHECK_NULL_INPUT(inputA_shape); | is_null_input_ = CHECK_NULL_INPUT(inputA_shape); | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null"; | MS_LOG(WARNING) << "ArrayReduceGpuKernel input is null"; | ||||
| @@ -132,6 +118,35 @@ class ArrayReduceGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| cudnn_handle_ = nullptr; | |||||
| reduce_tensor_op_ = CUDNN_REDUCE_TENSOR_ADD; | |||||
| data_type_ = CUDNN_DATA_FLOAT; | |||||
| nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; | |||||
| reduce_indices_ = CUDNN_REDUCE_TENSOR_NO_INDICES; | |||||
| reduce_tensor_descriptor_ = nullptr; | |||||
| inputA_descriptor_ = nullptr; | |||||
| outputC_descriptor_ = nullptr; | |||||
| keep_dims_ = false; | |||||
| all_match_ = false; | |||||
| is_null_input_ = false; | |||||
| input_size_ = 0; | |||||
| output_size_ = 0; | |||||
| workspace_size_ = 0; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), | |||||
| "cudnnDestroyReduceTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -160,14 +175,6 @@ class ArrayReduceGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), | |||||
| "cudnnDestroyReduceTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(outputC_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| void InferArrayReduceType(const CNodePtr &kernel_node) { | void InferArrayReduceType(const CNodePtr &kernel_node) { | ||||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | ||||
| auto iter = kReduceTypeMap.find(kernel_name); | auto iter = kReduceTypeMap.find(kernel_name); | ||||
| @@ -26,5 +26,14 @@ MS_REG_GPU_KERNEL_TWO( | |||||
| GatherV2, | GatherV2, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | ||||
| GatherV2GpuFwdKernel, half, int) | GatherV2GpuFwdKernel, half, int) | ||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| SparseGatherV2, | |||||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | |||||
| GatherV2GpuFwdKernel, float, int) | |||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| SparseGatherV2, | |||||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | |||||
| GatherV2GpuFwdKernel, half, int) | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -27,7 +27,7 @@ namespace kernel { | |||||
| template <typename T, typename S> | template <typename T, typename S> | ||||
| class GatherV2GpuFwdKernel : public GpuKernel { | class GatherV2GpuFwdKernel : public GpuKernel { | ||||
| public: | public: | ||||
| GatherV2GpuFwdKernel() : axis_(0), handle_(nullptr) {} | |||||
| GatherV2GpuFwdKernel() { ResetResource(); } | |||||
| ~GatherV2GpuFwdKernel() = default; | ~GatherV2GpuFwdKernel() = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -52,9 +52,9 @@ class GatherV2GpuFwdKernel : public GpuKernel { | |||||
| if (input_num != 2) { | if (input_num != 2) { | ||||
| MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2."; | MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherGpuV2FwdKernel needs 2."; | ||||
| } | } | ||||
| input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| input_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| indices_shapes_ = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); | |||||
| output_shapes_ = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis")); | axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis")); | ||||
| if (axis_ < 0) { | if (axis_ < 0) { | ||||
| @@ -65,9 +65,18 @@ class GatherV2GpuFwdKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| input_shapes_.clear(); | |||||
| indices_shapes_.clear(); | |||||
| output_shapes_.clear(); | |||||
| std::fill(dims_, dims_ + 3, 0); | |||||
| axis_ = 0; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); } | |||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| size_t size = GetSize(input_shapes_); | size_t size = GetSize(input_shapes_); | ||||
| input_size_list_.push_back(size); | input_size_list_.push_back(size); | ||||
| @@ -118,7 +127,6 @@ class GatherV2GpuFwdKernel : public GpuKernel { | |||||
| size_t dims_[3] = {}; | size_t dims_[3] = {}; | ||||
| int axis_; | int axis_; | ||||
| cudnnHandle_t handle_; | |||||
| std::vector<size_t> input_size_list_; | std::vector<size_t> input_size_list_; | ||||
| std::vector<size_t> output_size_list_; | std::vector<size_t> output_size_list_; | ||||
| @@ -28,14 +28,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class SplitGpuFwdKernel : public GpuKernel { | class SplitGpuFwdKernel : public GpuKernel { | ||||
| public: | public: | ||||
| SplitGpuFwdKernel() | |||||
| : axis_(0), | |||||
| output_num_(1), | |||||
| input_size_(1), | |||||
| axis_step_(1), | |||||
| all_size_before_axis_(1), | |||||
| all_size_axis_(1), | |||||
| outputs_host_(nullptr) {} | |||||
| SplitGpuFwdKernel() { ResetResource(); } | |||||
| ~SplitGpuFwdKernel() override = default; | ~SplitGpuFwdKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| @@ -59,7 +52,7 @@ class SplitGpuFwdKernel : public GpuKernel { | |||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis")); | axis_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "axis")); | ||||
| if (axis_ < 0) { | if (axis_ < 0) { | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| axis_ += SizeToInt(input_shape.size()); | axis_ += SizeToInt(input_shape.size()); | ||||
| } | } | ||||
| output_num_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "output_num")); | output_num_ = static_cast<int64_t>(GetAttr<int64_t>(kernel_node, "output_num")); | ||||
| @@ -68,7 +61,7 @@ class SplitGpuFwdKernel : public GpuKernel { | |||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| input_size_ = 1; | input_size_ = 1; | ||||
| all_size_before_axis_ = 1; | all_size_before_axis_ = 1; | ||||
| all_size_axis_ = 1; | all_size_axis_ = 1; | ||||
| @@ -88,7 +81,7 @@ class SplitGpuFwdKernel : public GpuKernel { | |||||
| for (int i = 0; i < output_num_; i++) { | for (int i = 0; i < output_num_; i++) { | ||||
| size_t output_size = 1; | size_t output_size = 1; | ||||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i); | |||||
| auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i); | |||||
| for (size_t j = 0; j < output_shape.size(); j++) { | for (size_t j = 0; j < output_shape.size(); j++) { | ||||
| output_size *= output_shape[j]; | output_size *= output_shape[j]; | ||||
| } | } | ||||
| @@ -100,6 +93,19 @@ class SplitGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| axis_ = 0; | |||||
| output_num_ = 1; | |||||
| input_size_ = 1; | |||||
| axis_step_ = 1; | |||||
| all_size_before_axis_ = 1; | |||||
| all_size_axis_ = 1; | |||||
| outputs_host_ = nullptr; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override {} | void InitSizeLists() override {} | ||||
| @@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output."; | MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| shape_size_ = input_shape.size(); | shape_size_ = input_shape.size(); | ||||
| if (shape_size_ > TRANSPOSE_MAX_DIMENSION) { | if (shape_size_ > TRANSPOSE_MAX_DIMENSION) { | ||||
| MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION | MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION | ||||
| @@ -27,8 +27,7 @@ namespace kernel { | |||||
| template <typename T, typename S> | template <typename T, typename S> | ||||
| class UnsortedSegmentSumGpuKernel : public GpuKernel { | class UnsortedSegmentSumGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| UnsortedSegmentSumGpuKernel() | |||||
| : input_dim0_(1), input_dim1_(1), output_dim0_(1), output_dim1_(1), is_null_input_(false) {} | |||||
| UnsortedSegmentSumGpuKernel() { ResetResource(); } | |||||
| ~UnsortedSegmentSumGpuKernel() override = default; | ~UnsortedSegmentSumGpuKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -53,15 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| is_null_input_ = CHECK_NULL_INPUT(input_shapes); | is_null_input_ = CHECK_NULL_INPUT(input_shapes); | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "UnsortedSegmentSum input is null"; | MS_LOG(WARNING) << "UnsortedSegmentSum input is null"; | ||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| auto ids_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); | |||||
| auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| auto axis = ids_shapes.size(); | auto axis = ids_shapes.size(); | ||||
| for (size_t i = 0; i < input_shapes.size(); i++) { | for (size_t i = 0; i < input_shapes.size(); i++) { | ||||
| @@ -81,6 +80,17 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| input_dim0_ = 1; | |||||
| input_dim1_ = 1; | |||||
| output_dim0_ = 1; | |||||
| output_dim1_ = 1; | |||||
| is_null_input_ = false; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T)); | input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T)); | ||||
| @@ -0,0 +1,36 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| void GpuDynamicKernel::UpdateArgs() { | |||||
| if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) { | |||||
| return; | |||||
| } | |||||
| MS_LOG(INFO) << "Update Args: " << cnode_ptr_->fullname_with_scope(); | |||||
| auto kernel_mod = AnfAlgo::GetKernelMod(cnode_ptr_); | |||||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||||
| auto gpu_kernel_mod = dynamic_cast<GpuKernel *>(kernel_mod); | |||||
| MS_EXCEPTION_IF_NULL(gpu_kernel_mod); | |||||
| gpu_kernel_mod->DestroyResource(); | |||||
| gpu_kernel_mod->ResetResource(); | |||||
| gpu_kernel_mod->Init(cnode_ptr_); | |||||
| } | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| @@ -23,11 +23,13 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include <utility> | #include <utility> | ||||
| #include <map> | #include <map> | ||||
| #include <memory> | |||||
| #include "backend/kernel_compiler/kernel.h" | #include "backend/kernel_compiler/kernel.h" | ||||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | #include "backend/kernel_compiler/gpu/kernel_constants.h" | ||||
| #include "runtime/device/gpu/gpu_device_manager.h" | #include "runtime/device/gpu/gpu_device_manager.h" | ||||
| #include "runtime/device/gpu/gpu_common.h" | #include "runtime/device/gpu/gpu_common.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| #include "runtime/device/executor/dynamic_kernel.h" | |||||
| using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; | using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -45,10 +47,28 @@ static std::map<int, int> kNHWCToNCHWAxisMap = { | |||||
| {3, 1}, | {3, 1}, | ||||
| }; | }; | ||||
| class GpuDynamicKernel : public device::DynamicKernel { | |||||
| public: | |||||
| explicit GpuDynamicKernel(const CNodePtr &cnode_ptr) : DynamicKernel(nullptr, cnode_ptr) {} | |||||
| ~GpuDynamicKernel() = default; | |||||
| void UpdateArgs() override; | |||||
| void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; }; | |||||
| void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; } | |||||
| }; | |||||
| class GpuKernel : public KernelMod { | class GpuKernel : public KernelMod { | ||||
| public: | public: | ||||
| virtual ~GpuKernel() = default; | virtual ~GpuKernel() = default; | ||||
| virtual bool Init(const CNodePtr &kernel_node) = 0; | virtual bool Init(const CNodePtr &kernel_node) = 0; | ||||
| virtual void ResetResource() noexcept { | |||||
| MS_LOG(EXCEPTION) << "kernel must override the `ResetResource()` method when dynamic shape"; | |||||
| } | |||||
| virtual void DestroyResource() noexcept {} | |||||
| virtual void PostExecute() {} | |||||
| void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<GpuDynamicKernel>(cnode_ptr); } | |||||
| device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; } | |||||
| protected: | protected: | ||||
| virtual void InitResource() {} | virtual void InitResource() {} | ||||
| @@ -228,7 +248,10 @@ class GpuKernel : public KernelMod { | |||||
| } | } | ||||
| return type->second; | return type->second; | ||||
| } | } | ||||
| device::DynamicKernelPtr dynamic_kernel_; | |||||
| }; | }; | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -123,6 +123,10 @@ class AddNGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -141,9 +145,6 @@ class AddNGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| cudnnTensorDescriptor_t input_descriptor_; | cudnnTensorDescriptor_t input_descriptor_; | ||||
| cudnnDataType_t cudnn_data_type_; | cudnnDataType_t cudnn_data_type_; | ||||
| @@ -112,6 +112,12 @@ class BiasAddGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -129,12 +135,6 @@ class BiasAddGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(op_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(b_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyOpTensorDescriptor failed"); | |||||
| } | |||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| cudnnDataType_t cudnn_data_type_; | cudnnDataType_t cudnn_data_type_; | ||||
| cudnnTensorDescriptor_t x_desc_; | cudnnTensorDescriptor_t x_desc_; | ||||
| @@ -31,13 +31,7 @@ constexpr int MAX_DIMS = 7; | |||||
| template <typename T> | template <typename T> | ||||
| class BroadcastOpGpuKernel : public GpuKernel { | class BroadcastOpGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| BroadcastOpGpuKernel() | |||||
| : op_type_(BROADCAST_TYPE_INVALID), | |||||
| need_broadcast_(false), | |||||
| is_comp_op_(false), | |||||
| input1_num_(1), | |||||
| input2_num_(1), | |||||
| output_num_(1) {} | |||||
| BroadcastOpGpuKernel() { ResetResource(); } | |||||
| ~BroadcastOpGpuKernel() override = default; | ~BroadcastOpGpuKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -71,9 +65,9 @@ class BroadcastOpGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| GetOpType(kernel_node); | GetOpType(kernel_node); | ||||
| auto shape1 = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||||
| auto shape2 = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||||
| auto shape3 = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||||
| auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1); | |||||
| auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| need_broadcast_ = IsBroadcast(shape1, shape2); | need_broadcast_ = IsBroadcast(shape1, shape2); | ||||
| if (need_broadcast_ && shape1.size() > 7) { | if (need_broadcast_ && shape1.size() > 7) { | ||||
| MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; | MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; | ||||
| @@ -106,6 +100,20 @@ class BroadcastOpGpuKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| op_type_ = BROADCAST_TYPE_INVALID; | |||||
| need_broadcast_ = false; | |||||
| is_comp_op_ = false; | |||||
| input1_num_ = 1; | |||||
| input2_num_ = 1; | |||||
| output_num_ = 1; | |||||
| lhs_shape_.clear(); | |||||
| rhs_shape_.clear(); | |||||
| output_shape_.clear(); | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { return; } | void InitResource() override { return; } | ||||
| @@ -30,14 +30,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class BroadcastOpGradGpuKernel : public GpuKernel { | class BroadcastOpGradGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| BroadcastOpGradGpuKernel() | |||||
| : op_type_(BROADCAST_GRAD_TYPE_INVALID), | |||||
| need_broadcast_(false), | |||||
| input1_num_(1), | |||||
| input2_num_(1), | |||||
| output_num_(1), | |||||
| grad_x_(false), | |||||
| grad_y_(false) {} | |||||
| BroadcastOpGradGpuKernel() { ResetResource(); } | |||||
| ~BroadcastOpGradGpuKernel() override = default; | ~BroadcastOpGradGpuKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -105,6 +98,22 @@ class BroadcastOpGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| op_type_ = BROADCAST_GRAD_TYPE_INVALID; | |||||
| need_broadcast_ = false; | |||||
| input1_num_ = 1; | |||||
| input2_num_ = 1; | |||||
| output_num_ = 1; | |||||
| std::fill(x1_shape_, x1_shape_ + 4, 1); | |||||
| std::fill(x2_shape_, x2_shape_ + 4, 1); | |||||
| std::fill(dy_shape_, dy_shape_ + 4, 1); | |||||
| grad_x_ = false; | |||||
| grad_y_ = false; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { return; } | void InitResource() override { return; } | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| @@ -69,21 +69,15 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY | |||||
| template <typename T> | template <typename T> | ||||
| class UnaryOpGpuKernel : public GpuKernel { | class UnaryOpGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| UnaryOpGpuKernel() | |||||
| : unary_op_type_(UNARY_OP_INVALID_TYPE), | |||||
| input_size_(sizeof(T)), | |||||
| output_size_(sizeof(T)), | |||||
| workspace_size_(0), | |||||
| is_null_input_(false) {} | |||||
| UnaryOpGpuKernel() { ResetResource(); } | |||||
| ~UnaryOpGpuKernel() override = default; | ~UnaryOpGpuKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| VARIABLE_NOT_USED(workspace); | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | T *input_addr = GetDeviceAddress<T>(inputs, 0); | ||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | T *output_addr = GetDeviceAddress<T>(outputs, 0); | ||||
| @@ -184,7 +178,7 @@ class UnaryOpGpuKernel : public GpuKernel { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output."; | MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| is_null_input_ = CHECK_NULL_INPUT(input_shape); | is_null_input_ = CHECK_NULL_INPUT(input_shape); | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "UnaryOpGpuKernel input is null"; | MS_LOG(WARNING) << "UnaryOpGpuKernel input is null"; | ||||
| @@ -198,6 +192,16 @@ class UnaryOpGpuKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| unary_op_type_ = UNARY_OP_INVALID_TYPE; | |||||
| input_size_ = sizeof(T); | |||||
| output_size_ = sizeof(T); | |||||
| workspace_size_ = 0; | |||||
| is_null_input_ = false; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| @@ -29,16 +29,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class ActivationGpuFwdKernel : public GpuKernel { | class ActivationGpuFwdKernel : public GpuKernel { | ||||
| public: | public: | ||||
| ActivationGpuFwdKernel() | |||||
| : cudnn_handle_(nullptr), | |||||
| activation_desc_(nullptr), | |||||
| mode_(CUDNN_ACTIVATION_RELU), | |||||
| data_descriptor_(nullptr), | |||||
| is_null_input_(false), | |||||
| cudnn_data_type_(CUDNN_DATA_FLOAT), | |||||
| input_size_(0), | |||||
| output_size_(0), | |||||
| workspace_size_(0) {} | |||||
| ActivationGpuFwdKernel() { ResetResource(); } | |||||
| ~ActivationGpuFwdKernel() override { DestroyResource(); } | ~ActivationGpuFwdKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| @@ -75,7 +66,7 @@ class ActivationGpuFwdKernel : public GpuKernel { | |||||
| MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1."; | MS_LOG(ERROR) << "Argument number is " << input_num << ", but ActivationGpuFwdKernel needs 1."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| is_null_input_ = CHECK_NULL_INPUT(input_shape); | is_null_input_ = CHECK_NULL_INPUT(input_shape); | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null."; | MS_LOG(WARNING) << "ActivationGpuFwdKernel input is null."; | ||||
| @@ -113,6 +104,27 @@ class ActivationGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "cudnnDestroyActivationDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| void ResetResource() noexcept override { | |||||
| cudnn_handle_ = nullptr; | |||||
| activation_desc_ = nullptr; | |||||
| mode_ = CUDNN_ACTIVATION_RELU; | |||||
| data_descriptor_ = nullptr; | |||||
| is_null_input_ = false; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| cudnn_data_type_ = CUDNN_DATA_FLOAT; | |||||
| input_size_ = 0; | |||||
| output_size_ = 0; | |||||
| workspace_size_ = 0; | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -132,12 +144,6 @@ class ActivationGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "cudnnDestroyActivationDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU}, | std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReLU", CUDNN_ACTIVATION_RELU}, | ||||
| {"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU}, | {"ReLU6", CUDNN_ACTIVATION_CLIPPED_RELU}, | ||||
| {"Tanh", CUDNN_ACTIVATION_TANH}, | {"Tanh", CUDNN_ACTIVATION_TANH}, | ||||
| @@ -29,14 +29,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class ActivationGradGpuKernel : public GpuKernel { | class ActivationGradGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| ActivationGradGpuKernel() | |||||
| : cudnn_handle_(nullptr), | |||||
| activation_desc_(nullptr), | |||||
| mode_(CUDNN_ACTIVATION_RELU), | |||||
| data_descriptor_(nullptr), | |||||
| is_null_input_(false), | |||||
| cudnn_data_type_(CUDNN_DATA_FLOAT), | |||||
| input_size_(0) {} | |||||
| ActivationGradGpuKernel() { ResetResource(); } | |||||
| ~ActivationGradGpuKernel() override { DestroyResource(); } | ~ActivationGradGpuKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| @@ -117,6 +110,25 @@ class ActivationGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "cudnnDestroyActivationDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| void ResetResource() noexcept override { | |||||
| cudnn_handle_ = nullptr; | |||||
| activation_desc_ = nullptr; | |||||
| mode_ = CUDNN_ACTIVATION_RELU; | |||||
| data_descriptor_ = nullptr; | |||||
| is_null_input_ = false; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| cudnn_data_type_ = CUDNN_DATA_FLOAT; | |||||
| input_size_ = 0; | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -135,12 +147,6 @@ class ActivationGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "cudnnDestroyActivationDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(data_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU}, | std::map<std::string, cudnnActivationMode_t> kernel_map = {{"ReluGrad", CUDNN_ACTIVATION_RELU}, | ||||
| {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU}, | {"ReLU6Grad", CUDNN_ACTIVATION_CLIPPED_RELU}, | ||||
| {"TanhGrad", CUDNN_ACTIVATION_TANH}, | {"TanhGrad", CUDNN_ACTIVATION_TANH}, | ||||
| @@ -121,6 +121,13 @@ class BatchNormGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -152,13 +159,6 @@ class BatchNormGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| } | |||||
| int batch_; | int batch_; | ||||
| int channel_; | int channel_; | ||||
| int height_; | int height_; | ||||
| @@ -111,6 +111,13 @@ class BiasAddGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_), | |||||
| "cudnnDestroyReduceTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -137,13 +144,6 @@ class BiasAddGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnDestroyReduceTensorDescriptor(op_desc_), | |||||
| "cudnnDestroyReduceTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(db_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyOpTensorDescriptor failed"); | |||||
| } | |||||
| bool same_dims_; | bool same_dims_; | ||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| cudnnDataType_t cudnn_data_type_; | cudnnDataType_t cudnn_data_type_; | ||||
| @@ -198,6 +198,15 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -243,14 +252,6 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| bool CheckParam(const CNodePtr &kernel_node) { | bool CheckParam(const CNodePtr &kernel_node) { | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | ||||
| if (input_num != 2) { | if (input_num != 2) { | ||||
| @@ -199,6 +199,15 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -243,14 +252,6 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "cudnnDestroyFilterDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| bool CheckParam(const CNodePtr &kernel_node) { | bool CheckParam(const CNodePtr &kernel_node) { | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | ||||
| if (input_num != 2) { | if (input_num != 2) { | ||||
| @@ -203,6 +203,15 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -244,14 +253,6 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "cudnnDestroyFilterDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| bool CheckParam(const CNodePtr &kernel_node) { | bool CheckParam(const CNodePtr &kernel_node) { | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | ||||
| if (input_num != 2) { | if (input_num != 2) { | ||||
| @@ -27,7 +27,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class FlattenGpuFwdKernel : public GpuKernel { | class FlattenGpuFwdKernel : public GpuKernel { | ||||
| public: | public: | ||||
| FlattenGpuFwdKernel() : input_size_(0), output_size_(0), workspace_size_(0) {} | |||||
| FlattenGpuFwdKernel() : input_size_(0) {} | |||||
| ~FlattenGpuFwdKernel() override = default; | ~FlattenGpuFwdKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -47,7 +47,7 @@ class FlattenGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| input_size_ = sizeof(T); | input_size_ = sizeof(T); | ||||
| for (size_t i = 0; i < shape.size(); ++i) { | for (size_t i = 0; i < shape.size(); ++i) { | ||||
| input_size_ *= shape[i]; | input_size_ *= shape[i]; | ||||
| @@ -55,12 +55,17 @@ class FlattenGpuFwdKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| input_size_ = 0; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| input_size_list_.push_back(input_size_); | input_size_list_.push_back(input_size_); | ||||
| output_size_ = input_size_; | |||||
| output_size_list_.push_back(output_size_); | |||||
| output_size_list_.push_back(input_size_); | |||||
| } | } | ||||
| private: | private: | ||||
| @@ -69,8 +74,6 @@ class FlattenGpuFwdKernel : public GpuKernel { | |||||
| std::vector<size_t> workspace_size_list_; | std::vector<size_t> workspace_size_list_; | ||||
| size_t input_size_; | size_t input_size_; | ||||
| size_t output_size_; | |||||
| size_t workspace_size_; | |||||
| }; | }; | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -27,7 +27,7 @@ namespace kernel { | |||||
| template <typename T> | template <typename T> | ||||
| class FlattenGardGpuBkwKernel : public GpuKernel { | class FlattenGardGpuBkwKernel : public GpuKernel { | ||||
| public: | public: | ||||
| FlattenGardGpuBkwKernel() : input_size_(0), output_size_(0), workspace_size_(0) {} | |||||
| FlattenGardGpuBkwKernel() { ResetResource(); } | |||||
| ~FlattenGardGpuBkwKernel() override = default; | ~FlattenGardGpuBkwKernel() override = default; | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| @@ -54,7 +54,7 @@ class FlattenGardGpuBkwKernel : public GpuKernel { | |||||
| return false; | return false; | ||||
| } | } | ||||
| auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0); | |||||
| for (size_t i = 0; i < shape.size(); ++i) { | for (size_t i = 0; i < shape.size(); ++i) { | ||||
| if (input_size_ == 0) { | if (input_size_ == 0) { | ||||
| input_size_ = 1; | input_size_ = 1; | ||||
| @@ -67,11 +67,17 @@ class FlattenGardGpuBkwKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void ResetResource() noexcept override { | |||||
| input_size_ = 0; | |||||
| input_size_list_.clear(); | |||||
| output_size_list_.clear(); | |||||
| workspace_size_list_.clear(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| input_size_list_.push_back(input_size_); | input_size_list_.push_back(input_size_); | ||||
| output_size_ = input_size_; | |||||
| output_size_list_.push_back(output_size_); | |||||
| output_size_list_.push_back(input_size_); | |||||
| } | } | ||||
| private: | private: | ||||
| @@ -80,8 +86,6 @@ class FlattenGardGpuBkwKernel : public GpuKernel { | |||||
| std::vector<size_t> workspace_size_list_; | std::vector<size_t> workspace_size_list_; | ||||
| size_t input_size_; | size_t input_size_; | ||||
| size_t output_size_; | |||||
| size_t workspace_size_; | |||||
| }; | }; | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -140,6 +140,20 @@ class FusedBatchNormExGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); | |||||
| } | |||||
| if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "Destroy activation descriptor failed"); | |||||
| } | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -238,20 +252,6 @@ class FusedBatchNormExGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| } | } | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(z_desc_), "Destroy z desc failed"); | |||||
| } | |||||
| if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "Destroy activation descriptor failed"); | |||||
| } | |||||
| } | |||||
| size_t input_x_size_; | size_t input_x_size_; | ||||
| size_t input_z_size_; | size_t input_z_size_; | ||||
| size_t para_size_; | size_t para_size_; | ||||
| @@ -133,6 +133,12 @@ class FusedBatchNormGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -165,12 +171,6 @@ class FusedBatchNormGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| int batch_; | int batch_; | ||||
| int channel_; | int channel_; | ||||
| int height_; | int height_; | ||||
| @@ -201,6 +201,21 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { | |||||
| workspace_size_list_.push_back(workspace_size_); | workspace_size_list_.push_back(workspace_size_); | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "Destroy activation descriptor failed"); | |||||
| } | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); | |||||
| } | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| private: | private: | ||||
| void SetTensorDescriptor(const std::string &format, const std::vector<size_t> &shape) { | void SetTensorDescriptor(const std::string &format, const std::vector<size_t> &shape) { | ||||
| @@ -255,22 +270,6 @@ class FusedBatchNormGradExGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| } | } | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| if (bn_ops_ != CUDNN_BATCHNORM_OPS_BN) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyActivationDescriptor(activation_desc_), | |||||
| "Destroy activation descriptor failed"); | |||||
| } | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| if (bn_ops_ == CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dz_desc_), "Destroy z desc failed"); | |||||
| } | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_diff_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| size_t x_size_; | size_t x_size_; | ||||
| size_t para_size_; | size_t para_size_; | ||||
| size_t workspace_size_; | size_t workspace_size_; | ||||
| @@ -117,6 +117,13 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -146,13 +153,6 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_desc_), "Destroy para desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| } | |||||
| int batch_; | int batch_; | ||||
| int channel_; | int channel_; | ||||
| int height_; | int height_; | ||||
| @@ -123,6 +123,15 @@ class Im2ColGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -152,14 +161,6 @@ class Im2ColGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyConvolutionDescriptor(conv_desc_), | |||||
| "cudnnDestroyConvolutionDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(filter_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_desc_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| bool CheckParam(const CNodePtr &kernel_node) { | bool CheckParam(const CNodePtr &kernel_node) { | ||||
| cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | ||||
| @@ -157,6 +157,21 @@ class LstmGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); | |||||
| for (size_t i = 0; i < IntToSize(seq_len_); ++i) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); | |||||
| } | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -195,21 +210,6 @@ class LstmGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cy_desc_), "destroy cy_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hy_desc_), "destroy hy_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc failed"); | |||||
| for (size_t i = 0; i < IntToSize(seq_len_); ++i) { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_[i]), "destroy y_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); | |||||
| } | |||||
| } | |||||
| int batch_size_; | int batch_size_; | ||||
| int seq_len_; | int seq_len_; | ||||
| int input_size_; | int input_size_; | ||||
| @@ -150,6 +150,18 @@ class LstmGradDataGpuKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); | |||||
| DestroyTensorDescGrp(); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| @@ -195,18 +207,6 @@ class LstmGradDataGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcx_desc_), "destroy dcx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhx_desc_), "destroy dhx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(w_desc_), "destroy w_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(cx_desc_), "destroy cx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dcy_desc_), "destroy dcy_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dhy_desc_), "destroy dhy_desc_ failed"); | |||||
| DestroyTensorDescGrp(); | |||||
| } | |||||
| void CreateTensorDescGrp() { | void CreateTensorDescGrp() { | ||||
| int x_dims[3]{batch_size_, input_size_, 1}; | int x_dims[3]{batch_size_, input_size_, 1}; | ||||
| int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1}; | int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1}; | ||||
| @@ -162,6 +162,13 @@ class LstmGradWeightGpuKernel : public GpuKernel { | |||||
| "get workspace size failed"); | "get workspace size failed"); | ||||
| workspace_size_list_.push_back(workspace_size); | workspace_size_list_.push_back(workspace_size); | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); | |||||
| DestroyTensorDescGrp(); | |||||
| } | |||||
| private: | private: | ||||
| void CreateTensorDescGrp() { | void CreateTensorDescGrp() { | ||||
| @@ -187,13 +194,6 @@ class LstmGradWeightGpuKernel : public GpuKernel { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); | CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_[i]), "destroy x_desc failed"); | ||||
| } | } | ||||
| } | } | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyRNNDescriptor(rnn_desc_), "destroy rnn_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyDropoutDescriptor(dropout_desc_), "destroy dropout_desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyFilterDescriptor(dw_desc_), "destroy dw_desc_ failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(hx_desc_), "destroy hx_desc_ failed"); | |||||
| DestroyTensorDescGrp(); | |||||
| } | |||||
| int batch_size_; | int batch_size_; | ||||
| int seq_len_; | int seq_len_; | ||||
| @@ -113,6 +113,13 @@ class PoolingGpuFwdKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | |||||
| "cudnnDestroyPoolingDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() { | void InitResource() { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -196,12 +203,6 @@ class PoolingGpuFwdKernel : public GpuKernel { | |||||
| 2, windowDimA, paddingA, strideA), | 2, windowDimA, paddingA, strideA), | ||||
| "cudnnSetPoolingNdDescriptor failed"); | "cudnnSetPoolingNdDescriptor failed"); | ||||
| } | } | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | |||||
| "cudnnDestroyPoolingDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| cudnnTensorDescriptor_t input_descriptor_; | cudnnTensorDescriptor_t input_descriptor_; | ||||
| @@ -129,6 +129,14 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | |||||
| "cudnnDestroyPoolingDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| @@ -230,14 +238,6 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| pad_value_ = kSignedMinFloat; | pad_value_ = kSignedMinFloat; | ||||
| } | } | ||||
| } | } | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | |||||
| "cudnnDestroyPoolingDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_descriptor_), "cudnnDestroyTensorDescriptor failed"); | |||||
| } | |||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| cudnnPoolingDescriptor_t pooling_descriptor_; | cudnnPoolingDescriptor_t pooling_descriptor_; | ||||
| @@ -101,6 +101,13 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -118,12 +125,6 @@ class SoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| void InferInputOutputSize(const CNodePtr &kernel_node) { | void InferInputOutputSize(const CNodePtr &kernel_node) { | ||||
| auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | ||||
| is_null_input_ = CHECK_NULL_INPUT(logits_shape); | is_null_input_ = CHECK_NULL_INPUT(logits_shape); | ||||
| @@ -140,6 +140,11 @@ class SoftmaxGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -159,11 +164,6 @@ class SoftmaxGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(output_descriptor_), "destroy output_descriptor failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed"); | |||||
| } | |||||
| void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) { | void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) { | ||||
| if (input_shape.size() == 2) { | if (input_shape.size() == 2) { | ||||
| InitSizeByAxis2D(input_shape, axis); | InitSizeByAxis2D(input_shape, axis); | ||||
| @@ -142,6 +142,10 @@ class SoftmaxGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -161,10 +165,6 @@ class SoftmaxGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(y_desc_), "destroy output_descriptor failed"); | |||||
| } | |||||
| void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) { | void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) { | ||||
| axis_ = axis; | axis_ = axis; | ||||
| if (axis_ < 0) { | if (axis_ < 0) { | ||||
| @@ -103,6 +103,13 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| @@ -120,12 +127,6 @@ class SparseSoftmaxCrossEntropyWithLogitsGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(softmax_output_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(logits_descriptor_), | |||||
| "cudnnDestroyTensorDescriptor failed."); | |||||
| } | |||||
| void InferInputOutputSize(const CNodePtr &kernel_node) { | void InferInputOutputSize(const CNodePtr &kernel_node) { | ||||
| auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | ||||
| is_null_input_ = CHECK_NULL_INPUT(logits_shape); | is_null_input_ = CHECK_NULL_INPUT(logits_shape); | ||||
| @@ -113,8 +113,6 @@ class BatchNormFold2GpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept {} | |||||
| cudnnHandle_t cudnn_handle_; | cudnnHandle_t cudnn_handle_; | ||||
| bool is_null_input_; | bool is_null_input_; | ||||
| size_t batch_size_; | size_t batch_size_; | ||||
| @@ -152,6 +152,11 @@ class BatchNormFoldGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| // x, mean, variance, current_step | // x, mean, variance, current_step | ||||
| @@ -177,11 +182,6 @@ class BatchNormFoldGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void DestroyResource() noexcept { | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed"); | |||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_), "Destroy para desc failed"); | |||||
| } | |||||
| size_t input_size_; | size_t input_size_; | ||||
| size_t output_size_; | size_t output_size_; | ||||
| std::vector<size_t> input_size_list_; | std::vector<size_t> input_size_list_; | ||||
| @@ -81,8 +81,6 @@ class CorrectionMulGpuKernel : public GpuKernel { | |||||
| void InitResource() override {} | void InitResource() override {} | ||||
| private: | private: | ||||
| void DestroyResource() noexcept {} | |||||
| size_t batch_size_; | size_t batch_size_; | ||||
| size_t channel_; | size_t channel_; | ||||
| size_t height_; | size_t height_; | ||||
| @@ -89,8 +89,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel { | |||||
| void InitResource() override {} | void InitResource() override {} | ||||
| private: | private: | ||||
| void DestroyResource() noexcept {} | |||||
| size_t batch_size_; | size_t batch_size_; | ||||
| size_t channel_; | size_t channel_; | ||||
| size_t height_; | size_t height_; | ||||
| @@ -237,6 +237,10 @@ void MemSwapManager::SaveUserKernelTopoOrder() { | |||||
| continue; | continue; | ||||
| } | } | ||||
| if (opt::IsNopNode(user_kernel)) { | |||||
| continue; | |||||
| } | |||||
| size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_; | size_t user_kernel_topo_sort = SearchKernelExecutionInfo(user_kernel).topo_order_; | ||||
| auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1); | auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(user_kernel, node_pair.second - 1); | ||||
| auto &output_idx = kernel_with_index.second; | auto &output_idx = kernel_with_index.second; | ||||
| @@ -50,6 +50,10 @@ bool IsShapeDynamic(const abstract::ShapePtr &shape) { | |||||
| return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; }); | return std::any_of(shape->shape().begin(), shape->shape().end(), [](int s) { return s < 0; }); | ||||
| } | } | ||||
| bool IsShapeDynamic(const std::vector<size_t> &shape) { | |||||
| return std::any_of(shape.begin(), shape.end(), [](int s) { return s < 0; }); | |||||
| } | |||||
| std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) { | std::vector<size_t> TransShapeToSizet(const abstract::ShapePtr &shape) { | ||||
| MS_EXCEPTION_IF_NULL(shape); | MS_EXCEPTION_IF_NULL(shape); | ||||
| std::vector<size_t> shape_size_t; | std::vector<size_t> shape_size_t; | ||||
| @@ -1389,5 +1393,29 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) { | |||||
| } | } | ||||
| return false; | return false; | ||||
| } | } | ||||
| std::vector<size_t> AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) { | |||||
| auto device_shape = GetInputDeviceShape(anf_node, index); | |||||
| // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse. | |||||
| if (IsShapeDynamic(device_shape)) { | |||||
| auto max_shape = GetInputMaxShape(anf_node, index); | |||||
| std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize); | |||||
| auto format = GetInputFormat(anf_node, index); | |||||
| trans::TransShapeToDevice(device_shape, format); | |||||
| } | |||||
| return device_shape; | |||||
| } | |||||
| std::vector<size_t> AnfRuntimeAlgorithm::GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) { | |||||
| auto device_shape = GetOutputDeviceShape(anf_node, index); | |||||
| // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse. | |||||
| if (IsShapeDynamic(device_shape)) { | |||||
| auto max_shape = GetOutputMaxShape(anf_node, index); | |||||
| std::transform(max_shape.begin(), max_shape.end(), device_shape.begin(), IntToSize); | |||||
| auto format = GetOutputFormat(anf_node, index); | |||||
| trans::TransShapeToDevice(device_shape, format); | |||||
| } | |||||
| return device_shape; | |||||
| } | |||||
| } // namespace session | } // namespace session | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -230,6 +230,8 @@ class AnfRuntimeAlgorithm { | |||||
| static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index); | static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index); | ||||
| static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index); | static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index); | ||||
| static bool IsNodeDynamicShape(const AnfNodePtr &node); | static bool IsNodeDynamicShape(const AnfNodePtr &node); | ||||
| static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); | |||||
| static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index); | |||||
| }; | }; | ||||
| } // namespace session | } // namespace session | ||||
| using AnfAlgo = session::AnfRuntimeAlgorithm; | using AnfAlgo = session::AnfRuntimeAlgorithm; | ||||
| @@ -306,7 +306,9 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr | |||||
| if (save_graphs) { | if (save_graphs) { | ||||
| DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id)); | DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id)); | ||||
| } | } | ||||
| // Update Graph Dynamic Shape Attr. | |||||
| UpdateGraphDynamicShapeAttr(NOT_NULL(graph)); | |||||
| graph->UpdateGraphDynamicAttr(); | |||||
| // Hide NopOp from execution graph | // Hide NopOp from execution graph | ||||
| opt::HideNopNode(graph.get()); | opt::HideNopNode(graph.get()); | ||||
| // Build kernel if node is cnode | // Build kernel if node is cnode | ||||
| @@ -317,13 +319,10 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr | |||||
| graph->set_execution_order(execution_order); | graph->set_execution_order(execution_order); | ||||
| // Get summary nodes. | // Get summary nodes. | ||||
| SetSummaryNodes(graph.get()); | SetSummaryNodes(graph.get()); | ||||
| // Remove NopOp from execution graph | |||||
| opt::RemoveNopNode(graph.get()); | |||||
| // Dump .pb graph after graph optimization | // Dump .pb graph after graph optimization | ||||
| if (save_graphs) { | if (save_graphs) { | ||||
| DumpIRProto(graph, "after_opt_" + std::to_string(graph_id)); | DumpIRProto(graph, "after_opt_" + std::to_string(graph_id)); | ||||
| } | } | ||||
| // Set graph manager. | // Set graph manager. | ||||
| MS_EXCEPTION_IF_NULL(context_); | MS_EXCEPTION_IF_NULL(context_); | ||||
| FuncGraphManagerPtr manager = MakeManager({graph}); | FuncGraphManagerPtr manager = MakeManager({graph}); | ||||
| @@ -1,5 +1,5 @@ | |||||
| file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" | file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" | ||||
| "kernel_info.cc" "executor/dynamic_kernel.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" | |||||
| "kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc" "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" | |||||
| ) | ) | ||||
| if (ENABLE_GPU) | if (ENABLE_GPU) | ||||
| @@ -48,7 +48,7 @@ | |||||
| #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | ||||
| #endif | #endif | ||||
| #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" | #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" | ||||
| #include "runtime/device/ascend/executor/executor_callback.h" | |||||
| #include "runtime/device/executor/executor_callback.h" | |||||
| #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" | #include "runtime/device/ascend/executor/hccl_dynamic_kernel.h" | ||||
| #include "profiler/device/ascend/ascend_profiling.h" | #include "profiler/device/ascend/ascend_profiling.h" | ||||
| #include "profiler/device/ascend/profiling_context.h" | #include "profiler/device/ascend/profiling_context.h" | ||||
| @@ -22,7 +22,7 @@ | |||||
| #include "runtime/kernel.h" | #include "runtime/kernel.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | #include "backend/session/anf_runtime_algorithm.h" | ||||
| #include "backend/kernel_compiler/aicpu/aicpu_util.h" | #include "backend/kernel_compiler/aicpu/aicpu_util.h" | ||||
| #include "runtime/device/ascend/executor/executor_callback.h" | |||||
| #include "runtime/device/executor/executor_callback.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| @@ -14,12 +14,11 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "runtime/device/ascend/executor/executor_callback.h" | |||||
| #include "runtime/device/executor/executor_callback.h" | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | |||||
| void ExecutorCallback::RegistCallback(const std::function<void()> &callback) { | void ExecutorCallback::RegistCallback(const std::function<void()> &callback) { | ||||
| std::lock_guard<std::mutex> guard(lock_); | std::lock_guard<std::mutex> guard(lock_); | ||||
| callback_queue_.push(callback); | callback_queue_.push(callback); | ||||
| @@ -36,6 +35,5 @@ void ExecutorCallback::Consume() { | |||||
| callback_func(); | callback_func(); | ||||
| } | } | ||||
| } | } | ||||
| } // namespace ascend | |||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -14,8 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| #define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| #include <queue> | #include <queue> | ||||
| #include <mutex> | #include <mutex> | ||||
| @@ -24,7 +24,6 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | |||||
| class ExecutorCallback { | class ExecutorCallback { | ||||
| public: | public: | ||||
| static ExecutorCallback &GetInstance() { | static ExecutorCallback &GetInstance() { | ||||
| @@ -43,7 +42,6 @@ class ExecutorCallback { | |||||
| std::queue<std::function<void()>> callback_queue_; | std::queue<std::function<void()>> callback_queue_; | ||||
| std::mutex lock_; | std::mutex lock_; | ||||
| }; | }; | ||||
| } // namespace ascend | |||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_EXECUTOR_EXECUTOR_CALLBACK_H_ | |||||
| @@ -67,6 +67,8 @@ void GpuBuild(const KernelGraphPtr &kernel_graph) { | |||||
| if (!gpu_kernel_ptr->Init(kernel)) { | if (!gpu_kernel_ptr->Init(kernel)) { | ||||
| MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed."; | MS_LOG(EXCEPTION) << "Initialize gpu kernel op[" << kernel->fullname_with_scope() << "] failed."; | ||||
| } | } | ||||
| gpu_kernel_ptr->InitDynamicKernel(kernel); | |||||
| gpu_kernel_ptr->DynamicKernel()->Initialize(); | |||||
| session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get()); | session::AnfRuntimeAlgorithm::SetKernelMod((kernel::KernelModPtr)gpu_kernel_ptr, kernel.get()); | ||||
| } | } | ||||
| } | } | ||||
| @@ -36,6 +36,8 @@ | |||||
| #include "profiler/device/gpu/gpu_profiling.h" | #include "profiler/device/gpu/gpu_profiling.h" | ||||
| #include "utils/shape_utils.h" | #include "utils/shape_utils.h" | ||||
| #include "debug/data_dump/dump_json_parser.h" | #include "debug/data_dump/dump_json_parser.h" | ||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||||
| #include "runtime/device/executor/executor_callback.h" | |||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| #include "debug/debug_services.h" | #include "debug/debug_services.h" | ||||
| #endif | #endif | ||||
| @@ -588,6 +590,29 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||||
| MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString(); | MS_LOG(INFO) << "[inplace optimizer] skip node: " << kernel->DebugString(); | ||||
| continue; | continue; | ||||
| } | } | ||||
| // akg kernel do not support dynamic shape by now. | |||||
| device::DynamicKernelPtr dynamic_kernel = nullptr; | |||||
| kernel::GpuKernel *gpu_kernel = nullptr; | |||||
| if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) != KernelType::AKG_KERNEL) { | |||||
| gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod); | |||||
| dynamic_kernel = gpu_kernel->DynamicKernel(); | |||||
| } | |||||
| if (dynamic_kernel && dynamic_kernel->have_depends()) { | |||||
| MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream"; | |||||
| if (!SyncStream()) { | |||||
| MS_LOG(ERROR) << "SyncStream failed"; | |||||
| return false; | |||||
| } | |||||
| } | |||||
| if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) { | |||||
| ExecutorCallback::GetInstance().Consume(); | |||||
| dynamic_kernel->InferShape(); | |||||
| dynamic_kernel->UpdateArgs(); | |||||
| } | |||||
| AddressPtrList kernel_inputs; | AddressPtrList kernel_inputs; | ||||
| AddressPtrList kernel_workspaces; | AddressPtrList kernel_workspaces; | ||||
| AddressPtrList kernel_outputs; | AddressPtrList kernel_outputs; | ||||
| @@ -615,6 +640,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||||
| } else { | } else { | ||||
| LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); | LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); | ||||
| } | } | ||||
| ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] { | |||||
| if (gpu_kernel) gpu_kernel->PostExecute(); | |||||
| }); | |||||
| // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) | // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) | ||||
| LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, | ||||
| dump_enabled); | dump_enabled); | ||||
| @@ -633,6 +662,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo | |||||
| // collect weights and bias for dump mode | // collect weights and bias for dump mode | ||||
| debugger_->LoadParametersAndConst(); | debugger_->LoadParametersAndConst(); | ||||
| CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); | ||||
| ExecutorCallback::GetInstance().Consume(); | |||||
| } | } | ||||
| ClearSwapInfo(mock); | ClearSwapInfo(mock); | ||||
| return true; | return true; | ||||
| @@ -19,12 +19,19 @@ | |||||
| #include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h" | #include "runtime/device/ascend/executor/rts/profiling_rts_dynamic_kernel.h" | ||||
| #include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h" | #include "runtime/device/ascend/executor/ai_core_dynamic_kernel.h" | ||||
| #include "profiler/device/ascend/rt_callback_manager.h" | #include "profiler/device/ascend/rt_callback_manager.h" | ||||
| #include "runtime/device/ascend/executor/executor_callback.h" | |||||
| #include "runtime/device/executor/executor_callback.h" | |||||
| #include "profiler/device/ascend/ascend_profiling.h" | #include "profiler/device/ascend/ascend_profiling.h" | ||||
| #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" | #include "runtime/device/ascend/executor/tiling/op_tiling_calculater.h" | ||||
| #include "backend/kernel_compiler/host/host_kernel_metadata.h" | #include "backend/kernel_compiler/host/host_kernel_metadata.h" | ||||
| #include "backend/kernel_compiler/host/host_kernel_build.h" | #include "backend/kernel_compiler/host/host_kernel_build.h" | ||||
| namespace mindspore { | |||||
| namespace device { | |||||
| void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {} | |||||
| void ExecutorCallback::Consume() {} | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | namespace ascend { | ||||
| @@ -45,13 +52,11 @@ void AiCoreDynamicKernel::PostExecute() {} | |||||
| bool HcclExecutorManager::Initialize() { return true; } | bool HcclExecutorManager::Initialize() { return true; } | ||||
| bool HcclExecutorManager::Finalize() { return true; } | bool HcclExecutorManager::Finalize() { return true; } | ||||
| void ExecutorCallback::RegistCallback(const std::function<void()> &callback) {} | |||||
| void ExecutorCallback::Consume() {} | |||||
| void OpTilingCalculater::Init() {} | void OpTilingCalculater::Init() {} | ||||
| void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode, const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json, | |||||
| const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, | |||||
| NotNull<optiling::OpRunInfo *> op_run_info) {} | |||||
| void OpTilingCalculater::CalculateTiling(const NotNull<CNodePtr> &cnode, | |||||
| const NotNull<std::shared_ptr<nlohmann::json>> &compile_info_json, | |||||
| const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, | |||||
| NotNull<optiling::OpRunInfo *> op_run_info) {} | |||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||