From e3a7b7ab923828c14f5373fb03531789b6d0a27f Mon Sep 17 00:00:00 2001 From: wilfChen Date: Tue, 10 Nov 2020 18:02:58 +0800 Subject: [PATCH] gpu support dynamic shape --- .../kernel_compiler/gpu/gpu_kernel_factory.h | 28 ++++++-------- .../pass/convert_const_input_to_attr.cc | 2 +- .../runtime/device/executor/dynamic_kernel.cc | 37 +++++++++++-------- .../runtime/device/executor/dynamic_kernel.h | 6 ++- .../runtime/device/gpu/gpu_kernel_runtime.cc | 20 +++------- 5 files changed, 43 insertions(+), 50 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h index d131b10e4c..6bb64d5ba5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h @@ -78,44 +78,38 @@ class GpuKernelRegister { // variable has been created. #define uchar unsigned char -#define UNIQUE_KERNEL_NAME(kernel) KERNEL_NAME(kernel, __COUNTER__) +#define UNIQUE_KERNEL_NAME(kernel) KERNEL_NAME(g_##kernel##_gpu_kernel_reg, __COUNTER__) #define KERNEL_NAME(kernel, cnt) MERGE(kernel, cnt) #define MERGE(kernel, cnt) kernel##cnt -#define MS_REG_GPU_KERNEL(OPNAME, OPCLASS) \ - static_assert(std::is_base_of::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_gpu_kernel_reg)(#OPNAME, KernelAttr(), \ - []() { return new OPCLASS(); }); +#define MS_REG_GPU_KERNEL(OPNAME, OPCLASS) \ + static_assert(std::is_base_of::value, " must be base of GpuKernel"); \ + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, KernelAttr(), []() { return new OPCLASS(); }); // regular register of fixed accuracy kernels -#define MS_REG_GPU_KERNEL_REGULAR(OPNAME, ATTR, OPCLASS) \ - static_assert(std::is_base_of::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_gpu_kernel_reg)(#OPNAME, ATTR, \ - []() { return new OPCLASS(); }); +#define MS_REG_GPU_KERNEL_REGULAR(OPNAME, ATTR, OPCLASS) \ + static_assert(std::is_base_of::value, " must be base of GpuKernel"); \ + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); }); // register of mixed accuracy kernels which use template and maintain one typename, ignore input num #define MS_REG_GPU_KERNEL_SAME(OPNAME, ATTR, OPCLASS, T) \ static_assert(std::is_base_of>::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_gpu_kernel_reg)( \ - #OPNAME, ATTR, []() { return new OPCLASS(); }); + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); }); // register of mixed accuracy kernels which use template and maintain one typename #define MS_REG_GPU_KERNEL_ONE(OPNAME, ATTR, OPCLASS, T) \ static_assert(std::is_base_of>::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_gpu_kernel_reg)( \ - #OPNAME, ATTR, []() { return new OPCLASS(); }); + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); }); // register of mixed accuracy kernels which use template and maintain two typename #define MS_REG_GPU_KERNEL_TWO(OPNAME, ATTR, OPCLASS, T, S) \ static_assert(std::is_base_of>::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_##S##_gpu_kernel_reg)( \ - #OPNAME, ATTR, []() { return new OPCLASS(); }); + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); }); // register of mixed accuracy kernels which use template and maintain three typename #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \ static_assert(std::is_base_of>::value, " must be base of GpuKernel"); \ - static const GpuKernelRegister UNIQUE_KERNEL_NAME(g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg)( \ - #OPNAME, ATTR, []() { return new OPCLASS(); }); + static const GpuKernelRegister UNIQUE_KERNEL_NAME(OPNAME)(#OPNAME, ATTR, []() { return new OPCLASS(); }); } // namespace kernel } // namespace mindspore #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_GPUKERNELFACTORY_H_ diff --git a/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc b/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc index 640d8d9801..bd284c1c96 100644 --- a/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc +++ b/mindspore/ccsrc/backend/optimizer/pass/convert_const_input_to_attr.cc @@ -62,7 +62,7 @@ const AnfNodePtr ConvertConstInputToAttr::Process(const FuncGraphPtr &, const An continue; } } - if (AnfAlgo::IsDynamicShape(cnode) && + if (AnfAlgo::IsNodeDynamicShape(cnode) && DynamicShapeConstInputToAttr.find(AnfAlgo::GetCNodeName(cnode)) == DynamicShapeConstInputToAttr.end()) { MS_LOG(INFO) << "current node is dynamic shape " << cnode->fullname_with_scope(); continue; diff --git a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc index 9f002a1c6c..e4512e3b5c 100644 --- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc +++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc @@ -42,24 +42,10 @@ void DynamicKernel::Initialize() { return; } MS_LOG(INFO) << "Have depends"; - std::vector depends_list; std::vector depends_list_me = AnfAlgo::GetNodeAttr>(cnode_ptr_, kDynamicShapeDepends); - (void)std::transform(depends_list_me.begin(), depends_list_me.end(), std::back_inserter(depends_list), + (void)std::transform(depends_list_me.begin(), depends_list_me.end(), std::back_inserter(depend_list_), [](const int64_t &value) { return static_cast(value); }); - // Save depend input tensor. Sync data in InferShape. - for (auto depend : depends_list) { - auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, depend); - auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode_ptr_, depend); - std::vector shapes = trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second); - auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second); - auto out_tensor = std::make_shared(host_type, shapes); - out_tensor->set_device_address(output_addr); - auto ret = depend_tensor_map_.try_emplace(depend, out_tensor); - if (!ret.second) { - MS_LOG(EXCEPTION) << "Insert map failed"; - } - } MS_LOG(INFO) << "Init End"; } @@ -74,6 +60,22 @@ bool IsTupleGetItem(const AnfNodePtr &anf_node) { return IsPrimitive(input0, prim::kPrimTupleGetItem); } +void DynamicKernel::RebuildDependTensor() { + depend_tensor_map_.clear(); + for (auto depend : depend_list_) { + auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, depend); + auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode_ptr_, depend); + std::vector shapes = trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second); + auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second); + auto out_tensor = std::make_shared(host_type, shapes); + out_tensor->set_device_address(output_addr); + auto ret = depend_tensor_map_.try_emplace(depend, out_tensor); + if (!ret.second) { + MS_LOG(EXCEPTION) << "Insert map failed"; + } + } +} + void DynamicKernel::InferShape() { if (!is_input_dynamic_shape_ && is_output_dynamic_shape_ && !have_depends()) { return; @@ -88,12 +90,15 @@ void DynamicKernel::InferShape() { AbstractBasePtrList args_spec_list; auto primitive = GetValueNode(inputs[0]); + // rebuild depend tensor map for gpu dynamic memory allocation. + RebuildDependTensor(); + auto input_size = AnfAlgo::GetInputTensorNum(cnode_ptr_); for (size_t i = 0; i < input_size; ++i) { auto input_with_index = AnfAlgo::GetPrevNodeOutput(cnode_ptr_, i); auto real_input = input_with_index.first; - MS_EXCEPTION_IF_NULL(real_input); + auto ret = depend_tensor_map_.find(i); if (ret != depend_tensor_map_.end()) { auto tensor_ptr = ret->second; diff --git a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h index 4689126af6..649358d1a7 100644 --- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h +++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.h @@ -19,6 +19,7 @@ #include #include +#include #include #include "ir/anf.h" #include "ir/tensor.h" @@ -44,16 +45,19 @@ class DynamicKernel { bool is_dynamic_shape() const { return is_dynamic_shape_; } bool is_input_dynamic_shape() const { return is_input_dynamic_shape_; } bool is_output_dynamic_shape() const { return is_output_dynamic_shape_; } - bool have_depends() const { return !depend_tensor_map_.empty(); } + bool have_depends() const { return !depend_list_.empty(); } virtual void Initialize(); std::string GetKernelName() { return cnode_ptr_->fullname_with_scope(); } protected: + void RebuildDependTensor(); + void *stream_; const CNodePtr cnode_ptr_; bool is_dynamic_shape_; bool is_input_dynamic_shape_; bool is_output_dynamic_shape_; + std::vector depend_list_; std::map depend_tensor_map_; }; using DynamicKernelPtr = std::shared_ptr; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 314a44b32c..c65a1f3dd8 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -37,7 +37,6 @@ #include "utils/shape_utils.h" #include "debug/data_dump/dump_json_parser.h" #include "backend/kernel_compiler/gpu/gpu_kernel.h" -#include "runtime/device/executor/executor_callback.h" #ifdef ENABLE_DEBUGGER #include "debug/debug_services.h" #endif @@ -369,7 +368,7 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { bool ret = true; auto graph_id = graph->graph_id(); - if (!is_first_step_map_[graph_id]) { + if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) { // Normally run graph ret = LaunchKernelDynamic(graph); } else { @@ -603,16 +602,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo dynamic_kernel = gpu_kernel->DynamicKernel(); } - if (dynamic_kernel && dynamic_kernel->have_depends()) { - MS_LOG(INFO) << "Match Dynamic Kernel, Start SyncStream"; - if (!SyncStream()) { - MS_LOG(ERROR) << "SyncStream failed"; - return false; - } - } - if (dynamic_kernel && dynamic_kernel->is_dynamic_shape()) { - ExecutorCallback::GetInstance().Consume(); dynamic_kernel->InferShape(); dynamic_kernel->UpdateArgs(); } @@ -645,9 +635,10 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); } - ExecutorCallback::GetInstance().RegistCallback([&gpu_kernel] { - if (gpu_kernel) gpu_kernel->PostExecute(); - }); + if (gpu_kernel && dynamic_kernel && dynamic_kernel->is_dynamic_shape()) { + gpu_kernel->PostExecute(); + } + // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, dump_enabled); @@ -666,7 +657,6 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo // collect weights and bias for dump mode debugger_->LoadParametersAndConst(); CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); - ExecutorCallback::GetInstance().Consume(); } ClearSwapInfo(mock); return true;