From c0070d3d49724b6d2dea2ed13bffe0879c066500 Mon Sep 17 00:00:00 2001 From: Zhang Qinghua Date: Mon, 7 Sep 2020 11:19:58 +0800 Subject: [PATCH] Use the unified Execute function to run Graph or Single Op Graph. --- .../ccsrc/backend/session/ascend_session.cc | 30 +- .../ccsrc/backend/session/ascend_session.h | 5 +- .../ccsrc/backend/session/cpu_session.cc | 2 +- .../ccsrc/backend/session/gpu_session.cc | 4 +- .../device/ascend/ascend_kernel_runtime.cc | 10 +- .../device/ascend/ascend_kernel_runtime.h | 4 +- .../runtime/device/cpu/cpu_kernel_runtime.cc | 2 +- .../runtime/device/cpu/cpu_kernel_runtime.h | 2 +- .../runtime/device/gpu/gpu_kernel_runtime.cc | 2300 ++++++++--------- .../runtime/device/gpu/gpu_kernel_runtime.h | 224 +- .../ccsrc/runtime/device/kernel_runtime.cc | 2 +- .../ccsrc/runtime/device/kernel_runtime.h | 4 +- 12 files changed, 1291 insertions(+), 1298 deletions(-) diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index d89c053299..019fabecbd 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -318,7 +318,7 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector &kernel_graph) const { - MS_LOG(INFO) << "Start!"; - auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); - MS_EXCEPTION_IF_NULL(runtime_instance); - bool ret_ok = runtime_instance->LaunchKernel(kernel_graph.get()); - if (!ret_ok) { - MS_LOG(EXCEPTION) << "Run task error!"; - } - MS_LOG(INFO) << "Finish!"; -} - bool AscendSession::GraphCacheExist(const GraphInfo &graph_info) const { return run_op_graphs_.find(graph_info) != run_op_graphs_.end(); } @@ -398,7 +387,7 @@ void AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_i // load input data to device LoadInputData(graph, input_tensors); // run op - RunOpExecTask(graph); + Execute(graph, false); // get output if (op_run_info.value != nullptr) { std::vector pre_output_tensors; @@ -552,21 +541,30 @@ void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const { void AscendSession::Load(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - bool ret_ok = runtime_instance->Load(kernel_graph.get()); + bool ret_ok = runtime_instance->Load(kernel_graph.get(), is_task_sink); if (!ret_ok) { MS_LOG(EXCEPTION) << "Load task error!"; } MS_LOG(INFO) << "Finish!"; } -void AscendSession::Execute(const std::shared_ptr &kernel_graph) const { +void AscendSession::Execute(const std::shared_ptr &kernel_graph, bool is_task) const { MS_LOG(INFO) << "Start!"; + bool is_task_sink = false; + if (is_task) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); + } auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - bool ret_ok = runtime_instance->Run(kernel_graph.get()); + bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink); if (!ret_ok) { MS_LOG(EXCEPTION) << "run task error!"; } diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index 4b2f2c232d..5338ebe0e7 100755 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -13,8 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #ifndef MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H #define MINDSPORE_CCSRC_BACKEND_SESSION_ASCEND_SESSION_H + #include #include #include @@ -82,13 +84,12 @@ class AscendSession : public SessionBasic { KernelGraph *kernel_graph) const; void RunOpMemoryClear(const KernelGraph *kernel_graph) const; void Load(const std::shared_ptr &kernel_graph) const; - void Execute(const std::shared_ptr &kernel_graph) const; + void Execute(const std::shared_ptr &kernel_graph, bool is_task) const; void Dump(const std::shared_ptr &kernel_graph) const; void DumpAllGraphs(const std::vector &all_graphs); void LoadTensor(const std::shared_ptr &kernel_graph) const; // below functions are used for run op void RunOpHardwareOptimize(const std::shared_ptr &kernel_graph) const; - void RunOpExecTask(const std::shared_ptr &kernel_graph) const; static void BackendOptimization(const std::vector &all_graphs); static void LinkChildGraphs(NotNull graph); diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc index db5663b31b..ce24f8b6f4 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.cc +++ b/mindspore/ccsrc/backend/session/cpu_session.cc @@ -118,7 +118,7 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vectorPreExecute(kernel_graph); } #endif - bool ret = runtime_.Run(kernel_graph.get()); + bool ret = runtime_.Run(kernel_graph.get(), false); if (!ret) { MS_LOG(EXCEPTION) << "Run graph failed"; } diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index e8969d0dd2..eb5332b486 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -191,9 +191,9 @@ void GPUSession::Execute(const std::shared_ptr &kernel_graph) const auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); #ifdef ENABLE_DEBUGGER - if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) { + if (!runtime_instance->Run(kernel_graph.get(), false, debugger_.get())) { #else - if (!runtime_instance->Run(kernel_graph.get())) { + if (!runtime_instance->Run(kernel_graph.get(), false)) { #endif MS_LOG(EXCEPTION) << "GPU execute graph failed!"; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 6511e227a2..8b9f267bde 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -454,10 +454,7 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size return std::make_shared(device_ptr, device_size, format, type_id); } -bool AscendKernelRuntime::Load(session::KernelGraph *graph) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - bool is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); +bool AscendKernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { if (!is_task_sink) { return true; } @@ -609,17 +606,14 @@ void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) { } } -bool AscendKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) { +bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { bool ret = false; - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); #if defined(_WIN32) || defined(_WIN64) auto start_time = std::chrono::steady_clock::now(); #else struct timeval start_time, end_time; (void)gettimeofday(&start_time, nullptr); #endif - bool is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); if (is_task_sink) { ret = RunTask(graph); } else { diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index f68a9c36c0..f23da565ff 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -44,8 +44,8 @@ class AscendKernelRuntime : public KernelRuntime { bool GenTask(const session::KernelGraph *graph); bool LoadTask(const session::KernelGraph *graph); bool RunTask(const session::KernelGraph *graph); - bool Load(session::KernelGraph *graph) override; - bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override; + bool Load(session::KernelGraph *graph, bool is_task_sink) override; + bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector &inputs, const std::unordered_set &value_nodes, const std::vector &execution_order) override; diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index a6679414a3..430c3a3e9d 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -287,7 +287,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput resource_manager_.DecreaseSummaryRefCount(summary_outputs); } -bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) { +bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink, Debugger *debugger) { MS_EXCEPTION_IF_NULL(kernel_graph); resource_manager_.IncreaseAddressRefCount(kernel_graph); diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h index ff448f1569..42520e9c3f 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h @@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime { ~CPUKernelRuntime() override = default; bool Init() override { return true; } - bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override; + bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; void AssignKernelAddress(session::KernelGraph *kernel_graph); void BindInputOutput(session::KernelGraph *kernel_graph, const std::vector &inputs, VectorRef *outputs); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 68059baca7..3d44d285ca 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -1,1150 +1,1150 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "runtime/device/gpu/gpu_kernel_runtime.h" -#include -#include "runtime/device/gpu/gpu_device_address.h" -#include "runtime/device/gpu/cuda_driver.h" -#include "runtime/device/gpu/gpu_buffer_mgr.h" -#include "runtime/device/gpu/gpu_device_manager.h" -#include "runtime/device/gpu/gpu_memory_allocator.h" -#include "runtime/device/gpu/distribution/collective_init.h" -#include "utils/convert_utils.h" -#include "utils/ms_context.h" -#include "runtime/device/kernel_runtime_manager.h" -#include "runtime/device/gpu/gpu_common.h" -#include "utils/ms_utils.h" -#include "runtime/device/gpu/gpu_memory_manager.h" -#include "backend/kernel_compiler/common_utils.h" -#include "runtime/device/gpu/gpu_memory_copy_manager.h" -#include "common/trans.h" -#include "ir/dtype.h" -#include "profiler/device/gpu/gpu_profiling.h" -#include "utils/shape_utils.h" -#ifdef ENABLE_DEBUGGER -#include "debug/debug_services.h" -#endif - -namespace mindspore { -namespace device { -namespace gpu { -using mindspore::device::memswap::MemSwapInfoSet; -using mindspore::device::memswap::MemSwapManager; -using mindspore::device::memswap::SwapKind; -static const size_t PARAMETER_OUTPUT_INDEX = 0; -bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } - -bool GPUKernelRuntime::Init() { - if (device_init_ == true) { - GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory(); - return true; - } - bool ret = false; -#ifdef ENABLE_DUMP_E2E - ret = SetDumpConf(); - if (!ret) { - MS_LOG(INFO) << "No dump conf to set!"; - } -#endif - - ret = InitDevice(); - if (!ret) { - MS_LOG(ERROR) << "InitDevice error."; - return ret; - } - mem_manager_ = std::make_shared(); - MS_EXCEPTION_IF_NULL(mem_manager_); - mem_manager_->MallocDeviceMemory(); - const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); - bool collective_inited = CollectiveInitializer::instance().collective_inited(); - if (collective_inited && collective_handle_ != nullptr) { - auto init_nccl_comm_funcptr = - reinterpret_cast(dlsym(const_cast(collective_handle_), "InitNCCLComm")); - MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr); - (*init_nccl_comm_funcptr)(); - } - device_init_ = true; - return ret; -} - -#ifdef ENABLE_DUMP_E2E -namespace { -void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, - Debugger *debugger) { - MS_EXCEPTION_IF_NULL(graph); - MS_EXCEPTION_IF_NULL(dump_conf); - bool trans_flag = dump_conf->trans_flag(); - const auto &apply_kernels = graph->execution_order(); - for (const auto &node : apply_kernels) { - MS_EXCEPTION_IF_NULL(node); - auto node_name = AnfAlgo::GetCNodeName(node); - std::string kernel_name = node->fullname_with_scope(); - if (!dump_conf->IsKernelNeedDump(kernel_name)) { - continue; - } - const std::string strsrc = "/"; - const std::string strdst = "--"; - std::string::size_type pos = 0; - std::string::size_type srclen = strsrc.size(); - std::string::size_type dstlen = strdst.size(); - while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) { - kernel_name.replace(pos, srclen, strdst); - pos += dstlen; - } - auto output_size = AnfAlgo::GetOutputTensorNum(node); - for (size_t j = 0; j < output_size; ++j) { - auto addr = AnfAlgo::GetOutputAddr(node, j); - TypeId addr_type_id = addr->type_id(); - std::string addr_format = addr->format(); - ShapeVector int_shapes; - if (trans_flag) { - int_shapes = trans::GetRuntimePaddingShape(node, j); - } else { - auto shape = AnfAlgo::GetOutputDeviceShape(node, j); - (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), - [](size_t inner_item) { return SizeToInt(inner_item); }); - } - - auto type = AnfAlgo::GetOutputInferDataType(node, j); - - auto format = kOpFormat_DEFAULT; - string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j); - - DebugServices *debug_services = debugger->debug_services(); - TensorLoader *tensor_loader = debug_services->tensor_loader(); - std::string original_kernel_name = node->fullname_with_scope(); - size_t slot = j; - auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, - addr_type_id, addr_format, slot); - - if (!ret) { - std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + - ", host_format:" + format + ".!"; - } - } - } -} - -void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, - Debugger *debugger) { - MS_EXCEPTION_IF_NULL(graph); - MS_EXCEPTION_IF_NULL(dump_conf); - bool trans_flag = dump_conf->trans_flag(); - const auto ¶meters = graph->inputs(); - for (auto &item : parameters) { - if (!item->isa()) { - continue; - } - std::string parameter_name = item->fullname_with_scope(); - if (!dump_conf->IsKernelNeedDump(parameter_name)) { - continue; - } - auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); - TypeId addr_type_id = addr->type_id(); - std::string addr_format = addr->format(); - ShapeVector int_shapes; - if (trans_flag) { - int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX); - } else { - auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); - (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), - [](size_t inner_item) { return SizeToInt(inner_item); }); - } - - auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); - - auto format = kOpFormat_DEFAULT; - string filepath = dump_path + '/' + parameter_name + '_' + "output_0"; - - DebugServices *debug_services = debugger->debug_services(); - TensorLoader *tensor_loader = debug_services->tensor_loader(); - std::string original_kernel_name = parameter_name; - size_t slot = 0; - auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, - addr_type_id, addr_format, slot); - - if (!ret) { - std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + - ", host_format:" + format + ".!"; - } - } -} -} // namespace - -bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { - MS_EXCEPTION_IF_NULL(graph); - MS_LOG(INFO) << "Start dump step"; - DumpConfPtr dump_conf = GetDumpConf(); - MS_EXCEPTION_IF_NULL(dump_conf); - dump_conf->UpdataCurIter(); - bool dump_flag = dump_conf->dump_enable(); - if (!dump_flag) { - MS_LOG(INFO) << "Dump flag is disable, pass dump step"; - return true; - } - uint32_t cur_iter = dump_conf->cur_iter(); - if (dump_conf->dump_iter() != 0) { - if (cur_iter != dump_conf->dump_iter()) { - return true; - } - } - MS_LOG(INFO) << "Cur iter is " << cur_iter; - std::string net_name = dump_conf->dump_net_name(); - std::string iterator = std::to_string(cur_iter); - std::string dump_path = dump_conf->dump_path(); - if (dump_path.back() == '/') { - dump_path = dump_path + net_name + '/' + iterator; - } else { - dump_path = dump_path + '/' + net_name + '/' + iterator; - } - - // dump output - DumpOutput(graph, dump_path, dump_conf, debugger); - // dump parameters - DumpParameters(graph, dump_path, dump_conf, debugger); - - return true; -} -#endif - -#ifdef ENABLE_DEBUGGER -namespace { -void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, - const std::vector &kernel_inputs, - const std::vector &kernel_workspaces, - const std::vector &kernel_outputs, int exec_order, void *stream_ptr, - bool dump_enabled) { - // check if we should read the kernel data - bool read_data = false; - std::string kernel_name = kernel->fullname_with_scope(); - if (debugger) { - debugger->SetCurNode(kernel_name); - if (dump_enabled) { - read_data = true; - } else if (debugger->debugger_enabled()) { - read_data = debugger->ReadNodeDataRequired(); - } - } - - if (!read_data) { - return; - } - - // get inputs - auto input_size = AnfAlgo::GetInputTensorNum(kernel); - for (size_t j = 0; j < input_size; ++j) { - auto input_kernel = kernel->input(j + 1); - std::string input_kernel_name = input_kernel->fullname_with_scope(); - auto addr = kernel_inputs[j]; - auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); - auto format = kOpFormat_DEFAULT; - auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); - string input_tensor_name = input_kernel_name + ':' + "0"; - ShapeVector int_shapes; - auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); - (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), - [](size_t inner_item) { return SizeToInt(inner_item); }); - auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); - if (!ret) { - MS_LOG(ERROR) << "LoadMemToHost:" - << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; - } - } - - // get outputs - auto output_size = AnfAlgo::GetOutputTensorNum(kernel); - for (size_t j = 0; j < output_size; ++j) { - auto addr = kernel_outputs[j]; - auto type = AnfAlgo::GetOutputInferDataType(kernel, j); - auto format = kOpFormat_DEFAULT; - auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); - string tensor_name = kernel_name + ':' + std::to_string(j); - ShapeVector int_shapes; - auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j); - (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), - [](size_t inner_item) { return SizeToInt(inner_item); }); - auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false); - if (!ret) { - MS_LOG(ERROR) << "LoadMemToHost:" - << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; - } - } - - debugger->PostExecuteNode(); -} - -void UpdateStepNum(Debugger *debugger, bool dump_enabled) { - if (debugger && (debugger->debugger_enabled() || dump_enabled)) { - auto cur_step_num = debugger->step_num(); - cur_step_num = cur_step_num + 1; - debugger->SetStepNum(cur_step_num); - } -} - -void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) { - MS_EXCEPTION_IF_NULL(graph); - if (!(debugger && dump_enabled)) { - return; - } - const auto ¶meters = graph->inputs(); - // for parameters, set its execution order to be 0; - int exec_order = 0; - for (auto &item : parameters) { - if (!item->isa()) { - continue; - } - std::string parameter_name = item->fullname_with_scope(); - auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); - auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); - auto format = kOpFormat_DEFAULT; - string tensor_name = parameter_name + ':' + "0"; - auto gpu_addr = dynamic_cast(addr); - ShapeVector int_shapes; - auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); - (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), - [](size_t inner_item) { return SizeToInt(inner_item); }); - auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); - if (!ret) { - MS_LOG(ERROR) << "LoadMemToHost:" - << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; - } - } -} - -void ClearCurrentData(Debugger *debugger, bool dump_enabled) { - if (debugger && (debugger->debugger_enabled() || dump_enabled)) { - DebugServices *debug_services = debugger->debug_services(); - TensorLoader *tensor_loader = debug_services->tensor_loader(); - tensor_loader->EmptyCurrentTensor(); - } -} -} // namespace -#endif - -DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, - TypeId type_id) { - return std::make_shared(device_ptr, device_size, format, type_id); -} - -bool GPUKernelRuntime::InitDevice() { - if (GPUDeviceManager::GetInstance().device_count() <= 0) { - MS_LOG(ERROR) << "No GPU device found."; - return false; - } - const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); - bool collective_inited = CollectiveInitializer::instance().collective_inited(); - if (collective_inited && collective_handle_ != nullptr) { - auto get_local_rank_funcptr = - reinterpret_cast(dlsym(const_cast(collective_handle_), "local_rank_id")); - MS_EXCEPTION_IF_NULL(get_local_rank_funcptr); - device_id_ = IntToUint((*get_local_rank_funcptr)()); - } - if (!GPUDeviceManager::GetInstance().is_device_id_init()) { - if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) { - MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_); - return false; - } - } - GPUDeviceManager::GetInstance().InitDevice(); - stream_ = GPUDeviceManager::GetInstance().default_stream(); - if (stream_ == nullptr) { - MS_LOG(ERROR) << "No default CUDA stream found."; - return false; - } - return true; -} - -void GPUKernelRuntime::ReleaseDeviceRes() { - // For dataset mode. - if (GpuBufferMgr::GetInstance().IsInit()) { - if (!GpuBufferMgr::GetInstance().IsClosed()) { - if (!GpuBufferMgr::GetInstance().CloseNotify()) { - MS_LOG(EXCEPTION) << "Could not close gpu data queue."; - } - } - CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue."); - } - - // Destroy remaining memory swap events and free host memory. - for (auto &item : mem_swap_map_) { - auto &mem_swap_manager = item.second; - MS_EXCEPTION_IF_NULL(mem_swap_manager); - if (mem_swap_manager->trigger_swap()) { - mem_swap_manager->ClearSwapQueue(false); - mem_swap_manager->ReleaseHostPinnedMem(); - } - } - - GPUDeviceManager::GetInstance().ReleaseDevice(); - if (mem_manager_ != nullptr) { - mem_manager_->FreeDeviceMemory(); - } - - kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); - MS_EXCEPTION_IF_NULL(bin_map); - bin_map->RemoveKernelCache(); -} - -void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector &inputs, - const std::unordered_set &value_nodes, - const std::vector &execution_order) { - MS_LOG(INFO) << "Clear graph:" << graph_id << " GPU runtime resource"; - // Release the kernel resource. - for (const auto &kernel : execution_order) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - if (kernel_mod == nullptr) { - continue; - } - kernel_mod->ReleaseResource(); - } - // Clear the output address of graph. - ClearOutputAddress(inputs, value_nodes, execution_order); -} - -void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - MS_EXCEPTION_IF_NULL(mem_manager_); - mem_manager_->ResetDynamicMemory(); - AssignStaticMemoryInput(graph); - AssignStaticMemoryValueNode(graph); - bool is_enable_dynamic_mem = context_ptr->get_param(MS_CTX_ENABLE_DYNAMIC_MEM_POOL); - if (is_enable_dynamic_mem) { - // Use the dynamic memory pool. - InitKernelRefCount(graph); - InitMemorySwapInfo(graph); - InitKernelOutputAddress(graph); - InitKernelWorkspaceAddress(graph); - SaveGraphOutputNode(graph); - } else { - AssignDynamicMemory(graph); - } -} - -bool GPUKernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) { - struct timeval start_time, end_time; - (void)gettimeofday(&start_time, nullptr); - bool ret = true; - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - bool is_enable_dynamic_mem = context_ptr->get_param(MS_CTX_ENABLE_DYNAMIC_MEM_POOL); - bool is_enable_pynative_infer = context_ptr->get_param(MS_CTX_ENABLE_PYNATIVE_INFER); - if (is_enable_dynamic_mem && !is_enable_pynative_infer) { - auto graph_id = graph->graph_id(); - auto iter = mem_swap_map_.find(graph_id); - if (iter == mem_swap_map_.end()) { - MS_LOG(EXCEPTION) << "Find memory swap map failed."; - } - mem_swap_manager_ = iter->second; - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - auto mem_reuse_iter = mem_reuse_util_map_.find(graph_id); - if (mem_reuse_iter == mem_reuse_util_map_.end()) { - MS_LOG(EXCEPTION) << "Find memory reuse map failed."; - } - mem_reuse_util_ = mem_reuse_iter->second; - MS_EXCEPTION_IF_NULL(mem_reuse_util_); - - ret = RunOneStep(graph, debugger); - } else { - ret = LaunchKernel(graph); - } - (void)gettimeofday(&end_time, nullptr); - const uint64_t kUSecondInSecond = 1000000; - uint64_t cost = kUSecondInSecond * static_cast(end_time.tv_sec - start_time.tv_sec); - cost += static_cast(end_time.tv_usec - start_time.tv_usec); - MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us"; - return ret; -} - -bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) { - bool ret = true; - auto graph_id = graph->graph_id(); - if (!is_first_step_map_[graph_id]) { - // Normally run graph - ret = LaunchKernelDynamic(graph, debugger); - } else { - // Mock run first step - ret = LaunchKernelDynamic(graph, debugger, true, false); - if (ret) { - // Normally run graph - ret = LaunchKernelDynamic(graph, debugger); - } else { - // Trigger memory swap - ret = SearchMemSwapScheme(graph, debugger); - } - is_first_step_map_[graph_id] = false; - } - return ret; -} - -bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { - MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; - bool ret = false; - ClearKernelOldOutputAndWorkspace(graph); - if (!mem_swap_manager_->mem_swap_init()) { - if (!mem_swap_manager_->Init(graph)) { - return false; - } - } - - while (!ret) { - if (!mem_swap_manager_->RetreatSwapInfo()) { - return false; - } - ret = LaunchKernelDynamic(graph, debugger, true, false); - if (!ret) { - ClearKernelOldOutputAndWorkspace(graph); - } - } - mem_swap_manager_->AssignHostMemory(); - - // Time profiling - ret = LaunchKernelDynamic(graph, debugger, false, true); - if (!ret) { - return ret; - } - return RefineMemSwapScheme(graph, debugger); -} - -bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { - MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; - auto &kernels = graph->execution_order(); - for (const auto &kernel : kernels) { - if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) { - continue; - } - - size_t swap_in_task_num = mem_swap_manager_->QueryKernelTriggerSwapInTaskNum(kernel); - for (size_t swap_in_task_idx = 0; swap_in_task_idx < swap_in_task_num; swap_in_task_idx++) { - bool ret = false; - while (!ret) { - mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); - ret = LaunchKernelDynamic(graph, debugger, true, false); - if (!ret) { - ClearKernelOldOutputAndWorkspace(graph); - ClearSwapInfo(true); - } - } - } - } - return true; -} - -void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared(); - MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); - // Init the kernel reference count. - if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) { - MS_LOG(EXCEPTION) << "Init kernel reference count failed"; - } - mem_reuse_util_ptr->SetKernelDefMap(); - mem_reuse_util_ptr->SetReuseRefCount(); - // Can't free the device address of graph output, so set the reference count of graph output specially. - mem_reuse_util_ptr->SetGraphOutputRefCount(); - // Can't free the device address of summary nodes, so set the reference count of summary nodes specially. - mem_reuse_util_ptr->SetSummaryNodesRefCount(); - auto graph_id = graph->graph_id(); - mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr; -} - -void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared(); - MS_EXCEPTION_IF_NULL(gpu_mem_copy_manager); - MemSwapManagerPtr mem_swap_manager = std::make_shared(gpu_mem_copy_manager); - MS_EXCEPTION_IF_NULL(mem_swap_manager); - auto graph_id = graph->graph_id(); - mem_swap_map_[graph_id] = mem_swap_manager; - is_first_step_map_[graph_id] = true; -} - -void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto &kernels = graph->execution_order(); - for (const auto &kernel : kernels) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto output_sizes = kernel_mod->GetOutputSizeList(); - for (size_t i = 0; i < output_sizes.size(); ++i) { - if (AnfAlgo::OutputAddrExist(kernel, i)) { - continue; - } - std::string output_format = AnfAlgo::GetOutputFormat(kernel, i); - auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); - auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type); - AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); - } - } -} - -void GPUKernelRuntime::InitKernelWorkspaceAddress(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto &kernels = graph->execution_order(); - for (const auto &kernel : kernels) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); - for (size_t i = 0; i < workspace_sizes.size(); ++i) { - auto device_address = CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown); - AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get()); - } - } -} - -void GPUKernelRuntime::SaveGraphOutputNode(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto graph_id = graph->graph_id(); - const auto &output_nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem}); - for (const auto &node : output_nodes) { - graph_output_map_[graph_id].insert(node); - } -} - -bool GPUKernelRuntime::IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const { - MS_EXCEPTION_IF_NULL(graph); - auto graph_id = graph->graph_id(); - auto iter = graph_output_map_.find(graph_id); - if (iter == graph_output_map_.end()) { - MS_LOG(EXCEPTION) << "Find graph output info failed."; - } - auto &graph_output_set = iter->second; - return (graph_output_set.find(kernel) != graph_output_set.end()); -} - -void GPUKernelRuntime::ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph) { - ClearKernelOutputAddress(graph); - ClearKernelWorkspaceAddress(graph); -} - -void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto &kernels = graph->execution_order(); - for (const auto &kernel : kernels) { - if (IsGraphOutput(graph, kernel)) { - continue; - } - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto output_sizes = kernel_mod->GetOutputSizeList(); - for (size_t i = 0; i < output_sizes.size(); ++i) { - if (!AnfAlgo::OutputAddrExist(kernel, i)) { - continue; - } - auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_) { - mem_manager_->FreeMemFromMemPool(device_address); - } - device_address->set_status(DeviceAddressStatus::kInDevice); - } - } -} - -void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto &kernels = graph->execution_order(); - for (const auto &kernel : kernels) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); - for (size_t i = 0; i < workspace_sizes.size(); ++i) { - auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_) { - mem_manager_->FreeMemFromMemPool(device_address); - } - } - } -} - -bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock, - bool profiling) { - MS_EXCEPTION_IF_NULL(graph); - MS_EXCEPTION_IF_NULL(mem_reuse_util_); - // Reset the reference count. - mem_reuse_util_->ResetDynamicUsedRefCount(); - // The inputs and outputs memory of communication kernel need be continuous, so separate processing. - AllocCommunicationOpDynamicRes(graph); - -#ifdef ENABLE_DEBUGGER - bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); - if (!mock) { - UpdateStepNum(debugger, dump_enabled); - } -#endif - auto &kernels = graph->execution_order(); - int exec_order = 1; - - auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); - MS_EXCEPTION_IF_NULL(profiler_inst); - - for (const auto &kernel : kernels) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - AddressPtrList kernel_inputs; - AddressPtrList kernel_workspaces; - AddressPtrList kernel_outputs; - auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock); - if (!ret) { -#ifdef ENABLE_DEBUGGER - if (!mock) { - // invalidate current data collected by the debugger - ClearCurrentData(debugger, dump_enabled); - } -#endif - return false; - } - if (!mock) { - if (!profiling) { - if (profiler_inst->GetEnableFlag()) { - profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_); - } - CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_), - "Launch kernel failed."); - if (profiler_inst->GetEnableFlag()) { - profiler_inst->OpDataProducerEnd(); - if (profiler_inst->GetSyncEnableFlag()) { - CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed."); - } - } - } else { - LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); - } -#ifdef ENABLE_DEBUGGER - // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) - LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, - dump_enabled); -#endif - } - exec_order = exec_order + 1; - FreeKernelDynamicRes(kernel); - if (!UpdateMemorySwapTask(kernel, mock, profiling)) { -#ifdef ENABLE_DEBUGGER - if (!mock) { - // invalidate current data collected by the debugger - ClearCurrentData(debugger, dump_enabled); - } -#endif - return false; - } - } - if (!mock) { -#ifdef ENABLE_DEBUGGER - // collect weights and bias for dump mode - LoadParameters(graph, debugger, dump_enabled); -#endif - CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); - } - ClearSwapInfo(mock); - return true; -} - -void GPUKernelRuntime::LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, - const AddressPtrList &workspace, const AddressPtrList &outputs) { - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - float cost_time = 0; - DeviceEvent start = nullptr; - DeviceEvent end = nullptr; - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create event."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create event."); - - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, stream_), "Failed to record event to stream."); - CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(inputs, workspace, outputs, stream_), "Launch kernel failed."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, stream_), "Failed to record event to stream."); - - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(&cost_time, start, end), "Failed to record elapsed time."); - - mem_swap_manager_->AddKernelExecutionPerform(kernel, cost_time); - - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(end), "Failed to destroy event."); -} - -bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - const MemSwapInfoSet &mem_swap_info_set = mem_swap_manager_->QueryKernelMemSwapInfo(kernel); - for (auto &mem_swap_info : mem_swap_info_set) { - auto need_swap_kernel = mem_swap_manager_->QueryKernelByTopoOrder(mem_swap_info.topo_order_); - MS_EXCEPTION_IF_NULL(need_swap_kernel); - const HostAddress &host_address = - mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_); - auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false); - - if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { - if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) { - mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address, mock); - mem_swap_manager_->AddKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_, false); - } else { - mem_manager_->FreeMemFromMemPool(device_address); - device_address->set_status(DeviceAddressStatus::kInHost); - } - } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) { - auto status = device_address->status(); - if (status == DeviceAddressStatus::kInDeviceToHost) { - device_address->set_status(DeviceAddressStatus::kInDevice); - } else if (status == DeviceAddressStatus::kInHost) { - if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) { - return false; - } - float cost_time = 0; - mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling, - &cost_time); - if (profiling) { - mem_swap_manager_->AddKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_, - std::make_pair(0, cost_time)); - } - } - } - } - return true; -} - -bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - if (!mem_swap_manager_->trigger_swap()) { - return true; - } - if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) { - if (!mock) { - CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); - } - if (!AddMemorySwapTask(kernel, mock, profiling)) { - return false; - } - if (!mock) { - CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed."); - } - } - return true; -} - -void GPUKernelRuntime::UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - if (!mem_swap_manager_->trigger_swap()) { - return; - } - while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { - device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); - } - - auto status = device_address->status(); - switch (status) { - case DeviceAddressStatus::kInDevice: - break; - case DeviceAddressStatus::kInDeviceToHost: { - device_address->set_status(DeviceAddressStatus::kInDevice); - break; - } - case DeviceAddressStatus::kInHostToDevice: { - while (device_address->status() != DeviceAddressStatus::kInDevice) { - while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { - device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); - } - } - break; - } - case DeviceAddressStatus::kInHost: - MS_LOG(WARNING) << "Unexpected device address status: " << status; - break; - default: - MS_LOG(EXCEPTION) << "Invaild device address status: " << status; - } -} - -void GPUKernelRuntime::UpdateHostSwapOutQueue(bool mock) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - if (!mem_swap_manager_->trigger_swap()) { - return; - } - while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost, mock)) { - if (device_address_swap_out->status() == DeviceAddressStatus::kInDeviceToHost && device_address_swap_out->ptr_) { - device_address_swap_out->set_status(DeviceAddressStatus::kInHost); - mem_manager_->FreeMemFromMemPool(device_address_swap_out); - } - } -} - -void GPUKernelRuntime::ClearSwapInfo(bool mock) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - if (!mem_swap_manager_->trigger_swap()) { - return; - } - mem_swap_manager_->ClearSwapQueue(mock); - mem_swap_manager_->ResetHostAddrIsDirty(); -} - -bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock) { - MS_EXCEPTION_IF_NULL(mem_manager_); - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - auto ret = mem_manager_->MallocMemFromMemPool(device_address, size); - if (!ret) { - if (!mem_swap_manager_->trigger_swap()) { - return false; - } - if (!mock) { - mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost); - } - UpdateHostSwapOutQueue(mock); - - ret = mem_manager_->MallocMemFromMemPool(device_address, size); - if (!ret) { - return false; - } - } - return true; -} - -bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, - AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs, - bool mock) { - if (!AllocKernelInputDynamicRes(kernel, kernel_inputs, mock)) { - return false; - } - if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs, mock)) { - return false; - } - if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces, mock)) { - return false; - } - return true; -} - -bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, - bool mock) { - MS_EXCEPTION_IF_NULL(kernel); - MS_EXCEPTION_IF_NULL(kernel_inputs); - MS_EXCEPTION_IF_NULL(mem_reuse_util_); - for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { - DeviceAddressPtr device_address; - if (mem_reuse_util_->is_all_nop_node()) { - // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); - } else { - // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); - } - MS_EXCEPTION_IF_NULL(device_address); - UpdateHostSwapInQueue(device_address, mock); - MS_EXCEPTION_IF_NULL(device_address->ptr_); - kernel::AddressPtr input = std::make_shared(); - MS_EXCEPTION_IF_NULL(input); - input->addr = device_address->ptr_; - input->size = device_address->size_; - kernel_inputs->emplace_back(input); - } - return true; -} - -bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_outputs, - bool mock) { - MS_EXCEPTION_IF_NULL(kernel); - MS_EXCEPTION_IF_NULL(kernel_outputs); - UpdateHostSwapOutQueue(mock); - auto output_sizes = kernel_mod.GetOutputSizeList(); - for (size_t i = 0; i < output_sizes.size(); ++i) { - auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) { - return false; - } - kernel::AddressPtr output = std::make_shared(); - MS_EXCEPTION_IF_NULL(output); - output->addr = device_address->ptr_; - output->size = output_sizes[i]; - kernel_outputs->emplace_back(output); - } - return true; -} - -bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_workspaces, bool mock) { - MS_EXCEPTION_IF_NULL(kernel); - MS_EXCEPTION_IF_NULL(kernel_workspaces); - auto workspace_sizes = kernel_mod.GetWorkspaceSizeList(); - for (size_t i = 0; i < workspace_sizes.size(); ++i) { - if (workspace_sizes[i] == 0) { - kernel_workspaces->emplace_back(nullptr); - continue; - } - auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); - if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) { - return false; - } - kernel::AddressPtr workspace = std::make_shared(); - MS_EXCEPTION_IF_NULL(workspace); - workspace->addr = device_address->ptr_; - workspace->size = workspace_sizes[i]; - kernel_workspaces->emplace_back(workspace); - } - return true; -} - -void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto &kernels = graph->execution_order(); - for (auto &kernel : kernels) { - MS_EXCEPTION_IF_NULL(kernel); - if (AnfAlgo::IsCommunicationOp(kernel)) { - AllocCommunicationOpInputDynamicRes(kernel); - AllocCommunicationOpOutputDynamicRes(kernel); - } - } -} - -void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) { - MS_EXCEPTION_IF_NULL(kernel); - MS_EXCEPTION_IF_NULL(mem_reuse_util_); - bool is_need_alloc_memory = false; - bool is_need_free_memory = false; - size_t total_size = 0; - std::vector size_list; - DeviceAddressPtrList addr_list; - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto intput_sizes = kernel_mod->GetInputSizeList(); - for (size_t i = 0; i < intput_sizes.size(); ++i) { - DeviceAddressPtr device_address; - if (mem_reuse_util_->is_all_nop_node()) { - // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); - } else { - // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); - } - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_ == nullptr) { - is_need_alloc_memory = true; - } else { - is_need_free_memory = true; - } - total_size += intput_sizes[i]; - size_list.emplace_back(intput_sizes[i]); - addr_list.emplace_back(device_address); - } - AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list); -} - -void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) { - MS_EXCEPTION_IF_NULL(kernel); - bool is_need_alloc_memory = false; - bool is_need_free_memory = false; - size_t total_size = 0; - std::vector size_list; - DeviceAddressPtrList addr_list; - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - auto output_sizes = kernel_mod->GetOutputSizeList(); - for (size_t i = 0; i < output_sizes.size(); ++i) { - auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_ == nullptr) { - is_need_alloc_memory = true; - } else { - is_need_free_memory = true; - } - total_size += output_sizes[i]; - size_list.emplace_back(output_sizes[i]); - addr_list.emplace_back(device_address); - } - AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list); -} - -void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory, - const DeviceAddressPtrList addr_list, size_t total_size, - std::vector size_list) { - MS_EXCEPTION_IF_NULL(mem_manager_); - if (!is_need_alloc_memory) { - return; - } - if (is_need_free_memory) { - for (const auto &iter : addr_list) { - MS_EXCEPTION_IF_NULL(iter); - // Free the inputs/outputs of communication kernel which are not released. - if (iter->ptr_ != nullptr) { - mem_manager_->FreeMemFromMemPool(iter); - } - } - } - auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list); - if (!ret) { - MS_LOG(EXCEPTION) << "Malloc device memory failed."; - } -} - -void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) { - MS_EXCEPTION_IF_NULL(kernel); - MS_EXCEPTION_IF_NULL(mem_manager_); - MS_EXCEPTION_IF_NULL(mem_reuse_util_); - auto cnode = kernel->cast(); - MS_EXCEPTION_IF_NULL(cnode); - if (AnfAlgo::IsCommunicationOp(kernel)) { - return; - } - // Free the input of kernel by reference count. - for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { - auto kernel_ref_count_ptr = mem_reuse_util_->GetKernelInputRef(cnode, i); - if (kernel_ref_count_ptr == nullptr) { - continue; - } - kernel_ref_count_ptr->ref_count_dynamic_use_--; - if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) { - MS_LOG(EXCEPTION) << "Check dynamic reference count failed."; - } - if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { - DeviceAddressPtr device_address; - if (mem_reuse_util_->is_all_nop_node()) { - // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); - } else { - // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. - device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); - } - mem_manager_->FreeMemFromMemPool(device_address); - device_address->set_status(DeviceAddressStatus::kInDevice); - } - } - // Free the output of kernel, if output has no reference. - for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) { - auto kernel_ref_count_ptr = mem_reuse_util_->GetRef(cnode, i); - if (kernel_ref_count_ptr == nullptr) { - continue; - } - if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { - auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); - mem_manager_->FreeMemFromMemPool(device_address); - device_address->set_status(DeviceAddressStatus::kInDevice); - } - } - // Free the workspace of kernel. - auto kernel_mod = AnfAlgo::GetKernelMod(kernel); - MS_EXCEPTION_IF_NULL(kernel_mod); - for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) { - auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); - MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_) { - mem_manager_->FreeMemFromMemPool(device_address); - } - } -} -} // namespace gpu -} // namespace device -} // namespace mindspore +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "runtime/device/gpu/gpu_kernel_runtime.h" +#include +#include "runtime/device/gpu/gpu_device_address.h" +#include "runtime/device/gpu/cuda_driver.h" +#include "runtime/device/gpu/gpu_buffer_mgr.h" +#include "runtime/device/gpu/gpu_device_manager.h" +#include "runtime/device/gpu/gpu_memory_allocator.h" +#include "runtime/device/gpu/distribution/collective_init.h" +#include "utils/convert_utils.h" +#include "utils/ms_context.h" +#include "runtime/device/kernel_runtime_manager.h" +#include "runtime/device/gpu/gpu_common.h" +#include "utils/ms_utils.h" +#include "runtime/device/gpu/gpu_memory_manager.h" +#include "backend/kernel_compiler/common_utils.h" +#include "runtime/device/gpu/gpu_memory_copy_manager.h" +#include "common/trans.h" +#include "ir/dtype.h" +#include "profiler/device/gpu/gpu_profiling.h" +#include "utils/shape_utils.h" +#ifdef ENABLE_DEBUGGER +#include "debug/debug_services.h" +#endif + +namespace mindspore { +namespace device { +namespace gpu { +using mindspore::device::memswap::MemSwapInfoSet; +using mindspore::device::memswap::MemSwapManager; +using mindspore::device::memswap::SwapKind; +static const size_t PARAMETER_OUTPUT_INDEX = 0; +bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } + +bool GPUKernelRuntime::Init() { + if (device_init_ == true) { + GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory(); + return true; + } + bool ret = false; +#ifdef ENABLE_DUMP_E2E + ret = SetDumpConf(); + if (!ret) { + MS_LOG(INFO) << "No dump conf to set!"; + } +#endif + + ret = InitDevice(); + if (!ret) { + MS_LOG(ERROR) << "InitDevice error."; + return ret; + } + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->MallocDeviceMemory(); + const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); + bool collective_inited = CollectiveInitializer::instance().collective_inited(); + if (collective_inited && collective_handle_ != nullptr) { + auto init_nccl_comm_funcptr = + reinterpret_cast(dlsym(const_cast(collective_handle_), "InitNCCLComm")); + MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr); + (*init_nccl_comm_funcptr)(); + } + device_init_ = true; + return ret; +} + +#ifdef ENABLE_DUMP_E2E +namespace { +void DumpOutput(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, + Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(dump_conf); + bool trans_flag = dump_conf->trans_flag(); + const auto &apply_kernels = graph->execution_order(); + for (const auto &node : apply_kernels) { + MS_EXCEPTION_IF_NULL(node); + auto node_name = AnfAlgo::GetCNodeName(node); + std::string kernel_name = node->fullname_with_scope(); + if (!dump_conf->IsKernelNeedDump(kernel_name)) { + continue; + } + const std::string strsrc = "/"; + const std::string strdst = "--"; + std::string::size_type pos = 0; + std::string::size_type srclen = strsrc.size(); + std::string::size_type dstlen = strdst.size(); + while ((pos = kernel_name.find(strsrc, pos)) != std::string::npos) { + kernel_name.replace(pos, srclen, strdst); + pos += dstlen; + } + auto output_size = AnfAlgo::GetOutputTensorNum(node); + for (size_t j = 0; j < output_size; ++j) { + auto addr = AnfAlgo::GetOutputAddr(node, j); + TypeId addr_type_id = addr->type_id(); + std::string addr_format = addr->format(); + ShapeVector int_shapes; + if (trans_flag) { + int_shapes = trans::GetRuntimePaddingShape(node, j); + } else { + auto shape = AnfAlgo::GetOutputDeviceShape(node, j); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + } + + auto type = AnfAlgo::GetOutputInferDataType(node, j); + + auto format = kOpFormat_DEFAULT; + string filepath = dump_path + '/' + kernel_name + '_' + "output_" + std::to_string(j); + + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + std::string original_kernel_name = node->fullname_with_scope(); + size_t slot = j; + auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, + addr_type_id, addr_format, slot); + + if (!ret) { + std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + + ", host_format:" + format + ".!"; + } + } + } +} + +void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_path, DumpConfPtr dump_conf, + Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(dump_conf); + bool trans_flag = dump_conf->trans_flag(); + const auto ¶meters = graph->inputs(); + for (auto &item : parameters) { + if (!item->isa()) { + continue; + } + std::string parameter_name = item->fullname_with_scope(); + if (!dump_conf->IsKernelNeedDump(parameter_name)) { + continue; + } + auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); + TypeId addr_type_id = addr->type_id(); + std::string addr_format = addr->format(); + ShapeVector int_shapes; + if (trans_flag) { + int_shapes = trans::GetRuntimePaddingShape(item, PARAMETER_OUTPUT_INDEX); + } else { + auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + } + + auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); + + auto format = kOpFormat_DEFAULT; + string filepath = dump_path + '/' + parameter_name + '_' + "output_0"; + + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + std::string original_kernel_name = parameter_name; + size_t slot = 0; + auto ret = tensor_loader->DumpTensorToFile(original_kernel_name, trans_flag, filepath, format, int_shapes, type, + addr_type_id, addr_format, slot); + + if (!ret) { + std::string error = "DumpTensorToFile Failed: flag:" + std::to_string(trans_flag) + ", path:" + filepath + + ", host_format:" + format + ".!"; + } + } +} +} // namespace + +bool GPUKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { + MS_EXCEPTION_IF_NULL(graph); + MS_LOG(INFO) << "Start dump step"; + DumpConfPtr dump_conf = GetDumpConf(); + MS_EXCEPTION_IF_NULL(dump_conf); + dump_conf->UpdataCurIter(); + bool dump_flag = dump_conf->dump_enable(); + if (!dump_flag) { + MS_LOG(INFO) << "Dump flag is disable, pass dump step"; + return true; + } + uint32_t cur_iter = dump_conf->cur_iter(); + if (dump_conf->dump_iter() != 0) { + if (cur_iter != dump_conf->dump_iter()) { + return true; + } + } + MS_LOG(INFO) << "Cur iter is " << cur_iter; + std::string net_name = dump_conf->dump_net_name(); + std::string iterator = std::to_string(cur_iter); + std::string dump_path = dump_conf->dump_path(); + if (dump_path.back() == '/') { + dump_path = dump_path + net_name + '/' + iterator; + } else { + dump_path = dump_path + '/' + net_name + '/' + iterator; + } + + // dump output + DumpOutput(graph, dump_path, dump_conf, debugger); + // dump parameters + DumpParameters(graph, dump_path, dump_conf, debugger); + + return true; +} +#endif + +#ifdef ENABLE_DEBUGGER +namespace { +void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, + const std::vector &kernel_inputs, + const std::vector &kernel_workspaces, + const std::vector &kernel_outputs, int exec_order, void *stream_ptr, + bool dump_enabled) { + // check if we should read the kernel data + bool read_data = false; + std::string kernel_name = kernel->fullname_with_scope(); + if (debugger) { + debugger->SetCurNode(kernel_name); + if (dump_enabled) { + read_data = true; + } else if (debugger->debugger_enabled()) { + read_data = debugger->ReadNodeDataRequired(); + } + } + + if (!read_data) { + return; + } + + // get inputs + auto input_size = AnfAlgo::GetInputTensorNum(kernel); + for (size_t j = 0; j < input_size; ++j) { + auto input_kernel = kernel->input(j + 1); + std::string input_kernel_name = input_kernel->fullname_with_scope(); + auto addr = kernel_inputs[j]; + auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX); + auto format = kOpFormat_DEFAULT; + auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); + string input_tensor_name = input_kernel_name + ':' + "0"; + ShapeVector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!"; + } + } + + // get outputs + auto output_size = AnfAlgo::GetOutputTensorNum(kernel); + for (size_t j = 0; j < output_size; ++j) { + auto addr = kernel_outputs[j]; + auto type = AnfAlgo::GetOutputInferDataType(kernel, j); + auto format = kOpFormat_DEFAULT; + auto gpu_addr = std::make_unique(addr->addr, addr->size, format, type); + string tensor_name = kernel_name + ':' + std::to_string(j); + ShapeVector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(kernel, j); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, debugger, false); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; + } + } + + debugger->PostExecuteNode(); +} + +void UpdateStepNum(Debugger *debugger, bool dump_enabled) { + if (debugger && (debugger->debugger_enabled() || dump_enabled)) { + auto cur_step_num = debugger->step_num(); + cur_step_num = cur_step_num + 1; + debugger->SetStepNum(cur_step_num); + } +} + +void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) { + MS_EXCEPTION_IF_NULL(graph); + if (!(debugger && dump_enabled)) { + return; + } + const auto ¶meters = graph->inputs(); + // for parameters, set its execution order to be 0; + int exec_order = 0; + for (auto &item : parameters) { + if (!item->isa()) { + continue; + } + std::string parameter_name = item->fullname_with_scope(); + auto addr = AnfAlgo::GetOutputAddr(item, PARAMETER_OUTPUT_INDEX); + auto type = AnfAlgo::GetOutputInferDataType(item, PARAMETER_OUTPUT_INDEX); + auto format = kOpFormat_DEFAULT; + string tensor_name = parameter_name + ':' + "0"; + auto gpu_addr = dynamic_cast(addr); + ShapeVector int_shapes; + auto shape = AnfAlgo::GetOutputDeviceShape(item, PARAMETER_OUTPUT_INDEX); + (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), + [](size_t inner_item) { return SizeToInt(inner_item); }); + auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); + if (!ret) { + MS_LOG(ERROR) << "LoadMemToHost:" + << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; + } + } +} + +void ClearCurrentData(Debugger *debugger, bool dump_enabled) { + if (debugger && (debugger->debugger_enabled() || dump_enabled)) { + DebugServices *debug_services = debugger->debug_services(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + tensor_loader->EmptyCurrentTensor(); + } +} +} // namespace +#endif + +DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, + TypeId type_id) { + return std::make_shared(device_ptr, device_size, format, type_id); +} + +bool GPUKernelRuntime::InitDevice() { + if (GPUDeviceManager::GetInstance().device_count() <= 0) { + MS_LOG(ERROR) << "No GPU device found."; + return false; + } + const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); + bool collective_inited = CollectiveInitializer::instance().collective_inited(); + if (collective_inited && collective_handle_ != nullptr) { + auto get_local_rank_funcptr = + reinterpret_cast(dlsym(const_cast(collective_handle_), "local_rank_id")); + MS_EXCEPTION_IF_NULL(get_local_rank_funcptr); + device_id_ = IntToUint((*get_local_rank_funcptr)()); + } + if (!GPUDeviceManager::GetInstance().is_device_id_init()) { + if (!GPUDeviceManager::GetInstance().set_cur_device_id(device_id_)) { + MS_LOG(ERROR) << "Failed to set current device to " << SizeToInt(device_id_); + return false; + } + } + GPUDeviceManager::GetInstance().InitDevice(); + stream_ = GPUDeviceManager::GetInstance().default_stream(); + if (stream_ == nullptr) { + MS_LOG(ERROR) << "No default CUDA stream found."; + return false; + } + return true; +} + +void GPUKernelRuntime::ReleaseDeviceRes() { + // For dataset mode. + if (GpuBufferMgr::GetInstance().IsInit()) { + if (!GpuBufferMgr::GetInstance().IsClosed()) { + if (!GpuBufferMgr::GetInstance().CloseNotify()) { + MS_LOG(EXCEPTION) << "Could not close gpu data queue."; + } + } + CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue."); + } + + // Destroy remaining memory swap events and free host memory. + for (auto &item : mem_swap_map_) { + auto &mem_swap_manager = item.second; + MS_EXCEPTION_IF_NULL(mem_swap_manager); + if (mem_swap_manager->trigger_swap()) { + mem_swap_manager->ClearSwapQueue(false); + mem_swap_manager->ReleaseHostPinnedMem(); + } + } + + GPUDeviceManager::GetInstance().ReleaseDevice(); + if (mem_manager_ != nullptr) { + mem_manager_->FreeDeviceMemory(); + } + + kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); + MS_EXCEPTION_IF_NULL(bin_map); + bin_map->RemoveKernelCache(); +} + +void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector &inputs, + const std::unordered_set &value_nodes, + const std::vector &execution_order) { + MS_LOG(INFO) << "Clear graph:" << graph_id << " GPU runtime resource"; + // Release the kernel resource. + for (const auto &kernel : execution_order) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + if (kernel_mod == nullptr) { + continue; + } + kernel_mod->ReleaseResource(); + } + // Clear the output address of graph. + ClearOutputAddress(inputs, value_nodes, execution_order); +} + +void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->ResetDynamicMemory(); + AssignStaticMemoryInput(graph); + AssignStaticMemoryValueNode(graph); + bool is_enable_dynamic_mem = context_ptr->get_param(MS_CTX_ENABLE_DYNAMIC_MEM_POOL); + if (is_enable_dynamic_mem) { + // Use the dynamic memory pool. + InitKernelRefCount(graph); + InitMemorySwapInfo(graph); + InitKernelOutputAddress(graph); + InitKernelWorkspaceAddress(graph); + SaveGraphOutputNode(graph); + } else { + AssignDynamicMemory(graph); + } +} + +bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger) { + struct timeval start_time, end_time; + (void)gettimeofday(&start_time, nullptr); + bool ret = true; + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool is_enable_dynamic_mem = context_ptr->get_param(MS_CTX_ENABLE_DYNAMIC_MEM_POOL); + bool is_enable_pynative_infer = context_ptr->get_param(MS_CTX_ENABLE_PYNATIVE_INFER); + if (is_enable_dynamic_mem && !is_enable_pynative_infer) { + auto graph_id = graph->graph_id(); + auto iter = mem_swap_map_.find(graph_id); + if (iter == mem_swap_map_.end()) { + MS_LOG(EXCEPTION) << "Find memory swap map failed."; + } + mem_swap_manager_ = iter->second; + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + auto mem_reuse_iter = mem_reuse_util_map_.find(graph_id); + if (mem_reuse_iter == mem_reuse_util_map_.end()) { + MS_LOG(EXCEPTION) << "Find memory reuse map failed."; + } + mem_reuse_util_ = mem_reuse_iter->second; + MS_EXCEPTION_IF_NULL(mem_reuse_util_); + + ret = RunOneStep(graph, debugger); + } else { + ret = LaunchKernel(graph); + } + (void)gettimeofday(&end_time, nullptr); + const uint64_t kUSecondInSecond = 1000000; + uint64_t cost = kUSecondInSecond * static_cast(end_time.tv_sec - start_time.tv_sec); + cost += static_cast(end_time.tv_usec - start_time.tv_usec); + MS_LOG(DEBUG) << "GPU kernel runtime run graph in " << cost << " us"; + return ret; +} + +bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph, Debugger *debugger) { + bool ret = true; + auto graph_id = graph->graph_id(); + if (!is_first_step_map_[graph_id]) { + // Normally run graph + ret = LaunchKernelDynamic(graph, debugger); + } else { + // Mock run first step + ret = LaunchKernelDynamic(graph, debugger, true, false); + if (ret) { + // Normally run graph + ret = LaunchKernelDynamic(graph, debugger); + } else { + // Trigger memory swap + ret = SearchMemSwapScheme(graph, debugger); + } + is_first_step_map_[graph_id] = false; + } + return ret; +} + +bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { + MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; + bool ret = false; + ClearKernelOldOutputAndWorkspace(graph); + if (!mem_swap_manager_->mem_swap_init()) { + if (!mem_swap_manager_->Init(graph)) { + return false; + } + } + + while (!ret) { + if (!mem_swap_manager_->RetreatSwapInfo()) { + return false; + } + ret = LaunchKernelDynamic(graph, debugger, true, false); + if (!ret) { + ClearKernelOldOutputAndWorkspace(graph); + } + } + mem_swap_manager_->AssignHostMemory(); + + // Time profiling + ret = LaunchKernelDynamic(graph, debugger, false, true); + if (!ret) { + return ret; + } + return RefineMemSwapScheme(graph, debugger); +} + +bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger) { + MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; + auto &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) { + continue; + } + + size_t swap_in_task_num = mem_swap_manager_->QueryKernelTriggerSwapInTaskNum(kernel); + for (size_t swap_in_task_idx = 0; swap_in_task_idx < swap_in_task_num; swap_in_task_idx++) { + bool ret = false; + while (!ret) { + mem_swap_manager_->AdjustSwapInPos(kernel, swap_in_task_idx); + ret = LaunchKernelDynamic(graph, debugger, true, false); + if (!ret) { + ClearKernelOldOutputAndWorkspace(graph); + ClearSwapInfo(true); + } + } + } + } + return true; +} + +void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); + // Init the kernel reference count. + if (!mem_reuse_util_ptr->InitDynamicKernelRef(graph)) { + MS_LOG(EXCEPTION) << "Init kernel reference count failed"; + } + mem_reuse_util_ptr->SetKernelDefMap(); + mem_reuse_util_ptr->SetReuseRefCount(); + // Can't free the device address of graph output, so set the reference count of graph output specially. + mem_reuse_util_ptr->SetGraphOutputRefCount(); + // Can't free the device address of summary nodes, so set the reference count of summary nodes specially. + mem_reuse_util_ptr->SetSummaryNodesRefCount(); + auto graph_id = graph->graph_id(); + mem_reuse_util_map_[graph_id] = mem_reuse_util_ptr; +} + +void GPUKernelRuntime::InitMemorySwapInfo(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + GPUMemCopyManagerPtr gpu_mem_copy_manager = std::make_shared(); + MS_EXCEPTION_IF_NULL(gpu_mem_copy_manager); + MemSwapManagerPtr mem_swap_manager = std::make_shared(gpu_mem_copy_manager); + MS_EXCEPTION_IF_NULL(mem_swap_manager); + auto graph_id = graph->graph_id(); + mem_swap_map_[graph_id] = mem_swap_manager; + is_first_step_map_[graph_id] = true; +} + +void GPUKernelRuntime::InitKernelOutputAddress(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto output_sizes = kernel_mod->GetOutputSizeList(); + for (size_t i = 0; i < output_sizes.size(); ++i) { + if (AnfAlgo::OutputAddrExist(kernel, i)) { + continue; + } + std::string output_format = AnfAlgo::GetOutputFormat(kernel, i); + auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); + auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type); + AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); + } + } +} + +void GPUKernelRuntime::InitKernelWorkspaceAddress(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); + for (size_t i = 0; i < workspace_sizes.size(); ++i) { + auto device_address = CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown); + AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get()); + } + } +} + +void GPUKernelRuntime::SaveGraphOutputNode(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto graph_id = graph->graph_id(); + const auto &output_nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem}); + for (const auto &node : output_nodes) { + graph_output_map_[graph_id].insert(node); + } +} + +bool GPUKernelRuntime::IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const { + MS_EXCEPTION_IF_NULL(graph); + auto graph_id = graph->graph_id(); + auto iter = graph_output_map_.find(graph_id); + if (iter == graph_output_map_.end()) { + MS_LOG(EXCEPTION) << "Find graph output info failed."; + } + auto &graph_output_set = iter->second; + return (graph_output_set.find(kernel) != graph_output_set.end()); +} + +void GPUKernelRuntime::ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph) { + ClearKernelOutputAddress(graph); + ClearKernelWorkspaceAddress(graph); +} + +void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + if (IsGraphOutput(graph, kernel)) { + continue; + } + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto output_sizes = kernel_mod->GetOutputSizeList(); + for (size_t i = 0; i < output_sizes.size(); ++i) { + if (!AnfAlgo::OutputAddrExist(kernel, i)) { + continue; + } + auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_) { + mem_manager_->FreeMemFromMemPool(device_address); + } + device_address->set_status(DeviceAddressStatus::kInDevice); + } + } +} + +void GPUKernelRuntime::ClearKernelWorkspaceAddress(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); + for (size_t i = 0; i < workspace_sizes.size(); ++i) { + auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_) { + mem_manager_->FreeMemFromMemPool(device_address); + } + } + } +} + +bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger, bool mock, + bool profiling) { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(mem_reuse_util_); + // Reset the reference count. + mem_reuse_util_->ResetDynamicUsedRefCount(); + // The inputs and outputs memory of communication kernel need be continuous, so separate processing. + AllocCommunicationOpDynamicRes(graph); + +#ifdef ENABLE_DEBUGGER + bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration(); + if (!mock) { + UpdateStepNum(debugger, dump_enabled); + } +#endif + auto &kernels = graph->execution_order(); + int exec_order = 1; + + auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance(); + MS_EXCEPTION_IF_NULL(profiler_inst); + + for (const auto &kernel : kernels) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + AddressPtrList kernel_inputs; + AddressPtrList kernel_workspaces; + AddressPtrList kernel_outputs; + auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock); + if (!ret) { +#ifdef ENABLE_DEBUGGER + if (!mock) { + // invalidate current data collected by the debugger + ClearCurrentData(debugger, dump_enabled); + } +#endif + return false; + } + if (!mock) { + if (!profiling) { + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerBegin(kernel->fullname_with_scope(), stream_); + } + CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_), + "Launch kernel failed."); + if (profiler_inst->GetEnableFlag()) { + profiler_inst->OpDataProducerEnd(); + if (profiler_inst->GetSyncEnableFlag()) { + CHECK_OP_RET_WITH_ERROR(SyncStream(), "Profiler SyncStream failed."); + } + } + } else { + LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); + } +#ifdef ENABLE_DEBUGGER + // called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost) + LoadKernelData(debugger, kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_, + dump_enabled); +#endif + } + exec_order = exec_order + 1; + FreeKernelDynamicRes(kernel); + if (!UpdateMemorySwapTask(kernel, mock, profiling)) { +#ifdef ENABLE_DEBUGGER + if (!mock) { + // invalidate current data collected by the debugger + ClearCurrentData(debugger, dump_enabled); + } +#endif + return false; + } + } + if (!mock) { +#ifdef ENABLE_DEBUGGER + // collect weights and bias for dump mode + LoadParameters(graph, debugger, dump_enabled); +#endif + CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); + } + ClearSwapInfo(mock); + return true; +} + +void GPUKernelRuntime::LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, + const AddressPtrList &workspace, const AddressPtrList &outputs) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + float cost_time = 0; + DeviceEvent start = nullptr; + DeviceEvent end = nullptr; + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create event."); + + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, stream_), "Failed to record event to stream."); + CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(inputs, workspace, outputs, stream_), "Launch kernel failed."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, stream_), "Failed to record event to stream."); + + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(&cost_time, start, end), "Failed to record elapsed time."); + + mem_swap_manager_->AddKernelExecutionPerform(kernel, cost_time); + + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(end), "Failed to destroy event."); +} + +bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + const MemSwapInfoSet &mem_swap_info_set = mem_swap_manager_->QueryKernelMemSwapInfo(kernel); + for (auto &mem_swap_info : mem_swap_info_set) { + auto need_swap_kernel = mem_swap_manager_->QueryKernelByTopoOrder(mem_swap_info.topo_order_); + MS_EXCEPTION_IF_NULL(need_swap_kernel); + const HostAddress &host_address = + mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_); + auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false); + + if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { + if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) { + mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address, mock); + mem_swap_manager_->AddKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_, false); + } else { + mem_manager_->FreeMemFromMemPool(device_address); + device_address->set_status(DeviceAddressStatus::kInHost); + } + } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) { + auto status = device_address->status(); + if (status == DeviceAddressStatus::kInDeviceToHost) { + device_address->set_status(DeviceAddressStatus::kInDevice); + } else if (status == DeviceAddressStatus::kInHost) { + if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) { + return false; + } + float cost_time = 0; + mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling, + &cost_time); + if (profiling) { + mem_swap_manager_->AddKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_, + std::make_pair(0, cost_time)); + } + } + } + } + return true; +} + +bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + if (!mem_swap_manager_->trigger_swap()) { + return true; + } + if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) { + if (!mock) { + CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); + } + if (!AddMemorySwapTask(kernel, mock, profiling)) { + return false; + } + if (!mock) { + CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed."); + } + } + return true; +} + +void GPUKernelRuntime::UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock) { + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + if (!mem_swap_manager_->trigger_swap()) { + return; + } + while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { + device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); + } + + auto status = device_address->status(); + switch (status) { + case DeviceAddressStatus::kInDevice: + break; + case DeviceAddressStatus::kInDeviceToHost: { + device_address->set_status(DeviceAddressStatus::kInDevice); + break; + } + case DeviceAddressStatus::kInHostToDevice: { + while (device_address->status() != DeviceAddressStatus::kInDevice) { + while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { + device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); + } + } + break; + } + case DeviceAddressStatus::kInHost: + MS_LOG(WARNING) << "Unexpected device address status: " << status; + break; + default: + MS_LOG(EXCEPTION) << "Invaild device address status: " << status; + } +} + +void GPUKernelRuntime::UpdateHostSwapOutQueue(bool mock) { + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + if (!mem_swap_manager_->trigger_swap()) { + return; + } + while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost, mock)) { + if (device_address_swap_out->status() == DeviceAddressStatus::kInDeviceToHost && device_address_swap_out->ptr_) { + device_address_swap_out->set_status(DeviceAddressStatus::kInHost); + mem_manager_->FreeMemFromMemPool(device_address_swap_out); + } + } +} + +void GPUKernelRuntime::ClearSwapInfo(bool mock) { + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + if (!mem_swap_manager_->trigger_swap()) { + return; + } + mem_swap_manager_->ClearSwapQueue(mock); + mem_swap_manager_->ResetHostAddrIsDirty(); +} + +bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock) { + MS_EXCEPTION_IF_NULL(mem_manager_); + MS_EXCEPTION_IF_NULL(mem_swap_manager_); + auto ret = mem_manager_->MallocMemFromMemPool(device_address, size); + if (!ret) { + if (!mem_swap_manager_->trigger_swap()) { + return false; + } + if (!mock) { + mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost); + } + UpdateHostSwapOutQueue(mock); + + ret = mem_manager_->MallocMemFromMemPool(device_address, size); + if (!ret) { + return false; + } + } + return true; +} + +bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, + const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, + AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs, + bool mock) { + if (!AllocKernelInputDynamicRes(kernel, kernel_inputs, mock)) { + return false; + } + if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs, mock)) { + return false; + } + if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces, mock)) { + return false; + } + return true; +} + +bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, + bool mock) { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(kernel_inputs); + MS_EXCEPTION_IF_NULL(mem_reuse_util_); + for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { + DeviceAddressPtr device_address; + if (mem_reuse_util_->is_all_nop_node()) { + // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); + } else { + // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); + } + MS_EXCEPTION_IF_NULL(device_address); + UpdateHostSwapInQueue(device_address, mock); + MS_EXCEPTION_IF_NULL(device_address->ptr_); + kernel::AddressPtr input = std::make_shared(); + MS_EXCEPTION_IF_NULL(input); + input->addr = device_address->ptr_; + input->size = device_address->size_; + kernel_inputs->emplace_back(input); + } + return true; +} + +bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, + const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_outputs, + bool mock) { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(kernel_outputs); + UpdateHostSwapOutQueue(mock); + auto output_sizes = kernel_mod.GetOutputSizeList(); + for (size_t i = 0; i < output_sizes.size(); ++i) { + auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) { + return false; + } + kernel::AddressPtr output = std::make_shared(); + MS_EXCEPTION_IF_NULL(output); + output->addr = device_address->ptr_; + output->size = output_sizes[i]; + kernel_outputs->emplace_back(output); + } + return true; +} + +bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, + const mindspore::AnfNodePtr &kernel, + AddressPtrList *kernel_workspaces, bool mock) { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(kernel_workspaces); + auto workspace_sizes = kernel_mod.GetWorkspaceSizeList(); + for (size_t i = 0; i < workspace_sizes.size(); ++i) { + if (workspace_sizes[i] == 0) { + kernel_workspaces->emplace_back(nullptr); + continue; + } + auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); + if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) { + return false; + } + kernel::AddressPtr workspace = std::make_shared(); + MS_EXCEPTION_IF_NULL(workspace); + workspace->addr = device_address->ptr_; + workspace->size = workspace_sizes[i]; + kernel_workspaces->emplace_back(workspace); + } + return true; +} + +void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto &kernels = graph->execution_order(); + for (auto &kernel : kernels) { + MS_EXCEPTION_IF_NULL(kernel); + if (AnfAlgo::IsCommunicationOp(kernel)) { + AllocCommunicationOpInputDynamicRes(kernel); + AllocCommunicationOpOutputDynamicRes(kernel); + } + } +} + +void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_reuse_util_); + bool is_need_alloc_memory = false; + bool is_need_free_memory = false; + size_t total_size = 0; + std::vector size_list; + DeviceAddressPtrList addr_list; + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto intput_sizes = kernel_mod->GetInputSizeList(); + for (size_t i = 0; i < intput_sizes.size(); ++i) { + DeviceAddressPtr device_address; + if (mem_reuse_util_->is_all_nop_node()) { + // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); + } else { + // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); + } + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_ == nullptr) { + is_need_alloc_memory = true; + } else { + is_need_free_memory = true; + } + total_size += intput_sizes[i]; + size_list.emplace_back(intput_sizes[i]); + addr_list.emplace_back(device_address); + } + AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list); +} + +void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) { + MS_EXCEPTION_IF_NULL(kernel); + bool is_need_alloc_memory = false; + bool is_need_free_memory = false; + size_t total_size = 0; + std::vector size_list; + DeviceAddressPtrList addr_list; + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto output_sizes = kernel_mod->GetOutputSizeList(); + for (size_t i = 0; i < output_sizes.size(); ++i) { + auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_ == nullptr) { + is_need_alloc_memory = true; + } else { + is_need_free_memory = true; + } + total_size += output_sizes[i]; + size_list.emplace_back(output_sizes[i]); + addr_list.emplace_back(device_address); + } + AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list); +} + +void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory, + const DeviceAddressPtrList addr_list, size_t total_size, + std::vector size_list) { + MS_EXCEPTION_IF_NULL(mem_manager_); + if (!is_need_alloc_memory) { + return; + } + if (is_need_free_memory) { + for (const auto &iter : addr_list) { + MS_EXCEPTION_IF_NULL(iter); + // Free the inputs/outputs of communication kernel which are not released. + if (iter->ptr_ != nullptr) { + mem_manager_->FreeMemFromMemPool(iter); + } + } + } + auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list); + if (!ret) { + MS_LOG(EXCEPTION) << "Malloc device memory failed."; + } +} + +void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(mem_manager_); + MS_EXCEPTION_IF_NULL(mem_reuse_util_); + auto cnode = kernel->cast(); + MS_EXCEPTION_IF_NULL(cnode); + if (AnfAlgo::IsCommunicationOp(kernel)) { + return; + } + // Free the input of kernel by reference count. + for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { + auto kernel_ref_count_ptr = mem_reuse_util_->GetKernelInputRef(cnode, i); + if (kernel_ref_count_ptr == nullptr) { + continue; + } + kernel_ref_count_ptr->ref_count_dynamic_use_--; + if (kernel_ref_count_ptr->ref_count_dynamic_use_ < 0) { + MS_LOG(EXCEPTION) << "Check dynamic reference count failed."; + } + if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { + DeviceAddressPtr device_address; + if (mem_reuse_util_->is_all_nop_node()) { + // Graph may be all nop nodes and not remove nop node, so this can not skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); + } else { + // Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. + device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); + } + mem_manager_->FreeMemFromMemPool(device_address); + device_address->set_status(DeviceAddressStatus::kInDevice); + } + } + // Free the output of kernel, if output has no reference. + for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) { + auto kernel_ref_count_ptr = mem_reuse_util_->GetRef(cnode, i); + if (kernel_ref_count_ptr == nullptr) { + continue; + } + if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { + auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); + mem_manager_->FreeMemFromMemPool(device_address); + device_address->set_status(DeviceAddressStatus::kInDevice); + } + } + // Free the workspace of kernel. + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + for (size_t i = 0; i < kernel_mod->GetWorkspaceSizeList().size(); ++i) { + auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); + MS_EXCEPTION_IF_NULL(device_address); + if (device_address->ptr_) { + mem_manager_->FreeMemFromMemPool(device_address); + } + } +} +} // namespace gpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h index 8ff8b773fb..714b8d82c5 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h @@ -1,112 +1,112 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ - -#include -#include -#include -#include -#include -#include -#include -#include "runtime/device/kernel_runtime.h" -#include "runtime/device/kernel_runtime_manager.h" -#include "backend/optimizer/mem_reuse/mem_swap_manager.h" - -namespace mindspore { -namespace device { -namespace gpu { -using mindspore::device::memswap::MemSwapManagerPtr; -class GPUKernelRuntime : public KernelRuntime { - public: - GPUKernelRuntime() = default; - ~GPUKernelRuntime() override = default; - bool Init() override; - void ReleaseDeviceRes() override; - void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector &inputs, - const std::unordered_set &value_nodes, - const std::vector &execution_order) override; - void AssignMemory(session::KernelGraph *graph) override; - bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override; -#ifdef ENABLE_DUMP_E2E - bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override; -#endif - - protected: - DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, - TypeId type_id) override; - bool SyncStream() override; - - private: - GPUKernelRuntime(const GPUKernelRuntime &); - GPUKernelRuntime &operator=(const GPUKernelRuntime &); - bool InitDevice(); - bool device_init_{false}; - - // The related functions and members for using dynamic memory pool. - void InitKernelRefCount(const session::KernelGraph *graph); - void InitKernelOutputAddress(const session::KernelGraph *graph); - void InitKernelWorkspaceAddress(const session::KernelGraph *graph); - void InitMemorySwapInfo(const session::KernelGraph *graph); - void SaveGraphOutputNode(const session::KernelGraph *graph); - bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const; - void ClearKernelOutputAddress(const session::KernelGraph *graph); - void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); - void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); - bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr); - bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); - bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); - bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false, - bool profiling = false); - void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, - const AddressPtrList &workspace, const AddressPtrList &outputs); - bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); - bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces, - AddressPtrList *kernel_outputs, bool mock); - bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock); - bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_outputs, bool mock); - bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces, - bool mock); - void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph); - void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel); - void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel); - void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory, - const DeviceAddressPtrList addr_list, size_t total_size, - std::vector size_list); - void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel); - bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); - bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); - void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock); - void UpdateHostSwapOutQueue(bool mock); - void ClearSwapInfo(bool mock); - std::unordered_map mem_reuse_util_map_; - std::unordered_map mem_swap_map_; - std::unordered_map is_first_step_map_; - std::unordered_map> graph_output_map_; - - MemReuseUtilPtr mem_reuse_util_{nullptr}; - MemSwapManagerPtr mem_swap_manager_{nullptr}; -}; -MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime); -} // namespace gpu -} // namespace device -} // namespace mindspore -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "runtime/device/kernel_runtime.h" +#include "runtime/device/kernel_runtime_manager.h" +#include "backend/optimizer/mem_reuse/mem_swap_manager.h" + +namespace mindspore { +namespace device { +namespace gpu { +using mindspore::device::memswap::MemSwapManagerPtr; +class GPUKernelRuntime : public KernelRuntime { + public: + GPUKernelRuntime() = default; + ~GPUKernelRuntime() override = default; + bool Init() override; + void ReleaseDeviceRes() override; + void ClearGraphRuntimeResource(uint32_t graph_id, const std::vector &inputs, + const std::unordered_set &value_nodes, + const std::vector &execution_order) override; + void AssignMemory(session::KernelGraph *graph) override; + bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) override; +#ifdef ENABLE_DUMP_E2E + bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override; +#endif + + protected: + DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, + TypeId type_id) override; + bool SyncStream() override; + + private: + GPUKernelRuntime(const GPUKernelRuntime &); + GPUKernelRuntime &operator=(const GPUKernelRuntime &); + bool InitDevice(); + bool device_init_{false}; + + // The related functions and members for using dynamic memory pool. + void InitKernelRefCount(const session::KernelGraph *graph); + void InitKernelOutputAddress(const session::KernelGraph *graph); + void InitKernelWorkspaceAddress(const session::KernelGraph *graph); + void InitMemorySwapInfo(const session::KernelGraph *graph); + void SaveGraphOutputNode(const session::KernelGraph *graph); + bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const; + void ClearKernelOutputAddress(const session::KernelGraph *graph); + void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); + void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); + bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr); + bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false, + bool profiling = false); + void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, + const AddressPtrList &workspace, const AddressPtrList &outputs); + bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); + bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, + AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces, + AddressPtrList *kernel_outputs, bool mock); + bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock); + bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, + AddressPtrList *kernel_outputs, bool mock); + bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, + const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces, + bool mock); + void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph); + void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel); + void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel); + void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory, + const DeviceAddressPtrList addr_list, size_t total_size, + std::vector size_list); + void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel); + bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); + bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); + void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock); + void UpdateHostSwapOutQueue(bool mock); + void ClearSwapInfo(bool mock); + std::unordered_map mem_reuse_util_map_; + std::unordered_map mem_swap_map_; + std::unordered_map is_first_step_map_; + std::unordered_map> graph_output_map_; + + MemReuseUtilPtr mem_reuse_util_{nullptr}; + MemSwapManagerPtr mem_swap_manager_{nullptr}; +}; +MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime); +} // namespace gpu +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_KERNEL_RUNTIME_H_ diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index ec213d4189..d3cd2cda98 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -40,7 +40,7 @@ KernelRuntime::~KernelRuntime() { #endif } -bool KernelRuntime::Load(session::KernelGraph *graph) { return true; } +bool KernelRuntime::Load(session::KernelGraph *graph, bool is_task_sink) { return true; } bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) { if (graph != nullptr) { diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 5265f4666d..636b5c8884 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -59,8 +59,8 @@ class KernelRuntime { bool DumpDataEnabled(); bool DumpDataEnabledIteration(); virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr); - virtual bool Load(session::KernelGraph *graph); - virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) = 0; + virtual bool Load(session::KernelGraph *graph, bool is_task_sink); + virtual bool Run(session::KernelGraph *graph, bool is_task_sink, Debugger *debugger = nullptr) = 0; bool LaunchKernel(const session::KernelGraph *graph); bool LaunchTaskBasedOnSingleKernel(kernel::KernelModPtr kernel_mod_ptr, const AddressPtrList &kernel_inputs, const AddressPtrList &kernel_outputs,