From 2078b7156f2b35ea9ac88d28fee66f1b65fadcce Mon Sep 17 00:00:00 2001 From: yitongh Date: Thu, 24 Dec 2020 04:01:01 +0000 Subject: [PATCH] Add support for CPU memory reuse --- .../ccsrc/backend/session/cpu_session.cc | 24 +++-- .../runtime/device/cpu/cpu_kernel_runtime.cc | 52 ++++++++-- .../runtime/device/cpu/cpu_kernel_runtime.h | 5 +- ...ource_manager.cc => cpu_memory_manager.cc} | 95 +++++++++++++++---- ...esource_manager.h => cpu_memory_manager.h} | 28 ++++-- .../ccsrc/runtime/device/device_address.h | 4 +- 6 files changed, 161 insertions(+), 47 deletions(-) rename mindspore/ccsrc/runtime/device/cpu/{cpu_resource_manager.cc => cpu_memory_manager.cc} (61%) rename mindspore/ccsrc/runtime/device/cpu/{cpu_resource_manager.h => cpu_memory_manager.h} (64%) diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc index 2787116251..f313d9746a 100644 --- a/mindspore/ccsrc/backend/session/cpu_session.cc +++ b/mindspore/ccsrc/backend/session/cpu_session.cc @@ -81,6 +81,14 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr #endif MS_LOG(INFO) << "Build kernel"; BuildKernel(graph.get()); + // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph + auto execution_order = graph->execution_order(); + Reorder(&execution_order); + graph->set_execution_order(execution_order); + // runtime init + if (!runtime_.Init()) { + MS_LOG(EXCEPTION) << "Kernel runtime init error."; + } MS_LOG(INFO) << "Assign kernel address"; runtime_.AssignKernelAddress(graph.get()); return graph_id; @@ -116,11 +124,8 @@ void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vectorexecution_order(); - Reorder(&execution_order); bool enable_summary = summary_callback_ != nullptr; - kernel_graph->set_execution_order(execution_order); NamedSummaryOutputs summary_outputs; if (enable_summary) { SetSummaryNodes(kernel_graph.get()); @@ -181,16 +186,21 @@ void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, auto kernel_graph = run_op_graphs_[graph_info]; MS_EXCEPTION_IF_NULL(kernel_graph); + // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph + auto execution_order = kernel_graph->execution_order(); + Reorder(&execution_order); + kernel_graph->set_execution_order(execution_order); + + // runtime init + if (!runtime_.Init()) { + MS_LOG(EXCEPTION) << "Kernel runtime init error."; + } runtime_.AssignKernelAddress(kernel_graph.get()); std::map tensor_to_node; runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node); runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs); MS_LOG(INFO) << "Run Op start"; - auto execution_order = kernel_graph->execution_order(); - Reorder(&execution_order); - - kernel_graph->set_execution_order(execution_order); bool ret = runtime_.Run(kernel_graph.get(), false); if (!ret) { diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc index 3c56162763..f35999209d 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc @@ -24,6 +24,7 @@ #include #include "backend/kernel_compiler/kernel.h" #include "runtime/device/cpu/cpu_device_address.h" +#include "runtime/device/cpu/cpu_memory_manager.h" #include "utils/ms_context.h" #include "backend/session/anf_runtime_algorithm.h" #include "backend/session/session_basic.h" @@ -31,16 +32,47 @@ #include "utils/shape_utils.h" #include "utils/profile.h" #include "utils/trace_base.h" +#ifdef MEM_REUSE_DEBUG +#include "backend/optimizer/mem_reuse/mem_reuse_checker.h" +#endif namespace mindspore { namespace device { namespace cpu { + +bool CPUKernelRuntime::Init() { + if (initialized_) { + return true; + } + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); + initialized_ = true; + return true; +} + const size_t INIT_NODE_REF = 1; void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) { AssignValueNodeAddress(kernel_graph); AssignInputNodeAddress(kernel_graph); - AssignKernelOutputAddress(kernel_graph); - resource_manager_.AssignMemory(kernel_graph); + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool is_enable_mem_reuse = context_ptr->get_param(MS_CTX_ENABLE_MEM_REUSE); + if (context_ptr->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode) { + // disable mem reuse for kPynativeMode + is_enable_mem_reuse = false; + } + if (is_enable_mem_reuse) { + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->ResetDynamicMemory(); + AssignDynamicMemory(kernel_graph); +#ifdef MEM_REUSE_DEBUG + // Get normal graph ir for memreuse + mindspore::memreuse::MemReuseChecker::GetInstance().CheckNormalIR(kernel_graph); +#endif + } else { + AssignKernelOutputAddress(kernel_graph); + static_cast(mem_manager_.get())->AssignMemory(kernel_graph); + } } void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) { @@ -75,7 +107,7 @@ void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph if (tensor->data_type() == output_type_id) { address->ptr_ = tensor->data_c(); } else { - address->ptr_ = resource_manager_.MemMalloc(tensor_size); + address->ptr_ = static_cast(mem_manager_.get())->StaticMemMalloc(tensor_size); if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c())) { MS_LOG(EXCEPTION) << "Value node sync host to device failed!"; @@ -169,7 +201,7 @@ tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput( size_t type_size = GetTypeByte(TypeIdToType(device_type_id)); ShapeVector data_shape = tensor->shape(); size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies()); - address->ptr_ = resource_manager_.MemMalloc(tensor_size); + address->ptr_ = static_cast(mem_manager_.get())->StaticMemMalloc(tensor_size); tensor->set_sync_status(kNeedSyncDeviceToHostImmediately); } else { tensor->set_sync_status(kNoNeedSync); @@ -268,7 +300,7 @@ void CPUKernelRuntime::BindInputTensorAddressPtr(const session::KernelGraph &ker ShapeVector data_shape = tensor->shape(); size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), GetTypeByte(TypeIdToType(address->type_id_)), std::multiplies()); - address->ptr_ = resource_manager_.MemMalloc(tensor_size); + address->ptr_ = static_cast(mem_manager_.get())->StaticMemMalloc(tensor_size); if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c())) { MS_LOG(EXCEPTION) << "Parameter node sync host to device failed!"; @@ -322,7 +354,7 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector(); MS_EXCEPTION_IF_NULL(input); if (address->ptr_ == nullptr) { - address->ptr_ = resource_manager_.MemMalloc(address->size_); + address->ptr_ = static_cast(mem_manager_.get())->StaticMemMalloc(address->size_); } MS_EXCEPTION_IF_NULL(address->ptr_); input->addr = address->ptr_; @@ -331,16 +363,16 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector(mem_manager_.get())->IncreaseSummaryRefCount(summary_outputs); } void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { - resource_manager_.DecreaseSummaryRefCount(summary_outputs); + static_cast(mem_manager_.get())->DecreaseSummaryRefCount(summary_outputs); } bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink) { MS_EXCEPTION_IF_NULL(kernel_graph); - resource_manager_.IncreaseAddressRefCount(kernel_graph); + static_cast(mem_manager_.get())->IncreaseAddressRefCount(kernel_graph); auto kernels = kernel_graph->execution_order(); for (const auto &kernel : kernels) { @@ -381,7 +413,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink if (!ret) { MS_LOG(EXCEPTION) << "Launch kernel failed. Trace:" << trace::DumpSourceLines(kernel); } - resource_manager_.DecreaseAddressRefCount(kernel); + static_cast(mem_manager_.get())->DecreaseAddressRefCount(kernel); #ifdef ENABLE_PROFILE double cost_time = GetTime() - start_time; MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us"; diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h index 4574bb4fd9..d1581f5e50 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h @@ -24,7 +24,6 @@ #include "runtime/device/kernel_runtime.h" #include "backend/session/kernel_graph.h" #include "backend/session/session_basic.h" -#include "runtime/device/cpu/cpu_resource_manager.h" #include "backend/session/anf_runtime_algorithm.h" #include "utils/any.h" namespace mindspore { @@ -35,7 +34,7 @@ class CPUKernelRuntime : public KernelRuntime { CPUKernelRuntime() = default; ~CPUKernelRuntime() override = default; - bool Init() override { return true; } + bool Init(); bool Run(session::KernelGraph *graph, bool is_task_sink) override; void AssignKernelAddress(session::KernelGraph *kernel_graph); void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector &inputs, @@ -63,9 +62,9 @@ class CPUKernelRuntime : public KernelRuntime { void AssignInputNodeAddress(const session::KernelGraph *kernel_graph); void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph); void AddRuntimeAddress(DeviceAddress *address, std::vector *input_list); - CPUResourceManager resource_manager_; std::set bound_addresses_; std::map input_param_tensor_map_; + bool initialized_{false}; }; } // namespace cpu } // namespace device diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.cc b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc similarity index 61% rename from mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.cc rename to mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc index f8917893f8..c1c5008717 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.cc +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,28 +13,90 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "runtime/device/cpu/cpu_resource_manager.h" -#include "backend/session/anf_runtime_algorithm.h" +#include "runtime/device/cpu/cpu_memory_manager.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "utils/ms_context.h" +#include "utils/convert_utils.h" namespace mindspore { namespace device { namespace cpu { -CPUResourceManager::~CPUResourceManager() { MemFree(); } -void CPUResourceManager::MemFree() { +uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool) { + void *ptr = malloc(size); + if (ptr != nullptr) { + memset_s(ptr, size, 0, size); + static_mem_[ptr] = size; + return reinterpret_cast(ptr); + } else { + MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size; + } +} + +uint8_t *CPUMemoryManager::MallocDynamicMem(size_t size, bool) { + void *ptr = nullptr; + size_t min_size = 0; + // first find the smallest cached_mem_ which fits the size + for (auto &&iter : cached_mem_) { + if (iter.second >= size) { + if (min_size == 0) { + ptr = iter.first; + min_size = iter.second; + } else if (iter.second < min_size) { + ptr = iter.first; + min_size = iter.second; + } + } + } + if (ptr != nullptr) { + memset_s(ptr, size, 0, size); + dynamic_mem_[ptr] = min_size; + (void)cached_mem_.erase(ptr); + return reinterpret_cast(ptr); + } + // if not found, malloc + ptr = malloc(size); + if (ptr != nullptr) { + memset_s(ptr, size, 0, size); + dynamic_mem_[ptr] = size; + return reinterpret_cast(ptr); + } else { + MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size; + } +} + +void CPUMemoryManager::ResetDynamicMemory() { + // don't free, for multi graph + for (auto &&iter : dynamic_mem_) { + cached_mem_[iter.first] = iter.second; + } + dynamic_mem_.clear(); +} + +CPUMemoryManager::~CPUMemoryManager() { MemFree(); } + +void CPUMemoryManager::MemFree() { if (mem_ptr_ != nullptr) { free(mem_ptr_); mem_ptr_ = nullptr; mem_size_ = 0; } + for (auto &&iter : static_mem_) { + free(iter.first); + } + static_mem_.clear(); for (auto &&iter : dynamic_mem_) { free(iter.first); } dynamic_mem_.clear(); + for (auto &&iter : cached_mem_) { + free(iter.first); + } + cached_mem_.clear(); } -void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { +void CPUMemoryManager::AssignMemory(const session::KernelGraph *graph) { size_t graph_mem_size = mem_plan_.MemPlan(graph); if (graph_mem_size > mem_size_) { if (mem_size_ > 0) { @@ -43,6 +105,7 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { } mem_ptr_ = reinterpret_cast(malloc(graph_mem_size)); if (mem_ptr_ != nullptr) { + MS_LOG(INFO) << "Simple MemPlan GraphMemSize [" << graph_mem_size << "]"; mem_size_ = graph_mem_size; dynamic_malloc_ = false; } else { @@ -56,26 +119,26 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { mem_plan_.MemAssign(graph, mem_ptr_); } -void *CPUResourceManager::MemMalloc(size_t mem_size) { +void *CPUMemoryManager::StaticMemMalloc(size_t mem_size) { void *ptr = malloc(mem_size); if (ptr != nullptr) { memset_s(ptr, mem_size, 0, mem_size); - dynamic_mem_[ptr] = mem_size; + static_mem_[ptr] = mem_size; return ptr; } else { MS_LOG(EXCEPTION) << "Malloc memory failed: size " << mem_size; } } -void CPUResourceManager::MemFree(void *ptr) { - auto iter = dynamic_mem_.find(ptr); - if (iter != dynamic_mem_.end()) { - (void)dynamic_mem_.erase(iter); +void CPUMemoryManager::MemFree(void *ptr) { + auto iter = static_mem_.find(ptr); + if (iter != static_mem_.end()) { + (void)static_mem_.erase(iter); free(ptr); } } -void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { +void CPUMemoryManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { if (!dynamic_malloc_) { return; } @@ -93,7 +156,7 @@ void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutp } } -void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { +void CPUMemoryManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { if (!dynamic_malloc_) { return; } @@ -115,7 +178,7 @@ void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutp } } -void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *graph) { +void CPUMemoryManager::IncreaseAddressRefCount(const session::KernelGraph *graph) { if (!dynamic_malloc_) { return; } @@ -140,7 +203,7 @@ void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *gra } } -void CPUResourceManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) { +void CPUMemoryManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) { if (!dynamic_malloc_) { return; } diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.h b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h similarity index 64% rename from mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.h rename to mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h index 5e476cac69..08f0052b7a 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_resource_manager.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,31 +13,40 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ -#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ #include #include #include "backend/session/kernel_graph.h" #include "backend/session/session_basic.h" #include "runtime/device/device_address.h" +#include "runtime/device/memory_manager.h" #include "runtime/device/cpu/cpu_simple_mem_plan.h" namespace mindspore { namespace device { namespace cpu { -class CPUResourceManager { +class CPUMemoryManager : public MemoryManager { public: - CPUResourceManager() = default; - ~CPUResourceManager(); + CPUMemoryManager() = default; + virtual ~CPUMemoryManager(); + + void MallocDeviceMemory() override {} + void FreeDeviceMemory() override {} + void ResetDynamicMemory() override; void AssignMemory(const session::KernelGraph *graph); void IncreaseAddressRefCount(const session::KernelGraph *graph); void DecreaseAddressRefCount(const AnfNodePtr &kernel); - void *MemMalloc(size_t mem_size); + void *StaticMemMalloc(size_t mem_size); void MemFree(void *ptr); void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); + protected: + uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; + uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; + private: void MemFree(); CPUSimpleMemPlan mem_plan_; @@ -46,9 +55,10 @@ class CPUResourceManager { uint8_t *mem_ptr_{nullptr}; bool dynamic_malloc_{false}; std::map dynamic_mem_; + std::map static_mem_; + std::map cached_mem_; }; } // namespace cpu } // namespace device } // namespace mindspore - -#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h index 7d32d11af5..8de942fd8b 100644 --- a/mindspore/ccsrc/runtime/device/device_address.h +++ b/mindspore/ccsrc/runtime/device/device_address.h @@ -28,7 +28,7 @@ namespace mindspore { namespace device { namespace cpu { class CPUSimpleMemPlan; -class CPUResourceManager; +class CPUMemoryManager; class CPUKernelRuntime; } // namespace cpu namespace ascend { @@ -93,7 +93,7 @@ class DeviceAddress : public mindspore::DeviceSync { friend class MemoryManager; friend class mindspore::device::ascend::tasksink::TaskGenerator; friend class mindspore::device::cpu::CPUSimpleMemPlan; - friend class mindspore::device::cpu::CPUResourceManager; + friend class mindspore::device::cpu::CPUMemoryManager; friend class mindspore::device::cpu::CPUKernelRuntime; friend class mindspore::device::gpu::GPUKernelRuntime; friend class mindspore::device::gpu::GPUMemoryManager;