| @@ -81,6 +81,14 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr | |||
| #endif | |||
| MS_LOG(INFO) << "Build kernel"; | |||
| BuildKernel(graph.get()); | |||
| // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph | |||
| auto execution_order = graph->execution_order(); | |||
| Reorder(&execution_order); | |||
| graph->set_execution_order(execution_order); | |||
| // runtime init | |||
| if (!runtime_.Init()) { | |||
| MS_LOG(EXCEPTION) << "Kernel runtime init error."; | |||
| } | |||
| MS_LOG(INFO) << "Assign kernel address"; | |||
| runtime_.AssignKernelAddress(graph.get()); | |||
| return graph_id; | |||
| @@ -116,11 +124,8 @@ void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor: | |||
| #endif | |||
| MS_LOG(INFO) << "Run graph start"; | |||
| auto execution_order = kernel_graph->execution_order(); | |||
| Reorder(&execution_order); | |||
| bool enable_summary = summary_callback_ != nullptr; | |||
| kernel_graph->set_execution_order(execution_order); | |||
| NamedSummaryOutputs summary_outputs; | |||
| if (enable_summary) { | |||
| SetSummaryNodes(kernel_graph.get()); | |||
| @@ -181,16 +186,21 @@ void CPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, | |||
| auto kernel_graph = run_op_graphs_[graph_info]; | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph | |||
| auto execution_order = kernel_graph->execution_order(); | |||
| Reorder(&execution_order); | |||
| kernel_graph->set_execution_order(execution_order); | |||
| // runtime init | |||
| if (!runtime_.Init()) { | |||
| MS_LOG(EXCEPTION) << "Kernel runtime init error."; | |||
| } | |||
| runtime_.AssignKernelAddress(kernel_graph.get()); | |||
| std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node; | |||
| runtime_.CreateOutputTensors(kernel_graph.get(), *input_tensors, outputs, &tensor_to_node); | |||
| runtime_.BindInputOutput(kernel_graph.get(), *input_tensors, outputs); | |||
| MS_LOG(INFO) << "Run Op start"; | |||
| auto execution_order = kernel_graph->execution_order(); | |||
| Reorder(&execution_order); | |||
| kernel_graph->set_execution_order(execution_order); | |||
| bool ret = runtime_.Run(kernel_graph.get(), false); | |||
| if (!ret) { | |||
| @@ -24,6 +24,7 @@ | |||
| #include <exception> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "runtime/device/cpu/cpu_memory_manager.h" | |||
| #include "utils/ms_context.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/session/session_basic.h" | |||
| @@ -31,16 +32,47 @@ | |||
| #include "utils/shape_utils.h" | |||
| #include "utils/profile.h" | |||
| #include "utils/trace_base.h" | |||
| #ifdef MEM_REUSE_DEBUG | |||
| #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace cpu { | |||
| bool CPUKernelRuntime::Init() { | |||
| if (initialized_) { | |||
| return true; | |||
| } | |||
| mem_manager_ = std::make_shared<CPUMemoryManager>(); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| initialized_ = true; | |||
| return true; | |||
| } | |||
| const size_t INIT_NODE_REF = 1; | |||
| void CPUKernelRuntime::AssignKernelAddress(session::KernelGraph *kernel_graph) { | |||
| AssignValueNodeAddress(kernel_graph); | |||
| AssignInputNodeAddress(kernel_graph); | |||
| AssignKernelOutputAddress(kernel_graph); | |||
| resource_manager_.AssignMemory(kernel_graph); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| bool is_enable_mem_reuse = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_REUSE); | |||
| if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) { | |||
| // disable mem reuse for kPynativeMode | |||
| is_enable_mem_reuse = false; | |||
| } | |||
| if (is_enable_mem_reuse) { | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->ResetDynamicMemory(); | |||
| AssignDynamicMemory(kernel_graph); | |||
| #ifdef MEM_REUSE_DEBUG | |||
| // Get normal graph ir for memreuse | |||
| mindspore::memreuse::MemReuseChecker::GetInstance().CheckNormalIR(kernel_graph); | |||
| #endif | |||
| } else { | |||
| AssignKernelOutputAddress(kernel_graph); | |||
| static_cast<CPUMemoryManager *>(mem_manager_.get())->AssignMemory(kernel_graph); | |||
| } | |||
| } | |||
| void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph) { | |||
| @@ -75,7 +107,7 @@ void CPUKernelRuntime::AssignValueNodeAddress(session::KernelGraph *kernel_graph | |||
| if (tensor->data_type() == output_type_id) { | |||
| address->ptr_ = tensor->data_c(); | |||
| } else { | |||
| address->ptr_ = resource_manager_.MemMalloc(tensor_size); | |||
| address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size); | |||
| if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(), | |||
| tensor->data_c())) { | |||
| MS_LOG(EXCEPTION) << "Value node sync host to device failed!"; | |||
| @@ -169,7 +201,7 @@ tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput( | |||
| size_t type_size = GetTypeByte(TypeIdToType(device_type_id)); | |||
| ShapeVector data_shape = tensor->shape(); | |||
| size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>()); | |||
| address->ptr_ = resource_manager_.MemMalloc(tensor_size); | |||
| address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size); | |||
| tensor->set_sync_status(kNeedSyncDeviceToHostImmediately); | |||
| } else { | |||
| tensor->set_sync_status(kNoNeedSync); | |||
| @@ -268,7 +300,7 @@ void CPUKernelRuntime::BindInputTensorAddressPtr(const session::KernelGraph &ker | |||
| ShapeVector data_shape = tensor->shape(); | |||
| size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), | |||
| GetTypeByte(TypeIdToType(address->type_id_)), std::multiplies<size_t>()); | |||
| address->ptr_ = resource_manager_.MemMalloc(tensor_size); | |||
| address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(tensor_size); | |||
| if (!address->SyncHostToDevice(data_shape, LongToSize(tensor->data().nbytes()), tensor->data_type(), | |||
| tensor->data_c())) { | |||
| MS_LOG(EXCEPTION) << "Parameter node sync host to device failed!"; | |||
| @@ -322,7 +354,7 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector<ker | |||
| kernel::AddressPtr input = std::make_shared<kernel::Address>(); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| if (address->ptr_ == nullptr) { | |||
| address->ptr_ = resource_manager_.MemMalloc(address->size_); | |||
| address->ptr_ = static_cast<CPUMemoryManager *>(mem_manager_.get())->StaticMemMalloc(address->size_); | |||
| } | |||
| MS_EXCEPTION_IF_NULL(address->ptr_); | |||
| input->addr = address->ptr_; | |||
| @@ -331,16 +363,16 @@ void CPUKernelRuntime::AddRuntimeAddress(DeviceAddress *address, std::vector<ker | |||
| } | |||
| void CPUKernelRuntime::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| resource_manager_.IncreaseSummaryRefCount(summary_outputs); | |||
| static_cast<CPUMemoryManager *>(mem_manager_.get())->IncreaseSummaryRefCount(summary_outputs); | |||
| } | |||
| void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| resource_manager_.DecreaseSummaryRefCount(summary_outputs); | |||
| static_cast<CPUMemoryManager *>(mem_manager_.get())->DecreaseSummaryRefCount(summary_outputs); | |||
| } | |||
| bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink) { | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| resource_manager_.IncreaseAddressRefCount(kernel_graph); | |||
| static_cast<CPUMemoryManager *>(mem_manager_.get())->IncreaseAddressRefCount(kernel_graph); | |||
| auto kernels = kernel_graph->execution_order(); | |||
| for (const auto &kernel : kernels) { | |||
| @@ -381,7 +413,7 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink | |||
| if (!ret) { | |||
| MS_LOG(EXCEPTION) << "Launch kernel failed. Trace:" << trace::DumpSourceLines(kernel); | |||
| } | |||
| resource_manager_.DecreaseAddressRefCount(kernel); | |||
| static_cast<CPUMemoryManager *>(mem_manager_.get())->DecreaseAddressRefCount(kernel); | |||
| #ifdef ENABLE_PROFILE | |||
| double cost_time = GetTime() - start_time; | |||
| MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us"; | |||
| @@ -24,7 +24,6 @@ | |||
| #include "runtime/device/kernel_runtime.h" | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/session/session_basic.h" | |||
| #include "runtime/device/cpu/cpu_resource_manager.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/any.h" | |||
| namespace mindspore { | |||
| @@ -35,7 +34,7 @@ class CPUKernelRuntime : public KernelRuntime { | |||
| CPUKernelRuntime() = default; | |||
| ~CPUKernelRuntime() override = default; | |||
| bool Init() override { return true; } | |||
| bool Init(); | |||
| bool Run(session::KernelGraph *graph, bool is_task_sink) override; | |||
| void AssignKernelAddress(session::KernelGraph *kernel_graph); | |||
| void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs, | |||
| @@ -63,9 +62,9 @@ class CPUKernelRuntime : public KernelRuntime { | |||
| void AssignInputNodeAddress(const session::KernelGraph *kernel_graph); | |||
| void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph); | |||
| void AddRuntimeAddress(DeviceAddress *address, std::vector<kernel::AddressPtr> *input_list); | |||
| CPUResourceManager resource_manager_; | |||
| std::set<DeviceAddressPtr> bound_addresses_; | |||
| std::map<AnfNodePtr, tensor::TensorPtr> input_param_tensor_map_; | |||
| bool initialized_{false}; | |||
| }; | |||
| } // namespace cpu | |||
| } // namespace device | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,28 +13,90 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "runtime/device/cpu/cpu_resource_manager.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "runtime/device/cpu/cpu_memory_manager.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/ms_context.h" | |||
| #include "utils/convert_utils.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace cpu { | |||
| CPUResourceManager::~CPUResourceManager() { MemFree(); } | |||
| void CPUResourceManager::MemFree() { | |||
| uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool) { | |||
| void *ptr = malloc(size); | |||
| if (ptr != nullptr) { | |||
| memset_s(ptr, size, 0, size); | |||
| static_mem_[ptr] = size; | |||
| return reinterpret_cast<uint8_t *>(ptr); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size; | |||
| } | |||
| } | |||
| uint8_t *CPUMemoryManager::MallocDynamicMem(size_t size, bool) { | |||
| void *ptr = nullptr; | |||
| size_t min_size = 0; | |||
| // first find the smallest cached_mem_ which fits the size | |||
| for (auto &&iter : cached_mem_) { | |||
| if (iter.second >= size) { | |||
| if (min_size == 0) { | |||
| ptr = iter.first; | |||
| min_size = iter.second; | |||
| } else if (iter.second < min_size) { | |||
| ptr = iter.first; | |||
| min_size = iter.second; | |||
| } | |||
| } | |||
| } | |||
| if (ptr != nullptr) { | |||
| memset_s(ptr, size, 0, size); | |||
| dynamic_mem_[ptr] = min_size; | |||
| (void)cached_mem_.erase(ptr); | |||
| return reinterpret_cast<uint8_t *>(ptr); | |||
| } | |||
| // if not found, malloc | |||
| ptr = malloc(size); | |||
| if (ptr != nullptr) { | |||
| memset_s(ptr, size, 0, size); | |||
| dynamic_mem_[ptr] = size; | |||
| return reinterpret_cast<uint8_t *>(ptr); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size; | |||
| } | |||
| } | |||
| void CPUMemoryManager::ResetDynamicMemory() { | |||
| // don't free, for multi graph | |||
| for (auto &&iter : dynamic_mem_) { | |||
| cached_mem_[iter.first] = iter.second; | |||
| } | |||
| dynamic_mem_.clear(); | |||
| } | |||
| CPUMemoryManager::~CPUMemoryManager() { MemFree(); } | |||
| void CPUMemoryManager::MemFree() { | |||
| if (mem_ptr_ != nullptr) { | |||
| free(mem_ptr_); | |||
| mem_ptr_ = nullptr; | |||
| mem_size_ = 0; | |||
| } | |||
| for (auto &&iter : static_mem_) { | |||
| free(iter.first); | |||
| } | |||
| static_mem_.clear(); | |||
| for (auto &&iter : dynamic_mem_) { | |||
| free(iter.first); | |||
| } | |||
| dynamic_mem_.clear(); | |||
| for (auto &&iter : cached_mem_) { | |||
| free(iter.first); | |||
| } | |||
| cached_mem_.clear(); | |||
| } | |||
| void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { | |||
| void CPUMemoryManager::AssignMemory(const session::KernelGraph *graph) { | |||
| size_t graph_mem_size = mem_plan_.MemPlan(graph); | |||
| if (graph_mem_size > mem_size_) { | |||
| if (mem_size_ > 0) { | |||
| @@ -43,6 +105,7 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { | |||
| } | |||
| mem_ptr_ = reinterpret_cast<uint8_t *>(malloc(graph_mem_size)); | |||
| if (mem_ptr_ != nullptr) { | |||
| MS_LOG(INFO) << "Simple MemPlan GraphMemSize [" << graph_mem_size << "]"; | |||
| mem_size_ = graph_mem_size; | |||
| dynamic_malloc_ = false; | |||
| } else { | |||
| @@ -56,26 +119,26 @@ void CPUResourceManager::AssignMemory(const session::KernelGraph *graph) { | |||
| mem_plan_.MemAssign(graph, mem_ptr_); | |||
| } | |||
| void *CPUResourceManager::MemMalloc(size_t mem_size) { | |||
| void *CPUMemoryManager::StaticMemMalloc(size_t mem_size) { | |||
| void *ptr = malloc(mem_size); | |||
| if (ptr != nullptr) { | |||
| memset_s(ptr, mem_size, 0, mem_size); | |||
| dynamic_mem_[ptr] = mem_size; | |||
| static_mem_[ptr] = mem_size; | |||
| return ptr; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Malloc memory failed: size " << mem_size; | |||
| } | |||
| } | |||
| void CPUResourceManager::MemFree(void *ptr) { | |||
| auto iter = dynamic_mem_.find(ptr); | |||
| if (iter != dynamic_mem_.end()) { | |||
| (void)dynamic_mem_.erase(iter); | |||
| void CPUMemoryManager::MemFree(void *ptr) { | |||
| auto iter = static_mem_.find(ptr); | |||
| if (iter != static_mem_.end()) { | |||
| (void)static_mem_.erase(iter); | |||
| free(ptr); | |||
| } | |||
| } | |||
| void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| void CPUMemoryManager::IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| if (!dynamic_malloc_) { | |||
| return; | |||
| } | |||
| @@ -93,7 +156,7 @@ void CPUResourceManager::IncreaseSummaryRefCount(const session::NamedSummaryOutp | |||
| } | |||
| } | |||
| void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| void CPUMemoryManager::DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs) { | |||
| if (!dynamic_malloc_) { | |||
| return; | |||
| } | |||
| @@ -115,7 +178,7 @@ void CPUResourceManager::DecreaseSummaryRefCount(const session::NamedSummaryOutp | |||
| } | |||
| } | |||
| void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *graph) { | |||
| void CPUMemoryManager::IncreaseAddressRefCount(const session::KernelGraph *graph) { | |||
| if (!dynamic_malloc_) { | |||
| return; | |||
| } | |||
| @@ -140,7 +203,7 @@ void CPUResourceManager::IncreaseAddressRefCount(const session::KernelGraph *gra | |||
| } | |||
| } | |||
| void CPUResourceManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) { | |||
| void CPUMemoryManager::DecreaseAddressRefCount(const AnfNodePtr &kernel) { | |||
| if (!dynamic_malloc_) { | |||
| return; | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,31 +13,40 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ | |||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ | |||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ | |||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ | |||
| #include <vector> | |||
| #include <map> | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/session/session_basic.h" | |||
| #include "runtime/device/device_address.h" | |||
| #include "runtime/device/memory_manager.h" | |||
| #include "runtime/device/cpu/cpu_simple_mem_plan.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace cpu { | |||
| class CPUResourceManager { | |||
| class CPUMemoryManager : public MemoryManager { | |||
| public: | |||
| CPUResourceManager() = default; | |||
| ~CPUResourceManager(); | |||
| CPUMemoryManager() = default; | |||
| virtual ~CPUMemoryManager(); | |||
| void MallocDeviceMemory() override {} | |||
| void FreeDeviceMemory() override {} | |||
| void ResetDynamicMemory() override; | |||
| void AssignMemory(const session::KernelGraph *graph); | |||
| void IncreaseAddressRefCount(const session::KernelGraph *graph); | |||
| void DecreaseAddressRefCount(const AnfNodePtr &kernel); | |||
| void *MemMalloc(size_t mem_size); | |||
| void *StaticMemMalloc(size_t mem_size); | |||
| void MemFree(void *ptr); | |||
| void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); | |||
| void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); | |||
| protected: | |||
| uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; | |||
| uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; | |||
| private: | |||
| void MemFree(); | |||
| CPUSimpleMemPlan mem_plan_; | |||
| @@ -46,9 +55,10 @@ class CPUResourceManager { | |||
| uint8_t *mem_ptr_{nullptr}; | |||
| bool dynamic_malloc_{false}; | |||
| std::map<void *, size_t> dynamic_mem_; | |||
| std::map<void *, size_t> static_mem_; | |||
| std::map<void *, size_t> cached_mem_; | |||
| }; | |||
| } // namespace cpu | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_RESOURCE_MANAGER_H_ | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_CPU_CPU_MEMORY_MANAGER_H_ | |||
| @@ -28,7 +28,7 @@ namespace mindspore { | |||
| namespace device { | |||
| namespace cpu { | |||
| class CPUSimpleMemPlan; | |||
| class CPUResourceManager; | |||
| class CPUMemoryManager; | |||
| class CPUKernelRuntime; | |||
| } // namespace cpu | |||
| namespace ascend { | |||
| @@ -93,7 +93,7 @@ class DeviceAddress : public mindspore::DeviceSync { | |||
| friend class MemoryManager; | |||
| friend class mindspore::device::ascend::tasksink::TaskGenerator; | |||
| friend class mindspore::device::cpu::CPUSimpleMemPlan; | |||
| friend class mindspore::device::cpu::CPUResourceManager; | |||
| friend class mindspore::device::cpu::CPUMemoryManager; | |||
| friend class mindspore::device::cpu::CPUKernelRuntime; | |||
| friend class mindspore::device::gpu::GPUKernelRuntime; | |||
| friend class mindspore::device::gpu::GPUMemoryManager; | |||