| @@ -132,6 +132,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "kernel/kash/*.cc" | |||
| "device/kernel_info.cc" | |||
| "device/kernel_runtime.cc" | |||
| "device/memory_manager.cc" | |||
| "device/kernel_runtime_manager.cc" | |||
| "device/convert_tensor_utils.cc" | |||
| "pre_activate/common/*.cc" | |||
| @@ -37,6 +37,7 @@ | |||
| #include "kernel/tbe/tbe_utils.h" | |||
| #include "kernel/tbe/tbe_python_funcs.h" | |||
| #include "pre_activate/mem_reuse/mem_reuse_checker.h" | |||
| #include "device/ascend/ascend_memory_manager.h" | |||
| using mindspore::device::ascend::ProfilingManager; | |||
| using mindspore::device::ascend::ProfilingUtils; | |||
| @@ -47,8 +48,6 @@ using std::vector; | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| static const uint64_t ASCEND_MEM_SIZE = 20; | |||
| static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30); | |||
| static const size_t PRAMATER_OUTPUT_INDEX = 0; | |||
| AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | |||
| @@ -86,7 +85,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtSetDevice, ret[" << static_cast<int>(ret) << "]"; | |||
| } | |||
| FreeDeviceMemory(); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->FreeDeviceMemory(); | |||
| (void)DestroyHccl(); | |||
| (void)ResetDevice(); | |||
| (void)ProfilingManager::GetInstance().StopProfiling(); | |||
| @@ -109,11 +109,9 @@ bool AscendKernelRuntime::Init() { | |||
| if (!ret) { | |||
| return ret; | |||
| } | |||
| ret = MallocDeviceMemory(); | |||
| if (!ret) { | |||
| return ret; | |||
| } | |||
| mem_manager_ = std::make_shared<AscendMemoryManager>(); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->MallocDeviceMemory(); | |||
| ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); | |||
| if (!ret) { | |||
| @@ -239,13 +237,6 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size | |||
| return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id); | |||
| } | |||
| void AscendKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) { | |||
| auto device_ptr = AscendMemoryAllocator::GetInstance().AllocTensorMem(size); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| address->ptr_ = device_ptr; | |||
| address->mem_dynamic_alloc_ = true; | |||
| } | |||
| bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| @@ -474,42 +465,6 @@ bool AscendKernelRuntime::DestroyHccl() { | |||
| context_ptr->set_enable_hccl(false); | |||
| return true; | |||
| } | |||
| bool AscendKernelRuntime::MallocDeviceMemory() { | |||
| device_mem_size_ = ASCEND_MEM_SIZE_BYTE; | |||
| static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO); | |||
| auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO)); | |||
| ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_); | |||
| AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_); | |||
| return true; | |||
| } | |||
| void AscendKernelRuntime::FreeDeviceMemory() { | |||
| if (device_mem_base_ != nullptr) { | |||
| auto ret = rtFree(device_mem_base_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_base_ = nullptr; | |||
| } | |||
| if (device_mem_pool_base_ != nullptr) { | |||
| auto ret = rtFree(device_mem_pool_base_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_pool_base_ = nullptr; | |||
| } | |||
| } | |||
| void AscendKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; } | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -39,13 +39,11 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| bool GenTask(const session::KernelGraph *graph) override; | |||
| bool RunTask(const session::KernelGraph *graph) override; | |||
| bool LoadTask(const session::KernelGraph *graph) override; | |||
| void FreeHostMemory() override; | |||
| protected: | |||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) override; | |||
| bool SyncStream() override; | |||
| void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override; | |||
| private: | |||
| bool InitDevice(); | |||
| @@ -53,8 +51,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| bool HcclInit(); | |||
| bool NeedDestroyHccl(); | |||
| bool DestroyHccl(); | |||
| bool MallocDeviceMemory(); | |||
| void FreeDeviceMemory(); | |||
| void ClearGraphModelMap(); | |||
| void ReleaseDeviceRes() override; | |||
| uint32_t GetGraphModelId(const session::KernelGraph *kernel_graph); | |||
| @@ -0,0 +1,65 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "device/ascend/ascend_memory_manager.h" | |||
| #include "device/ascend/ascend_memory_allocator.h" | |||
| #include "utils/context/ms_context.h" | |||
| #include "runtime/mem.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| static const uint64_t ASCEND_MEM_SIZE = 20; | |||
| static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30); | |||
| void AscendMemoryManager::MallocDeviceMemory() { | |||
| device_mem_size_ = ASCEND_MEM_SIZE_BYTE; | |||
| static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO); | |||
| auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO)); | |||
| ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_); | |||
| AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_); | |||
| } | |||
| void AscendMemoryManager::FreeDeviceMemory() { | |||
| if (device_mem_base_ != nullptr) { | |||
| auto ret = rtFree(device_mem_base_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_base_ = nullptr; | |||
| } | |||
| if (device_mem_pool_base_ != nullptr) { | |||
| auto ret = rtFree(device_mem_pool_base_); | |||
| if (ret != RT_ERROR_NONE) { | |||
| MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]"; | |||
| } | |||
| device_mem_pool_base_ = nullptr; | |||
| } | |||
| } | |||
| void *AscendMemoryManager::AllocTensorMemDynamic(size_t size) { | |||
| return AscendMemoryAllocator::GetInstance().AllocTensorMem(size); | |||
| } | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,35 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ | |||
| #include "device/memory_manager.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace ascend { | |||
| class AscendMemoryManager : public MemoryManager { | |||
| public: | |||
| AscendMemoryManager() = default; | |||
| virtual ~AscendMemoryManager() = default; | |||
| void MallocDeviceMemory() override; | |||
| void FreeDeviceMemory() override; | |||
| void *AllocTensorMemDynamic(size_t size) override; | |||
| }; | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ | |||
| @@ -33,12 +33,14 @@ class CPUKernelRuntime; | |||
| } // namespace cpu | |||
| namespace ascend { | |||
| class AscendKernelRuntime; | |||
| class AscendMemoryManager; | |||
| namespace tasksink { | |||
| class TaskGenerator; | |||
| } // namespace tasksink | |||
| } // namespace ascend | |||
| namespace gpu { | |||
| class GPUKernelRuntime; | |||
| class GPUMemoryManager; | |||
| } // namespace gpu | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -70,12 +72,15 @@ class DeviceAddress { | |||
| TypeId type_id_{kNumberTypeFloat16}; | |||
| bool mem_dynamic_alloc_{false}; | |||
| friend class KernelRuntime; | |||
| friend class MemoryManager; | |||
| friend class mindspore::device::ascend::tasksink::TaskGenerator; | |||
| friend class mindspore::device::cpu::CPUSimpleMemPlan; | |||
| friend class mindspore::device::cpu::CPUResourceManager; | |||
| friend class mindspore::device::cpu::CPUKernelRuntime; | |||
| friend class mindspore::device::gpu::GPUKernelRuntime; | |||
| friend class mindspore::device::gpu::GPUMemoryManager; | |||
| friend class mindspore::device::ascend::AscendKernelRuntime; | |||
| friend class mindspore::device::ascend::AscendMemoryManager; | |||
| }; | |||
| using DeviceAddressPtr = std::shared_ptr<DeviceAddress>; | |||
| @@ -26,6 +26,7 @@ | |||
| #include "device/kernel_runtime_manager.h" | |||
| #include "device/gpu/gpu_common.h" | |||
| #include "common/utils.h" | |||
| #include "device/gpu/gpu_memory_manager.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| @@ -36,26 +37,14 @@ bool GPUKernelRuntime::Init() { | |||
| if (device_init_ == true) { | |||
| return true; | |||
| } | |||
| auto ret = InitDevice(); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "InitDevice error."; | |||
| return ret; | |||
| } | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| // If use the dynamic memory pool, then alloc the first memory block to init. | |||
| if (context_ptr->enable_dynamic_mem_pool()) { | |||
| auto device_addr = AllocTensorMemDynamic(1); | |||
| if (!device_addr) { | |||
| MS_LOG(ERROR) << "Dynamic memory pool init error."; | |||
| return false; | |||
| } | |||
| } else { | |||
| MallocDeviceMemory(); | |||
| } | |||
| mem_manager_ = std::make_shared<GPUMemoryManager>(); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->MallocDeviceMemory(); | |||
| const void *collective_handle_ = CollectiveInitializer::instance().collective_handle(); | |||
| bool collective_inited = CollectiveInitializer::instance().collective_inited(); | |||
| if (collective_inited && collective_handle_ != nullptr) { | |||
| @@ -101,16 +90,6 @@ bool GPUKernelRuntime::InitDevice() { | |||
| return true; | |||
| } | |||
| void GPUKernelRuntime::MallocDeviceMemory() { | |||
| // Need to reserve 20% space for dynamic memory | |||
| const float init_gpu_mem_ratio = 0.8; | |||
| size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio); | |||
| auto alloc_size = | |||
| GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_)); | |||
| device_mem_size_ = alloc_size; | |||
| static_mem_offset_ = device_mem_size_; | |||
| } | |||
| void GPUKernelRuntime::ReleaseDeviceRes() { | |||
| // For dataset mode. | |||
| if (GpuBufferMgr::GetInstance().IsInit()) { | |||
| @@ -122,39 +101,22 @@ void GPUKernelRuntime::ReleaseDeviceRes() { | |||
| CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue."); | |||
| } | |||
| GPUDeviceManager::GetInstance().ReleaseDevice(); | |||
| if (device_mem_base_ != nullptr) { | |||
| if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) { | |||
| MS_LOG(EXCEPTION) << "Could not free gpu device memory."; | |||
| } | |||
| } | |||
| GPUMemoryAllocator::GetInstance().ReleaseDeviceRes(); | |||
| } | |||
| void GPUKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; } | |||
| void *GPUKernelRuntime::AllocTensorMemDynamic(size_t size) { | |||
| return GPUMemoryAllocator::GetInstance().AllocTensorMem(size); | |||
| } | |||
| void GPUKernelRuntime::FreeTensorMemDynamic(void *device_ptr) { | |||
| GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->FreeDeviceMemory(); | |||
| } | |||
| void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->ResetDynamicMemory(); | |||
| AssignStaticMemory(graph); | |||
| bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); | |||
| bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool(); | |||
| if (is_enable_dynamic_mem) { | |||
| // Use the dynamic memory pool. | |||
| InitKernelRefCount(graph); | |||
| InitKernelOutputAddress(graph); | |||
| } else if (is_enable_mem_reuse) { | |||
| // Use the memory reuse. | |||
| ReuseAssignDynamicMemory(graph); | |||
| } else { | |||
| // Normal way. | |||
| AssignDynamicMemory(graph); | |||
| } | |||
| } | |||
| @@ -179,32 +141,6 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { | |||
| return ret; | |||
| } | |||
| uint8_t *GPUKernelRuntime::MallocStaticMem(size_t size, bool) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->enable_dynamic_mem_pool()) { | |||
| auto device_ptr = AllocTensorMemDynamic(size); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| return AddressOffset(device_ptr, 0); | |||
| } | |||
| auto align_size = GetCommonAlignSize(size); | |||
| if (static_mem_offset_ < align_size) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| auto offset = static_mem_offset_ - align_size; | |||
| if (dynamic_mem_offset_ > offset) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_static_size_ += align_size; | |||
| static_mem_offset_ = offset; | |||
| return device_mem_base_ + offset; | |||
| } | |||
| void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>(); | |||
| @@ -273,6 +209,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod | |||
| MS_EXCEPTION_IF_NULL(kernel_inputs); | |||
| MS_EXCEPTION_IF_NULL(kernel_workspaces); | |||
| MS_EXCEPTION_IF_NULL(kernel_outputs); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { | |||
| auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| @@ -290,7 +227,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| auto device_ptr = device_address->ptr_; | |||
| if (device_ptr == nullptr) { | |||
| device_ptr = AllocTensorMemDynamic(output_sizes[i]); | |||
| device_ptr = mem_manager_->AllocTensorMemDynamic(output_sizes[i]); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| device_address->ptr_ = device_ptr; | |||
| } | |||
| @@ -307,7 +244,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod | |||
| kernel_workspaces->emplace_back(nullptr); | |||
| continue; | |||
| } | |||
| auto device_ptr = AllocTensorMemDynamic(workspace_sizes[i]); | |||
| auto device_ptr = mem_manager_->AllocTensorMemDynamic(workspace_sizes[i]); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| kernel::AddressPtr workspace = std::make_shared<kernel::Address>(); | |||
| MS_EXCEPTION_IF_NULL(workspace); | |||
| @@ -333,6 +270,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph | |||
| void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| // The reference count of communication kernel input is not 0. | |||
| if (communication_op_input_ref_count_ != 0) { | |||
| MS_LOG(ERROR) << "The reference count of communication kernel input is not 0."; | |||
| @@ -354,7 +292,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN | |||
| addr_size.emplace_back(device_address.get(), output_size); | |||
| } | |||
| auto device_mem_ptr = AllocTensorMemDynamic(total); | |||
| auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total); | |||
| MS_EXCEPTION_IF_NULL(device_mem_ptr); | |||
| for (const auto &iter : addr_size) { | |||
| MS_EXCEPTION_IF_NULL(iter.first); | |||
| @@ -366,6 +304,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN | |||
| void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| // The reference count of communication kernel output is not 0. | |||
| if (communication_op_output_ref_count_ != 0) { | |||
| MS_LOG(ERROR) << "The reference count of communication kernel output is not 0."; | |||
| @@ -389,7 +328,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf | |||
| addr_size.emplace_back(device_address.get(), output_sizes[i]); | |||
| } | |||
| auto device_mem_ptr = AllocTensorMemDynamic(total); | |||
| auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total); | |||
| MS_EXCEPTION_IF_NULL(device_mem_ptr); | |||
| for (const auto &iter : addr_size) { | |||
| MS_EXCEPTION_IF_NULL(iter.first); | |||
| @@ -402,6 +341,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf | |||
| void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, | |||
| const AddressPtrList &kernel_workspaces) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto cnode = kernel->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| // Free the input of kernel by reference count. | |||
| @@ -421,7 +361,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, | |||
| auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MS_EXCEPTION_IF_NULL(device_address->ptr_); | |||
| FreeTensorMemDynamic(device_address->ptr_); | |||
| mem_manager_->FreeTensorMemDynamic(device_address->ptr_); | |||
| device_address->ptr_ = nullptr; | |||
| } | |||
| } | |||
| @@ -432,7 +372,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, | |||
| auto workspace = kernel_workspaces[i]; | |||
| if (workspace != nullptr) { | |||
| MS_EXCEPTION_IF_NULL(workspace->addr); | |||
| FreeTensorMemDynamic(workspace->addr); | |||
| mem_manager_->FreeTensorMemDynamic(workspace->addr); | |||
| workspace->addr = nullptr; | |||
| } | |||
| } | |||
| @@ -441,6 +381,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, | |||
| void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, | |||
| bool *is_communication_op) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| // The inputs memory of communication kernel is one piece memory, need release together. | |||
| if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) { | |||
| communication_op_input_ref_count_--; | |||
| @@ -448,7 +389,7 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr | |||
| auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MS_EXCEPTION_IF_NULL(device_address->ptr_); | |||
| FreeTensorMemDynamic(device_address->ptr_); | |||
| mem_manager_->FreeTensorMemDynamic(device_address->ptr_); | |||
| device_address->ptr_ = nullptr; | |||
| } | |||
| *is_communication_op = true; | |||
| @@ -470,19 +411,12 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr | |||
| auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MS_EXCEPTION_IF_NULL(device_address->ptr_); | |||
| FreeTensorMemDynamic(device_address->ptr_); | |||
| mem_manager_->FreeTensorMemDynamic(device_address->ptr_); | |||
| device_address->ptr_ = nullptr; | |||
| } | |||
| *is_communication_op = true; | |||
| } | |||
| } | |||
| void GPUKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) { | |||
| auto device_ptr = AllocTensorMemDynamic(size); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| address->ptr_ = device_ptr; | |||
| address->mem_dynamic_alloc_ = true; | |||
| } | |||
| } // namespace gpu | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -33,7 +33,6 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| ~GPUKernelRuntime() override = default; | |||
| bool Init() override; | |||
| void ReleaseDeviceRes() override; | |||
| void FreeHostMemory() override; | |||
| void AssignMemory(session::KernelGraph *graph) override; | |||
| bool Run(session::KernelGraph *graph) override; | |||
| @@ -41,18 +40,11 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) override; | |||
| bool SyncStream() override; | |||
| // Alloc memory use the dynamic memory pool. | |||
| void *AllocTensorMemDynamic(size_t size) override; | |||
| // Free memory use the dynamic memory pool. | |||
| void FreeTensorMemDynamic(void *device_ptr) override; | |||
| void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override; | |||
| uint8_t *MallocStaticMem(size_t size, bool communication_mem) override; | |||
| private: | |||
| GPUKernelRuntime(const GPUKernelRuntime &); | |||
| GPUKernelRuntime &operator=(const GPUKernelRuntime &); | |||
| bool InitDevice(); | |||
| void MallocDeviceMemory(); | |||
| bool device_init_{false}; | |||
| // The related functions and members for using dynamic memory pool. | |||
| @@ -69,6 +61,7 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op); | |||
| size_t communication_op_input_ref_count_{0}; | |||
| size_t communication_op_output_ref_count_{0}; | |||
| MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; | |||
| }; | |||
| MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime); | |||
| } // namespace gpu | |||
| @@ -0,0 +1,88 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "device/gpu/gpu_memory_manager.h" | |||
| #include "device/gpu/gpu_memory_allocator.h" | |||
| #include "utils/context/ms_context.h" | |||
| #include "utils/convert_utils.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace gpu { | |||
| void *GPUMemoryManager::AllocTensorMemDynamic(size_t size) { | |||
| return GPUMemoryAllocator::GetInstance().AllocTensorMem(size); | |||
| } | |||
| void GPUMemoryManager::FreeTensorMemDynamic(void *device_ptr) { | |||
| GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr); | |||
| } | |||
| void GPUMemoryManager::MallocDeviceMemory() { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| // If use the dynamic memory pool, then alloc the first memory block to init. | |||
| if (context_ptr->enable_dynamic_mem_pool()) { | |||
| auto device_addr = AllocTensorMemDynamic(1); | |||
| if (!device_addr) { | |||
| MS_LOG(ERROR) << "Dynamic memory pool init error."; | |||
| } | |||
| } else { | |||
| // Need to reserve 20% space for dynamic memory | |||
| const float init_gpu_mem_ratio = 0.8; | |||
| size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio); | |||
| auto alloc_size = | |||
| GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_)); | |||
| device_mem_size_ = alloc_size; | |||
| static_mem_offset_ = device_mem_size_; | |||
| } | |||
| } | |||
| void GPUMemoryManager::FreeDeviceMemory() { | |||
| if (device_mem_base_ != nullptr) { | |||
| if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) { | |||
| MS_LOG(EXCEPTION) << "Could not free gpu device memory."; | |||
| } | |||
| } | |||
| GPUMemoryAllocator::GetInstance().ReleaseDeviceRes(); | |||
| } | |||
| uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| if (context_ptr->enable_dynamic_mem_pool()) { | |||
| auto device_ptr = AllocTensorMemDynamic(size); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| return AddressOffset(device_ptr, 0); | |||
| } | |||
| auto align_size = GetCommonAlignSize(size); | |||
| if (static_mem_offset_ < align_size) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| auto offset = static_mem_offset_ - align_size; | |||
| if (dynamic_mem_offset_ > offset) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_static_size_ += align_size; | |||
| static_mem_offset_ = offset; | |||
| return device_mem_base_ + offset; | |||
| } | |||
| } // namespace gpu | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ | |||
| #include "device/memory_manager.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| namespace gpu { | |||
| class GPUMemoryManager : public MemoryManager { | |||
| public: | |||
| GPUMemoryManager() = default; | |||
| virtual ~GPUMemoryManager() = default; | |||
| void MallocDeviceMemory() override; | |||
| void FreeDeviceMemory() override; | |||
| void *AllocTensorMemDynamic(size_t size) override; | |||
| void FreeTensorMemDynamic(void *device_ptr) override; | |||
| protected: | |||
| uint8_t *MallocStaticMem(size_t size, bool communication_mem); | |||
| }; | |||
| } // namespace gpu | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_ | |||
| @@ -31,18 +31,13 @@ | |||
| #include "ir/value.h" | |||
| using mindspore::kernel::Address; | |||
| using mindspore::kernel::AddressPtr; | |||
| using mindspore::memreuse::BestFitMemReuse; | |||
| using mindspore::memreuse::MemReuseUtilPtr; | |||
| namespace mindspore { | |||
| namespace device { | |||
| KernelRuntime::~KernelRuntime() { | |||
| device_mem_base_ = nullptr; | |||
| device_mem_pool_base_ = nullptr; | |||
| #ifdef ENABLE_DUMP_E2E | |||
| dump_conf_ptr_ = nullptr; | |||
| #endif | |||
| mem_reuse_util_ptr_ = nullptr; | |||
| } | |||
| bool KernelRuntime::Run(session::KernelGraph *graph) { | |||
| @@ -88,11 +83,6 @@ bool KernelRuntime::LoadTask(const session::KernelGraph *graph) { | |||
| return false; | |||
| } | |||
| void KernelRuntime::FreeHostMemory() { | |||
| dynamic_mem_offset_ = 0; | |||
| static_mem_offset_ = 0; | |||
| } | |||
| // for D to impl | |||
| bool KernelRuntime::RunTask(const session::KernelGraph *graph) { | |||
| if (graph != nullptr) { | |||
| @@ -126,13 +116,11 @@ size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &nod | |||
| void KernelRuntime::AssignMemory(session::KernelGraph *graph) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| mem_manager_->ResetDynamicMemory(); | |||
| AssignStaticMemory(graph); | |||
| bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); | |||
| if (is_enable_mem_reuse) { | |||
| ReuseAssignDynamicMemory(graph); | |||
| } else { | |||
| AssignDynamicMemory(graph); | |||
| } | |||
| AssignDynamicMemory(graph); | |||
| UpdateRefNodeOutputMem(graph); | |||
| } | |||
| @@ -159,6 +147,7 @@ void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) { | |||
| void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors, | |||
| const session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| for (size_t input_index = 0; input_index < graph->inputs().size(); ++input_index) { | |||
| auto item = graph->inputs()[input_index]; | |||
| MS_EXCEPTION_IF_NULL(item); | |||
| @@ -180,7 +169,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> | |||
| auto device_address = | |||
| CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MallocOpMemory(device_address, tensor_size, kStaticMem); | |||
| mem_manager_->MallocOpMemory(device_address, tensor_size); | |||
| AnfAlgo::SetOutputAddr(device_address, index, item.get()); | |||
| } | |||
| } | |||
| @@ -188,6 +177,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> | |||
| void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(kernel); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| auto output_sizes = kernel_mod->GetOutputSizeList(); | |||
| @@ -208,13 +198,14 @@ void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) { | |||
| auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); | |||
| auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MallocOpMemory(device_address, output_sizes[i], kDynamicMem); | |||
| mem_manager_->MallocOpMemory(device_address, output_sizes[i]); | |||
| AnfAlgo::SetOutputAddr(device_address, i, kernel.get()); | |||
| } | |||
| } | |||
| void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| if (kernel->isa<CNode>()) { | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(kernel); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| @@ -222,7 +213,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { | |||
| for (size_t i = 0; i < workspace_lists.size(); ++i) { | |||
| auto device_address = CreateDeviceAddress(nullptr, workspace_lists[i], "", kTypeUnknown); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| MallocOpMemory(device_address, workspace_lists[i], kDynamicMem); | |||
| mem_manager_->MallocOpMemory(device_address, workspace_lists[i]); | |||
| AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get()); | |||
| } | |||
| } | |||
| @@ -230,6 +221,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { | |||
| void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| for (auto &item : graph->inputs()) { | |||
| MS_EXCEPTION_IF_NULL(item); | |||
| if (!item->isa<Parameter>()) { | |||
| @@ -247,7 +239,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { | |||
| output_type_id = AnfAlgo::GetOutputInferDataType(item, index); | |||
| } | |||
| auto tensor_size = CountNodeDeviceMemorySize(item, index); | |||
| auto ptr = MallocStaticMem(tensor_size, false); | |||
| auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size); | |||
| auto address = CreateDeviceAddress(ptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id); | |||
| AnfAlgo::SetOutputAddr(address, index, item.get()); | |||
| } | |||
| @@ -301,6 +293,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) { | |||
| void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| auto output_sizes = kernel_mod->GetOutputSizeList(); | |||
| @@ -314,12 +307,12 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr | |||
| std::vector<size_t> align_size_list; | |||
| for (uint64_t mem_size : output_sizes) { | |||
| if (context_ptr->enable_hccl()) { | |||
| mem_size = GetCommonAlignSize(mem_size); | |||
| mem_size = mem_manager_->GetCommonAlignSize(mem_size); | |||
| } | |||
| total_size += mem_size; | |||
| align_size_list.emplace_back(mem_size); | |||
| } | |||
| uint8_t *output_ptr = CalDeviceMem(node, total_size, flag, 0); | |||
| uint8_t *output_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size); | |||
| for (size_t j = 0; j < align_size_list.size(); ++j) { | |||
| std::string output_format = AnfAlgo::GetOutputFormat(node, j); | |||
| auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j); | |||
| @@ -333,6 +326,7 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| size_t total_size = 0; | |||
| std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size; | |||
| for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(node); ++i) { | |||
| @@ -340,12 +334,12 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| auto mem_size = address->size(); | |||
| if (context_ptr->enable_hccl()) { | |||
| mem_size = GetCommonAlignSize(mem_size); | |||
| mem_size = mem_manager_->GetCommonAlignSize(mem_size); | |||
| } | |||
| total_size += mem_size; | |||
| addr_size.emplace_back(address.get(), mem_size); | |||
| } | |||
| uint8_t *input_ptr = CalDeviceMem(node, total_size, kDynamicMem, 0); | |||
| uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, kDynamicMem, total_size); | |||
| for (const auto &iter : addr_size) { | |||
| MS_EXCEPTION_IF_NULL(iter.first); | |||
| iter.first->set_ptr(input_ptr); | |||
| @@ -355,7 +349,8 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) { | |||
| void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (IsCommunicationOp(node)) { | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| if (AnfAlgo::IsCommunicationOp(node)) { | |||
| UpdateCommunicationOpInputMem(node); | |||
| AssignCommunicationNodeOutputMem(flag, node); | |||
| return; | |||
| @@ -375,7 +370,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in | |||
| MS_LOG(INFO) << "Already malloc index:" << i; | |||
| continue; | |||
| } | |||
| auto ptr = CalDeviceMem(node, output_sizes[i], flag, i); | |||
| auto ptr = mem_manager_->MallocOutputMem(node, i, flag, output_sizes[i]); | |||
| if (ptr == nullptr) { | |||
| // reused ptr, no need alloc, continue; | |||
| continue; | |||
| @@ -390,6 +385,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const | |||
| size_t output_idx) { | |||
| MS_EXCEPTION_IF_NULL(value_node); | |||
| MS_EXCEPTION_IF_NULL(node_value); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto tensor = node_value->cast<TensorPtr>(); | |||
| if (tensor == nullptr) { | |||
| MS_LOG(WARNING) << "Tensor is null"; | |||
| @@ -397,7 +393,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const | |||
| } | |||
| size_t tensor_size = tensor->data().nbytes(); | |||
| auto node_size = CountNodeDeviceMemorySize(value_node, output_idx); | |||
| auto ptr = MallocStaticMem(node_size, false); | |||
| auto ptr = mem_manager_->MallocMem(kStaticMem, node_size); | |||
| TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx); | |||
| if (output_type_id == kTypeUnknown) { | |||
| output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx); | |||
| @@ -414,6 +410,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const | |||
| void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| for (auto &value_node : graph->graph_value_nodes()) { | |||
| MS_EXCEPTION_IF_NULL(value_node); | |||
| if (AnfAlgo::OutputAddrExist(value_node, 0)) { | |||
| @@ -440,7 +437,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { | |||
| } else if (node_value->isa<StringImm>()) { | |||
| auto value = GetValue<std::string>(node_value); | |||
| size_t tensor_size = value.size(); | |||
| auto ptr = MallocStaticMem(tensor_size, false); | |||
| auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size); | |||
| auto address = CreateDeviceAddress(ptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8); | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| AnfAlgo::SetOutputAddr(address, 0, value_node.get()); | |||
| @@ -452,103 +449,37 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { | |||
| } | |||
| } | |||
| void KernelRuntime::AssignDynamicMemory(const session::KernelGraph *graph) { | |||
| void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| // reset dynamic mem offset | |||
| dynamic_mem_offset_ = 0; | |||
| auto &kernels = graph->execution_order(); | |||
| for (auto &kernel : kernels) { | |||
| AssignNodeOutputMem(kDynamicMem, kernel, kGetAllOuts); | |||
| AssignWorkSpaceMem(kernel); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| bool is_enable_mem_reuse = context_ptr->enable_mem_reuse(); | |||
| auto mem_flag = kDynamicMem; | |||
| if (is_enable_mem_reuse) { | |||
| mem_manager_->InitReuseDynamicMemory(graph); | |||
| mem_flag = kReuseDynamicMem; | |||
| } | |||
| } | |||
| void KernelRuntime::ReuseAssignDynamicMemory(session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| dynamic_mem_offset_ = 0; | |||
| MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>(); | |||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); | |||
| // set all infos | |||
| mem_reuse_util_ptr->SetAllInfo(graph); | |||
| auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>(); | |||
| MS_EXCEPTION_IF_NULL(bestfit_mem_reuse); | |||
| bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get()); | |||
| size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize(); | |||
| MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]"; | |||
| mem_reuse_util_ptr_ = mem_reuse_util_ptr; | |||
| auto base_ptr = MallocDynamicMem(total_allocated_size, false); | |||
| mem_reuse_util_ptr_->set_mem_base(base_ptr); | |||
| auto &kernels = graph->execution_order(); | |||
| for (auto &kernel : kernels) { | |||
| AssignNodeOutputMem(kReuseDynamicMem, kernel, kGetAllOuts); | |||
| AssignReuseWorkSpaceMem(kernel); | |||
| AssignNodeOutputMem(mem_flag, kernel, kGetAllOuts); | |||
| AssignWorkSpaceMem(mem_flag, kernel); | |||
| } | |||
| } | |||
| void KernelRuntime::AssignReuseWorkSpaceMem(const AnfNodePtr &node) { | |||
| void KernelRuntime::AssignWorkSpaceMem(int flag, const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(mem_manager_); | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| size_t index = 0; | |||
| for (auto &size : kernel_mod->GetWorkspaceSizeList()) { | |||
| auto wk_ptr = mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); | |||
| AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(wk_ptr, size, "", kTypeUnknown), index, node.get()); | |||
| auto ptr = mem_manager_->MallocWorkSpaceMem(node, flag, index, size); | |||
| AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get()); | |||
| index++; | |||
| } | |||
| } | |||
| void KernelRuntime::AssignWorkSpaceMem(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (node->isa<CNode>()) { | |||
| auto kernel_mod = AnfAlgo::GetKernelMod(node); | |||
| MS_EXCEPTION_IF_NULL(kernel_mod); | |||
| size_t index = 0; | |||
| for (auto &size : kernel_mod->GetWorkspaceSizeList()) { | |||
| auto ptr = MallocDynamicMem(size, false); | |||
| AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get()); | |||
| index++; | |||
| } | |||
| } | |||
| } | |||
| bool KernelRuntime::IsCommunicationOp(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto kernel_name = AnfAlgo::GetCNodeName(node); | |||
| auto kernel_type = AnfAlgo::GetKernelType(node); | |||
| if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) { | |||
| return true; | |||
| } | |||
| return false; | |||
| } | |||
| uint8_t *KernelRuntime::CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| uint8_t *ptr = nullptr; | |||
| if (IsCommunicationOp(node)) { | |||
| bool communication_mem = false; | |||
| if (context_ptr->enable_hccl()) { | |||
| communication_mem = true; | |||
| } | |||
| if (flag == kStaticMem) { | |||
| ptr = MallocStaticMem(size, communication_mem); | |||
| } else { | |||
| ptr = MallocDynamicMem(size, communication_mem); | |||
| } | |||
| return ptr; | |||
| } | |||
| if (flag == kStaticMem) { | |||
| ptr = MallocStaticMem(size, false); | |||
| } else if (flag == kDynamicMem) { | |||
| ptr = MallocDynamicMem(size, false); | |||
| } else if (flag == kReuseDynamicMem) { | |||
| ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); | |||
| } | |||
| return ptr; | |||
| } | |||
| void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, | |||
| AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces, | |||
| AddressPtrList *kernel_outputs) { | |||
| @@ -659,65 +590,6 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) { | |||
| return true; | |||
| } | |||
| size_t KernelRuntime::GetCommonAlignSize(size_t input_size) const { | |||
| return (input_size + mem_align_size_ + 31) / mem_align_size_ * mem_align_size_; | |||
| } | |||
| size_t KernelRuntime::GetCommunicationAlignSize(size_t input_size) const { | |||
| return (input_size + mem_align_size_ - 1) / mem_align_size_ * mem_align_size_ + 2 * mem_align_size_; | |||
| } | |||
| uint8_t *KernelRuntime::MallocStaticMem(size_t size, bool communication_mem) { | |||
| size_t align_size = 0; | |||
| if (communication_mem) { | |||
| align_size = GetCommunicationAlignSize(size); | |||
| } else { | |||
| align_size = GetCommonAlignSize(size); | |||
| } | |||
| if (static_mem_offset_ < align_size) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_static_size_ += align_size; | |||
| auto offset = static_mem_offset_ - align_size; | |||
| if (dynamic_mem_offset_ > offset) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| static_mem_offset_ = offset; | |||
| if (communication_mem) { | |||
| return device_mem_base_ + offset + mem_align_size_; | |||
| } else { | |||
| return device_mem_base_ + offset; | |||
| } | |||
| } | |||
| uint8_t *KernelRuntime::MallocDynamicMem(size_t size, bool communication_mem) { | |||
| size_t align_size = 0; | |||
| if (communication_mem) { | |||
| align_size = GetCommunicationAlignSize(size); | |||
| } else { | |||
| align_size = GetCommonAlignSize(size); | |||
| } | |||
| uint64_t offset = dynamic_mem_offset_; | |||
| auto new_offset = dynamic_mem_offset_ + align_size; | |||
| if (new_offset > static_mem_offset_) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_dynamic_size_ += align_size; | |||
| dynamic_mem_offset_ = new_offset; | |||
| if (communication_mem) { | |||
| return device_mem_base_ + offset + mem_align_size_; | |||
| } else { | |||
| return device_mem_base_ + offset; | |||
| } | |||
| } | |||
| bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| if (!LaunchKernelMod(*graph)) { | |||
| @@ -731,29 +603,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) { | |||
| return true; | |||
| } | |||
| void KernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) { | |||
| if (flag == kStaticMem) { | |||
| address->ptr_ = MallocStaticMem(size, false); | |||
| } else if (flag == kDynamicMem) { | |||
| address->ptr_ = MallocDynamicMem(size, false); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Unknown memory type!"; | |||
| } | |||
| } | |||
| void *KernelRuntime::AllocTensorMemDynamic(size_t size) { | |||
| if (size == 0) { | |||
| MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0."; | |||
| } | |||
| return nullptr; | |||
| } | |||
| void KernelRuntime::FreeTensorMemDynamic(void *device_ptr) { | |||
| if (device_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null."; | |||
| } | |||
| } | |||
| #ifdef ENABLE_DUMP_E2E | |||
| bool KernelRuntime::SetDumpConf() { | |||
| dump_conf_ptr_ = std::make_shared<Dump>(); | |||
| @@ -20,8 +20,7 @@ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <map> | |||
| #include "pre_activate/mem_reuse/mem_reuse.h" | |||
| #include "pre_activate/mem_reuse/mem_reuse_allocator.h" | |||
| #include "device/device_address.h" | |||
| #include "ir/meta_tensor.h" | |||
| #include "predict/generator/utils/ir_model_util.h" | |||
| @@ -32,21 +31,16 @@ | |||
| #include "session/anf_runtime_algorithm.h" | |||
| #include "kernel/kernel.h" | |||
| #include "utils/context/ms_context.h" | |||
| #include "device/memory_manager.h" | |||
| // using mindspore::session::KernelGraph; | |||
| using mindspore::tensor::Tensor; | |||
| using TensorPtr = std::shared_ptr<Tensor>; | |||
| using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr; | |||
| using mindspore::kernel::AddressPtr; | |||
| using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>; | |||
| namespace mindspore { | |||
| namespace device { | |||
| const int kStaticMem = 0; | |||
| const int kDynamicMem = 1; | |||
| const int kReuseDynamicMem = 2; | |||
| const int kGetAllOuts = -1; | |||
| class KernelRuntime { | |||
| public: | |||
| KernelRuntime() = default; | |||
| @@ -65,7 +59,6 @@ class KernelRuntime { | |||
| DumpConfPtr GetDumpConf(); | |||
| #endif | |||
| virtual bool LoadTask(const session::KernelGraph *graph); | |||
| virtual void FreeHostMemory(); | |||
| // for GPU and D to impl | |||
| virtual void ReleaseDeviceRes() {} | |||
| void set_device_id(uint32_t device_id) { device_id_ = device_id; } | |||
| @@ -75,29 +68,17 @@ class KernelRuntime { | |||
| TypeId type_id) = 0; | |||
| virtual bool SyncStream() = 0; | |||
| void AssignStaticMemory(session::KernelGraph *graph); | |||
| void AssignDynamicMemory(const session::KernelGraph *graph); | |||
| void AssignDynamicMemory(session::KernelGraph *graph); | |||
| void ReuseAssignDynamicMemory(session::KernelGraph *graph); | |||
| void AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index); | |||
| void AssignWorkSpaceMem(const AnfNodePtr &node); | |||
| void AssignWorkSpaceMem(int flag, const AnfNodePtr &node); | |||
| void AssignReuseWorkSpaceMem(const AnfNodePtr &node); | |||
| void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node); | |||
| void UpdateRefNodeOutputMem(const session::KernelGraph *graph); | |||
| void UpdateCommunicationOpInputMem(const AnfNodePtr &node); | |||
| bool IsCommunicationOp(const AnfNodePtr &node); | |||
| size_t GetCommonAlignSize(size_t input_size) const; | |||
| size_t GetCommunicationAlignSize(size_t input_size) const; | |||
| uint8_t *CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index); | |||
| virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem); | |||
| uint8_t *MallocDynamicMem(size_t size, bool communication_mem); | |||
| #ifdef ENABLE_DUMP_E2E | |||
| bool SetDumpConf(); | |||
| #endif | |||
| // Alloc memory use the dynamic memory pool. | |||
| virtual void *AllocTensorMemDynamic(size_t size); | |||
| // Free memory use the dynamic memory pool. | |||
| virtual void FreeTensorMemDynamic(void *device_ptr); | |||
| virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag); | |||
| private: | |||
| void AssignStaticMemoryOutput(const session::KernelGraph *graph); | |||
| @@ -114,20 +95,11 @@ class KernelRuntime { | |||
| protected: | |||
| uint32_t device_id_{0}; | |||
| uint8_t *device_mem_base_{nullptr}; | |||
| uint8_t *device_mem_pool_base_{nullptr}; | |||
| uint64_t device_mem_size_{0}; | |||
| uint64_t device_mem_pool_size_{0}; | |||
| uint64_t dynamic_mem_offset_{0}; | |||
| uint64_t static_mem_offset_{0}; | |||
| const uint64_t mem_align_size_ = 512; | |||
| #ifdef ENABLE_DUMP_E2E | |||
| DumpConfPtr dump_conf_ptr_; | |||
| #endif | |||
| void *stream_ = nullptr; | |||
| size_t total_static_size_ = 0; | |||
| size_t total_dynamic_size_ = 0; | |||
| MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; | |||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | |||
| }; | |||
| using KernelRuntimePtr = std::shared_ptr<KernelRuntime>; | |||
| } // namespace device | |||
| @@ -0,0 +1,170 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "device/memory_manager.h" | |||
| #include "session/anf_runtime_algorithm.h" | |||
| #include "utils/context/ms_context.h" | |||
| using mindspore::memreuse::BestFitMemReuse; | |||
| using mindspore::memreuse::MemReuseUtilPtr; | |||
| namespace mindspore { | |||
| namespace device { | |||
| MemoryManager::~MemoryManager() { | |||
| device_mem_base_ = nullptr; | |||
| device_mem_pool_base_ = nullptr; | |||
| mem_reuse_util_ptr_ = nullptr; | |||
| } | |||
| size_t MemoryManager::GetCommonAlignSize(size_t input_size) const { | |||
| return (input_size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize; | |||
| } | |||
| size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const { | |||
| return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize; | |||
| } | |||
| void MemoryManager::InitReuseDynamicMemory(session::KernelGraph *graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>(); | |||
| MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr); | |||
| // set all infos | |||
| mem_reuse_util_ptr->SetAllInfo(graph); | |||
| auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>(); | |||
| MS_EXCEPTION_IF_NULL(bestfit_mem_reuse); | |||
| bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get()); | |||
| size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize(); | |||
| MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]"; | |||
| mem_reuse_util_ptr_ = mem_reuse_util_ptr; | |||
| auto base_ptr = MallocDynamicMem(total_allocated_size, false); | |||
| mem_reuse_util_ptr_->set_mem_base(base_ptr); | |||
| } | |||
| uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| uint8_t *ptr = nullptr; | |||
| if (AnfAlgo::IsCommunicationOp(node)) { | |||
| bool communication_mem = false; | |||
| if (context_ptr->enable_hccl()) { | |||
| communication_mem = true; | |||
| } | |||
| if (flag == kStaticMem) { | |||
| ptr = MallocStaticMem(size, communication_mem); | |||
| } else { | |||
| ptr = MallocDynamicMem(size, communication_mem); | |||
| } | |||
| return ptr; | |||
| } | |||
| if (flag == kStaticMem) { | |||
| ptr = MallocStaticMem(size, false); | |||
| } else if (flag == kDynamicMem) { | |||
| ptr = MallocDynamicMem(size, false); | |||
| } else if (flag == kReuseDynamicMem) { | |||
| ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); | |||
| } | |||
| return ptr; | |||
| } | |||
| uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size) { | |||
| if (flag == kReuseDynamicMem) { | |||
| return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); | |||
| } | |||
| return MallocDynamicMem(size, false); | |||
| } | |||
| uint8_t *MemoryManager::MallocMem(int flag, size_t size) { | |||
| uint8_t *ptr = nullptr; | |||
| if (flag == kStaticMem) { | |||
| ptr = MallocStaticMem(size, false); | |||
| } else if (flag == kDynamicMem) { | |||
| ptr = MallocDynamicMem(size, false); | |||
| } | |||
| return ptr; | |||
| } | |||
| uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) { | |||
| size_t align_size = 0; | |||
| if (communication_mem) { | |||
| align_size = GetCommunicationAlignSize(size); | |||
| } else { | |||
| align_size = GetCommonAlignSize(size); | |||
| } | |||
| if (static_mem_offset_ < align_size) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_static_size_ += align_size; | |||
| auto offset = static_mem_offset_ - align_size; | |||
| if (dynamic_mem_offset_ > offset) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| static_mem_offset_ = offset; | |||
| if (communication_mem) { | |||
| return device_mem_base_ + offset + kMemAlignSize; | |||
| } else { | |||
| return device_mem_base_ + offset; | |||
| } | |||
| } | |||
| uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { | |||
| size_t align_size = 0; | |||
| if (communication_mem) { | |||
| align_size = GetCommunicationAlignSize(size); | |||
| } else { | |||
| align_size = GetCommonAlignSize(size); | |||
| } | |||
| uint64_t offset = dynamic_mem_offset_; | |||
| auto new_offset = dynamic_mem_offset_ + align_size; | |||
| if (new_offset > static_mem_offset_) { | |||
| MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_ | |||
| << "] static[" << total_static_size_ << "])" | |||
| << " malloc [" << align_size << "] failed!"; | |||
| } | |||
| total_dynamic_size_ += align_size; | |||
| dynamic_mem_offset_ = new_offset; | |||
| if (communication_mem) { | |||
| return device_mem_base_ + offset + kMemAlignSize; | |||
| } else { | |||
| return device_mem_base_ + offset; | |||
| } | |||
| } | |||
| void MemoryManager::MallocOpMemory(const DeviceAddressPtr address, size_t size) { | |||
| auto device_ptr = AllocTensorMemDynamic(size); | |||
| MS_EXCEPTION_IF_NULL(device_ptr); | |||
| address->ptr_ = device_ptr; | |||
| address->mem_dynamic_alloc_ = true; | |||
| } | |||
| void *MemoryManager::AllocTensorMemDynamic(size_t size) { | |||
| if (size == 0) { | |||
| MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0."; | |||
| } | |||
| return nullptr; | |||
| } | |||
| void MemoryManager::FreeTensorMemDynamic(void *device_ptr) { | |||
| if (device_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null."; | |||
| } | |||
| } | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,71 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ | |||
| #include <memory> | |||
| #include "pre_activate/mem_reuse/mem_reuse.h" | |||
| #include "pre_activate/mem_reuse/mem_reuse_allocator.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| const int kStaticMem = 0; | |||
| const int kDynamicMem = 1; | |||
| const int kReuseDynamicMem = 2; | |||
| const int kGetAllOuts = -1; | |||
| const uint64_t kMemAlignSize = 512; | |||
| using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr; | |||
| class MemoryManager { | |||
| public: | |||
| MemoryManager() = default; | |||
| virtual ~MemoryManager(); | |||
| virtual void MallocDeviceMemory() = 0; | |||
| virtual void FreeDeviceMemory() = 0; | |||
| void ResetDynamicMemory() { | |||
| total_dynamic_size_ = 0; | |||
| dynamic_mem_offset_ = 0; | |||
| } | |||
| void InitReuseDynamicMemory(session::KernelGraph *graph); | |||
| uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size); | |||
| uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size); | |||
| virtual uint8_t *MallocMem(int flag, size_t size); | |||
| // Alloc memory use the dynamic memory pool. | |||
| virtual void *AllocTensorMemDynamic(size_t size); | |||
| // Free memory use the dynamic memory pool. | |||
| virtual void FreeTensorMemDynamic(void *device_ptr); | |||
| virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size); | |||
| size_t GetCommonAlignSize(size_t input_size) const; | |||
| size_t GetCommunicationAlignSize(size_t input_size) const; | |||
| protected: | |||
| virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem); | |||
| virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); | |||
| uint8_t *device_mem_base_{nullptr}; | |||
| uint8_t *device_mem_pool_base_{nullptr}; | |||
| uint64_t device_mem_size_{0}; | |||
| uint64_t device_mem_pool_size_{0}; | |||
| uint64_t dynamic_mem_offset_{0}; | |||
| uint64_t static_mem_offset_{0}; | |||
| size_t total_static_size_ = 0; | |||
| size_t total_dynamic_size_ = 0; | |||
| MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; | |||
| }; | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_ | |||
| @@ -857,5 +857,15 @@ void AnfRuntimeAlgorithm::SetNodeInput(const CNodePtr &node, const AnfNodePtr &i | |||
| MS_EXCEPTION_IF_NULL(input_node); | |||
| node->set_input(index + 1, input_node); | |||
| } | |||
| bool AnfRuntimeAlgorithm::IsCommunicationOp(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto kernel_name = AnfAlgo::GetCNodeName(node); | |||
| auto kernel_type = AnfAlgo::GetKernelType(node); | |||
| if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) { | |||
| return true; | |||
| } | |||
| return false; | |||
| } | |||
| } // namespace session | |||
| } // namespace mindspore | |||
| @@ -166,6 +166,7 @@ class AnfRuntimeAlgorithm { | |||
| static bool IsFeatureMapInput(const AnfNodePtr &node, size_t input_index); | |||
| // get real input index for some tbe ops which input order is different between me and tbe impl | |||
| static size_t GetRealInputIndex(const AnfNodePtr &anf_node, const size_t cur_index); | |||
| static bool IsCommunicationOp(const AnfNodePtr &node); | |||
| }; | |||
| } // namespace session | |||
| using AnfAlgo = session::AnfRuntimeAlgorithm; | |||
| @@ -102,10 +102,6 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList | |||
| graph->set_execution_order(execution_order); | |||
| // Alloc memory, including static memory and dynamic memory | |||
| AllocateMemory(graph.get()); | |||
| // Reset memory resource | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| runtime_instance->FreeHostMemory(); | |||
| return graph_id; | |||
| } | |||
| @@ -85,6 +85,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/kernel/oplib/*.cc" | |||
| "../../../mindspore/ccsrc/kernel/tbe/*.cc" | |||
| "../../../mindspore/ccsrc/device/kernel_runtime.cc" | |||
| "../../../mindspore/ccsrc/device/memory_manager.cc" | |||
| "../../../mindspore/ccsrc/device/kernel_runtime_manager.cc" | |||
| "../../../mindspore/ccsrc/device/kernel_info.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/profiling/*.cc" | |||
| @@ -92,6 +93,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "../../../mindspore/ccsrc/device/convert_tensor_utils.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/ascend_memory_manager.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/ascend_device_address.cc" | |||
| "../../../mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc" | |||
| "../../../mindspore/ccsrc/predict/generator/utils/ir_model_util.cc" | |||