Merge pull request !30729 from limingqi107/new_actor_runtimefeature/build-system-rewrite
| @@ -16,7 +16,6 @@ | |||
| #include "common/mem_reuse/mem_dynamic_allocator.h" | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| #include "include/common/utils/convert_utils.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_context.h" | |||
| @@ -30,6 +29,13 @@ constexpr size_t kGBToByte = 1024 << 20; | |||
| // Set experience value to 10M | |||
| const size_t kMinimumAllocMem = 10 << 20; | |||
| thread_local AllocatorDebugInfo DynamicMemAllocatorDebugInfo::debug_info_; | |||
| static const std::map<DynamicMemBufStatus, std::string> kBufStatusString = { | |||
| {kMemBufIdle, "idle"}, | |||
| {kMemBufUsed, "used"}, | |||
| }; | |||
| DynamicMemPoolBestFit::~DynamicMemPoolBestFit() { | |||
| persistent_mem_->clear(); | |||
| common_mem_->clear(); | |||
| @@ -43,8 +49,11 @@ DeviceMemPtr DynamicMemPoolBestFit::AllocTensorMem(size_t size, bool from_persis | |||
| if (!device_addr) { | |||
| device_addr = AddMemBlockAndMemBuf(align_size, from_persistent_mem); | |||
| } | |||
| // Alloc memory failed and dump the info. | |||
| if (!device_addr) { | |||
| DumpDynamicMemPoolInfo(); | |||
| DumpDynamicMemPoolDebugInfo(); | |||
| DumpDynamicMemPoolStateInfo(); | |||
| } | |||
| return device_addr; | |||
| } | |||
| @@ -79,7 +88,9 @@ std::vector<DeviceMemPtr> DynamicMemPoolBestFit::AllocContinuousTensorMem(size_t | |||
| DynamicMemBufPtr continuous_mem_buf; | |||
| auto buf_addr = device_addr; | |||
| for (size_t i : size_list) { | |||
| continuous_mem_buf = std::make_shared<DynamicMemBuf>(buf_addr, kMemBufUsed, i); | |||
| continuous_mem_buf = | |||
| std::make_shared<DynamicMemBuf>(buf_addr, kMemBufUsed, i, DynamicMemAllocatorDebugInfo::GetDebugInfo().name_); | |||
| MS_EXCEPTION_IF_NULL(continuous_mem_buf); | |||
| (void)mem_block->block_all_mem_buf_map_.emplace(buf_addr, continuous_mem_buf); | |||
| device_addr_list.emplace_back(buf_addr); | |||
| buf_addr = AddressOffset(buf_addr, i); | |||
| @@ -111,6 +122,7 @@ DeviceMemPtr DynamicMemPoolBestFit::FindIdleMemBuf(size_t size, bool from_persis | |||
| << "] mem_buf_address[" << mem_buf->device_addr_ << "]."; | |||
| } | |||
| mem_buf->status_ = kMemBufUsed; | |||
| mem_buf->allocator_name_ = DynamicMemAllocatorDebugInfo::GetDebugInfo().name_; | |||
| // Remove map of old idle memory buf | |||
| (void)mem_mng->idle_mem_buf_map_.erase(iter); | |||
| // Divide memory buf | |||
| @@ -200,7 +212,8 @@ DeviceMemPtr DynamicMemPoolBestFit::AddMemBlockAndMemBuf(size_t size, bool from_ | |||
| std::upper_bound(mem_mng->mem_block_list_.begin(), mem_mng->mem_block_list_.end(), device_addr, CmpMemBlock); | |||
| (void)mem_mng->mem_block_list_.insert(iter, mem_block); | |||
| // Add new memory buf | |||
| auto mem_buf = std::make_shared<DynamicMemBuf>(device_addr, kMemBufUsed, real_alloc_size); | |||
| auto mem_buf = std::make_shared<DynamicMemBuf>(device_addr, kMemBufUsed, real_alloc_size, | |||
| DynamicMemAllocatorDebugInfo::GetDebugInfo().name_); | |||
| MS_EXCEPTION_IF_NULL(mem_buf); | |||
| // Add map of new memory buf in the block | |||
| (void)mem_block->block_all_mem_buf_map_.emplace(device_addr, mem_buf); | |||
| @@ -395,7 +408,7 @@ void DynamicMemPoolBestFit::ReleaseDeviceRes() { | |||
| fn(persistent_mem_); | |||
| } | |||
| void DynamicMemPoolBestFit::DumpDynamicMemPoolInfo() { | |||
| void DynamicMemPoolBestFit::DumpDynamicMemPoolStateInfo() { | |||
| auto fn = [](const MemStatusManagerPtr &mem_mng, const std::string &mem_type) { | |||
| if (mem_mng->mem_block_list_.empty()) { | |||
| return; | |||
| @@ -409,17 +422,73 @@ void DynamicMemPoolBestFit::DumpDynamicMemPoolInfo() { | |||
| idle_size += mb->second->size_; | |||
| } | |||
| } | |||
| buf << ", block[" << i << "] idle size " << idle_size; | |||
| buf << ", block[" << i << "] block size:" << mem_mng->mem_block_list_[i]->mem_block_size_ | |||
| << " idle size:" << idle_size; | |||
| } | |||
| // Dump all the memory buf info | |||
| MS_LOG(WARNING) << mem_type << "pool info: block size " << mem_mng->unit_size_ << ", block counts " | |||
| MS_LOG(WARNING) << mem_type << " pool info: block size " << mem_mng->unit_size_ << ", block counts " | |||
| << mem_mng->mem_block_list_.size() << buf.str() << ". Total allocated mem " | |||
| << mem_mng->mps_.total_mem_size_ << ", peak used mem " << mem_mng->mps_.used_mem_peak_size_ | |||
| << ", in used mem " << mem_mng->mps_.total_used_mem_size_ << ", total idle mem " | |||
| << mem_mng->mps_.total_mem_size_ - mem_mng->mps_.total_used_mem_size_; | |||
| }; | |||
| fn(common_mem_, std::string(kCommonMem)); | |||
| fn(persistent_mem_, std::string(kPersistentParamMem)); | |||
| } | |||
| void DynamicMemPoolBestFit::DumpDynamicMemPoolDebugInfo() { | |||
| auto fn = [](const MemStatusManagerPtr &mem_mng, const std::string &mem_type) { | |||
| size_t total_mem = 0; | |||
| size_t total_used_mem = 0; | |||
| size_t total_idle_mem1 = 0; | |||
| size_t total_idle_mem2 = 0; | |||
| // Dump the memory block info and memory buf info. | |||
| MS_LOG(INFO) << mem_type << " all mem_block info: counts[" << mem_mng->mem_block_list_.size() << "]."; | |||
| for (auto iter = mem_mng->mem_block_list_.begin(); iter != mem_mng->mem_block_list_.end(); ++iter) { | |||
| total_mem += (*iter)->size(); | |||
| auto mem_buf_map = (*iter)->block_all_mem_buf_map_; | |||
| MS_LOG(INFO) << " MemBlock info: number[" << iter - mem_mng->mem_block_list_.begin() << "] mem_buf_counts[" | |||
| << mem_buf_map.size() << "] base_address[" << (*iter)->device_addr() << "] block_size[" | |||
| << (*iter)->size() << "]."; | |||
| for (auto iter_mem_buf = mem_buf_map.begin(); iter_mem_buf != mem_buf_map.end(); ++iter_mem_buf) { | |||
| auto mem_buf = iter_mem_buf->second; | |||
| MS_EXCEPTION_IF_NULL(mem_buf); | |||
| if (mem_buf->status_ == kMemBufIdle) { | |||
| total_idle_mem1 += mem_buf->size_; | |||
| } else { | |||
| total_used_mem += mem_buf->size_; | |||
| } | |||
| auto user_name = (mem_buf->status_ == kMemBufUsed) ? ("] name[" + mem_buf->allocator_name_ + "].") : "]."; | |||
| MS_LOG(INFO) << " MemBuf info: address[" << mem_buf->device_addr_ << "] size[" << mem_buf->size_ << "] status[" | |||
| << kBufStatusString.at(mem_buf->status_) << user_name; | |||
| } | |||
| } | |||
| // Dump all the idle memory buf info. | |||
| MS_LOG(INFO) << mem_type << " all idle mem_buf info: counts[" << mem_mng->idle_mem_buf_map_.size() << "]."; | |||
| for (auto iter_idle = mem_mng->idle_mem_buf_map_.begin(); iter_idle != mem_mng->idle_mem_buf_map_.end(); | |||
| ++iter_idle) { | |||
| auto mem_buf = iter_idle->second; | |||
| MS_EXCEPTION_IF_NULL(mem_buf); | |||
| total_idle_mem2 += mem_buf->size_; | |||
| MS_LOG(INFO) << " Idle mem_buf info: size[" << mem_buf->size_ << "] address[" << mem_buf->device_addr_ | |||
| << "] status[" << kBufStatusString.at(mem_buf->status_) << "]."; | |||
| } | |||
| // Dump the memory statistical info. | |||
| MS_LOG(INFO) << mem_type << " total allocated memory[" << total_mem << "], used memory[" << total_used_mem | |||
| << "], idle memory[" << total_idle_mem1 << "]."; | |||
| if (total_idle_mem1 != total_idle_mem2) { | |||
| MS_LOG(ERROR) << "Check error: the idle memory in the mem_block is not equal the global idle memory."; | |||
| } | |||
| if (total_mem != total_used_mem + total_idle_mem1) { | |||
| MS_LOG(ERROR) << "Check error: the the total memory is not equal the sum of used memory and idle memory."; | |||
| } | |||
| }; | |||
| MS_LOG(INFO) << "Start dump dynamic memory pool debug info."; | |||
| fn(common_mem_, std::string(kCommonMem)); | |||
| fn(persistent_mem_, std::string(kPersistentParamMem)); | |||
| MS_LOG(INFO) << "Finish dump dynamic memory pool debug info."; | |||
| } | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -24,6 +24,8 @@ | |||
| #include <utility> | |||
| #include <thread> | |||
| #include <mutex> | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace device { | |||
| @@ -43,13 +45,42 @@ struct DeviceAddrCmp { | |||
| bool operator()(const DeviceMemPtr &addr1, const DeviceMemPtr &addr2) const { return addr1 < addr2; } | |||
| }; | |||
| // Recording information for debugging the memory allocator. | |||
| struct AllocatorDebugInfo { | |||
| std::string name_{"Unknown"}; | |||
| int input_index_{-1}; | |||
| int output_index_{-1}; | |||
| }; | |||
| // The AllocatorDebugInfo warpper which is the local thread for the dynamic memory pool. | |||
| class DynamicMemAllocatorDebugInfo { | |||
| public: | |||
| static AllocatorDebugInfo &GetDebugInfo() noexcept { return debug_info_; } | |||
| // Set the debug info when memory alloc. | |||
| static void SetDebugInfo(const std::string &name, int input_index = -1, int output_index = -1) { | |||
| debug_info_.name_ = name; | |||
| debug_info_.input_index_ = input_index; | |||
| debug_info_.output_index_ = output_index; | |||
| } | |||
| private: | |||
| DynamicMemAllocatorDebugInfo() = default; | |||
| virtual ~DynamicMemAllocatorDebugInfo() = default; | |||
| DISABLE_COPY_AND_ASSIGN(DynamicMemAllocatorDebugInfo); | |||
| static thread_local AllocatorDebugInfo debug_info_; | |||
| }; | |||
| // Memory buf is the smallest operation object of dynamic memory pool. | |||
| struct DynamicMemBuf { | |||
| DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size) | |||
| : device_addr_(addr), status_(status), size_(size) {} | |||
| DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size, | |||
| const std::string &allocator_name = "Unknown") | |||
| : device_addr_(addr), status_(status), size_(size), allocator_name_(allocator_name) {} | |||
| DeviceMemPtr device_addr_; | |||
| DynamicMemBufStatus status_; | |||
| size_t size_; | |||
| std::string allocator_name_; | |||
| }; | |||
| using DynamicMemBufPtr = std::shared_ptr<DynamicMemBuf>; | |||
| // Multimap key is the tensor size, for finding the idle memory buf by tensor size. | |||
| @@ -123,6 +154,8 @@ class DynamicMemPoolBestFit { | |||
| void SetMemAllocUintSize(size_t common_size, size_t persist_size = DYNAMIC_MEM_ALLOC_UNIT_SIZE); | |||
| // Set mem pool block size | |||
| void SetMemPoolBlockSize(size_t available_device_mem_size); | |||
| // The statistics information. | |||
| size_t TotalMemStatistics() const { | |||
| return common_mem_->mps_.total_mem_size_ + persistent_mem_->mps_.total_mem_size_; | |||
| } | |||
| @@ -133,6 +166,11 @@ class DynamicMemPoolBestFit { | |||
| return common_mem_->mps_.used_mem_peak_size_ + persistent_mem_->mps_.used_mem_peak_size_; | |||
| } | |||
| // Display the brief state information of memory block and memory buf. | |||
| void DumpDynamicMemPoolStateInfo(); | |||
| // Display the detailed debug information of memory block and memory buf. | |||
| void DumpDynamicMemPoolDebugInfo(); | |||
| // The related interface of device memory real operation, needs override by device type. | |||
| virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) = 0; | |||
| virtual bool FreeDeviceMem(const DeviceMemPtr &addr) = 0; | |||
| @@ -164,8 +202,6 @@ class DynamicMemPoolBestFit { | |||
| const MemStatusManagerPtr &mem_mng); | |||
| // Erase the idle memory buf by size and device address when idle memory buf is combined. | |||
| void EraseIdleMemBuf(size_t size, const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mng); | |||
| // Display the information of memory block and memory buf. | |||
| void DumpDynamicMemPoolInfo(); | |||
| // Support multi-thread. | |||
| std::mutex mutex_; | |||
| @@ -394,6 +394,7 @@ void ControlActor::UpdateOutputData(OpData<DeviceTensor> *const output_data, con | |||
| const auto &device_context = device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext( | |||
| {device_tensor->device_name(), device_tensor->device_id()}); | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(GetAID().Name(), 0); | |||
| if ((device_tensor->GetPtr() == nullptr) && | |||
| (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize()))) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *context, *device_context, | |||
| @@ -175,6 +175,7 @@ void ExitActor::CopyDeviceAddress(OpContext<DeviceTensor> *const context) { | |||
| // If the address ptr can't be changed, then alloc the new device memory and copy the data. | |||
| if (input_device_tensor->is_ptr_persisted()) { | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(GetAID().Name()); | |||
| if (!device_contexts_[i]->AllocateMemory(new_device_tensor.get(), new_device_tensor->GetSize())) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *context, *device_contexts_[i], | |||
| GetAID().Name(), new_device_tensor->GetSize()); | |||
| @@ -38,6 +38,7 @@ void SyncTensorData(const TensorPtr &host_tensor, const DeviceTensorPtr &device_ | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| MS_EXCEPTION_IF_NULL(context); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(node->fullname_with_scope(), 0); | |||
| if ((device_tensor->GetPtr() == nullptr) && | |||
| (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize()))) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(strategy, *context, *device_context, node->fullname_with_scope(), | |||
| @@ -128,6 +129,7 @@ void PrepareDataForValue(const ValuePtr &value, const KernelWithIndex &node_with | |||
| return; | |||
| } | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(node->fullname_with_scope(), 0); | |||
| if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *context, *device_context, | |||
| node->fullname_with_scope(), device_tensor->GetSize()); | |||
| @@ -555,6 +557,7 @@ void DataPrepareActor::PrepareDataForControlValueNode(const KernelWithIndex &nod | |||
| tensor->set_device_address(device_tensor); | |||
| UpdateRefCount(device_tensor.get(), true); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(node->DebugString(), 0); | |||
| if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(real_strategy_, *context, *device_context, | |||
| node->fullname_with_scope(), device_tensor->GetSize()); | |||
| @@ -594,6 +597,7 @@ void DataPrepareActor::PrepareDataForValueNode(const ValueNodePtr &node, const D | |||
| } | |||
| MS_LOG(INFO) << "Prepare device data for value node: " << node->fullname_with_scope(); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(node->fullname_with_scope(), 0); | |||
| if (!device_context->AllocateMemory(device_tensor.get(), device_tensor->GetSize())) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(real_strategy_, *context, *device_context, | |||
| node->fullname_with_scope(), device_tensor->GetSize()); | |||
| @@ -625,6 +629,7 @@ void DataPrepareActor::CopyDataFromHostToOtherDevice(const AnfNodePtr &front_nod | |||
| const auto &another_device_context = device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext( | |||
| {device::kDeviceTypeToName.at(another_device_type), device_context->device_context_key().device_id_}); | |||
| MS_EXCEPTION_IF_NULL(another_device_context); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(backend_node->fullname_with_scope(), 0); | |||
| if ((another_device_tensor->GetPtr() == nullptr) && | |||
| (!another_device_context->AllocateMemory(another_device_tensor.get(), another_device_tensor->GetSize()))) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(real_strategy_, *context, *another_device_context, | |||
| @@ -319,6 +319,7 @@ void KernelActor::CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data, | |||
| // Update the input device tensor. | |||
| input_device_tensors_[input_data->index_] = new_device_tensor.get(); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(GetAID().Name(), input_data->index_); | |||
| if ((new_device_tensor->GetPtr() == nullptr) && | |||
| (!device_contexts_[0]->AllocateMemory(new_device_tensor.get(), new_device_tensor->GetSize()))) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(strategy_, *context, *(device_contexts_[0]), GetAID().Name(), | |||
| @@ -74,6 +74,7 @@ void MemoryManagerActor::AllocateMemory(const std::vector<DeviceTensor *> *alloc | |||
| } | |||
| try { | |||
| // Allocate memory through the device context. | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name()); | |||
| if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) { | |||
| SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context); | |||
| return; | |||
| @@ -112,6 +113,7 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector< | |||
| auto &device_context = (*device_contexts)[i]; | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| // Allocate memory through the device context. | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name()); | |||
| if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) { | |||
| SetOpContextMemoryAllocFail(from_aid.Name(), device_context, total_size, op_context); | |||
| return; | |||
| @@ -144,6 +146,7 @@ void MemoryManagerActor::AllocateBatchMemory(const std::vector<DeviceTensor *> * | |||
| try { | |||
| // Allocate memory through the device context. | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name()); | |||
| if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) { | |||
| SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context); | |||
| return; | |||
| @@ -193,6 +193,7 @@ void OutputActor::UpdateOutputDeviceAddress() { | |||
| if (output_node->isa<ValueNode>() || output_node->isa<Parameter>() || device_tensor->is_ptr_persisted()) { | |||
| auto device_context = device_contexts_[i]; | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| device::DynamicMemAllocatorDebugInfo::SetDebugInfo(GetAID().Name()); | |||
| if (!device_context->AllocateMemory(tensor_device_address.get(), tensor_device_address->GetSize())) { | |||
| MS_LOG(EXCEPTION) << "Device(id:" << device_context->device_context_key().device_id_ | |||
| << ") memory isn't enough and alloc failed, kernel name: " | |||