GitOrigin-RevId: da2cc22436
tags/v1.7.0
| @@ -25,6 +25,11 @@ __all__ = [ | |||
| "set_default_device", | |||
| "get_mem_status_bytes", | |||
| "get_cuda_compute_capability", | |||
| "get_allocated_memory", | |||
| "get_reserved_memory", | |||
| "get_max_reserved_memory", | |||
| "get_max_allocated_memory", | |||
| "reset_max_memory_stats", | |||
| "set_prealloc_config", | |||
| "coalesce_free_memory", | |||
| "DeviceType", | |||
| @@ -157,6 +162,61 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int | |||
| return _get_cuda_compute_capability(device, device_type) | |||
| def get_allocated_memory(device: Optional[str] = None): | |||
| r"""Returns the current memory occupied by tensors on the computing device in bytes. | |||
| Due to the asynchronous execution of MegEngine, please call megengine._full_sync | |||
| before calling this function in order to get accurate value. | |||
| """ | |||
| if device is None: | |||
| device = get_default_device() | |||
| return CompNode(device).get_used_memory | |||
| def get_reserved_memory(device: Optional[str] = None): | |||
| r"""Returns the current memory managed by the caching allocator on the computing device in bytes. | |||
| Due to the asynchronous execution of MegEngine, please call megengine._full_sync | |||
| before calling this function in order to get accurate value. | |||
| """ | |||
| if device is None: | |||
| device = get_default_device() | |||
| return CompNode(device).get_reserved_memory | |||
| def get_max_reserved_memory(device: Optional[str] = None): | |||
| r"""Returns the maximum memory managed by the caching allocator on the computing device in bytes. | |||
| Due to the asynchronous execution of MegEngine, please call megengine._full_sync | |||
| before calling this function in order to get accurate value. | |||
| """ | |||
| if device is None: | |||
| device = get_default_device() | |||
| return CompNode(device).get_max_reserved_memory | |||
| def get_max_allocated_memory(device: Optional[str] = None): | |||
| r"""Returns the maximum memory occupied by tensors on the computing device in bytes. | |||
| Due to the asynchronous execution of MegEngine, please call megengine._full_sync | |||
| before calling this function in order to get accurate value. | |||
| """ | |||
| if device is None: | |||
| device = get_default_device() | |||
| return CompNode(device).get_max_used_memory | |||
| def reset_max_memory_stats(device: Optional[str] = None): | |||
| r"""Resets the maximum stats on the computing device. | |||
| Due to the asynchronous execution of MegEngine, please call megengine._full_sync | |||
| before calling this function in order to properly reset memory stats. | |||
| """ | |||
| if device is None: | |||
| device = get_default_device() | |||
| CompNode.reset_max_memory_stats(device) | |||
| set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux")) | |||
| @@ -73,6 +73,26 @@ void init_common(py::module m) { | |||
| [](const CompNode& cn) { | |||
| return cn.get_mem_status_bytes(); | |||
| }) | |||
| .def_property_readonly( | |||
| "get_used_memory", | |||
| [](const CompNode& cn) { return cn.get_used_memory(); }) | |||
| .def_property_readonly( | |||
| "get_max_used_memory", | |||
| [](const CompNode& cn) { return cn.get_max_used_memory(); }) | |||
| .def_property_readonly( | |||
| "get_reserved_memory", | |||
| [](const CompNode& cn) { return cn.get_reserved_memory(); }) | |||
| .def_property_readonly( | |||
| "get_max_reserved_memory", | |||
| [](const CompNode& cn) { | |||
| return cn.get_max_reserved_memory(); | |||
| }) | |||
| .def_static( | |||
| "reset_max_memory_stats", | |||
| [](const CompNode& cn) { | |||
| cn.reset_max_used_memory(); | |||
| cn.reset_max_reserved_memory(); | |||
| }) | |||
| .def("create_event", &CompNode::create_event, | |||
| py::arg("flags") = 0ul) | |||
| .def_static("_set_default_device", &set_default_device) | |||
| @@ -208,20 +208,7 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { | |||
| public: | |||
| CompNodeImpl() : Impl(static_free_device, static_free_host) {} | |||
| void* alloc_device(size_t size) override { | |||
| activate(); | |||
| #if MGB_BUILD_SLIM_SERVING | |||
| return m_mem_alloc->alloc(size); | |||
| #else | |||
| void* ptr = m_mem_alloc->alloc(size); | |||
| { | |||
| MGB_LOCK_GUARD(m_update_mem); | |||
| ptr2size[ptr] = size; | |||
| m_used_mem += size; | |||
| } | |||
| return ptr; | |||
| #endif | |||
| } | |||
| void* alloc_device(size_t size) override; | |||
| void free_device(void* ptr); | |||
| @@ -311,20 +298,30 @@ public: | |||
| uint64_t get_uid() override { return m_uid; } | |||
| #if !MGB_BUILD_SLIM_SERVING | |||
| size_t get_used_memory() override { return m_used_mem; } | |||
| size_t get_used_memory() override; | |||
| size_t get_max_used_memory() override; | |||
| size_t get_reserved_memory() override; | |||
| size_t get_max_reserved_memory() override; | |||
| void reset_max_used_memory() override; | |||
| void reset_max_reserved_memory() override; | |||
| #endif | |||
| private: | |||
| uint64_t m_uid; | |||
| #if !MGB_BUILD_SLIM_SERVING | |||
| std::unordered_map<void*, size_t> ptr2size; | |||
| size_t m_used_mem = 0; | |||
| #endif | |||
| }; | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | |||
| struct CudaCompNodeImpl::DeviceInfo { | |||
| int dev_num = -1; | |||
| std::atomic_size_t m_used_mem{0}; | |||
| std::atomic_size_t m_max_used_mem{0}; | |||
| std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc; | |||
| bool init_done() const { return mem_alloc.get(); } | |||
| @@ -438,6 +435,24 @@ void CudaCompNodeImpl::fini() { | |||
| m_initialized = false; | |||
| } | |||
| void* CudaCompNodeImpl::alloc_device(size_t size) { | |||
| activate(); | |||
| #if MGB_BUILD_SLIM_SERVING | |||
| return m_mem_alloc->alloc(size); | |||
| #else | |||
| void* ptr = m_mem_alloc->alloc(size); | |||
| { | |||
| MGB_LOCK_GUARD(m_update_mem); | |||
| ptr2size[ptr] = size; | |||
| m_device_info->m_used_mem += size; | |||
| if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) { | |||
| m_device_info->m_max_used_mem = m_device_info->m_used_mem.load(); | |||
| } | |||
| } | |||
| return ptr; | |||
| #endif | |||
| } | |||
| void CudaCompNodeImpl::free_device(void* ptr) { | |||
| if (check_global_finalized()) | |||
| return; | |||
| @@ -447,13 +462,39 @@ void CudaCompNodeImpl::free_device(void* ptr) { | |||
| { | |||
| MGB_LOCK_GUARD(m_update_mem); | |||
| mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr); | |||
| m_used_mem -= ptr2size.at(ptr); | |||
| m_device_info->m_used_mem -= ptr2size.at(ptr); | |||
| ptr2size.erase(ptr); | |||
| } | |||
| #endif | |||
| m_mem_alloc->free(ptr); | |||
| } | |||
| #if !MGB_BUILD_SLIM_SERVING | |||
| size_t CudaCompNodeImpl::get_used_memory() { | |||
| return m_device_info->m_used_mem.load(); | |||
| } | |||
| size_t CudaCompNodeImpl::get_max_used_memory() { | |||
| return m_device_info->m_max_used_mem.load(); | |||
| } | |||
| void CudaCompNodeImpl::reset_max_used_memory() { | |||
| m_device_info->m_max_used_mem = 0; | |||
| } | |||
| size_t CudaCompNodeImpl::get_reserved_memory() { | |||
| return m_device_info->mem_alloc->get_used_memory(); | |||
| } | |||
| size_t CudaCompNodeImpl::get_max_reserved_memory() { | |||
| return m_device_info->mem_alloc->get_max_used_memory(); | |||
| } | |||
| void CudaCompNodeImpl::reset_max_reserved_memory() { | |||
| m_device_info->mem_alloc->reset_max_used_memory(); | |||
| } | |||
| #endif | |||
| void* CudaCompNodeImpl::alloc_host(size_t size) { | |||
| // need activate because it create cuda cuda context in current device | |||
| activate(); | |||
| @@ -226,6 +226,9 @@ StreamMemAlloc* DevMemAllocImpl::add_stream(StreamKey stream) { | |||
| MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc(size_t size) { | |||
| auto addr = do_alloc(size, true); | |||
| m_used_size += size; | |||
| if (m_used_size > m_max_used_size) { | |||
| m_max_used_size = m_used_size.load(); | |||
| } | |||
| return addr; | |||
| } | |||
| @@ -291,6 +294,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) { | |||
| // exception would be thrown from here | |||
| auto t = do_alloc(size, false, true); | |||
| m_used_size += size; | |||
| if (m_used_size > m_max_used_size) { | |||
| m_max_used_size = m_used_size.load(); | |||
| } | |||
| return t; | |||
| } | |||
| } | |||
| @@ -419,6 +425,9 @@ void DevMemAllocImpl::insert_free_unsafe(const FreeBlock& block) { | |||
| child->insert_free_unsafe(block); | |||
| } | |||
| m_used_size += block.size; | |||
| if (m_used_size > m_max_used_size) { | |||
| m_max_used_size = m_used_size.load(); | |||
| } | |||
| } else { | |||
| MemAllocImplHelper::insert_free_unsafe(block); | |||
| } | |||
| @@ -171,6 +171,7 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { | |||
| size_t m_tot_allocated_from_raw = 0; | |||
| std::atomic_size_t m_used_size{0}; | |||
| std::atomic_size_t m_max_used_size{0}; | |||
| /*! | |||
| * \brief gather all free blocks from child streams, and release full chunks | |||
| @@ -197,6 +198,10 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { | |||
| size_t get_used_memory() override { return m_used_size.load(); } | |||
| size_t get_max_used_memory() override { return m_max_used_size.load(); } | |||
| void reset_max_used_memory() override { m_max_used_size = 0; } | |||
| void insert_free_unsafe(const FreeBlock& block) override; | |||
| /*! | |||
| @@ -335,11 +335,23 @@ public: | |||
| size_t get_used_memory() const { return m_impl->get_used_memory(); } | |||
| size_t get_reserved_memory() const { return m_impl->get_reserved_memory(); } | |||
| size_t get_max_reserved_memory() const { return m_impl->get_max_reserved_memory(); } | |||
| size_t get_max_used_memory() const { return m_impl->get_max_used_memory(); } | |||
| size_t get_max_block_size_available() const { | |||
| return m_impl->get_max_block_size_available(); | |||
| } | |||
| size_t get_free_mem() const { return m_impl->get_free_mem(); } | |||
| void reset_max_reserved_memory() const { | |||
| return m_impl->reset_max_reserved_memory(); | |||
| } | |||
| void reset_max_used_memory() const { return m_impl->reset_max_used_memory(); } | |||
| #endif | |||
| //! change to another stream on the same memory node | |||
| @@ -533,8 +545,13 @@ protected: | |||
| return {x - x, y - y}; | |||
| } | |||
| virtual size_t get_used_memory() { return 0; } | |||
| virtual size_t get_reserved_memory() { return 0; } | |||
| virtual size_t get_max_reserved_memory() { return 0; } | |||
| virtual size_t get_max_used_memory() { return 0; } | |||
| virtual size_t get_max_block_size_available() { return 0; } | |||
| virtual size_t get_free_mem() { return 0; } | |||
| virtual void reset_max_reserved_memory() {} | |||
| virtual void reset_max_used_memory() {} | |||
| #endif | |||
| virtual Locator locator() = 0; | |||
| @@ -275,6 +275,10 @@ public: | |||
| const PreAllocConfig& prealloc_config() { return m_prealloc_config; } | |||
| virtual size_t get_used_memory() { return 0; } | |||
| virtual size_t get_max_used_memory() { return 0; } | |||
| virtual void reset_max_used_memory() {} | |||
| private: | |||
| size_t m_alignment = 1; | |||
| PreAllocConfig m_prealloc_config; | |||