test(mgb): add SimpleCachingAlloc test
GitOrigin-RevId: 17f381e4ac
tags/v1.0.0-rc1
| @@ -99,6 +99,46 @@ public: | |||
| } | |||
| }; | |||
| class CudaHostAllocator : public RawAllocator { | |||
| public: | |||
| void* alloc(size_t size) override { | |||
| void* addr; | |||
| cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault); | |||
| if (cuda_error == cudaSuccess) { | |||
| mgb_assert(addr); | |||
| return addr; | |||
| } | |||
| auto msg = mgb_ssprintf_log( | |||
| "cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)" | |||
| " of pinned host memory; error: %s", | |||
| size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error)); | |||
| msg.append(CudaError::get_cuda_extra_info()); | |||
| if (cuda_error == cudaErrorMemoryAllocation) { | |||
| mgb_log_error("%s", msg.c_str()); | |||
| // clear cuda error | |||
| cudaGetLastError(); | |||
| mgb_assert(cudaGetLastError() == cudaSuccess); | |||
| return nullptr; | |||
| } | |||
| mgb_throw_raw(MemAllocError{msg}); | |||
| } | |||
| void free(void* ptr) override { | |||
| cudaError_t cuda_error = cudaFreeHost(ptr); | |||
| if (cuda_error == cudaSuccess) | |||
| return; | |||
| auto msg = ssprintf("cudaFreeHost failed for %p: %s", ptr, | |||
| cudaGetErrorString(cuda_error)); | |||
| msg.append(CudaError::get_cuda_extra_info()); | |||
| mgb_throw_raw(MemAllocError{msg}); | |||
| } | |||
| void get_mem_info(size_t& free, size_t& tot) override { | |||
| free = 0; | |||
| tot = 0; | |||
| } | |||
| }; | |||
| class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy { | |||
| public: | |||
| CompNode::DeviceType device_type() override { | |||
| @@ -175,19 +215,9 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||
| void free_device(void *ptr); | |||
| void *alloc_host(size_t size) override { | |||
| activate(); | |||
| void *ptr; | |||
| MGB_CUDA_CHECK(cudaMallocHost(&ptr, size)); | |||
| return ptr; | |||
| } | |||
| void *alloc_host(size_t size) override; | |||
| void free_host(void *ptr) { | |||
| if (!check_global_finalized()) { | |||
| activate(); | |||
| } | |||
| MGB_CUDA_CHECK(cudaFreeHost(ptr)); | |||
| } | |||
| void free_host(void *ptr); | |||
| void copy_to_host(void *host_ptr, | |||
| const void *device_ptr, size_t size) override { | |||
| @@ -284,14 +314,18 @@ struct CudaCompNodeImpl::StaticData { | |||
| mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | |||
| std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc; | |||
| CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | |||
| DeviceInfo dev_info[MAX_NR_DEVICE]; | |||
| int nr_node = 0, //!< number of loaded node[] | |||
| nr_dev_used = 0; //!< number of used dev_info[] | |||
| StaticData() { | |||
| StaticData() : host_alloc( | |||
| mem_alloc::SimpleCachingAlloc::make( | |||
| std::make_unique<mem_alloc::CudaHostAllocator>())) { | |||
| prealloc_config.max_overhead = 0; | |||
| prealloc_config.alignment = 1; | |||
| host_alloc->alignment(1); | |||
| } | |||
| ~StaticData() { | |||
| @@ -388,6 +422,18 @@ void CudaCompNodeImpl::free_device(void *ptr) { | |||
| m_mem_alloc->free(ptr); | |||
| } | |||
| void* CudaCompNodeImpl::alloc_host(size_t size) { | |||
| // no need for activate() here because under | |||
| // unified addressing, host memory can be accessed | |||
| // and freed on any device | |||
| return sd->host_alloc->alloc(size); | |||
| } | |||
| void CudaCompNodeImpl::free_host(void* ptr) { | |||
| if (check_global_finalized()) return; | |||
| sd->host_alloc->free(ptr); | |||
| } | |||
| void CudaCompNodeImpl::peer_copy_to( | |||
| Impl *dest_impl, void *dest, const void *src, size_t size) { | |||
| if (dest_impl->same_type<CudaCompNodeImpl>()) { | |||
| @@ -364,5 +364,57 @@ DevMemAllocImpl::~DevMemAllocImpl() { | |||
| m_raw_allocator->free(i.first); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| std::unique_ptr<SimpleCachingAlloc> SimpleCachingAlloc::make(std::unique_ptr<RawAllocator> raw_alloc) { | |||
| return std::make_unique<SimpleCachingAllocImpl>(std::move(raw_alloc)); | |||
| } | |||
| SimpleCachingAllocImpl::SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> raw_alloc) | |||
| : m_raw_alloc(std::move(raw_alloc)) {} | |||
| void* SimpleCachingAllocImpl::alloc(size_t size) { | |||
| size = get_aligned_power2(size, m_alignment); | |||
| auto&& addr = do_alloc(size, true); | |||
| auto ptr = addr.addr_ptr(); | |||
| MGB_LOCK_GUARD(m_mutex); | |||
| m_allocated_blocks[ptr] = {addr.is_head, size}; | |||
| m_used_size += size; | |||
| return ptr; | |||
| } | |||
| void SimpleCachingAllocImpl::free(void* ptr) { | |||
| MGB_LOCK_GUARD(m_mutex); | |||
| auto&& iter = m_allocated_blocks.find(ptr); | |||
| mgb_assert(iter != m_allocated_blocks.end(), | |||
| "releasing bad pointer: %p", ptr); | |||
| auto size = iter->second.size; | |||
| FreeBlock fb{MemAddr{iter->second.is_head, reinterpret_cast<size_t>(ptr)}, size}; | |||
| m_allocated_blocks.erase(iter); | |||
| merge_free_unsafe(fb); | |||
| m_used_size -= size; | |||
| } | |||
| SimpleCachingAllocImpl::~SimpleCachingAllocImpl() { | |||
| for (auto&& ptr_size : m_alloc_from_raw) { | |||
| m_raw_alloc->free(ptr_size.first); | |||
| } | |||
| } | |||
| SimpleCachingAllocImpl::MemAddr SimpleCachingAllocImpl::alloc_from_parent(size_t size) { | |||
| void* ptr = m_raw_alloc->alloc(size); | |||
| m_alloc_from_raw[ptr] = size; | |||
| return {true, reinterpret_cast<size_t>(ptr)}; | |||
| } | |||
| std::string SimpleCachingAllocImpl::get_name() const { | |||
| return "SimpleCachingAllocImpl"; | |||
| } | |||
| size_t SimpleCachingAllocImpl::get_used_memory() { | |||
| return m_used_size; | |||
| } | |||
| FreeMemStat SimpleCachingAllocImpl::get_free_memory_dev() { | |||
| return get_free_memory(); | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -211,7 +211,32 @@ public: | |||
| FreeMemStat get_free_memory_dev() override; | |||
| }; | |||
| class SimpleCachingAllocImpl : public SimpleCachingAlloc, | |||
| public MemAllocImplHelper { | |||
| struct AllocatedBlock { | |||
| bool is_head; | |||
| size_t size; | |||
| }; | |||
| std::unique_ptr<RawAllocator> m_raw_alloc; | |||
| std::unordered_map<void*, size_t> m_alloc_from_raw; | |||
| std::unordered_map<void*, AllocatedBlock> m_allocated_blocks; | |||
| size_t m_used_size = 0; | |||
| public: | |||
| SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> m_raw_alloc); | |||
| ~SimpleCachingAllocImpl(); | |||
| void* alloc(size_t size) override; | |||
| void free(void* ptr) override; | |||
| size_t get_used_memory() override; | |||
| FreeMemStat get_free_memory_dev() override; | |||
| protected: | |||
| MemAddr alloc_from_parent(size_t size) override; | |||
| std::string get_name() const override; | |||
| }; | |||
| } | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -341,6 +341,32 @@ public: | |||
| FwdDevMemAlloc(const std::shared_ptr<RawAllocator>& ra) : m_raw_alloc(ra) {} | |||
| }; | |||
| /* ===================== SimpleCachingAlloc ===================== */ | |||
| /*! | |||
| * \brief An allocator that cache allocations to reduce call to raw allocator. | |||
| * Mainly used for CUDA pinned memory. | |||
| */ | |||
| class SimpleCachingAlloc : virtual public MemAllocBase { | |||
| protected: | |||
| size_t m_alignment = 1; | |||
| public: | |||
| virtual ~SimpleCachingAlloc() = default; | |||
| static std::unique_ptr<SimpleCachingAlloc> make(std::unique_ptr<RawAllocator> raw_alloc); | |||
| virtual void* alloc(size_t size) = 0; | |||
| virtual void free(void* ptr) = 0; | |||
| SimpleCachingAlloc& alignment(size_t alignment) { | |||
| m_alignment = alignment; | |||
| return *this; | |||
| }; | |||
| size_t alignment() const { | |||
| return m_alignment; | |||
| }; | |||
| }; | |||
| } // mem_alloc | |||
| } // mgb | |||
| @@ -440,6 +440,54 @@ TEST(TestMemAlloc, RandomOprs) { | |||
| ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free()); | |||
| } | |||
| TEST(TestSimpleCachingAlloc, Basic) { | |||
| constexpr size_t TOT = 2048, REQ = 1000; | |||
| static_assert(TOT > REQ * 2, ""); | |||
| auto raw_alloc = new DummyAllocator(TOT); | |||
| auto alloc = SimpleCachingAlloc::make(std::unique_ptr<RawAllocator>(raw_alloc)); | |||
| auto ptr = alloc->alloc(REQ); | |||
| EXPECT_EQ(TOT - REQ, raw_alloc->free_size()); | |||
| EXPECT_EQ(REQ, alloc->get_used_memory()); | |||
| EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||
| alloc->free(ptr); | |||
| EXPECT_EQ(0u, raw_alloc->nr_free()); | |||
| EXPECT_EQ(REQ, alloc->get_free_memory().tot); | |||
| ptr = alloc->alloc(REQ / 2); | |||
| EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
| EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||
| EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
| auto ptr2 = alloc->alloc(REQ / 2); | |||
| EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
| EXPECT_EQ(REQ / 2 * 2, alloc->get_used_memory()); | |||
| EXPECT_EQ(REQ - REQ / 2 * 2, alloc->get_free_memory().tot); | |||
| EXPECT_EQ(REQ / 2, (char*)ptr2 - (char*)ptr); | |||
| alloc->free(ptr); | |||
| EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
| EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||
| EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
| ptr = alloc->alloc(REQ); | |||
| EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||
| EXPECT_EQ(TOT - REQ * 2, raw_alloc->free_size()); | |||
| EXPECT_EQ(REQ + REQ / 2, alloc->get_used_memory()); | |||
| EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
| alloc->free(ptr2); | |||
| ptr2 = alloc->alloc(REQ); | |||
| EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||
| EXPECT_EQ(REQ * 2, alloc->get_used_memory()); | |||
| EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||
| alloc->free(ptr); | |||
| alloc->free(ptr2); | |||
| EXPECT_EQ(0u, raw_alloc->nr_free()); | |||
| }; | |||
| namespace { | |||
| class DevicePolicy { | |||
| public: | |||