GitOrigin-RevId: de9e7d7f16
tags/v1.7.0
| @@ -10,7 +10,6 @@ import re | |||||
| from typing import Union | from typing import Union | ||||
| from ..core._imperative_rt.core2 import set_option as _set_option | from ..core._imperative_rt.core2 import set_option as _set_option | ||||
| from ..core._imperative_rt.utils import _set_defrag | |||||
| _eviction_threshold = 0 | _eviction_threshold = 0 | ||||
| _evictee_minimum_size = 1024 ** 2 | _evictee_minimum_size = 1024 ** 2 | ||||
| @@ -216,9 +216,6 @@ void init_utils(py::module m) { | |||||
| #endif | #endif | ||||
| // Debug code, internal only | // Debug code, internal only | ||||
| m.def("_set_defrag", [](bool enable) { | |||||
| mgb::imperative::BlobManager::inst()->set_enable(enable); | |||||
| }); | |||||
| m.def("_defrag", [](const mgb::CompNode& cn) { | m.def("_defrag", [](const mgb::CompNode& cn) { | ||||
| mgb::imperative::BlobManager::inst()->defrag(cn); | mgb::imperative::BlobManager::inst()->defrag(cn); | ||||
| }); | }); | ||||
| @@ -41,22 +41,14 @@ void BlobManagerImpl::unregister_blob(Blob* blob) { | |||||
| } | } | ||||
| void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) { | void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) { | ||||
| if (!m_enable) { | |||||
| // try alloc | |||||
| MGB_TRY { alloc_direct(blob, size); } | |||||
| // if fail, try defrag, alloc again | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||||
| defrag(blob->m_comp_node); | |||||
| alloc_direct(blob, size); | alloc_direct(blob, size); | ||||
| } else { | |||||
| // // debug | |||||
| // defrag(blob->m_comp_node); | |||||
| // alloc_direct(blob, storage, size); | |||||
| // try alloc | |||||
| MGB_TRY { alloc_direct(blob, size); } | |||||
| // if fail, try defrag, alloc again | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||||
| defrag(blob->m_comp_node); | |||||
| alloc_direct(blob, size); | |||||
| }); | |||||
| } | |||||
| }); | |||||
| } | } | ||||
| void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | ||||
| @@ -69,16 +61,12 @@ void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | |||||
| DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | ||||
| CompNode cn, TensorLayout layout) { | CompNode cn, TensorLayout layout) { | ||||
| DeviceTensorND dev_tensor; | DeviceTensorND dev_tensor; | ||||
| if (!m_enable) { | |||||
| MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
| defrag(cn); | |||||
| dev_tensor = alloc_workspace(cn, layout); | dev_tensor = alloc_workspace(cn, layout); | ||||
| } else { | |||||
| MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
| defrag(cn); | |||||
| dev_tensor = alloc_workspace(cn, layout); | |||||
| }); | |||||
| } | |||||
| }); | |||||
| return dev_tensor; | return dev_tensor; | ||||
| }; | }; | ||||
| @@ -154,10 +142,6 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||||
| cn.sync(); | cn.sync(); | ||||
| } | } | ||||
| void BlobManagerImpl::set_enable(bool flag) { | |||||
| m_enable = flag; | |||||
| } | |||||
| struct BlobManagerStub : BlobManager { | struct BlobManagerStub : BlobManager { | ||||
| void alloc_direct(Blob* blob, size_t size) { | void alloc_direct(Blob* blob, size_t size) { | ||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| @@ -172,9 +156,6 @@ struct BlobManagerStub : BlobManager { | |||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| void unregister_blob(Blob* blob){}; | void unregister_blob(Blob* blob){}; | ||||
| void set_enable(bool flag) { | |||||
| mgb_assert(0, "prohibited after global variable destruction"); | |||||
| }; | |||||
| void defrag(const CompNode& cn) { | void defrag(const CompNode& cn) { | ||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| @@ -38,7 +38,6 @@ class BlobManagerImpl final : public BlobManager { | |||||
| std::mutex m_mtx; | std::mutex m_mtx; | ||||
| CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | ||||
| bool m_enable = true; | |||||
| void defrag(const CompNode& cn) override; | void defrag(const CompNode& cn) override; | ||||
| @@ -57,8 +56,6 @@ public: | |||||
| void register_blob(Blob* blob) override; | void register_blob(Blob* blob) override; | ||||
| void unregister_blob(Blob* blob) override; | void unregister_blob(Blob* blob) override; | ||||
| void set_enable(bool flag) override; | |||||
| }; | }; | ||||
| } // namespace imperative | } // namespace imperative | ||||
| @@ -33,8 +33,6 @@ public: | |||||
| virtual void unregister_blob(Blob* blob) = 0; | virtual void unregister_blob(Blob* blob) = 0; | ||||
| virtual void set_enable(bool flag) = 0; | |||||
| virtual void defrag(const CompNode& cn) = 0; | virtual void defrag(const CompNode& cn) = 0; | ||||
| }; | }; | ||||
| @@ -94,15 +94,13 @@ TEST(TestImperative, Split) { | |||||
| } | } | ||||
| #if MGB_CUDA && MGB_ENABLE_EXCEPTION | #if MGB_CUDA && MGB_ENABLE_EXCEPTION | ||||
| void run_graph(size_t mem_reserved, bool enable_defrag) { | |||||
| void run_graph(size_t mem_reserved) { | |||||
| CompNode::try_coalesce_all_free_memory(); | CompNode::try_coalesce_all_free_memory(); | ||||
| CompNode::finalize(); | CompNode::finalize(); | ||||
| auto cn = CompNode::load("gpux"); | auto cn = CompNode::load("gpux"); | ||||
| cn.sync(); // wait for async init to finish | cn.sync(); // wait for async init to finish | ||||
| BlobManager::inst()->set_enable(enable_defrag); | |||||
| HostTensorGenerator<> gen; | HostTensorGenerator<> gen; | ||||
| using TensorPtr = std::shared_ptr<Tensor>; | using TensorPtr = std::shared_ptr<Tensor>; | ||||
| TensorPtr ptr_a[100]; | TensorPtr ptr_a[100]; | ||||
| @@ -159,10 +157,7 @@ TEST(TestImperative, Defragment) { | |||||
| } | } | ||||
| auto reserve_setting = ssprintf("b:%zu", reserve); | auto reserve_setting = ssprintf("b:%zu", reserve); | ||||
| auto do_run = [reserve]() { | |||||
| ASSERT_THROW(run_graph(reserve, false), MemAllocError); | |||||
| run_graph(reserve, true); | |||||
| }; | |||||
| auto do_run = [reserve]() { run_graph(reserve); }; | |||||
| // reserve memory explicitly to avoid uncontrollable factors | // reserve memory explicitly to avoid uncontrollable factors | ||||
| constexpr const char* KEY = "MGB_CUDA_RESERVE_MEMORY"; | constexpr const char* KEY = "MGB_CUDA_RESERVE_MEMORY"; | ||||