code(127), as windows cuda driver unloading before atexit function
may remove this after upgrade cuda runtime
GitOrigin-RevId: cac37ca3dd
tags/v1.5.0
| @@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject { | |||
| ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event; | |||
| std::mutex m_mtx; | |||
| public: | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| static bool is_into_atexit; | |||
| #endif | |||
| std::shared_ptr<void> on_comp_node_finalize() override { | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| m_blob2event.clear(); | |||
| @@ -34,6 +41,16 @@ public: | |||
| static CompNodeSyncManager& inst() { | |||
| static CompNodeSyncManager sl_inst; | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| if (!is_into_atexit) { | |||
| auto err = atexit([] { is_into_atexit = true; }); | |||
| mgb_assert(!err, "failed to register atexit function"); | |||
| } | |||
| #endif | |||
| return sl_inst; | |||
| } | |||
| @@ -52,6 +69,13 @@ public: | |||
| m_blob2event.erase(blob); | |||
| } | |||
| }; | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| bool CompNodeSyncManager::is_into_atexit = false; | |||
| #endif | |||
| // Cache for small blobs | |||
| // 1. A blob has to be seen twice (within a window) to be eligible for cache | |||
| @@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz): | |||
| Blob::~Blob() { | |||
| BlobManager::inst()->unregister_blob(this); | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| if (CompNodeSyncManager::is_into_atexit) | |||
| return; | |||
| #endif | |||
| CompNodeSyncManager::inst().remove(this); | |||
| } | |||
| @@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical, | |||
| } | |||
| void CompNode::finalize() { | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround recovery | |||
| //! resource by OS temporarily, may need remove this after upgrade cuda | |||
| //! runtime | |||
| return; | |||
| #endif | |||
| comp_node_detail::DepedentObjList::invoke_callback_and_clean(); | |||
| CudaCompNode::finalize(); | |||
| CpuCompNode::finalize(); | |||
| @@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() { | |||
| } | |||
| return true; | |||
| } | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| if (CudaCompNode::is_into_atexit) { | |||
| mgb_log_debug( | |||
| "windows cudaErrorCudartUnloading happened!!, resource " | |||
| "recovery by OS!!"); | |||
| return true; | |||
| } | |||
| #endif | |||
| return false; | |||
| } | |||
| @@ -733,11 +745,29 @@ void CudaCompNode::finalize() { | |||
| } | |||
| } | |||
| CompNode::Impl* CudaCompNode::load_cuda( | |||
| const Locator &locator, const Locator &locator_logical) { | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| bool CudaCompNode::is_into_atexit = false; | |||
| #endif | |||
| CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator, | |||
| const Locator& locator_logical) { | |||
| int nr_gpu = get_device_count(); | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function even | |||
| //! register atexit function after init cuda driver! as a workround | |||
| //! recovery resource by OS temporarily, may need remove this after | |||
| //! upgrade cuda runtime | |||
| if (!is_into_atexit) { | |||
| auto err = atexit([] { is_into_atexit = true; }); | |||
| mgb_assert(!err, "failed to register atexit function"); | |||
| } | |||
| #endif | |||
| mgb_assert(locator.device >= 0 && locator.device < nr_gpu, | |||
| "request gpu%d out of valid range [0, %d)", locator.device, nr_gpu); | |||
| "request gpu%d out of valid range [0, %d)", locator.device, | |||
| nr_gpu); | |||
| auto &&sdptr = CudaCompNodeImpl::sd; | |||
| { | |||
| @@ -36,6 +36,13 @@ namespace mgb { | |||
| static void set_prealloc_config(size_t alignment, size_t min_req, | |||
| size_t max_overhead, double growth_factor); | |||
| #if MGB_CUDA && defined(WIN32) | |||
| //! FIXME: windows cuda driver shutdown before call atexit function | |||
| //! even register atexit function after init cuda driver! as a | |||
| //! workround recovery resource by OS temporarily, may need remove | |||
| //! this after upgrade cuda runtime | |||
| static bool is_into_atexit; | |||
| #endif | |||
| }; | |||
| } | |||