GitOrigin-RevId: f78c79eb06
tags/v1.10.0
| @@ -8,6 +8,7 @@ | |||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| import atexit | import atexit | ||||
| import ctypes | import ctypes | ||||
| import re | |||||
| import os | import os | ||||
| import platform | import platform | ||||
| import sys | import sys | ||||
| @@ -89,6 +90,9 @@ if sys.platform == "win32": | |||||
| from .core._imperative_rt.core2 import close as _close | from .core._imperative_rt.core2 import close as _close | ||||
| from .core._imperative_rt.core2 import full_sync as _full_sync | from .core._imperative_rt.core2 import full_sync as _full_sync | ||||
| from .core._imperative_rt.core2 import sync as _sync | from .core._imperative_rt.core2 import sync as _sync | ||||
| from .core._imperative_rt.common import ( | |||||
| get_supported_sm_versions as _get_supported_sm_versions, | |||||
| ) | |||||
| from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | ||||
| from .config import * | from .config import * | ||||
| from .device import * | from .device import * | ||||
| @@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools | |||||
| from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer | from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer | ||||
| from .version import __version__ | from .version import __version__ | ||||
| logger = get_logger(__name__) | |||||
| ngpus = get_device_count("gpu") | |||||
| supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions()) | |||||
| for idx in range(ngpus): | |||||
| prop = get_cuda_device_property(idx) | |||||
| cur_sm = str(prop.major * 10 + prop.minor) | |||||
| if not cur_sm in supported_sm_versions: | |||||
| logger.warning( | |||||
| "{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format( | |||||
| prop.name, | |||||
| cur_sm, | |||||
| "capabilities" if len(supported_sm_versions) > 1 else "capability", | |||||
| " ".join(["sm_" + v for v in supported_sm_versions]), | |||||
| prop.name, | |||||
| ) | |||||
| ) | |||||
| _set_fork_exec_path_for_timed_func( | _set_fork_exec_path_for_timed_func( | ||||
| sys.executable, | sys.executable, | ||||
| os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), | os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), | ||||
| @@ -11,9 +11,7 @@ import re | |||||
| from typing import Optional | from typing import Optional | ||||
| from .core._imperative_rt.common import CompNode, DeviceType | from .core._imperative_rt.common import CompNode, DeviceType | ||||
| from .core._imperative_rt.common import ( | |||||
| get_cuda_compute_capability as _get_cuda_compute_capability, | |||||
| ) | |||||
| from .core._imperative_rt.common import get_device_prop as _get_device_prop | |||||
| from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config | from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config | ||||
| from .core._imperative_rt.common import what_is_xpu as _what_is_xpu | from .core._imperative_rt.common import what_is_xpu as _what_is_xpu | ||||
| from .core._imperative_rt.utils import _try_coalesce_all_free_memory | from .core._imperative_rt.utils import _try_coalesce_all_free_memory | ||||
| @@ -25,6 +23,7 @@ __all__ = [ | |||||
| "set_default_device", | "set_default_device", | ||||
| "get_mem_status_bytes", | "get_mem_status_bytes", | ||||
| "get_cuda_compute_capability", | "get_cuda_compute_capability", | ||||
| "get_cuda_device_property", | |||||
| "get_allocated_memory", | "get_allocated_memory", | ||||
| "get_reserved_memory", | "get_reserved_memory", | ||||
| "get_max_reserved_memory", | "get_max_reserved_memory", | ||||
| @@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int | |||||
| Returns: | Returns: | ||||
| a version number, or `SM version`. | a version number, or `SM version`. | ||||
| """ | """ | ||||
| return _get_cuda_compute_capability(device, device_type) | |||||
| prop = _get_device_prop(device, device_type) | |||||
| return prop.major * 10 + prop.minor | |||||
| def get_cuda_device_property(device: int, device_type=DeviceType.CUDA): | |||||
| return _get_device_prop(device, device_type) | |||||
| def get_allocated_memory(device: Optional[str] = None): | def get_allocated_memory(device: Optional[str] = None): | ||||
| @@ -123,6 +123,23 @@ void init_common(py::module m) { | |||||
| py::implicitly_convertible<std::string, CompNode>(); | py::implicitly_convertible<std::string, CompNode>(); | ||||
| py::class_<CompNode::DeviceProperties>(m, "DeviceProperties") | |||||
| .def(py::init()) | |||||
| .def_property_readonly( | |||||
| "name", | |||||
| [](const CompNode::DeviceProperties prop) { return prop.name; }) | |||||
| .def_property_readonly( | |||||
| "total_memory", | |||||
| [](const CompNode::DeviceProperties prop) { | |||||
| return prop.total_memory; | |||||
| }) | |||||
| .def_property_readonly( | |||||
| "major", | |||||
| [](const CompNode::DeviceProperties prop) { return prop.major; }) | |||||
| .def_property_readonly("minor", [](const CompNode::DeviceProperties prop) { | |||||
| return prop.minor; | |||||
| }); | |||||
| def_TensorND<DeviceTensorND>(m, "DeviceTensorND") | def_TensorND<DeviceTensorND>(m, "DeviceTensorND") | ||||
| .def("numpy", [](const DeviceTensorND& self) { | .def("numpy", [](const DeviceTensorND& self) { | ||||
| HostTensorND hv; | HostTensorND hv; | ||||
| @@ -223,7 +240,12 @@ void init_common(py::module m) { | |||||
| m.def("set_prealloc_config", &CompNode::set_prealloc_config, | m.def("set_prealloc_config", &CompNode::set_prealloc_config, | ||||
| "specifies how to pre-allocate from raw dev allocator"); | "specifies how to pre-allocate from raw dev allocator"); | ||||
| m.def("get_cuda_compute_capability", &CompNode::get_compute_capability); | |||||
| m.def("get_device_prop", &CompNode::get_device_prop); | |||||
| m.def("get_supported_sm_versions", []() { | |||||
| static const char* mge_gen_code = MGE_CUDA_GENCODE; | |||||
| return mge_gen_code; | |||||
| }); | |||||
| m.def("what_is_xpu", | m.def("what_is_xpu", | ||||
| [] { return CompNode::Locator::parse("xpux").to_physical().type; }); | [] { return CompNode::Locator::parse("xpux").to_physical().type; }); | ||||
| @@ -431,13 +431,12 @@ void CompNode::set_prealloc_config( | |||||
| }; | }; | ||||
| } | } | ||||
| size_t CompNode::get_compute_capability(int dev, DeviceType device_type) { | |||||
| CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) { | |||||
| switch (device_type) { | switch (device_type) { | ||||
| case DeviceType::CUDA: | case DeviceType::CUDA: | ||||
| return CudaCompNode::get_compute_capability(dev); | |||||
| return CudaCompNode::get_device_prop(dev); | |||||
| default: | default: | ||||
| mgb_log_warn("unsupport device type for get_compute_capability"); | |||||
| return 0; | |||||
| mgb_throw(MegBrainError, "unsupport device type for get_device_prop"); | |||||
| }; | }; | ||||
| } | } | ||||
| @@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { | |||||
| //! return whether global finalized, and print warning in such case | //! return whether global finalized, and print warning in such case | ||||
| static inline bool check_global_finalized(); | static inline bool check_global_finalized(); | ||||
| static CompNode::DeviceProperties get_device_prop(int dev); | |||||
| //! enable peer copy from dev0 to dev1 | //! enable peer copy from dev0 to dev1 | ||||
| static void enable_peer_access(int dev0, int dev1); | static void enable_peer_access(int dev0, int dev1); | ||||
| static size_t get_compute_capability(int dev); | |||||
| static void static_free_device(ImplBase* self, void* ptr) { | static void static_free_device(ImplBase* self, void* ptr) { | ||||
| static_cast<CompNodeImpl*>(self)->free_device(ptr); | static_cast<CompNodeImpl*>(self)->free_device(ptr); | ||||
| } | } | ||||
| @@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { | |||||
| public: | public: | ||||
| CompNodeImpl() : Impl(static_free_device, static_free_host) {} | CompNodeImpl() : Impl(static_free_device, static_free_host) {} | ||||
| static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; | |||||
| void* alloc_device(size_t size) override; | void* alloc_device(size_t size) override; | ||||
| void free_device(void* ptr); | void free_device(void* ptr); | ||||
| @@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo { | |||||
| }; | }; | ||||
| struct CudaCompNodeImpl::StaticData { | struct CudaCompNodeImpl::StaticData { | ||||
| static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; | |||||
| std::recursive_mutex mtx; | std::recursive_mutex mtx; | ||||
| mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | ||||
| @@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData { | |||||
| CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; | CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; | ||||
| Spinlock CudaCompNodeImpl::sd_mtx; | Spinlock CudaCompNodeImpl::sd_mtx; | ||||
| struct DevicePropRec { | |||||
| bool init = false; | |||||
| CompNode::DeviceProperties prop; | |||||
| Spinlock mtx_com; | |||||
| }; | |||||
| DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE]; | |||||
| void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { | void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { | ||||
| m_locator = locator; | m_locator = locator; | ||||
| m_locator_logical = locator_logical; | m_locator_logical = locator_logical; | ||||
| @@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() { | |||||
| } | } | ||||
| void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { | void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { | ||||
| static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE]; | |||||
| static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE]; | |||||
| if (already_enabled[dev0][dev1]) | if (already_enabled[dev0][dev1]) | ||||
| return; | return; | ||||
| @@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) { | |||||
| return err; | return err; | ||||
| return err2; | return err2; | ||||
| } | } | ||||
| template <typename Func, typename... Args> | |||||
| CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) { | |||||
| auto err = func(val, len, args...); | |||||
| if (err != CUDA_ERROR_NOT_INITIALIZED) | |||||
| return err; | |||||
| // cuInit not called, call it in child process | |||||
| int fd[2]; | |||||
| mgb_assert(pipe(fd) == 0, "pipe() failed"); | |||||
| int fdr = fd[0], fdw = fd[1]; | |||||
| RAIICloseFD fdr_guard(fdr); | |||||
| RAIICloseFD fdw_guard(fdw); | |||||
| auto cpid = fork(); | |||||
| mgb_assert(cpid != -1, "fork() failed"); | |||||
| if (cpid == 0) { | |||||
| fdr_guard.close(); | |||||
| do { | |||||
| err = cuInit(0); | |||||
| if (err != CUDA_SUCCESS) | |||||
| break; | |||||
| err = func(val, len, args...); | |||||
| } while (0); | |||||
| auto sz = write(fdw, &err, sizeof(err)); | |||||
| if (sz == sizeof(err) && err == CUDA_SUCCESS) { | |||||
| sz = write(fdw, val, sizeof(*val) * len); | |||||
| } | |||||
| fdw_guard.close(); | |||||
| std::quick_exit(0); | |||||
| } | |||||
| fdw_guard.close(); | |||||
| auto sz = read(fdr, &err, sizeof(err)); | |||||
| mgb_assert(sz == sizeof(err), "failed to read error code from child"); | |||||
| if (err == CUDA_SUCCESS) { | |||||
| sz = read(fdr, val, sizeof(*val) * len); | |||||
| mgb_assert( | |||||
| static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len), | |||||
| "failed to read value from child"); | |||||
| return err; | |||||
| } | |||||
| // try again, maybe another thread called cuInit while we fork | |||||
| auto err2 = func(val, len, args...); | |||||
| if (err2 == CUDA_SUCCESS) | |||||
| return err2; | |||||
| if (err2 == CUDA_ERROR_NOT_INITIALIZED) | |||||
| return err; | |||||
| return err2; | |||||
| } | |||||
| #endif | #endif | ||||
| const char* cu_get_error_string(CUresult err) { | const char* cu_get_error_string(CUresult err) { | ||||
| @@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda( | |||||
| } | } | ||||
| if (!available_node) { | if (!available_node) { | ||||
| mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated"); | |||||
| mgb_assert( | |||||
| sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE, | |||||
| "too many CompNode allocated"); | |||||
| available_node = &sd.node[sd.nr_node++]; | available_node = &sd.node[sd.nr_node++]; | ||||
| } | } | ||||
| mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large"); | |||||
| mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large"); | |||||
| mgb_assert(!available_node->m_initialized); | mgb_assert(!available_node->m_initialized); | ||||
| available_node->init(locator, locator_logical); | available_node->init(locator, locator_logical); | ||||
| @@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config( | |||||
| } | } | ||||
| } | } | ||||
| size_t CudaCompNode::get_compute_capability(int dev) { | |||||
| size_t cnt = get_device_count(); | |||||
| if (dev < 0 || dev >= static_cast<int>(cnt)) { | |||||
| mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt); | |||||
| return 0; | |||||
| } | |||||
| static Spinlock mtx_com; | |||||
| MGB_LOCK_GUARD(mtx_com); | |||||
| int pmajor; | |||||
| int pminor; | |||||
| auto err = call_cuda_forksafe( | |||||
| cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | |||||
| dev); | |||||
| if (err != CUDA_SUCCESS) { | |||||
| return 0; | |||||
| } | |||||
| auto err2 = call_cuda_forksafe( | |||||
| cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | |||||
| dev); | |||||
| if (err2 != CUDA_SUCCESS) { | |||||
| return 0; | |||||
| CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { | |||||
| int cnt = static_cast<int>(get_device_count()); | |||||
| mgb_assert( | |||||
| dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev, | |||||
| cnt); | |||||
| auto&& rec = device_prop_rec[dev]; | |||||
| if (!rec.init) { | |||||
| MGB_LOCK_GUARD(rec.mtx_com); | |||||
| if (!rec.init) { | |||||
| char pname[256] = {0}; | |||||
| mgb_assert( | |||||
| call_cuda_forksafe( | |||||
| cuDeviceGetAttribute, &rec.prop.major, | |||||
| CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | |||||
| dev) == CUDA_SUCCESS); | |||||
| mgb_assert( | |||||
| call_cuda_forksafe( | |||||
| cuDeviceGetAttribute, &rec.prop.minor, | |||||
| CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | |||||
| dev) == CUDA_SUCCESS); | |||||
| mgb_assert( | |||||
| call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) == | |||||
| CUDA_SUCCESS); | |||||
| mgb_assert( | |||||
| call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) == | |||||
| CUDA_SUCCESS); | |||||
| rec.prop.name = pname; | |||||
| rec.init = true; | |||||
| } | |||||
| } | } | ||||
| return pmajor * 10 + pminor; | |||||
| return rec.prop; | |||||
| } | } | ||||
| #else | #else | ||||
| @@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {} | |||||
| void CudaCompNode::set_prealloc_config( | void CudaCompNode::set_prealloc_config( | ||||
| size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} | size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} | ||||
| size_t CudaCompNode::get_compute_capability(int dev) { | |||||
| return 0; | |||||
| CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { | |||||
| return CompNode::DeviceProperties{}; | |||||
| } | } | ||||
| #undef err | #undef err | ||||
| @@ -31,7 +31,7 @@ public: | |||||
| static size_t get_device_count(bool warn = true); | static size_t get_device_count(bool warn = true); | ||||
| static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); | static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); | ||||
| static void sync_all(); | static void sync_all(); | ||||
| static size_t get_compute_capability(int dev); | |||||
| static DeviceProperties get_device_prop(int dev); | |||||
| static void set_prealloc_config( | static void set_prealloc_config( | ||||
| size_t alignment, size_t min_req, size_t max_overhead, | size_t alignment, size_t min_req, size_t max_overhead, | ||||
| @@ -80,6 +80,20 @@ public: | |||||
| static constexpr size_t NR_DEVICE_TYPE = | static constexpr size_t NR_DEVICE_TYPE = | ||||
| static_cast<size_t>(DeviceType::MAX_DEVICE_ID); | static_cast<size_t>(DeviceType::MAX_DEVICE_ID); | ||||
| struct DeviceProperties { | |||||
| DeviceProperties() { | |||||
| name = "unspec"; | |||||
| total_memory = major = minor = 0; | |||||
| } | |||||
| std::string name; | |||||
| size_t total_memory; | |||||
| //! for cuda | |||||
| int major; | |||||
| int minor; | |||||
| }; | |||||
| /*! | /*! | ||||
| * \brief an identifier to specify a computing node | * \brief an identifier to specify a computing node | ||||
| * | * | ||||
| @@ -301,10 +315,11 @@ public: | |||||
| MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( | MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( | ||||
| size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, | size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, | ||||
| DeviceType device_type); | DeviceType device_type); | ||||
| /*! | /*! | ||||
| * \brief get compute capability of the specified device | |||||
| * \brief get device property of the specified device | |||||
| */ | */ | ||||
| MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability( | |||||
| MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop( | |||||
| int dev, DeviceType device_type); | int dev, DeviceType device_type); | ||||
| /* =================== synchronization ======================== */ | /* =================== synchronization ======================== */ | ||||
| @@ -268,5 +268,6 @@ | |||||
| #endif | #endif | ||||
| #define GIT_FULL_HASH "@GIT_FULL_HASH@" | #define GIT_FULL_HASH "@GIT_FULL_HASH@" | ||||
| #define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@" | |||||
| #endif // _HEADER_MGB_BUILD_CONFIG | #endif // _HEADER_MGB_BUILD_CONFIG | ||||