You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node_env.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /**
  2. * \file src/core/impl/comp_node_env.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "megbrain/exception.h"
  13. #include "megbrain/system.h"
  14. #include "megbrain/utils/metahelper.h"
  15. #include "megbrain/version_symbol.h"
  16. #include "megdnn/version.h"
  17. #if MGB_CUDA
  18. #include "megcore_cuda.h"
  19. #if MGB_ENABLE_DEBUG_UTIL
  20. #include <nvToolsExtCudaRt.h>
  21. #endif
  22. #endif
  23. #if MGB_CAMBRICON
  24. #include "megcore_cambricon.h"
  25. #endif
  26. #if MGB_ATLAS
  27. #include "acl/acl.h"
  28. #include "megcore_atlas.h"
  29. #endif
  30. using namespace mgb;
  31. /* =================== MegDNNHandle =================== */
  32. MGB_TYPEINFO_OBJ_IMPL(MegDNNHandle);
  33. int MegDNNHandle::sm_default_dbg_level = 0;
  34. MegDNNHandle& MegDNNHandle::get(const CompNodeEnv& env) {
  35. auto maker = [&]() { return std::make_shared<MegDNNHandle>(env); };
  36. return env.get_user_data<MegDNNHandle>(maker);
  37. }
  38. MegDNNHandle::MegDNNHandle(const CompNodeEnv& env) {
  39. auto megdnn_version = megdnn::get_version();
  40. mgb_throw_if(
  41. megdnn_version.major != MEGDNN_MAJOR ||
  42. megdnn_version.minor < MEGDNN_MINOR,
  43. SystemError,
  44. "incompatible megdnn version: compiled with %d.%d, get %d.%d.%d "
  45. "at runtime",
  46. MEGDNN_MAJOR, MEGDNN_MINOR, megdnn_version.major,
  47. megdnn_version.minor, megdnn_version.patch);
  48. bool init = false;
  49. #if MGB_CUDA
  50. if (env.property().type == CompNode::DeviceType::CUDA) {
  51. megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformCUDA,
  52. env.cuda_env().device, 0);
  53. megcore::createComputingHandleWithCUDAContext(&m_comp_hdl, m_dev_hdl, 0,
  54. {env.cuda_env().stream, make_async_error_info(env)});
  55. init = true;
  56. }
  57. #endif
  58. #if MGB_CAMBRICON
  59. if (env.property().type == CompNode::DeviceType::CAMBRICON) {
  60. CompNodeEnv::CnrtEnv::init_status.init();
  61. megcore::createDeviceHandleWithGlobalInitStatus(
  62. &m_dev_hdl, env.cnrt_env().device, 0, true);
  63. megcore::createComputingHandleWithCambriconContext(
  64. &m_comp_hdl, m_dev_hdl, 0, {env.cnrt_env().queue});
  65. init = true;
  66. }
  67. #endif
  68. #if MGB_ATLAS
  69. if (env.property().type == CompNode::DeviceType::ATLAS) {
  70. CompNodeEnv::AtlasEnv::init_status.init();
  71. megcore::createAtlasDeviceHandleWithGlobalInitStatus(
  72. &m_dev_hdl, env.atlas_env().device, 0, true);
  73. megcore::createComputingHandleWithAtlasContext(
  74. &m_comp_hdl, m_dev_hdl, 0, {env.atlas_env().stream});
  75. init = true;
  76. }
  77. #endif
  78. if (env.property().type == CompNode::DeviceType::CPU) {
  79. megcoreCreateDeviceHandle(&m_dev_hdl, megcorePlatformCPU);
  80. megcoreCreateComputingHandleWithCPUDispatcher(&m_comp_hdl, m_dev_hdl,
  81. env.cpu_env().dispatcher);
  82. init = true;
  83. }
  84. mgb_assert(init);
  85. int level = sm_default_dbg_level;
  86. if (auto set = MGB_GETENV("MGB_USE_MEGDNN_DBG")) {
  87. level = std::stol(set);
  88. mgb_log_warn("use megdnn handle with debug level: %d", level);
  89. }
  90. // handle may have been implemented when device type is cadence.
  91. if (!m_megdnn_handle) {
  92. m_megdnn_handle = megdnn::Handle::make(m_comp_hdl, level);
  93. }
  94. }
  95. MegDNNHandle::~MegDNNHandle() noexcept {
  96. m_megdnn_handle.reset();
  97. #if MGB_NEED_MEGDNN_ASYNC_ERROR
  98. m_async_error_info_devptr.reset();
  99. #endif
  100. if (m_comp_hdl) {
  101. megcoreDestroyComputingHandle(m_comp_hdl);
  102. }
  103. if (m_dev_hdl) {
  104. megcoreDestroyDeviceHandle(m_dev_hdl);
  105. }
  106. }
  107. #if MGB_NEED_MEGDNN_ASYNC_ERROR
  108. megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info(
  109. const CompNodeEnv& env) {
  110. auto cn = env.comp_node();
  111. auto del = [cn](megcore::AsyncErrorInfo* ptr) {
  112. if (ptr) {
  113. cn.free_device(ptr);
  114. }
  115. };
  116. megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0, 0, 0, 0}};
  117. auto ptr = static_cast<megcore::AsyncErrorInfo*>(
  118. env.comp_node().alloc_device(sizeof(zero_info)));
  119. cn.copy_to_device(ptr, &zero_info, sizeof(zero_info));
  120. cn.sync();
  121. m_async_error_info_devptr = {ptr, del};
  122. return m_async_error_info_devptr.get();
  123. }
  124. #endif
  125. /* =================== misc =================== */
  126. #if MGB_CUDA
  127. void mgb::_on_cuda_error(const char* expr, cudaError_t err, const char* file,
  128. const char* func, int line) {
  129. mgb_throw(CudaError, "cuda error %d: %s (%s at %s:%s:%d)", int(err),
  130. cudaGetErrorString(err), expr, file, func, line);
  131. }
  132. void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
  133. const ContinuationCtx<cudaStream_t>& cont) {
  134. m_comp_node = comp_node;
  135. mgb_assert(!m_user_data_container && !m_async_init_need_wait);
  136. m_cuda_env.device = dev;
  137. m_property.type = DeviceType::CUDA;
  138. MGB_CUDA_CHECK(cudaGetDeviceProperties(&m_cuda_env.device_prop, dev));
  139. {
  140. auto&& prop = m_cuda_env.device_prop;
  141. m_property.mem_alignment =
  142. std::max(prop.textureAlignment, prop.texturePitchAlignment);
  143. }
  144. std::atomic_bool tid_set{false};
  145. auto worker = [this, cont, &tid_set]() {
  146. sys::set_thread_name("async_cuda_init");
  147. m_async_init_tid = std::this_thread::get_id();
  148. tid_set.store(true);
  149. bool stream_done = false;
  150. MGB_MARK_USED_VAR(stream_done);
  151. MGB_TRY {
  152. m_cuda_env.activate();
  153. MGB_CUDA_CHECK(cudaStreamCreateWithFlags(&m_cuda_env.stream,
  154. cudaStreamNonBlocking));
  155. stream_done = true;
  156. m_user_data_container = std::make_unique<UserDataContainer>();
  157. #if MGB_ENABLE_DEBUG_UTIL
  158. nvtxNameCudaStreamA(m_cuda_env.stream,
  159. m_comp_node.to_string().c_str());
  160. #endif
  161. cont.next(m_cuda_env.stream);
  162. // megdnn is initialized here; must be placed after cont.next()
  163. // which handles comp node init
  164. mgb_assert(
  165. m_property.mem_alignment ==
  166. MegDNNHandle::get(*this).handle()->alignment_requirement());
  167. }
  168. MGB_CATCH(std::exception & exc, {
  169. mgb_log_error("async cuda init failed: %s", exc.what());
  170. if (stream_done) {
  171. cudaStreamDestroy(m_cuda_env.stream);
  172. }
  173. cont.err(exc);
  174. throw;
  175. })
  176. };
  177. m_async_init_need_wait = true;
  178. m_async_init_future = std::async(std::launch::async, worker);
  179. while (!tid_set.load())
  180. std::this_thread::yield();
  181. mgb_assert(m_async_init_tid != std::this_thread::get_id());
  182. }
  183. #endif
  184. #if MGB_ATLAS
  185. void mgb::_on_atlas_error(const char* expr, int err, const char* file,
  186. const char* func, int line) {
  187. mgb_throw(AtlasError, "atlas error %d: %s (%s at %s:%s:%d)", int(err),
  188. megcore::atlas::get_error_str(err), expr, file, func, line);
  189. }
  190. CompNodeEnv::AtlasEnv::InitStatus CompNodeEnv::AtlasEnv::init_status;
  191. void CompNodeEnv::init_atlas(CompNode comp_node, const AtlasEnv& env) {
  192. m_comp_node = comp_node;
  193. m_atlas_env = env;
  194. m_property.type = DeviceType::ATLAS;
  195. m_property.mem_alignment = 64;
  196. m_atlas_env.activate();
  197. MGB_ATLAS_CHECK(aclrtCreateStream(&m_atlas_env.stream));
  198. m_user_data_container = std::make_unique<UserDataContainer>();
  199. mgb_assert(m_property.mem_alignment ==
  200. MegDNNHandle::get(*this).handle()->alignment_requirement());
  201. }
  202. #endif
  203. #if MGB_CAMBRICON
  204. const char* mgb::cnml_get_error_string(cnmlStatus_t err) {
  205. switch (err) {
  206. #define cb(_err) \
  207. case _err: \
  208. return #_err
  209. cb(CNML_STATUS_SUCCESS);
  210. cb(CNML_STATUS_NODEVICE);
  211. cb(CNML_STATUS_DOMAINERR);
  212. cb(CNML_STATUS_INVALIDARG);
  213. cb(CNML_STATUS_LENGTHERR);
  214. cb(CNML_STATUS_OUTOFRANGE);
  215. cb(CNML_STATUS_RANGEERR);
  216. cb(CNML_STATUS_OVERFLOWERR);
  217. cb(CNML_STATUS_UNDERFLOWERR);
  218. cb(CNML_STATUS_INVALIDPARAM);
  219. cb(CNML_STATUS_BADALLOC);
  220. cb(CNML_STATUS_BADTYPEID);
  221. cb(CNML_STATUS_BADCAST);
  222. cb(CNML_STATUS_UNSUPPORT);
  223. #undef cb
  224. }
  225. return "Unknown CNML error";
  226. }
  227. void mgb::_on_cnrt_error(const char* expr, cnrtRet_t err, const char* file,
  228. const char* func, int line) {
  229. mgb_throw(CnrtError, "cnrt error %d: %s (%s at %s:%s:%d)", int(err),
  230. cnrtGetErrorStr(err), expr, file, func, line);
  231. }
  232. void mgb::_on_cndev_error(const char* expr, cndevRet_t err, const char* file,
  233. const char* func, int line) {
  234. mgb_throw(CndevError, "cndev error %d: %s (%s at %s:%s:%d)", int(err),
  235. cndevGetErrorString(err), expr, file, func, line);
  236. }
  237. void mgb::_on_cnml_error(const char* expr, cnmlStatus_t err, const char* file,
  238. const char* func, int line) {
  239. mgb_throw(CnmlError, "cnml error %d: %s (%s at %s:%s:%d)", int(err),
  240. cnml_get_error_string(err), expr, file, func, line);
  241. }
  242. #endif
  243. void CompNodeEnv::init_cpu(const CpuEnv& env, CompNode comp_node) {
  244. m_comp_node = comp_node;
  245. mgb_assert(!m_user_data_container);
  246. m_property.type = DeviceType::CPU;
  247. m_cpu_env = env;
  248. m_user_data_container = std::make_unique<UserDataContainer>();
  249. m_property.mem_alignment =
  250. MegDNNHandle::get(*this).handle()->alignment_requirement();
  251. }
  252. #if MGB_CAMBRICON
  253. void CompNodeEnv::init_cnrt(int dev, CompNode comp_node,
  254. const ContinuationCtx<cnrtQueue_t>& cont) {
  255. m_comp_node = comp_node;
  256. m_cnrt_env.device = dev;
  257. m_property.type = DeviceType::CAMBRICON;
  258. MGB_CNRT_CHECK(cnrtGetDeviceInfo(&m_cnrt_env.device_info, dev));
  259. // FIXME: doc doesn't describe the aligment requirement for device memory
  260. // address
  261. m_property.mem_alignment = 1u;
  262. // ensure exception safe
  263. bool queue_created = false;
  264. MGB_MARK_USED_VAR(queue_created);
  265. MGB_TRY {
  266. m_cnrt_env.activate();
  267. MGB_CNRT_CHECK(cnrtCreateQueue(&m_cnrt_env.queue));
  268. queue_created = true;
  269. m_user_data_container = std::make_unique<UserDataContainer>();
  270. cont.next(m_cnrt_env.queue);
  271. // TODO: initialize megdnn handle
  272. mgb_assert(m_property.mem_alignment ==
  273. MegDNNHandle::get(*this).handle()->alignment_requirement());
  274. }
  275. MGB_CATCH(std::exception & exc, {
  276. mgb_log_error("cnrt init failed: %s", exc.what());
  277. if (queue_created) {
  278. MGB_CNRT_CHECK(cnrtDestroyQueue(m_cnrt_env.queue));
  279. }
  280. cont.err(exc);
  281. throw;
  282. })
  283. }
  284. CompNodeEnv::CnrtEnv::InitStatus CompNodeEnv::CnrtEnv::init_status;
  285. #endif
  286. void CompNodeEnv::fini() {
  287. ensure_async_init_finished();
  288. m_user_data_container.reset();
  289. #if MGB_CUDA
  290. if (m_property.type == DeviceType::CUDA) {
  291. m_cuda_env.activate();
  292. MGB_CUDA_CHECK(cudaStreamDestroy(m_cuda_env.stream));
  293. }
  294. #endif
  295. #if MGB_CAMBRICON
  296. if (m_property.type == DeviceType::CAMBRICON) {
  297. m_cnrt_env.activate();
  298. MGB_CNRT_CHECK(cnrtDestroyQueue(m_cnrt_env.queue));
  299. }
  300. #endif
  301. #if MGB_ATLAS
  302. if (m_property.type == DeviceType::ATLAS) {
  303. m_atlas_env.activate();
  304. MGB_ATLAS_CHECK(aclrtDestroyStream(m_atlas_env.stream));
  305. }
  306. #endif
  307. }
  308. #if MGB_ENABLE_COMP_NODE_ASYNC_INIT
  309. void CompNodeEnv::wait_async_init() {
  310. if (std::this_thread::get_id() == m_async_init_tid)
  311. return;
  312. MGB_LOCK_GUARD(m_async_init_mtx);
  313. if (m_async_init_need_wait.load()) {
  314. m_async_init_future.wait();
  315. m_async_init_need_wait.store(false);
  316. m_async_init_future.get();
  317. }
  318. }
  319. #endif
  320. void CompNodeEnv::on_bad_device_type(DeviceType expected) const {
  321. mgb_throw(MegBrainError, "bad device type: expected=%d actual=%d",
  322. static_cast<int>(expected), static_cast<int>(m_property.type));
  323. }
  324. MGB_VERSION_SYMBOL3(MEGDNN, MEGDNN_MAJOR, MEGDNN_MINOR, MEGDNN_PATCH);
  325. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台