You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node.cpp 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844
  1. /**
  2. * \file src/core/test/comp_node.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./comp_node_helper.h"
  12. #include "megbrain/comp_node_env.h"
  13. #include "megbrain/utils/comp_node_sync_manager.h"
  14. #include "megbrain/utils/timer.h"
  15. #include "megbrain/system.h"
  16. #include "megbrain/test/helper.h"
  17. #include "megbrain/opr/utility.h"
  18. #include <chrono>
  19. #if MGB_HAVE_THREAD
  20. #include <thread>
  21. #endif
  22. using namespace mgb;
  23. TEST(TestCompNode, Parse) {
  24. using L = CompNode::Locator;
  25. using D = CompNode::DeviceType;
  26. auto make_lc = [](D t, int dev, int s) -> L { return {t, dev, {s}}; };
  27. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  28. ASSERT_EQ(L::parse("xpux:23"), make_lc(D::UNSPEC, -1, 23));
  29. ASSERT_EQ(L::parse("xpu2:23"), make_lc(D::UNSPEC, 2, 23));
  30. ASSERT_EQ(L::parse("xpu21:23"), make_lc(D::UNSPEC, 21, 23));
  31. ASSERT_EQ(L::parse("cpux"), make_lc(D::CPU, -1, 0));
  32. ASSERT_EQ(L::parse("cpux:23"), make_lc(D::CPU, -1, 23));
  33. ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
  34. ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));
  35. ASSERT_EQ(L::parse("rocmx"), make_lc(D::ROCM, -1, 0));
  36. ASSERT_EQ(L::parse("rocm2"), make_lc(D::ROCM, 2, 0));
  37. ASSERT_EQ(L::parse("rocm2:3"), make_lc(D::ROCM, 2, 3));
  38. ASSERT_EQ(L::parse("cambriconx"), make_lc(D::CAMBRICON, -1, 0));
  39. ASSERT_EQ(L::parse("cambricon2"), make_lc(D::CAMBRICON, 2, 0));
  40. ASSERT_EQ(L::parse("cambricon2:3"), make_lc(D::CAMBRICON, 2, 3));
  41. ASSERT_EQ(L::parse("atlasx"), make_lc(D::ATLAS, -1, 0));
  42. ASSERT_EQ(L::parse("atlas2"), make_lc(D::ATLAS, 2, 0));
  43. ASSERT_EQ(L::parse("atlas2:3"), make_lc(D::ATLAS, 2, 3));
  44. ASSERT_EQ(L::parse("xpu"), make_lc(D::UNSPEC, -1, 0));
  45. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  46. ASSERT_EQ(L::parse("xpu23"), make_lc(D::UNSPEC, 23, 0));
  47. ASSERT_EQ(L::parse("xpu23:1"), make_lc(D::UNSPEC, 23, 1));
  48. ASSERT_EQ(L::parse("cpu:default"), make_lc(D::CPU, L::DEVICE_CPU_DEFAULT, 0));
  49. ASSERT_EQ(L::parse("multithread2:0"), make_lc(D::MULTITHREAD, 0, 2));
  50. ASSERT_EQ(L::parse("multithread1:3"), make_lc(D::MULTITHREAD, 3, 1));
  51. ASSERT_EQ(L::parse("multithread:default:2"),
  52. make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));
  53. ASSERT_THROW(L::parse("apu"), MegBrainError);
  54. ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
  55. ASSERT_THROW(L::parse("cab0"), MegBrainError);
  56. ASSERT_THROW(L::parse("cpu"), MegBrainError);
  57. ASSERT_THROW(L::parse("cpu-1"), MegBrainError);
  58. ASSERT_THROW(L::parse("cpu0:"), MegBrainError);
  59. ASSERT_THROW(L::parse("cpu0:x"), MegBrainError);
  60. ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError);
  61. ASSERT_THROW(L::parse("rcom0"), MegBrainError);
  62. ASSERT_THROW(L::parse("cmabricon0"), MegBrainError);
  63. ASSERT_THROW(L::parse("atlast0"), MegBrainError);
  64. ASSERT_THROW(L::parse("multithread"), MegBrainError);
  65. ASSERT_THROW(L::parse("multithread1:"), MegBrainError);
  66. ASSERT_THROW(L::parse("multithread1:default"), MegBrainError);
  67. ASSERT_THROW(L::parse("multithread1:default:0"), MegBrainError);
  68. }
  69. TEST(TestCompNode, SetDefaultDev) {
  70. REQUIRE_GPU(3);
  71. CompNode::finalize();
  72. using L = CompNode::Locator;
  73. auto orig_dt = L::parse("xpu").to_physical(),
  74. orig_gpu = L::parse("gpux").to_physical(),
  75. orig_cpu = L::parse("cpux").to_physical();
  76. constexpr auto CUDA = CompNode::DeviceType::CUDA;
  77. constexpr auto CPU = CompNode::DeviceType::CPU;
  78. L::set_unspec_device_type(CUDA);
  79. auto run = [](int device) {
  80. ASSERT_EQ(CompNode::load("xpu").locator(),
  81. L::parse("gpu" + std::to_string(device)));
  82. };
  83. auto run_cpu = [](int device) {
  84. ASSERT_EQ(CompNode::load("cpux").locator(),
  85. L::parse("cpu" + std::to_string(device)));
  86. };
  87. MGB_TRY {
  88. L::set_device_map(CUDA, -1, 2);
  89. run(2);
  90. L::set_device_map(CUDA, -1, 1);
  91. run(1);
  92. L::set_device_map(CPU, -1, 2);
  93. run_cpu(2);
  94. L::set_device_map(CPU, -1, 1);
  95. run_cpu(1);
  96. } MGB_FINALLY({
  97. L::set_unspec_device_type(orig_dt.type);
  98. L::set_device_map(CUDA, -1, orig_gpu.device);
  99. L::set_device_map(CPU, -1, orig_cpu.device);
  100. });
  101. CompNode::finalize();
  102. }
  103. TEST(TestCompNode, Load) {
  104. auto cn0 = CompNode::load("xpux"),
  105. cn1 = CompNode::load("cpux");
  106. ASSERT_EQ(CompNode::DeviceType::UNSPEC, cn0.locator_logical().type);
  107. ASSERT_EQ(CompNode::DeviceType::CPU, cn1.locator_logical().type);
  108. ASSERT_EQ(CompNode::load("cpux"), cn1);
  109. ASSERT_EQ(CompNode::load("xpux"), cn0);
  110. auto cnp = CompNode::load("cpu1"), cnq = CompNode::load("cpu2");
  111. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  112. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  113. #if MGB_HAVE_THREAD
  114. ASSERT_NE(cnp, cnq);
  115. #else
  116. ASSERT_EQ(cnp, cnq);
  117. #endif
  118. #if MGB_HAVE_THREAD
  119. auto cn_multi_thread0 = CompNode::load("multithread2:0");
  120. auto cn_multi_thread1 = CompNode::load("multithread2:1");
  121. ASSERT_EQ(CompNode::load("multithread2:0"), cn_multi_thread0);
  122. ASSERT_EQ(CompNode::load("multithread2:1"), cn_multi_thread1);
  123. ASSERT_NE(CompNode::load("multithread4:0"), cn_multi_thread0);
  124. ASSERT_NE(CompNode::load("multithread4:1"), cn_multi_thread1);
  125. auto cn_multi_default0 = CompNode::load("multithread:default:2");
  126. auto cn_multi_default1 = CompNode::load("multithread:default:4");
  127. ASSERT_EQ(CompNode::load("multithread:default:2"), cn_multi_default0);
  128. ASSERT_EQ(CompNode::load("multithread:default:4"), cn_multi_default1);
  129. ASSERT_NE(cn_multi_thread0, cn_multi_default1);
  130. #endif
  131. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  132. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  133. if (check_gpu_available(2)) {
  134. auto cn2 = CompNode::load("gpux"),
  135. cn3 = CompNode::load("gpu1");
  136. ASSERT_EQ(CompNode::DeviceType::CUDA, cn2.locator_logical().type);
  137. ASSERT_NE(cn2, cn3);
  138. ASSERT_EQ(CompNode::load("gpux"), cn2);
  139. ASSERT_EQ(CompNode::load("gpu1"), cn3);
  140. }
  141. #if MGB_ATLAS
  142. auto atlas0 = CompNode::load("atlas0");
  143. auto atlas1 = CompNode::load("atlas1");
  144. ASSERT_NE(atlas0, atlas1);
  145. #endif
  146. }
  147. TEST(TestCompNode, FreeAfterFinalize) {
  148. CompNode::finalize();
  149. for (size_t i = 0; i < CompNode::NR_DEVICE_TYPE; ++i) {
  150. auto type = static_cast<CompNode::DeviceType>(i);
  151. if (!check_device_type_avaiable(type) ||
  152. !CompNode::get_device_count(type))
  153. continue;
  154. auto cn = CompNode::load(CompNode::Locator{type, -1, {0}});
  155. auto ptr = cn.alloc_device(123);
  156. CompNode::finalize();
  157. cn.free_device(ptr);
  158. }
  159. }
  160. TEST(TestCompNode, CPUDispatchSync) {
  161. REQUIRE_THREAD();
  162. constexpr int LOOP = 160, tot_threads = 8;
  163. std::atomic_int started_threads{0};
  164. auto worker = [&](int *shared_cnt, CompNode dest) {
  165. int nr_call = 0;
  166. RNGxorshf rng{next_rand_seed()};
  167. auto func = [&rng, &nr_call, shared_cnt]() {
  168. ++ nr_call;
  169. ++ *shared_cnt;
  170. int volatile cnt = 0;
  171. while (rng() % 20)
  172. ++ cnt;
  173. };
  174. auto &&env = CompNodeEnv::from_comp_node(dest).cpu_env();
  175. ++ started_threads;
  176. while (started_threads.load() != tot_threads);
  177. for (int i = 0; i < LOOP; ++ i) {
  178. env.dispatch(func);
  179. dest.sync();
  180. ASSERT_EQ(i + 1, nr_call);
  181. }
  182. };
  183. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  184. int cnt0 = 0, cnt1 = 0;
  185. std::vector<std::thread> wk_threads;
  186. for (int i = 0; i < tot_threads / 2; ++ i) {
  187. wk_threads.emplace_back(worker, &cnt0, cn0);
  188. wk_threads.emplace_back(worker, &cnt1, cn1);
  189. }
  190. for (auto &&i: wk_threads)
  191. i.join();
  192. ASSERT_EQ(LOOP * tot_threads / 2, cnt0);
  193. ASSERT_EQ(LOOP * tot_threads / 2, cnt1);
  194. }
  195. TEST(TestCompNodeCPU, CoreAffinity) {
  196. REQUIRE_THREAD();
  197. std::vector<size_t> data_v(2, 0);
  198. size_t data0, data1 = 0;
  199. auto empty_task = []() {};
  200. auto cn0 = CompNode::load("cpu:default"), cn1 = CompNode::load("cpu0"),
  201. cn2 = CompNode::load("multithread2:0");
  202. auto binding0 = [&](size_t) { data0 = 10; };
  203. CompNodeEnv::from_comp_node(cn0).cpu_env().set_affinity(binding0);
  204. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(empty_task);
  205. cn0.sync();
  206. auto binding1 = [&](size_t ) { data1 = 20; };
  207. CompNodeEnv::from_comp_node(cn1).cpu_env().set_affinity(binding1);
  208. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(empty_task);
  209. cn1.sync();
  210. auto binding2 = [&](size_t thread_id) { data_v[thread_id] = 30; };
  211. auto temp_task = [](size_t, size_t) {};
  212. CompNodeEnv::from_comp_node(cn2).cpu_env().set_affinity(binding2);
  213. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(temp_task, 40u);
  214. cn2.sync();
  215. ASSERT_EQ(data0, static_cast<size_t>(10));
  216. ASSERT_EQ(data1, static_cast<size_t>(20));
  217. ASSERT_EQ(data_v[0], static_cast<size_t>(30));
  218. ASSERT_EQ(data_v[1], static_cast<size_t>(30));
  219. }
  220. TEST(TestCompNode, CPU_MULTI_THREAD) {
  221. REQUIRE_THREAD();
  222. std::vector<int> source(100), dst0(100), dst1(100);
  223. for (int i = 0; i < 100; i++) {
  224. source[i] = i;
  225. dst0[i] = 0;
  226. dst1[i] = 0;
  227. }
  228. size_t total_task = 20;
  229. auto worker = [&](std::vector<int>& dst, CompNode dest) {
  230. auto func = [&](size_t index, size_t) {
  231. size_t sub_task = 100 / total_task;
  232. for (size_t i = index * sub_task; i < (index + 1) * sub_task; i++) {
  233. int sum = 0;
  234. for (size_t j = 0; j < i; j++) {
  235. sum += source[j];
  236. }
  237. dst[i] = sum;
  238. }
  239. };
  240. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  241. env.dispatch(std::move(func), total_task);
  242. dest.sync();
  243. };
  244. for (auto&& str : std::vector<std::string>{
  245. "multithread2:0", "multithread4:0", "multithread:default:4"}) {
  246. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load(str);
  247. std::thread wk_thread0{std::ref(worker), std::ref(dst0), std::ref(cn0)};
  248. std::thread wk_thread1{std::ref(worker), std::ref(dst1), std::ref(cn1)};
  249. wk_thread0.join();
  250. wk_thread1.join();
  251. for (int i = 0; i < 100; i++) {
  252. ASSERT_EQ(dst0[i], dst1[i]);
  253. }
  254. }
  255. }
  256. TEST(TestCompNodeCuda, MemNode) {
  257. REQUIRE_GPU(2);
  258. auto cn00 = CompNode::load("gpu0"),
  259. cn1 = CompNode::load("gpu1"),
  260. cn01 = CompNode::load("gpu0:1");
  261. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  262. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  263. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  264. }
  265. TEST(TestCompNodeCuda, Uid) {
  266. REQUIRE_GPU(2);
  267. auto cn00 = CompNode::load("gpu0"),
  268. cn1 = CompNode::load("gpu1"),
  269. cn01 = CompNode::load("gpu0:0"),
  270. cn02 = CompNode::load("gpu0:2");
  271. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  272. ASSERT_EQ(cn00.get_uid(), cn01.get_uid());
  273. ASSERT_NE(cn00.get_uid(), cn02.get_uid());
  274. ASSERT_NE(cn00.get_uid(), cn1.get_uid());
  275. }
  276. TEST(TestCompNodeCuda, set_prealloc_config) {
  277. CompNode::set_prealloc_config(
  278. 1024, 1024, 256 * 1024 * 1024,
  279. 4, CompNode::DeviceType::CUDA);
  280. }
  281. #if MGB_ROCM
  282. TEST(TestCompNodeROCm, MemNode) {
  283. REQUIRE_AMD_GPU(2);
  284. auto cn00 = CompNode::load("rocm0"),
  285. cn1 = CompNode::load("rocm1"),
  286. cn01 = CompNode::load("rocm0:1");
  287. ASSERT_EQ(cn00, CompNode::load("rocm0"));
  288. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  289. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  290. }
  291. #endif
  292. #if MGB_CAMBRICON
  293. TEST(TestCompNodeCambricon, MemNode) {
  294. REQUIRE_CAMBRICON_DEVICE(2);
  295. auto cn00 = CompNode::load("cambricon0"),
  296. cn1 = CompNode::load("cambricon1"),
  297. cn01 = CompNode::load("cambricon0:1");
  298. ASSERT_EQ(cn00, CompNode::load("cambricon0"));
  299. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  300. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  301. }
  302. #endif
  303. #if MGB_ATLAS
  304. TEST(TestCompNodeAtlas, MemNode) {
  305. auto cn00 = CompNode::load("atlas0"),
  306. cn1 = CompNode::load("atlas1"),
  307. cn01 = CompNode::load("atlas0:1");
  308. ASSERT_EQ(cn00, CompNode::load("atlas0"));
  309. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  310. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  311. }
  312. #endif
  313. TEST(TestCompNodeCPU, PhysicalDispatch) {
  314. constexpr int ID = 0x2a6453e0;
  315. using L = CompNode::Locator;
  316. constexpr auto DT = CompNode::DeviceType::CPU;
  317. L::set_device_map(DT, ID, 0);
  318. L::set_device_map(DT, ID + 1, 0);
  319. L::set_device_map(DT, ID + 2, 1);
  320. auto cn0 = CompNode::load({DT, ID, {0}}),
  321. cn1 = CompNode::load({DT, ID + 1, {0}}),
  322. cn2 = CompNode::load({DT, ID + 2, {0}});
  323. #if MGB_HAVE_THREAD
  324. ASSERT_NE(cn0, cn1);
  325. #else
  326. ASSERT_EQ(cn0, cn1);
  327. #endif
  328. std::vector<std::thread::id> tids;
  329. std::mutex tids_mtx;
  330. auto get_tid = [&]() {
  331. MGB_LOCK_GUARD(tids_mtx);
  332. tids.push_back(std::this_thread::get_id());
  333. };
  334. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(get_tid);
  335. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(get_tid);
  336. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(get_tid);
  337. CompNode::sync_all();
  338. std::unordered_set<std::thread::id> uniq_tids(tids.begin(), tids.end());
  339. ASSERT_EQ(3u, tids.size());
  340. #if MGB_HAVE_THREAD
  341. ASSERT_EQ(2u, uniq_tids.size());
  342. #else
  343. ASSERT_EQ(1u, uniq_tids.size());
  344. #endif
  345. }
  346. TEST(TestCompNodeCPU, EventWait) {
  347. REQUIRE_THREAD();
  348. std::atomic_bool start = ATOMIC_VAR_INIT(false);
  349. auto cn0 = CompNode::load("cpu0"),
  350. cn1 = CompNode::load("cpu1");
  351. auto task0 = [&]() {
  352. while (!start)
  353. std::this_thread::yield();
  354. };
  355. auto event = cn0.create_event();
  356. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(task0);
  357. event->record();
  358. cn1.device_wait_event(*event);
  359. bool succ = false;
  360. auto task1 = [&]() {
  361. succ = start;
  362. };
  363. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(task1);
  364. using namespace std::literals;
  365. std::this_thread::sleep_for(50ms);
  366. ASSERT_FALSE(succ);
  367. start = true;
  368. CompNode::sync_all();
  369. ASSERT_TRUE(succ);
  370. }
  371. TEST(TestCompNodeCPU, EventRecOverwrite) {
  372. REQUIRE_THREAD();
  373. auto cn = CompNode::load("cpu0");
  374. auto dispatcher = CompNodeEnv::from_comp_node(cn).
  375. cpu_env().dispatcher.get();
  376. auto dispatch = [&](MegcoreCPUDispatcher::Task &&t) {
  377. dispatcher->dispatch(std::move(t));
  378. };
  379. auto ev = cn.create_event();
  380. auto wait_atomic = [](std::atomic_bool *var) {
  381. while(!var->load())
  382. std::this_thread::yield();
  383. };
  384. auto set_atomic = [](std::atomic_bool *var) {
  385. var->store(true);
  386. };
  387. std::atomic_bool
  388. s0 = ATOMIC_VAR_INIT(false),
  389. s1 = ATOMIC_VAR_INIT(false),
  390. t0 = ATOMIC_VAR_INIT(false),
  391. t1 = ATOMIC_VAR_INIT(false),
  392. t2 = ATOMIC_VAR_INIT(false);
  393. dispatch(std::bind(set_atomic, &t0));
  394. dispatch(std::bind(wait_atomic, &s0));
  395. ev->record();
  396. dispatch(std::bind(set_atomic, &t1));
  397. dispatch(std::bind(wait_atomic, &s1));
  398. ev->record();
  399. dispatch(std::bind(set_atomic, &t2));
  400. wait_atomic(&t0);
  401. ASSERT_FALSE(ev->finished());
  402. set_atomic(&s0);
  403. wait_atomic(&t1);
  404. ASSERT_FALSE(ev->finished());
  405. set_atomic(&s1);
  406. wait_atomic(&t2);
  407. ASSERT_TRUE(ev->finished());
  408. }
  409. namespace {
  410. void test_peer_copy_from_device(const char* comp_node) {
  411. REQUIRE_THREAD();
  412. auto cn_gpu = CompNode::load(comp_node);
  413. auto cn_cpu = CompNode::load("cpux");
  414. HostTensorGenerator<> gen;
  415. auto a = gen({20, 3, 112, 112});
  416. auto b = gen({20, 3, 112, 112});
  417. auto c = gen({20, 3, 112, 112});
  418. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  419. dev_a.copy_from(*a).sync();
  420. dev_b.copy_from(*b).sync();
  421. dev_c.copy_from(*c).sync();
  422. auto wait_event = cn_gpu.create_event();
  423. opr::Sleep::sleep(cn_gpu, 0.1);
  424. dev_a.copy_from(dev_c);
  425. wait_event->record();
  426. cn_cpu.device_wait_event(*wait_event);
  427. dev_b.copy_from(dev_a);
  428. dev_b.sync();
  429. HostTensorND result;
  430. result.copy_from(dev_b);
  431. CompNode::sync_all();
  432. MGB_ASSERT_TENSOR_EQ(result, *c);
  433. }
  434. }
  435. TEST(TestCompNodeCPU, PeerCopyFromCUDA) {
  436. REQUIRE_GPU(1);
  437. test_peer_copy_from_device("gpux");
  438. }
  439. TEST(TestCompNodeCPU, PeerCopyFromROCm) {
  440. REQUIRE_AMD_GPU(1);
  441. test_peer_copy_from_device("rocmx");
  442. }
  443. #if MGB_CAMBRICON
  444. TEST(TestCompNodeCPU, PeerCopyFromCambricon) {
  445. REQUIRE_CAMBRICON_DEVICE(1);
  446. REQUIRE_THREAD();
  447. auto cn_gpu = CompNode::load("cambriconx");
  448. auto cn_cpu = CompNode::load("cpux");
  449. HostTensorGenerator<> gen;
  450. auto a = gen({20, 3, 112, 112});
  451. auto b = gen({20, 3, 112, 112});
  452. auto c = gen({20, 3, 112, 112});
  453. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  454. dev_a.copy_from(*a).sync();
  455. dev_b.copy_from(*b).sync();
  456. dev_c.copy_from(*c).sync();
  457. auto wait_event = cn_gpu.create_event();
  458. dev_a.copy_from(dev_c);
  459. wait_event->record();
  460. cn_cpu.device_wait_event(*wait_event);
  461. dev_b.copy_from(dev_a);
  462. dev_b.sync();
  463. HostTensorND result;
  464. result.copy_from(dev_b);
  465. CompNode::sync_all();
  466. MGB_ASSERT_TENSOR_EQ(result, *c);
  467. }
  468. #endif
  469. TEST(TestCompNodeSyncManager, HostWait) {
  470. REQUIRE_THREAD();
  471. CompNodeSyncManager mgr(CompNode::load("xpu0"));
  472. auto run_set = [&]() {
  473. using namespace std::literals;
  474. std::this_thread::sleep_for(200ms);
  475. mgr.set_ready();
  476. mgb_log_debug("set_ready() called");
  477. };
  478. for (int run = 0; run < 2; ++ run) {
  479. std::thread th_run_set(run_set);
  480. RealTimer timer;
  481. mgr.clear_waiter_record();
  482. ASSERT_THROW(mgr.busy_wait_set_ready(), MegBrainError);
  483. mgr.add_waiter_record(false);
  484. mgr.add_waiter_record(false);
  485. mgr.busy_wait_set_ready();
  486. EXPECT_GE(timer.get_secs(), 0.1);
  487. timer.reset();
  488. mgr.busy_wait_set_ready();
  489. EXPECT_LE(timer.get_secs(), 0.001);
  490. th_run_set.join();
  491. }
  492. }
  493. TEST(TestCompNodeSyncManager, DeviceWait) {
  494. REQUIRE_THREAD();
  495. auto cns = load_multiple_xpus(3);
  496. auto cn0 = cns[0], cn1 = cns[1], cn2 = cns[2];
  497. CompNodeSyncManager mgr(cn0);
  498. using Event = CompNode::Event;
  499. auto ev_cn1 = cn1.create_event(),
  500. ev_cn2_begin = cn2.create_event(Event::NEED_TIMER),
  501. ev_cn2_end = cn2.create_event(Event::NEED_TIMER);
  502. for (int run = 0; run < 2; ++ run) {
  503. RealTimer timer;
  504. mgr.clear_waiter_record();
  505. ASSERT_THROW(mgr.busy_wait_set_ready_and_get_event(), MegBrainError);
  506. mgr.add_waiter_record(true);
  507. mgr.add_waiter_record(true);
  508. opr::Sleep::sleep(cn0, 0.13);
  509. mgr.set_ready();
  510. ev_cn2_begin->record();
  511. cn1.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  512. cn2.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  513. ev_cn1->record();
  514. ev_cn2_end->record();
  515. EXPECT_LE(timer.get_secs(), 0.06);
  516. ev_cn1->host_wait();
  517. EXPECT_GE(timer.get_secs(), 0.1);
  518. ev_cn2_end->host_wait();
  519. auto ev2_t = ev_cn2_begin->elapsed_time_until(*ev_cn2_end);
  520. EXPECT_GE(ev2_t, 0.1);
  521. }
  522. }
  523. TEST(TestCompNodeSyncManager, DeviceWaitCross) {
  524. REQUIRE_THREAD();
  525. auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
  526. auto ev_cn0 = cn0.create_event(),
  527. ev_cn1 = cn1.create_event();
  528. RealTimer timer;
  529. // cross wait like deadlock, but guaranteed to work due to good timing
  530. ev_cn0->record();
  531. cn1.device_wait_event(*ev_cn0);
  532. ev_cn1->record();
  533. opr::Sleep::sleep(cn0, 0.1);
  534. cn0.device_wait_event(*ev_cn1);
  535. ev_cn0->record();
  536. cn1.device_wait_event(*ev_cn0);
  537. cn0.sync();
  538. cn1.sync();
  539. // sleep kernel in cuda is easily affected by the frequency change of GPU,
  540. // so we just print warn log instead assert. more refer to
  541. // XPU-226
  542. auto used = timer.get_secs();
  543. if (used <= 0.1 || used >= 0.2) {
  544. mgb_log_warn("expect time between [%f, %f], got %f", 0.1, 0.2, used);
  545. }
  546. }
  547. #if !MGB_HAVE_THREAD
  548. TEST(TestCompNodeSyncManager, DeviceWaitWithoutThread) {
  549. auto cn = CompNode::load("cpu:default");
  550. CompNodeSyncManager mgr(cn);
  551. mgr.add_waiter_record(true);
  552. ASSERT_ANY_THROW(mgr.busy_wait_set_ready());
  553. mgr.set_ready();
  554. EXPECT_TRUE(mgr.busy_wait_set_ready_and_get_event().finished());
  555. }
  556. #endif
  557. TEST(TestCompNode, MultipleLoad) {
  558. auto run = [](CompNode cn) {
  559. HostTensorND a(cn, {23}, dtype::Int32{}), b;
  560. auto pa = a.ptr<int>();
  561. for (int i = 0; i < 23; ++i) {
  562. pa[i] = i;
  563. }
  564. DeviceTensorND tmp;
  565. tmp.copy_from(a);
  566. b.copy_from(tmp).sync();
  567. auto pb = b.ptr<int>();
  568. for (int i = 0; i < 23; ++i) {
  569. ASSERT_EQ(i, pb[i]);
  570. }
  571. CompNode::finalize();
  572. };
  573. for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) {
  574. auto dt = static_cast<CompNode::DeviceType>(i);
  575. if (!check_device_type_avaiable(dt))
  576. continue;
  577. if (CompNode::get_device_count(dt)) {
  578. auto cn = CompNode::load({dt, 0, {0}});
  579. mgb_log("comp node %s is available", cn.to_string().c_str());
  580. run(cn);
  581. cn = CompNode::load({dt, 0, {0}});
  582. run(cn);
  583. }
  584. }
  585. }
  586. #if MGB_CAMBRICON
  587. TEST(TestCompNodeCambricon, D2DCopy) {
  588. auto run = [](CompNode cn) {
  589. constexpr size_t size = 100 * 1024 * 1024;
  590. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  591. auto pa = a.ptr<int>();
  592. for (size_t i = 0; i < size; ++i) {
  593. pa[i] = i;
  594. }
  595. DeviceTensorND tmp, tmp1;
  596. tmp.copy_from(a);
  597. tmp1.copy_from(tmp);
  598. b.copy_from(tmp1).sync();
  599. auto pb = b.ptr<int>();
  600. for (size_t i = 0; i < size; ++i) {
  601. ASSERT_EQ(static_cast<int>(i), pb[i]);
  602. }
  603. CompNode::finalize();
  604. };
  605. REQUIRE_CAMBRICON_DEVICE(1);
  606. auto cn = CompNode::load("cambricon0");
  607. run(cn);
  608. cn = CompNode::load("cambricon1");
  609. run(cn);
  610. }
  611. // peer copy for cambricon between different devices is not correct now, so
  612. // disable this testcase
  613. #if 0
  614. TEST(TestCompNodeCambricon, P2PCopy) {
  615. auto run_raw = []() {
  616. int v0 = 0, v1 = 1;
  617. cnrtDev_t dev0, dev1;
  618. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev0, 0));
  619. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev1, 1));
  620. int *dp0, *dp1;
  621. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev0));
  622. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp0), sizeof(int)));
  623. MGB_CNRT_CHECK(
  624. cnrtMemcpy(dp0, &v0, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  625. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev1));
  626. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp1), sizeof(int)));
  627. MGB_CNRT_CHECK(
  628. cnrtMemcpy(dp1, &v1, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  629. unsigned int can = 0;
  630. MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, 0, 1));
  631. printf("can = %s\n", can ? "TRUE" : "FALSE");
  632. if (can) {
  633. MGB_CNRT_CHECK(cnrtMemcpyPeer(dp1, 1, dp0, 0, sizeof(int)));
  634. int get;
  635. MGB_CNRT_CHECK(cnrtMemcpy(&get, dp1, sizeof(int),
  636. CNRT_MEM_TRANS_DIR_DEV2HOST));
  637. ASSERT_EQ(0, get);
  638. }
  639. };
  640. auto run = [](CompNode cn0, CompNode cn1) {
  641. constexpr size_t size = 100;
  642. HostTensorND a(cn0, {size}, dtype::Int32{}), b;
  643. auto pa = a.ptr<int>();
  644. for (size_t i = 0; i < size; ++i) {
  645. pa[i] = i;
  646. }
  647. DeviceTensorND tmp(cn0, {size}, dtype::Int32{}),
  648. tmp1(cn1, {size}, dtype::Int32{});
  649. tmp.copy_from(a);
  650. tmp1.copy_from(tmp);
  651. b.copy_from(tmp1).sync();
  652. auto pb = b.ptr<int>();
  653. for (size_t i = 0; i < size; ++i) {
  654. ASSERT_EQ(static_cast<int>(i), pb[i]);
  655. }
  656. CompNode::finalize();
  657. };
  658. REQUIRE_CAMBRICON_DEVICE(2);
  659. auto cn0 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1");
  660. run_raw();
  661. run(cn0, cn1);
  662. }
  663. #endif
  664. #endif // MGB_CAMBRICON
  665. #if MGB_ATLAS
  666. TEST(TestCompNodeAtlas, D2DCopy) {
  667. auto run = [](CompNode cn) {
  668. constexpr size_t size = 10 * 1024 * 1024;
  669. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  670. auto pa = a.ptr<int>();
  671. for (size_t i = 0; i < size; ++i) {
  672. pa[i] = i;
  673. }
  674. DeviceTensorND tmp, tmp1;
  675. tmp.copy_from(a);
  676. tmp1.copy_from(tmp);
  677. b.copy_from(tmp1).sync();
  678. auto pb = b.ptr<int>();
  679. for (size_t i = 0; i < size; ++i) {
  680. ASSERT_EQ(static_cast<int>(i), pb[i]);
  681. }
  682. CompNode::finalize();
  683. };
  684. auto cn = CompNode::load("atlas0");
  685. run(cn);
  686. }
  687. #endif
  688. namespace {
  689. class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
  690. int *m_dst, *m_timer;
  691. std::shared_ptr<void> on_comp_node_finalize() override {
  692. EXPECT_EQ(0, *m_dst);
  693. *m_dst = ++*m_timer;
  694. return {};
  695. }
  696. public:
  697. CompNodeDepedentObjectInst(int* dst, int* timer)
  698. : m_dst{dst}, m_timer{timer} {}
  699. void chk() { check_not_finalized(); }
  700. };
  701. } // anonymous namespace
  702. TEST(TestCompNode, DepedentObjectList) {
  703. CompNode::finalize();
  704. for (int i = 0; i < 5; ++i) {
  705. // loop multiple times so memory problems can be easier exposed
  706. int ts[4] = {0}, timer = 0;
  707. auto make = [&](int i) {
  708. return std::make_unique<CompNodeDepedentObjectInst>(ts + i, &timer);
  709. };
  710. auto i0 = make(0), i1 = make(1), i2 = make(2), i3 = make(3);
  711. ASSERT_NO_THROW(i0->chk());
  712. ASSERT_NO_THROW(i1->chk());
  713. i1.reset();
  714. comp_node_detail::DepedentObjList::invoke_callback_and_clean();
  715. ASSERT_EQ(1, ts[3]);
  716. ASSERT_EQ(2, ts[2]);
  717. ASSERT_EQ(0, ts[1]);
  718. ASSERT_EQ(3, ts[0]);
  719. ASSERT_THROW(i0->chk(), InternalError);
  720. }
  721. }
  722. namespace {
  723. template <typename tag>
  724. class TestCPUCompSeqRec : public ::testing::Test {};
  725. TYPED_TEST_CASE(TestCPUCompSeqRec, comp_node_test::seq_rec::test_types);
  726. TYPED_TEST(TestCPUCompSeqRec, run) {
  727. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpux"));
  728. }
  729. TYPED_TEST(TestCPUCompSeqRec, run_default_cpu) {
  730. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpu:default"));
  731. }
  732. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread) {
  733. auto cn = CompNode::load("multithread4:0");
  734. comp_node_test::seq_rec::run<TypeParam>(cn);
  735. }
  736. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) {
  737. auto cn = CompNode::load("multithread:default:4");
  738. comp_node_test::seq_rec::run<TypeParam>(cn);
  739. }
  740. } // anonymous namespace
  741. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台