You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node.cpp 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. /**
  2. * \file src/core/test/comp_node.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./comp_node_helper.h"
  12. #include "megbrain/comp_node_env.h"
  13. #include "megbrain/utils/comp_node_sync_manager.h"
  14. #include "megbrain/utils/timer.h"
  15. #include "megbrain/system.h"
  16. #include "megbrain/test/helper.h"
  17. #include "megbrain/opr/utility.h"
  18. #include <chrono>
  19. #if MGB_HAVE_THREAD
  20. #include <thread>
  21. #endif
  22. using namespace mgb;
  23. TEST(TestCompNode, Parse) {
  24. using L = CompNode::Locator;
  25. using D = CompNode::DeviceType;
  26. auto make_lc = [](D t, int dev, int s) -> L { return {t, dev, {s}}; };
  27. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  28. ASSERT_EQ(L::parse("xpux:23"), make_lc(D::UNSPEC, -1, 23));
  29. ASSERT_EQ(L::parse("xpu2:23"), make_lc(D::UNSPEC, 2, 23));
  30. ASSERT_EQ(L::parse("xpu21:23"), make_lc(D::UNSPEC, 21, 23));
  31. ASSERT_EQ(L::parse("cpux"), make_lc(D::CPU, -1, 0));
  32. ASSERT_EQ(L::parse("cpux:23"), make_lc(D::CPU, -1, 23));
  33. ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
  34. ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));
  35. ASSERT_EQ(L::parse("cambriconx"), make_lc(D::CAMBRICON, -1, 0));
  36. ASSERT_EQ(L::parse("cambricon2"), make_lc(D::CAMBRICON, 2, 0));
  37. ASSERT_EQ(L::parse("cambricon2:3"), make_lc(D::CAMBRICON, 2, 3));
  38. ASSERT_EQ(L::parse("atlasx"), make_lc(D::ATLAS, -1, 0));
  39. ASSERT_EQ(L::parse("atlas2"), make_lc(D::ATLAS, 2, 0));
  40. ASSERT_EQ(L::parse("atlas2:3"), make_lc(D::ATLAS, 2, 3));
  41. ASSERT_EQ(L::parse("xpu"), make_lc(D::UNSPEC, -1, 0));
  42. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  43. ASSERT_EQ(L::parse("xpu23"), make_lc(D::UNSPEC, 23, 0));
  44. ASSERT_EQ(L::parse("xpu23:1"), make_lc(D::UNSPEC, 23, 1));
  45. ASSERT_EQ(L::parse("cpu:default"), make_lc(D::CPU, L::DEVICE_CPU_DEFAULT, 0));
  46. ASSERT_EQ(L::parse("multithread2:0"), make_lc(D::MULTITHREAD, 0, 2));
  47. ASSERT_EQ(L::parse("multithread1:3"), make_lc(D::MULTITHREAD, 3, 1));
  48. ASSERT_EQ(L::parse("multithread:default:2"),
  49. make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));
  50. ASSERT_THROW(L::parse("apu"), MegBrainError);
  51. ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
  52. ASSERT_THROW(L::parse("cab0"), MegBrainError);
  53. ASSERT_THROW(L::parse("cpu"), MegBrainError);
  54. ASSERT_THROW(L::parse("cpu-1"), MegBrainError);
  55. ASSERT_THROW(L::parse("cpu0:"), MegBrainError);
  56. ASSERT_THROW(L::parse("cpu0:x"), MegBrainError);
  57. ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError);
  58. ASSERT_THROW(L::parse("cmabricon0"), MegBrainError);
  59. ASSERT_THROW(L::parse("atlast0"), MegBrainError);
  60. ASSERT_THROW(L::parse("multithread"), MegBrainError);
  61. ASSERT_THROW(L::parse("multithread1:"), MegBrainError);
  62. ASSERT_THROW(L::parse("multithread1:default"), MegBrainError);
  63. ASSERT_THROW(L::parse("multithread1:default:0"), MegBrainError);
  64. }
  65. TEST(TestCompNode, SetDefaultDev) {
  66. REQUIRE_GPU(3);
  67. CompNode::finalize();
  68. using L = CompNode::Locator;
  69. auto orig_dt = L::parse("xpu").to_physical(),
  70. orig_gpu = L::parse("gpux").to_physical();
  71. constexpr auto CUDA = CompNode::DeviceType::CUDA;
  72. L::set_unspec_device_type(CUDA);
  73. L::set_device_map(CUDA, -1, 2);
  74. auto run = []() {
  75. ASSERT_EQ(CompNode::load("xpu").locator(), L::parse("gpu2"));
  76. };
  77. MGB_TRY {
  78. run();
  79. } MGB_FINALLY({
  80. L::set_unspec_device_type(orig_dt.type);
  81. L::set_device_map(CUDA, -1, orig_gpu.device);
  82. });
  83. CompNode::finalize();
  84. }
  85. TEST(TestCompNode, Load) {
  86. auto cn0 = CompNode::load("xpux"),
  87. cn1 = CompNode::load("cpux");
  88. ASSERT_EQ(CompNode::DeviceType::UNSPEC, cn0.locator_logical().type);
  89. ASSERT_EQ(CompNode::DeviceType::CPU, cn1.locator_logical().type);
  90. ASSERT_EQ(CompNode::load("cpux"), cn1);
  91. ASSERT_EQ(CompNode::load("xpux"), cn0);
  92. auto cnp = CompNode::load("cpu1"), cnq = CompNode::load("cpu2");
  93. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  94. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  95. #if MGB_HAVE_THREAD
  96. ASSERT_NE(cnp, cnq);
  97. #else
  98. ASSERT_EQ(cnp, cnq);
  99. #endif
  100. #if MGB_HAVE_THREAD
  101. auto cn_multi_thread0 = CompNode::load("multithread2:0");
  102. auto cn_multi_thread1 = CompNode::load("multithread2:1");
  103. ASSERT_EQ(CompNode::load("multithread2:0"), cn_multi_thread0);
  104. ASSERT_EQ(CompNode::load("multithread2:1"), cn_multi_thread1);
  105. ASSERT_NE(CompNode::load("multithread4:0"), cn_multi_thread0);
  106. ASSERT_NE(CompNode::load("multithread4:1"), cn_multi_thread1);
  107. auto cn_multi_default0 = CompNode::load("multithread:default:2");
  108. auto cn_multi_default1 = CompNode::load("multithread:default:4");
  109. ASSERT_EQ(CompNode::load("multithread:default:2"), cn_multi_default0);
  110. ASSERT_EQ(CompNode::load("multithread:default:4"), cn_multi_default1);
  111. ASSERT_NE(cn_multi_thread0, cn_multi_default1);
  112. #endif
  113. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  114. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  115. if (check_gpu_available(2)) {
  116. auto cn2 = CompNode::load("gpux"),
  117. cn3 = CompNode::load("gpu1");
  118. ASSERT_EQ(CompNode::DeviceType::CUDA, cn2.locator_logical().type);
  119. ASSERT_NE(cn2, cn3);
  120. ASSERT_EQ(CompNode::load("gpux"), cn2);
  121. ASSERT_EQ(CompNode::load("gpu1"), cn3);
  122. }
  123. #if MGB_ATLAS
  124. auto atlas0 = CompNode::load("atlas0");
  125. auto atlas1 = CompNode::load("atlas1");
  126. ASSERT_NE(atlas0, atlas1);
  127. #endif
  128. }
  129. TEST(TestCompNode, FreeAfterFinalize) {
  130. CompNode::finalize();
  131. for (size_t i = 0; i < CompNode::NR_DEVICE_TYPE; ++i) {
  132. auto type = static_cast<CompNode::DeviceType>(i);
  133. if (!check_device_type_avaiable(type) ||
  134. !CompNode::get_device_count(type))
  135. continue;
  136. auto cn = CompNode::load(CompNode::Locator{type, -1, {0}});
  137. auto ptr = cn.alloc_device(123);
  138. CompNode::finalize();
  139. cn.free_device(ptr);
  140. }
  141. }
  142. TEST(TestCompNode, CPUDispatchSync) {
  143. REQUIRE_THREAD();
  144. constexpr int LOOP = 160, tot_threads = 8;
  145. std::atomic_int started_threads{0};
  146. auto worker = [&](int *shared_cnt, CompNode dest) {
  147. int nr_call = 0;
  148. RNGxorshf rng{next_rand_seed()};
  149. auto func = [&rng, &nr_call, shared_cnt]() {
  150. ++ nr_call;
  151. ++ *shared_cnt;
  152. int volatile cnt = 0;
  153. while (rng() % 20)
  154. ++ cnt;
  155. };
  156. auto &&env = CompNodeEnv::from_comp_node(dest).cpu_env();
  157. ++ started_threads;
  158. while (started_threads.load() != tot_threads);
  159. for (int i = 0; i < LOOP; ++ i) {
  160. env.dispatch(func);
  161. dest.sync();
  162. ASSERT_EQ(i + 1, nr_call);
  163. }
  164. };
  165. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  166. int cnt0 = 0, cnt1 = 0;
  167. std::vector<std::thread> wk_threads;
  168. for (int i = 0; i < tot_threads / 2; ++ i) {
  169. wk_threads.emplace_back(worker, &cnt0, cn0);
  170. wk_threads.emplace_back(worker, &cnt1, cn1);
  171. }
  172. for (auto &&i: wk_threads)
  173. i.join();
  174. ASSERT_EQ(LOOP * tot_threads / 2, cnt0);
  175. ASSERT_EQ(LOOP * tot_threads / 2, cnt1);
  176. }
  177. TEST(TestCompNodeCPU, CoreAffinity) {
  178. REQUIRE_THREAD();
  179. std::vector<size_t> data_v(2, 0);
  180. size_t data0, data1 = 0;
  181. auto empty_task = []() {};
  182. auto cn0 = CompNode::load("cpu:default"), cn1 = CompNode::load("cpu0"),
  183. cn2 = CompNode::load("multithread2:0");
  184. auto binding0 = [&](size_t) { data0 = 10; };
  185. CompNodeEnv::from_comp_node(cn0).cpu_env().set_affinity(binding0);
  186. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(empty_task);
  187. cn0.sync();
  188. auto binding1 = [&](size_t ) { data1 = 20; };
  189. CompNodeEnv::from_comp_node(cn1).cpu_env().set_affinity(binding1);
  190. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(empty_task);
  191. cn1.sync();
  192. auto binding2 = [&](size_t thread_id) { data_v[thread_id] = 30; };
  193. auto temp_task = [](size_t, size_t) {};
  194. CompNodeEnv::from_comp_node(cn2).cpu_env().set_affinity(binding2);
  195. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(temp_task, 40u);
  196. cn2.sync();
  197. ASSERT_EQ(data0, static_cast<size_t>(10));
  198. ASSERT_EQ(data1, static_cast<size_t>(20));
  199. ASSERT_EQ(data_v[0], static_cast<size_t>(30));
  200. ASSERT_EQ(data_v[1], static_cast<size_t>(30));
  201. }
  202. TEST(TestCompNode, CPU_MULTI_THREAD) {
  203. REQUIRE_THREAD();
  204. std::vector<int> source(100), dst0(100), dst1(100);
  205. for (int i = 0; i < 100; i++) {
  206. source[i] = i;
  207. dst0[i] = 0;
  208. dst1[i] = 0;
  209. }
  210. size_t total_task = 20;
  211. auto worker = [&](std::vector<int>& dst, CompNode dest) {
  212. auto func = [&](size_t index, size_t) {
  213. size_t sub_task = 100 / total_task;
  214. for (size_t i = index * sub_task; i < (index + 1) * sub_task; i++) {
  215. int sum = 0;
  216. for (size_t j = 0; j < i; j++) {
  217. sum += source[j];
  218. }
  219. dst[i] = sum;
  220. }
  221. };
  222. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  223. env.dispatch(std::move(func), total_task);
  224. dest.sync();
  225. };
  226. for (auto&& str : std::vector<std::string>{
  227. "multithread2:0", "multithread4:0", "multithread:default:4"}) {
  228. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load(str);
  229. std::thread wk_thread0{std::ref(worker), std::ref(dst0), std::ref(cn0)};
  230. std::thread wk_thread1{std::ref(worker), std::ref(dst1), std::ref(cn1)};
  231. wk_thread0.join();
  232. wk_thread1.join();
  233. for (int i = 0; i < 100; i++) {
  234. ASSERT_EQ(dst0[i], dst1[i]);
  235. }
  236. }
  237. }
  238. TEST(TestCompNodeCuda, MemNode) {
  239. REQUIRE_GPU(2);
  240. auto cn00 = CompNode::load("gpu0"),
  241. cn1 = CompNode::load("gpu1"),
  242. cn01 = CompNode::load("gpu0:1");
  243. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  244. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  245. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  246. }
  247. TEST(TestCompNodeCuda, Uid) {
  248. REQUIRE_GPU(2);
  249. auto cn00 = CompNode::load("gpu0"),
  250. cn1 = CompNode::load("gpu1"),
  251. cn01 = CompNode::load("gpu0:0"),
  252. cn02 = CompNode::load("gpu0:2");
  253. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  254. ASSERT_EQ(cn00.get_uid(), cn01.get_uid());
  255. ASSERT_NE(cn00.get_uid(), cn02.get_uid());
  256. ASSERT_NE(cn00.get_uid(), cn1.get_uid());
  257. }
  258. #if MGB_CAMBRICON
  259. TEST(TestCompNodeCambricon, MemNode) {
  260. REQUIRE_CAMBRICON_DEVICE(2);
  261. auto cn00 = CompNode::load("cambricon0"),
  262. cn1 = CompNode::load("cambricon1"),
  263. cn01 = CompNode::load("cambricon0:1");
  264. ASSERT_EQ(cn00, CompNode::load("cambricon0"));
  265. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  266. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  267. }
  268. #endif
  269. #if MGB_ATLAS
  270. TEST(TestCompNodeAtlas, MemNode) {
  271. auto cn00 = CompNode::load("atlas0"),
  272. cn1 = CompNode::load("atlas1"),
  273. cn01 = CompNode::load("atlas0:1");
  274. ASSERT_EQ(cn00, CompNode::load("atlas0"));
  275. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  276. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  277. }
  278. #endif
  279. TEST(TestCompNodeCPU, PhysicalDispatch) {
  280. constexpr int ID = 0x2a6453e0;
  281. using L = CompNode::Locator;
  282. constexpr auto DT = CompNode::DeviceType::CPU;
  283. L::set_device_map(DT, ID, 0);
  284. L::set_device_map(DT, ID + 1, 0);
  285. L::set_device_map(DT, ID + 2, 1);
  286. auto cn0 = CompNode::load({DT, ID, {0}}),
  287. cn1 = CompNode::load({DT, ID + 1, {0}}),
  288. cn2 = CompNode::load({DT, ID + 2, {0}});
  289. #if MGB_HAVE_THREAD
  290. ASSERT_NE(cn0, cn1);
  291. #else
  292. ASSERT_EQ(cn0, cn1);
  293. #endif
  294. std::vector<std::thread::id> tids;
  295. std::mutex tids_mtx;
  296. auto get_tid = [&]() {
  297. MGB_LOCK_GUARD(tids_mtx);
  298. tids.push_back(std::this_thread::get_id());
  299. };
  300. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(get_tid);
  301. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(get_tid);
  302. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(get_tid);
  303. CompNode::sync_all();
  304. std::unordered_set<std::thread::id> uniq_tids(tids.begin(), tids.end());
  305. ASSERT_EQ(3u, tids.size());
  306. #if MGB_HAVE_THREAD
  307. ASSERT_EQ(2u, uniq_tids.size());
  308. #else
  309. ASSERT_EQ(1u, uniq_tids.size());
  310. #endif
  311. }
  312. TEST(TestCompNodeCPU, EventWait) {
  313. REQUIRE_THREAD();
  314. std::atomic_bool start = ATOMIC_VAR_INIT(false);
  315. auto cn0 = CompNode::load("cpu0"),
  316. cn1 = CompNode::load("cpu1");
  317. auto task0 = [&]() {
  318. while (!start)
  319. std::this_thread::yield();
  320. };
  321. auto event = cn0.create_event();
  322. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(task0);
  323. event->record();
  324. cn1.device_wait_event(*event);
  325. bool succ = false;
  326. auto task1 = [&]() {
  327. succ = start;
  328. };
  329. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(task1);
  330. using namespace std::literals;
  331. std::this_thread::sleep_for(50ms);
  332. ASSERT_FALSE(succ);
  333. start = true;
  334. CompNode::sync_all();
  335. ASSERT_TRUE(succ);
  336. }
  337. TEST(TestCompNodeCPU, EventRecOverwrite) {
  338. REQUIRE_THREAD();
  339. auto cn = CompNode::load("cpu0");
  340. auto dispatcher = CompNodeEnv::from_comp_node(cn).
  341. cpu_env().dispatcher.get();
  342. auto dispatch = [&](MegcoreCPUDispatcher::Task &&t) {
  343. dispatcher->dispatch(std::move(t));
  344. };
  345. auto ev = cn.create_event();
  346. auto wait_atomic = [](std::atomic_bool *var) {
  347. while(!var->load())
  348. std::this_thread::yield();
  349. };
  350. auto set_atomic = [](std::atomic_bool *var) {
  351. var->store(true);
  352. };
  353. std::atomic_bool
  354. s0 = ATOMIC_VAR_INIT(false),
  355. s1 = ATOMIC_VAR_INIT(false),
  356. t0 = ATOMIC_VAR_INIT(false),
  357. t1 = ATOMIC_VAR_INIT(false),
  358. t2 = ATOMIC_VAR_INIT(false);
  359. dispatch(std::bind(set_atomic, &t0));
  360. dispatch(std::bind(wait_atomic, &s0));
  361. ev->record();
  362. dispatch(std::bind(set_atomic, &t1));
  363. dispatch(std::bind(wait_atomic, &s1));
  364. ev->record();
  365. dispatch(std::bind(set_atomic, &t2));
  366. wait_atomic(&t0);
  367. ASSERT_FALSE(ev->finished());
  368. set_atomic(&s0);
  369. wait_atomic(&t1);
  370. ASSERT_FALSE(ev->finished());
  371. set_atomic(&s1);
  372. wait_atomic(&t2);
  373. ASSERT_TRUE(ev->finished());
  374. }
  375. namespace {
  376. void test_peer_copy_from_device(const char* comp_node) {
  377. REQUIRE_THREAD();
  378. auto cn_gpu = CompNode::load(comp_node);
  379. auto cn_cpu = CompNode::load("cpux");
  380. HostTensorGenerator<> gen;
  381. auto a = gen({20, 3, 112, 112});
  382. auto b = gen({20, 3, 112, 112});
  383. auto c = gen({20, 3, 112, 112});
  384. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  385. dev_a.copy_from(*a).sync();
  386. dev_b.copy_from(*b).sync();
  387. dev_c.copy_from(*c).sync();
  388. auto wait_event = cn_gpu.create_event();
  389. opr::Sleep::sleep(cn_gpu, 0.1);
  390. dev_a.copy_from(dev_c);
  391. wait_event->record();
  392. cn_cpu.device_wait_event(*wait_event);
  393. dev_b.copy_from(dev_a);
  394. dev_b.sync();
  395. HostTensorND result;
  396. result.copy_from(dev_b);
  397. CompNode::sync_all();
  398. MGB_ASSERT_TENSOR_EQ(result, *c);
  399. }
  400. }
  401. TEST(TestCompNodeCPU, PeerCopyFromCUDA) {
  402. REQUIRE_GPU(1);
  403. test_peer_copy_from_device("gpux");
  404. }
  405. #if MGB_CAMBRICON
  406. TEST(TestCompNodeCPU, PeerCopyFromCambricon) {
  407. REQUIRE_CAMBRICON_DEVICE(1);
  408. REQUIRE_THREAD();
  409. auto cn_gpu = CompNode::load("cambriconx");
  410. auto cn_cpu = CompNode::load("cpux");
  411. HostTensorGenerator<> gen;
  412. auto a = gen({20, 3, 112, 112});
  413. auto b = gen({20, 3, 112, 112});
  414. auto c = gen({20, 3, 112, 112});
  415. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  416. dev_a.copy_from(*a).sync();
  417. dev_b.copy_from(*b).sync();
  418. dev_c.copy_from(*c).sync();
  419. auto wait_event = cn_gpu.create_event();
  420. dev_a.copy_from(dev_c);
  421. wait_event->record();
  422. cn_cpu.device_wait_event(*wait_event);
  423. dev_b.copy_from(dev_a);
  424. dev_b.sync();
  425. HostTensorND result;
  426. result.copy_from(dev_b);
  427. CompNode::sync_all();
  428. MGB_ASSERT_TENSOR_EQ(result, *c);
  429. }
  430. #endif
  431. TEST(TestCompNodeSyncManager, HostWait) {
  432. REQUIRE_THREAD();
  433. CompNodeSyncManager mgr(CompNode::load("xpu0"));
  434. auto run_set = [&]() {
  435. using namespace std::literals;
  436. std::this_thread::sleep_for(200ms);
  437. mgr.set_ready();
  438. mgb_log_debug("set_ready() called");
  439. };
  440. for (int run = 0; run < 2; ++ run) {
  441. std::thread th_run_set(run_set);
  442. RealTimer timer;
  443. mgr.clear_waiter_record();
  444. ASSERT_THROW(mgr.busy_wait_set_ready(), MegBrainError);
  445. mgr.add_waiter_record(false);
  446. mgr.add_waiter_record(false);
  447. mgr.busy_wait_set_ready();
  448. EXPECT_GE(timer.get_secs(), 0.1);
  449. timer.reset();
  450. mgr.busy_wait_set_ready();
  451. EXPECT_LE(timer.get_secs(), 0.001);
  452. th_run_set.join();
  453. }
  454. }
  455. TEST(TestCompNodeSyncManager, DeviceWait) {
  456. REQUIRE_THREAD();
  457. auto cns = load_multiple_xpus(3);
  458. auto cn0 = cns[0], cn1 = cns[1], cn2 = cns[2];
  459. CompNodeSyncManager mgr(cn0);
  460. using Event = CompNode::Event;
  461. auto ev_cn1 = cn1.create_event(),
  462. ev_cn2_begin = cn2.create_event(Event::NEED_TIMER),
  463. ev_cn2_end = cn2.create_event(Event::NEED_TIMER);
  464. for (int run = 0; run < 2; ++ run) {
  465. RealTimer timer;
  466. mgr.clear_waiter_record();
  467. ASSERT_THROW(mgr.busy_wait_set_ready_and_get_event(), MegBrainError);
  468. mgr.add_waiter_record(true);
  469. mgr.add_waiter_record(true);
  470. opr::Sleep::sleep(cn0, 0.13);
  471. mgr.set_ready();
  472. ev_cn2_begin->record();
  473. cn1.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  474. cn2.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  475. ev_cn1->record();
  476. ev_cn2_end->record();
  477. EXPECT_LE(timer.get_secs(), 0.05);
  478. ev_cn1->host_wait();
  479. EXPECT_GE(timer.get_secs(), 0.1);
  480. ev_cn2_end->host_wait();
  481. auto ev2_t = ev_cn2_begin->elapsed_time_until(*ev_cn2_end);
  482. EXPECT_GE(ev2_t, 0.1);
  483. }
  484. }
  485. TEST(TestCompNodeSyncManager, DeviceWaitCross) {
  486. REQUIRE_THREAD();
  487. auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
  488. auto ev_cn0 = cn0.create_event(),
  489. ev_cn1 = cn1.create_event();
  490. RealTimer timer;
  491. // cross wait like deadlock, but guaranteed to work due to good timing
  492. ev_cn0->record();
  493. cn1.device_wait_event(*ev_cn0);
  494. ev_cn1->record();
  495. opr::Sleep::sleep(cn0, 0.1);
  496. cn0.device_wait_event(*ev_cn1);
  497. ev_cn0->record();
  498. cn1.device_wait_event(*ev_cn0);
  499. cn0.sync();
  500. cn1.sync();
  501. // sleep kernel in cuda is easily affected by the frequency change of GPU,
  502. // so we just print warn log instead assert. more refer to
  503. // XPU-226
  504. auto used = timer.get_secs();
  505. if (used <= 0.1 || used >= 0.2) {
  506. mgb_log_warn("expect time between [%f, %f], got %f", 0.1, 0.2, used);
  507. }
  508. }
  509. #if !MGB_HAVE_THREAD
  510. TEST(TestCompNodeSyncManager, DeviceWaitWithoutThread) {
  511. auto cn = CompNode::load("cpu:default");
  512. CompNodeSyncManager mgr(cn);
  513. mgr.add_waiter_record(true);
  514. ASSERT_ANY_THROW(mgr.busy_wait_set_ready());
  515. mgr.set_ready();
  516. EXPECT_TRUE(mgr.busy_wait_set_ready_and_get_event().finished());
  517. }
  518. #endif
  519. TEST(TestCompNode, MultipleLoad) {
  520. auto run = [](CompNode cn) {
  521. HostTensorND a(cn, {23}, dtype::Int32{}), b;
  522. auto pa = a.ptr<int>();
  523. for (int i = 0; i < 23; ++i) {
  524. pa[i] = i;
  525. }
  526. DeviceTensorND tmp;
  527. tmp.copy_from(a);
  528. b.copy_from(tmp).sync();
  529. auto pb = b.ptr<int>();
  530. for (int i = 0; i < 23; ++i) {
  531. ASSERT_EQ(i, pb[i]);
  532. }
  533. CompNode::finalize();
  534. };
  535. for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) {
  536. auto dt = static_cast<CompNode::DeviceType>(i);
  537. if (!check_device_type_avaiable(dt))
  538. continue;
  539. if (CompNode::get_device_count(dt)) {
  540. auto cn = CompNode::load({dt, 0, {0}});
  541. mgb_log("comp node %s is available", cn.to_string().c_str());
  542. run(cn);
  543. cn = CompNode::load({dt, 0, {0}});
  544. run(cn);
  545. }
  546. }
  547. }
  548. #if MGB_CAMBRICON
  549. TEST(TestCompNodeCambricon, D2DCopy) {
  550. auto run = [](CompNode cn) {
  551. constexpr size_t size = 100 * 1024 * 1024;
  552. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  553. auto pa = a.ptr<int>();
  554. for (size_t i = 0; i < size; ++i) {
  555. pa[i] = i;
  556. }
  557. DeviceTensorND tmp, tmp1;
  558. tmp.copy_from(a);
  559. tmp1.copy_from(tmp);
  560. b.copy_from(tmp1).sync();
  561. auto pb = b.ptr<int>();
  562. for (size_t i = 0; i < size; ++i) {
  563. ASSERT_EQ(static_cast<int>(i), pb[i]);
  564. }
  565. CompNode::finalize();
  566. };
  567. REQUIRE_CAMBRICON_DEVICE(1);
  568. auto cn = CompNode::load("cambricon0");
  569. run(cn);
  570. cn = CompNode::load("cambricon1");
  571. run(cn);
  572. }
  573. // peer copy for cambricon between different devices is not correct now, so
  574. // disable this testcase
  575. #if 0
  576. TEST(TestCompNodeCambricon, P2PCopy) {
  577. auto run_raw = []() {
  578. int v0 = 0, v1 = 1;
  579. cnrtDev_t dev0, dev1;
  580. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev0, 0));
  581. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev1, 1));
  582. int *dp0, *dp1;
  583. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev0));
  584. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp0), sizeof(int)));
  585. MGB_CNRT_CHECK(
  586. cnrtMemcpy(dp0, &v0, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  587. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev1));
  588. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp1), sizeof(int)));
  589. MGB_CNRT_CHECK(
  590. cnrtMemcpy(dp1, &v1, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  591. unsigned int can = 0;
  592. MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, 0, 1));
  593. printf("can = %s\n", can ? "TRUE" : "FALSE");
  594. if (can) {
  595. MGB_CNRT_CHECK(cnrtMemcpyPeer(dp1, 1, dp0, 0, sizeof(int)));
  596. int get;
  597. MGB_CNRT_CHECK(cnrtMemcpy(&get, dp1, sizeof(int),
  598. CNRT_MEM_TRANS_DIR_DEV2HOST));
  599. ASSERT_EQ(0, get);
  600. }
  601. };
  602. auto run = [](CompNode cn0, CompNode cn1) {
  603. constexpr size_t size = 100;
  604. HostTensorND a(cn0, {size}, dtype::Int32{}), b;
  605. auto pa = a.ptr<int>();
  606. for (size_t i = 0; i < size; ++i) {
  607. pa[i] = i;
  608. }
  609. DeviceTensorND tmp(cn0, {size}, dtype::Int32{}),
  610. tmp1(cn1, {size}, dtype::Int32{});
  611. tmp.copy_from(a);
  612. tmp1.copy_from(tmp);
  613. b.copy_from(tmp1).sync();
  614. auto pb = b.ptr<int>();
  615. for (size_t i = 0; i < size; ++i) {
  616. ASSERT_EQ(static_cast<int>(i), pb[i]);
  617. }
  618. CompNode::finalize();
  619. };
  620. REQUIRE_CAMBRICON_DEVICE(2);
  621. auto cn0 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1");
  622. run_raw();
  623. run(cn0, cn1);
  624. }
  625. #endif
  626. #endif // MGB_CAMBRICON
  627. #if MGB_ATLAS
  628. TEST(TestCompNodeAtlas, D2DCopy) {
  629. auto run = [](CompNode cn) {
  630. constexpr size_t size = 10 * 1024 * 1024;
  631. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  632. auto pa = a.ptr<int>();
  633. for (size_t i = 0; i < size; ++i) {
  634. pa[i] = i;
  635. }
  636. DeviceTensorND tmp, tmp1;
  637. tmp.copy_from(a);
  638. tmp1.copy_from(tmp);
  639. b.copy_from(tmp1).sync();
  640. auto pb = b.ptr<int>();
  641. for (size_t i = 0; i < size; ++i) {
  642. ASSERT_EQ(static_cast<int>(i), pb[i]);
  643. }
  644. CompNode::finalize();
  645. };
  646. auto cn = CompNode::load("atlas0");
  647. run(cn);
  648. }
  649. #endif
  650. namespace {
  651. class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
  652. int *m_dst, *m_timer;
  653. std::shared_ptr<void> on_comp_node_finalize() override {
  654. EXPECT_EQ(0, *m_dst);
  655. *m_dst = ++*m_timer;
  656. return {};
  657. }
  658. public:
  659. CompNodeDepedentObjectInst(int* dst, int* timer)
  660. : m_dst{dst}, m_timer{timer} {}
  661. void chk() { check_not_finalized(); }
  662. };
  663. } // anonymous namespace
  664. TEST(TestCompNode, DepedentObjectList) {
  665. CompNode::finalize();
  666. for (int i = 0; i < 5; ++i) {
  667. // loop multiple times so memory problems can be easier exposed
  668. int ts[4] = {0}, timer = 0;
  669. auto make = [&](int i) {
  670. return std::make_unique<CompNodeDepedentObjectInst>(ts + i, &timer);
  671. };
  672. auto i0 = make(0), i1 = make(1), i2 = make(2), i3 = make(3);
  673. ASSERT_NO_THROW(i0->chk());
  674. ASSERT_NO_THROW(i1->chk());
  675. i1.reset();
  676. comp_node_detail::DepedentObjList::invoke_callback_and_clean();
  677. ASSERT_EQ(1, ts[3]);
  678. ASSERT_EQ(2, ts[2]);
  679. ASSERT_EQ(0, ts[1]);
  680. ASSERT_EQ(3, ts[0]);
  681. ASSERT_THROW(i0->chk(), InternalError);
  682. }
  683. }
  684. namespace {
  685. template <typename tag>
  686. class TestCPUCompSeqRec : public ::testing::Test {};
  687. TYPED_TEST_CASE(TestCPUCompSeqRec, comp_node_test::seq_rec::test_types);
  688. TYPED_TEST(TestCPUCompSeqRec, run) {
  689. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpux"));
  690. }
  691. TYPED_TEST(TestCPUCompSeqRec, run_default_cpu) {
  692. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpu:default"));
  693. }
  694. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread) {
  695. auto cn = CompNode::load("multithread4:0");
  696. comp_node_test::seq_rec::run<TypeParam>(cn);
  697. }
  698. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) {
  699. auto cn = CompNode::load("multithread:default:4");
  700. comp_node_test::seq_rec::run<TypeParam>(cn);
  701. }
  702. } // anonymous namespace
  703. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台