You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node.cpp 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894
  1. /**
  2. * \file src/core/test/comp_node.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./comp_node_helper.h"
  12. #include "megbrain/comp_node_env.h"
  13. #include "megbrain/opr/utility.h"
  14. #include "megbrain/system.h"
  15. #include "megbrain/test/helper.h"
  16. #include "megbrain/utils/comp_node_sync_manager.h"
  17. #include "megbrain/utils/timer.h"
  18. #include <chrono>
  19. #if MGB_HAVE_THREAD
  20. #include <thread>
  21. #endif
  22. using namespace mgb;
  23. TEST(TestCompNode, Parse) {
  24. using L = CompNode::Locator;
  25. using D = CompNode::DeviceType;
  26. auto make_lc = [](D t, int dev, int s) -> L { return {t, dev, {s}}; };
  27. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  28. ASSERT_EQ(L::parse("xpux:23"), make_lc(D::UNSPEC, -1, 23));
  29. ASSERT_EQ(L::parse("xpu2:23"), make_lc(D::UNSPEC, 2, 23));
  30. ASSERT_EQ(L::parse("xpu21:23"), make_lc(D::UNSPEC, 21, 23));
  31. ASSERT_EQ(L::parse("cpux"), make_lc(D::CPU, -1, 0));
  32. ASSERT_EQ(L::parse("cpux:23"), make_lc(D::CPU, -1, 23));
  33. ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
  34. ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));
  35. ASSERT_EQ(L::parse("rocmx"), make_lc(D::ROCM, -1, 0));
  36. ASSERT_EQ(L::parse("rocm2"), make_lc(D::ROCM, 2, 0));
  37. ASSERT_EQ(L::parse("rocm2:3"), make_lc(D::ROCM, 2, 3));
  38. ASSERT_EQ(L::parse("cambriconx"), make_lc(D::CAMBRICON, -1, 0));
  39. ASSERT_EQ(L::parse("cambricon2"), make_lc(D::CAMBRICON, 2, 0));
  40. ASSERT_EQ(L::parse("cambricon2:3"), make_lc(D::CAMBRICON, 2, 3));
  41. ASSERT_EQ(L::parse("atlasx"), make_lc(D::ATLAS, -1, 0));
  42. ASSERT_EQ(L::parse("atlas2"), make_lc(D::ATLAS, 2, 0));
  43. ASSERT_EQ(L::parse("atlas2:3"), make_lc(D::ATLAS, 2, 3));
  44. ASSERT_EQ(L::parse("xpu"), make_lc(D::UNSPEC, -1, 0));
  45. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  46. ASSERT_EQ(L::parse("xpu23"), make_lc(D::UNSPEC, 23, 0));
  47. ASSERT_EQ(L::parse("xpu23:1"), make_lc(D::UNSPEC, 23, 1));
  48. ASSERT_EQ(L::parse("cpu:default"), make_lc(D::CPU, L::DEVICE_CPU_DEFAULT, 0));
  49. ASSERT_EQ(L::parse("multithread2:0"), make_lc(D::MULTITHREAD, 0, 2));
  50. ASSERT_EQ(L::parse("multithread1:3"), make_lc(D::MULTITHREAD, 3, 1));
  51. ASSERT_EQ(
  52. L::parse("multithread:default:2"),
  53. make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));
  54. ASSERT_THROW(L::parse("apu"), MegBrainError);
  55. ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
  56. ASSERT_THROW(L::parse("cab0"), MegBrainError);
  57. ASSERT_THROW(L::parse("cpu"), MegBrainError);
  58. ASSERT_THROW(L::parse("cpu-1"), MegBrainError);
  59. ASSERT_THROW(L::parse("cpu0:"), MegBrainError);
  60. ASSERT_THROW(L::parse("cpu0:x"), MegBrainError);
  61. ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError);
  62. ASSERT_THROW(L::parse("rcom0"), MegBrainError);
  63. ASSERT_THROW(L::parse("cmabricon0"), MegBrainError);
  64. ASSERT_THROW(L::parse("atlast0"), MegBrainError);
  65. ASSERT_THROW(L::parse("multithread"), MegBrainError);
  66. ASSERT_THROW(L::parse("multithread1:"), MegBrainError);
  67. ASSERT_THROW(L::parse("multithread1:default"), MegBrainError);
  68. ASSERT_THROW(L::parse("multithread1:default:0"), MegBrainError);
  69. }
  70. TEST(TestCompNode, SetDefaultDev) {
  71. REQUIRE_GPU(3);
  72. CompNode::finalize();
  73. using L = CompNode::Locator;
  74. auto orig_dt = L::parse("xpu").to_physical(),
  75. orig_gpu = L::parse("gpux").to_physical(),
  76. orig_cpu = L::parse("cpux").to_physical();
  77. constexpr auto CUDA = CompNode::DeviceType::CUDA;
  78. constexpr auto CPU = CompNode::DeviceType::CPU;
  79. L::set_unspec_device_type(CUDA);
  80. auto run = [](int device) {
  81. ASSERT_EQ(
  82. CompNode::load("xpu").locator(),
  83. L::parse("gpu" + std::to_string(device)));
  84. };
  85. auto run_cpu = [](int device) {
  86. ASSERT_EQ(
  87. CompNode::load("cpux").locator(),
  88. L::parse("cpu" + std::to_string(device)));
  89. };
  90. MGB_TRY {
  91. L::set_device_map(CUDA, -1, 2);
  92. run(2);
  93. L::set_device_map(CUDA, -1, 1);
  94. run(1);
  95. L::set_device_map(CPU, -1, 2);
  96. run_cpu(2);
  97. L::set_device_map(CPU, -1, 1);
  98. run_cpu(1);
  99. }
  100. MGB_FINALLY({
  101. L::set_unspec_device_type(orig_dt.type);
  102. L::set_device_map(CUDA, -1, orig_gpu.device);
  103. L::set_device_map(CPU, -1, orig_cpu.device);
  104. });
  105. CompNode::finalize();
  106. }
  107. TEST(TestCompNode, Load) {
  108. auto cn0 = CompNode::load("xpux"), cn1 = CompNode::load("cpux");
  109. ASSERT_EQ(CompNode::DeviceType::UNSPEC, cn0.locator_logical().type);
  110. ASSERT_EQ(CompNode::DeviceType::CPU, cn1.locator_logical().type);
  111. ASSERT_EQ(CompNode::load("cpux"), cn1);
  112. ASSERT_EQ(CompNode::load("xpux"), cn0);
  113. auto cnp = CompNode::load("cpu1"), cnq = CompNode::load("cpu2");
  114. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  115. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  116. #if MGB_HAVE_THREAD
  117. ASSERT_NE(cnp, cnq);
  118. #else
  119. ASSERT_EQ(cnp, cnq);
  120. #endif
  121. #if MGB_HAVE_THREAD
  122. auto cn_multi_thread0 = CompNode::load("multithread2:0");
  123. auto cn_multi_thread1 = CompNode::load("multithread2:1");
  124. ASSERT_EQ(CompNode::load("multithread2:0"), cn_multi_thread0);
  125. ASSERT_EQ(CompNode::load("multithread2:1"), cn_multi_thread1);
  126. ASSERT_NE(CompNode::load("multithread4:0"), cn_multi_thread0);
  127. ASSERT_NE(CompNode::load("multithread4:1"), cn_multi_thread1);
  128. auto cn_multi_default0 = CompNode::load("multithread:default:2");
  129. auto cn_multi_default1 = CompNode::load("multithread:default:4");
  130. ASSERT_EQ(CompNode::load("multithread:default:2"), cn_multi_default0);
  131. ASSERT_EQ(CompNode::load("multithread:default:4"), cn_multi_default1);
  132. ASSERT_NE(cn_multi_thread0, cn_multi_default1);
  133. #endif
  134. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  135. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  136. if (check_gpu_available(2)) {
  137. auto cn2 = CompNode::load("gpux"), cn3 = CompNode::load("gpu1");
  138. ASSERT_EQ(CompNode::DeviceType::CUDA, cn2.locator_logical().type);
  139. ASSERT_NE(cn2, cn3);
  140. ASSERT_EQ(CompNode::load("gpux"), cn2);
  141. ASSERT_EQ(CompNode::load("gpu1"), cn3);
  142. }
  143. #if MGB_ATLAS
  144. auto atlas0 = CompNode::load("atlas0");
  145. auto atlas1 = CompNode::load("atlas1");
  146. ASSERT_NE(atlas0, atlas1);
  147. #endif
  148. }
  149. TEST(TestCompNode, FreeAfterFinalize) {
  150. CompNode::finalize();
  151. for (size_t i = 0; i < CompNode::NR_DEVICE_TYPE; ++i) {
  152. auto type = static_cast<CompNode::DeviceType>(i);
  153. if (!check_device_type_avaiable(type) || !CompNode::get_device_count(type))
  154. continue;
  155. auto cn = CompNode::load(CompNode::Locator{type, -1, {0}});
  156. auto ptr = cn.alloc_device(123);
  157. CompNode::finalize();
  158. cn.free_device(ptr);
  159. }
  160. }
  161. TEST(TestCompNode, CPUDispatchSync) {
  162. REQUIRE_THREAD();
  163. constexpr int LOOP = 160, tot_threads = 8;
  164. std::atomic_int started_threads{0};
  165. auto worker = [&](int* shared_cnt, CompNode dest) {
  166. int nr_call = 0;
  167. RNGxorshf rng{next_rand_seed()};
  168. auto func = [&rng, &nr_call, shared_cnt]() {
  169. ++nr_call;
  170. ++*shared_cnt;
  171. int volatile cnt = 0;
  172. while (rng() % 20)
  173. ++cnt;
  174. };
  175. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  176. ++started_threads;
  177. while (started_threads.load() != tot_threads)
  178. ;
  179. for (int i = 0; i < LOOP; ++i) {
  180. env.dispatch(func);
  181. dest.sync();
  182. ASSERT_EQ(i + 1, nr_call);
  183. }
  184. };
  185. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  186. int cnt0 = 0, cnt1 = 0;
  187. std::vector<std::thread> wk_threads;
  188. for (int i = 0; i < tot_threads / 2; ++i) {
  189. wk_threads.emplace_back(worker, &cnt0, cn0);
  190. wk_threads.emplace_back(worker, &cnt1, cn1);
  191. }
  192. for (auto&& i : wk_threads)
  193. i.join();
  194. ASSERT_EQ(LOOP * tot_threads / 2, cnt0);
  195. ASSERT_EQ(LOOP * tot_threads / 2, cnt1);
  196. }
  197. TEST(TestCompNodeCPU, CoreAffinity) {
  198. REQUIRE_THREAD();
  199. std::vector<size_t> data_v(2, 0);
  200. size_t data0, data1 = 0;
  201. auto empty_task = []() {};
  202. auto cn0 = CompNode::load("cpu:default"), cn1 = CompNode::load("cpu0"),
  203. cn2 = CompNode::load("multithread2:0");
  204. auto binding0 = [&](size_t) { data0 = 10; };
  205. CompNodeEnv::from_comp_node(cn0).cpu_env().set_affinity(binding0);
  206. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(empty_task);
  207. cn0.sync();
  208. auto binding1 = [&](size_t) { data1 = 20; };
  209. CompNodeEnv::from_comp_node(cn1).cpu_env().set_affinity(binding1);
  210. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(empty_task);
  211. cn1.sync();
  212. auto binding2 = [&](size_t thread_id) { data_v[thread_id] = 30; };
  213. auto temp_task = [](size_t, size_t) {};
  214. CompNodeEnv::from_comp_node(cn2).cpu_env().set_affinity(binding2);
  215. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(temp_task, 40u);
  216. cn2.sync();
  217. ASSERT_EQ(data0, static_cast<size_t>(10));
  218. ASSERT_EQ(data1, static_cast<size_t>(20));
  219. ASSERT_EQ(data_v[0], static_cast<size_t>(30));
  220. ASSERT_EQ(data_v[1], static_cast<size_t>(30));
  221. }
  222. TEST(TestCompNode, CPU_MULTI_THREAD) {
  223. REQUIRE_THREAD();
  224. std::vector<int> source(100), dst0(100), dst1(100);
  225. for (int i = 0; i < 100; i++) {
  226. source[i] = i;
  227. dst0[i] = 0;
  228. dst1[i] = 0;
  229. }
  230. size_t total_task = 20;
  231. auto worker = [&](std::vector<int>& dst, CompNode dest) {
  232. auto func = [&](size_t index, size_t) {
  233. size_t sub_task = 100 / total_task;
  234. for (size_t i = index * sub_task; i < (index + 1) * sub_task; i++) {
  235. int sum = 0;
  236. for (size_t j = 0; j < i; j++) {
  237. sum += source[j];
  238. }
  239. dst[i] = sum;
  240. }
  241. };
  242. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  243. env.dispatch(std::move(func), total_task);
  244. dest.sync();
  245. };
  246. for (auto&& str : std::vector<std::string>{
  247. "multithread2:0", "multithread4:0", "multithread:default:4"}) {
  248. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load(str);
  249. std::thread wk_thread0{std::ref(worker), std::ref(dst0), std::ref(cn0)};
  250. std::thread wk_thread1{std::ref(worker), std::ref(dst1), std::ref(cn1)};
  251. wk_thread0.join();
  252. wk_thread1.join();
  253. for (int i = 0; i < 100; i++) {
  254. ASSERT_EQ(dst0[i], dst1[i]);
  255. }
  256. }
  257. }
  258. TEST(TestCompNodeCuda, MemNode) {
  259. REQUIRE_GPU(2);
  260. auto cn00 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"),
  261. cn01 = CompNode::load("gpu0:1");
  262. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  263. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  264. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  265. }
  266. TEST(TestCompNodeCuda, Uid) {
  267. REQUIRE_GPU(2);
  268. auto cn00 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"),
  269. cn01 = CompNode::load("gpu0:0"), cn02 = CompNode::load("gpu0:2");
  270. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  271. ASSERT_EQ(cn00.get_uid(), cn01.get_uid());
  272. ASSERT_NE(cn00.get_uid(), cn02.get_uid());
  273. ASSERT_NE(cn00.get_uid(), cn1.get_uid());
  274. }
  275. TEST(TestCompNodeCuda, set_prealloc_config) {
  276. CompNode::set_prealloc_config(
  277. 1024, 1024, 256 * 1024 * 1024, 4, CompNode::DeviceType::CUDA);
  278. }
  279. #if MGB_ROCM
  280. TEST(TestCompNodeROCm, MemNode) {
  281. REQUIRE_AMD_GPU(2);
  282. auto cn00 = CompNode::load("rocm0"), cn1 = CompNode::load("rocm1"),
  283. cn01 = CompNode::load("rocm0:1");
  284. ASSERT_EQ(cn00, CompNode::load("rocm0"));
  285. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  286. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  287. }
  288. #endif
  289. #if MGB_CAMBRICON
  290. TEST(TestCompNodeCambricon, MemNode) {
  291. REQUIRE_CAMBRICON_DEVICE(2);
  292. auto cn00 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1"),
  293. cn01 = CompNode::load("cambricon0:1");
  294. ASSERT_EQ(cn00, CompNode::load("cambricon0"));
  295. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  296. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  297. }
  298. #endif
  299. #if MGB_ATLAS
  300. TEST(TestCompNodeAtlas, MemNode) {
  301. auto cn00 = CompNode::load("atlas0"), cn1 = CompNode::load("atlas1"),
  302. cn01 = CompNode::load("atlas0:1");
  303. ASSERT_EQ(cn00, CompNode::load("atlas0"));
  304. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  305. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  306. }
  307. #endif
  308. TEST(TestCompNodeCPU, PhysicalDispatch) {
  309. constexpr int ID = 0x2a6453e0;
  310. using L = CompNode::Locator;
  311. constexpr auto DT = CompNode::DeviceType::CPU;
  312. L::set_device_map(DT, ID, 0);
  313. L::set_device_map(DT, ID + 1, 0);
  314. L::set_device_map(DT, ID + 2, 1);
  315. auto cn0 = CompNode::load({DT, ID, {0}}), cn1 = CompNode::load({DT, ID + 1, {0}}),
  316. cn2 = CompNode::load({DT, ID + 2, {0}});
  317. #if MGB_HAVE_THREAD
  318. ASSERT_NE(cn0, cn1);
  319. #else
  320. ASSERT_EQ(cn0, cn1);
  321. #endif
  322. std::vector<std::thread::id> tids;
  323. std::mutex tids_mtx;
  324. auto get_tid = [&]() {
  325. MGB_LOCK_GUARD(tids_mtx);
  326. tids.push_back(std::this_thread::get_id());
  327. };
  328. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(get_tid);
  329. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(get_tid);
  330. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(get_tid);
  331. CompNode::sync_all();
  332. std::unordered_set<std::thread::id> uniq_tids(tids.begin(), tids.end());
  333. ASSERT_EQ(3u, tids.size());
  334. #if MGB_HAVE_THREAD
  335. ASSERT_EQ(2u, uniq_tids.size());
  336. #else
  337. ASSERT_EQ(1u, uniq_tids.size());
  338. #endif
  339. }
  340. TEST(TestCompNodeCPU, EventWait) {
  341. REQUIRE_THREAD();
  342. std::atomic_bool start = ATOMIC_VAR_INIT(false);
  343. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  344. auto task0 = [&]() {
  345. while (!start)
  346. std::this_thread::yield();
  347. };
  348. auto event = cn0.create_event();
  349. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(task0);
  350. event->record();
  351. cn1.device_wait_event(*event);
  352. bool succ = false;
  353. auto task1 = [&]() { succ = start; };
  354. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(task1);
  355. using namespace std::literals;
  356. std::this_thread::sleep_for(50ms);
  357. ASSERT_FALSE(succ);
  358. start = true;
  359. CompNode::sync_all();
  360. ASSERT_TRUE(succ);
  361. }
  362. TEST(TestCompNodeCPU, EventRecOverwrite) {
  363. REQUIRE_THREAD();
  364. auto cn = CompNode::load("cpu0");
  365. auto dispatcher = CompNodeEnv::from_comp_node(cn).cpu_env().dispatcher.get();
  366. auto dispatch = [&](MegcoreCPUDispatcher::Task&& t) {
  367. dispatcher->dispatch(std::move(t));
  368. };
  369. auto ev = cn.create_event();
  370. auto wait_atomic = [](std::atomic_bool* var) {
  371. while (!var->load())
  372. std::this_thread::yield();
  373. };
  374. auto set_atomic = [](std::atomic_bool* var) { var->store(true); };
  375. std::atomic_bool s0 = ATOMIC_VAR_INIT(false), s1 = ATOMIC_VAR_INIT(false),
  376. t0 = ATOMIC_VAR_INIT(false), t1 = ATOMIC_VAR_INIT(false),
  377. t2 = ATOMIC_VAR_INIT(false);
  378. dispatch(std::bind(set_atomic, &t0));
  379. dispatch(std::bind(wait_atomic, &s0));
  380. ev->record();
  381. dispatch(std::bind(set_atomic, &t1));
  382. dispatch(std::bind(wait_atomic, &s1));
  383. ev->record();
  384. dispatch(std::bind(set_atomic, &t2));
  385. wait_atomic(&t0);
  386. ASSERT_FALSE(ev->finished());
  387. set_atomic(&s0);
  388. wait_atomic(&t1);
  389. ASSERT_FALSE(ev->finished());
  390. set_atomic(&s1);
  391. wait_atomic(&t2);
  392. ASSERT_TRUE(ev->finished());
  393. }
  394. namespace {
  395. void test_peer_copy_from_device(const char* comp_node) {
  396. REQUIRE_THREAD();
  397. auto cn_gpu = CompNode::load(comp_node);
  398. auto cn_cpu = CompNode::load("cpux");
  399. HostTensorGenerator<> gen;
  400. auto a = gen({20, 3, 112, 112});
  401. auto b = gen({20, 3, 112, 112});
  402. auto c = gen({20, 3, 112, 112});
  403. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  404. dev_a.copy_from(*a).sync();
  405. dev_b.copy_from(*b).sync();
  406. dev_c.copy_from(*c).sync();
  407. auto wait_event = cn_gpu.create_event();
  408. opr::Sleep::sleep(cn_gpu, 0.1);
  409. dev_a.copy_from(dev_c);
  410. wait_event->record();
  411. cn_cpu.device_wait_event(*wait_event);
  412. dev_b.copy_from(dev_a);
  413. dev_b.sync();
  414. HostTensorND result;
  415. result.copy_from(dev_b);
  416. CompNode::sync_all();
  417. MGB_ASSERT_TENSOR_EQ(result, *c);
  418. }
  419. } // namespace
  420. TEST(TestCompNodeCPU, PeerCopyFromCUDA) {
  421. REQUIRE_GPU(1);
  422. test_peer_copy_from_device("gpux");
  423. }
  424. TEST(TestCompNodeCPU, PeerCopyFromROCm) {
  425. REQUIRE_AMD_GPU(1);
  426. test_peer_copy_from_device("rocmx");
  427. }
  428. #if MGB_CAMBRICON
  429. TEST(TestCompNodeCPU, PeerCopyFromCambricon) {
  430. REQUIRE_CAMBRICON_DEVICE(1);
  431. REQUIRE_THREAD();
  432. auto cn_gpu = CompNode::load("cambriconx");
  433. auto cn_cpu = CompNode::load("cpux");
  434. HostTensorGenerator<> gen;
  435. auto a = gen({20, 3, 112, 112});
  436. auto b = gen({20, 3, 112, 112});
  437. auto c = gen({20, 3, 112, 112});
  438. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  439. dev_a.copy_from(*a).sync();
  440. dev_b.copy_from(*b).sync();
  441. dev_c.copy_from(*c).sync();
  442. auto wait_event = cn_gpu.create_event();
  443. dev_a.copy_from(dev_c);
  444. wait_event->record();
  445. cn_cpu.device_wait_event(*wait_event);
  446. dev_b.copy_from(dev_a);
  447. dev_b.sync();
  448. HostTensorND result;
  449. result.copy_from(dev_b);
  450. CompNode::sync_all();
  451. MGB_ASSERT_TENSOR_EQ(result, *c);
  452. }
  453. #endif
  454. TEST(TestCompNodeSyncManager, HostWait) {
  455. REQUIRE_THREAD();
  456. CompNodeSyncManager mgr(CompNode::load("xpu0"));
  457. auto run_set = [&]() {
  458. using namespace std::literals;
  459. std::this_thread::sleep_for(200ms);
  460. mgr.set_ready();
  461. mgb_log_debug("set_ready() called");
  462. };
  463. for (int run = 0; run < 2; ++run) {
  464. std::thread th_run_set(run_set);
  465. RealTimer timer;
  466. mgr.clear_waiter_record();
  467. ASSERT_THROW(mgr.busy_wait_set_ready(), MegBrainError);
  468. mgr.add_waiter_record(false);
  469. mgr.add_waiter_record(false);
  470. mgr.busy_wait_set_ready();
  471. EXPECT_GE(timer.get_secs(), 0.1);
  472. timer.reset();
  473. mgr.busy_wait_set_ready();
  474. EXPECT_LE(timer.get_secs(), 0.001);
  475. th_run_set.join();
  476. }
  477. }
  478. TEST(TestCompNodeSyncManager, DeviceWait) {
  479. REQUIRE_THREAD();
  480. auto cns = load_multiple_xpus(3);
  481. auto cn0 = cns[0], cn1 = cns[1], cn2 = cns[2];
  482. CompNodeSyncManager mgr(cn0);
  483. using Event = CompNode::Event;
  484. auto ev_cn1 = cn1.create_event(),
  485. ev_cn2_begin = cn2.create_event(Event::NEED_TIMER),
  486. ev_cn2_end = cn2.create_event(Event::NEED_TIMER);
  487. for (int run = 0; run < 2; ++run) {
  488. RealTimer timer;
  489. mgr.clear_waiter_record();
  490. ASSERT_THROW(mgr.busy_wait_set_ready_and_get_event(), MegBrainError);
  491. mgr.add_waiter_record(true);
  492. mgr.add_waiter_record(true);
  493. opr::Sleep::sleep(cn0, 0.13);
  494. mgr.set_ready();
  495. ev_cn2_begin->record();
  496. cn1.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  497. cn2.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  498. ev_cn1->record();
  499. ev_cn2_end->record();
  500. EXPECT_LE(timer.get_secs(), 0.06);
  501. ev_cn1->host_wait();
  502. EXPECT_GE(timer.get_secs(), 0.1);
  503. ev_cn2_end->host_wait();
  504. auto ev2_t = ev_cn2_begin->elapsed_time_until(*ev_cn2_end);
  505. EXPECT_GE(ev2_t, 0.1);
  506. }
  507. }
  508. TEST(TestCompNodeSyncManager, DeviceWaitCross) {
  509. REQUIRE_THREAD();
  510. auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
  511. auto ev_cn0 = cn0.create_event(), ev_cn1 = cn1.create_event();
  512. RealTimer timer;
  513. // cross wait like deadlock, but guaranteed to work due to good timing
  514. ev_cn0->record();
  515. cn1.device_wait_event(*ev_cn0);
  516. ev_cn1->record();
  517. opr::Sleep::sleep(cn0, 0.1);
  518. cn0.device_wait_event(*ev_cn1);
  519. ev_cn0->record();
  520. cn1.device_wait_event(*ev_cn0);
  521. cn0.sync();
  522. cn1.sync();
  523. // sleep kernel in cuda is easily affected by the frequency change of GPU,
  524. // so we just print warn log instead assert. more refer to
  525. // XPU-226
  526. auto used = timer.get_secs();
  527. if (used <= 0.1 || used >= 0.2) {
  528. mgb_log_warn("expect time between [%f, %f], got %f", 0.1, 0.2, used);
  529. }
  530. }
  531. #if !MGB_HAVE_THREAD
  532. TEST(TestCompNodeSyncManager, DeviceWaitWithoutThread) {
  533. auto cn = CompNode::load("cpu:default");
  534. CompNodeSyncManager mgr(cn);
  535. mgr.add_waiter_record(true);
  536. ASSERT_ANY_THROW(mgr.busy_wait_set_ready());
  537. mgr.set_ready();
  538. EXPECT_TRUE(mgr.busy_wait_set_ready_and_get_event().finished());
  539. }
  540. #endif
  541. TEST(TestCompNode, MultipleLoad) {
  542. auto run = [](CompNode cn) {
  543. HostTensorND a(cn, {23}, dtype::Int32{}), b;
  544. auto pa = a.ptr<int>();
  545. for (int i = 0; i < 23; ++i) {
  546. pa[i] = i;
  547. }
  548. DeviceTensorND tmp;
  549. tmp.copy_from(a);
  550. b.copy_from(tmp).sync();
  551. auto pb = b.ptr<int>();
  552. for (int i = 0; i < 23; ++i) {
  553. ASSERT_EQ(i, pb[i]);
  554. }
  555. CompNode::finalize();
  556. };
  557. for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) {
  558. auto dt = static_cast<CompNode::DeviceType>(i);
  559. if (!check_device_type_avaiable(dt))
  560. continue;
  561. if (CompNode::get_device_count(dt)) {
  562. auto cn = CompNode::load({dt, 0, {0}});
  563. mgb_log("comp node %s is available", cn.to_string().c_str());
  564. run(cn);
  565. cn = CompNode::load({dt, 0, {0}});
  566. run(cn);
  567. }
  568. }
  569. }
  570. #if MGB_CAMBRICON
  571. TEST(TestCompNodeCambricon, D2DCopy) {
  572. auto run = [](CompNode cn) {
  573. constexpr size_t size = 100 * 1024 * 1024;
  574. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  575. auto pa = a.ptr<int>();
  576. for (size_t i = 0; i < size; ++i) {
  577. pa[i] = i;
  578. }
  579. DeviceTensorND tmp, tmp1;
  580. tmp.copy_from(a);
  581. tmp1.copy_from(tmp);
  582. b.copy_from(tmp1).sync();
  583. auto pb = b.ptr<int>();
  584. for (size_t i = 0; i < size; ++i) {
  585. ASSERT_EQ(static_cast<int>(i), pb[i]);
  586. }
  587. CompNode::finalize();
  588. };
  589. REQUIRE_CAMBRICON_DEVICE(1);
  590. auto cn = CompNode::load("cambricon0");
  591. run(cn);
  592. cn = CompNode::load("cambricon1");
  593. run(cn);
  594. }
  595. // peer copy for cambricon between different devices is not correct now, so
  596. // disable this testcase
  597. #if 0
  598. TEST(TestCompNodeCambricon, P2PCopy) {
  599. auto run_raw = []() {
  600. int v0 = 0, v1 = 1;
  601. cnrtDev_t dev0, dev1;
  602. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev0, 0));
  603. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev1, 1));
  604. int *dp0, *dp1;
  605. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev0));
  606. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp0), sizeof(int)));
  607. MGB_CNRT_CHECK(
  608. cnrtMemcpy(dp0, &v0, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  609. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev1));
  610. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp1), sizeof(int)));
  611. MGB_CNRT_CHECK(
  612. cnrtMemcpy(dp1, &v1, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  613. unsigned int can = 0;
  614. MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, 0, 1));
  615. printf("can = %s\n", can ? "TRUE" : "FALSE");
  616. if (can) {
  617. MGB_CNRT_CHECK(cnrtMemcpyPeer(dp1, 1, dp0, 0, sizeof(int)));
  618. int get;
  619. MGB_CNRT_CHECK(cnrtMemcpy(&get, dp1, sizeof(int),
  620. CNRT_MEM_TRANS_DIR_DEV2HOST));
  621. ASSERT_EQ(0, get);
  622. }
  623. };
  624. auto run = [](CompNode cn0, CompNode cn1) {
  625. constexpr size_t size = 100;
  626. HostTensorND a(cn0, {size}, dtype::Int32{}), b;
  627. auto pa = a.ptr<int>();
  628. for (size_t i = 0; i < size; ++i) {
  629. pa[i] = i;
  630. }
  631. DeviceTensorND tmp(cn0, {size}, dtype::Int32{}),
  632. tmp1(cn1, {size}, dtype::Int32{});
  633. tmp.copy_from(a);
  634. tmp1.copy_from(tmp);
  635. b.copy_from(tmp1).sync();
  636. auto pb = b.ptr<int>();
  637. for (size_t i = 0; i < size; ++i) {
  638. ASSERT_EQ(static_cast<int>(i), pb[i]);
  639. }
  640. CompNode::finalize();
  641. };
  642. REQUIRE_CAMBRICON_DEVICE(2);
  643. auto cn0 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1");
  644. run_raw();
  645. run(cn0, cn1);
  646. }
  647. #endif
  648. #endif // MGB_CAMBRICON
  649. #if MGB_ATLAS
  650. TEST(TestCompNodeAtlas, D2DCopy) {
  651. auto run = [](CompNode cn) {
  652. constexpr size_t size = 10 * 1024 * 1024;
  653. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  654. auto pa = a.ptr<int>();
  655. for (size_t i = 0; i < size; ++i) {
  656. pa[i] = i;
  657. }
  658. DeviceTensorND tmp, tmp1;
  659. tmp.copy_from(a);
  660. tmp1.copy_from(tmp);
  661. b.copy_from(tmp1).sync();
  662. auto pb = b.ptr<int>();
  663. for (size_t i = 0; i < size; ++i) {
  664. ASSERT_EQ(static_cast<int>(i), pb[i]);
  665. }
  666. CompNode::finalize();
  667. };
  668. auto cn = CompNode::load("atlas0");
  669. run(cn);
  670. }
  671. #endif
  672. namespace {
  673. class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
  674. int *m_dst, *m_timer;
  675. std::shared_ptr<void> on_comp_node_finalize() override {
  676. EXPECT_EQ(0, *m_dst);
  677. *m_dst = ++*m_timer;
  678. return {};
  679. }
  680. public:
  681. CompNodeDepedentObjectInst(int* dst, int* timer) : m_dst{dst}, m_timer{timer} {}
  682. void chk() { check_not_finalized(); }
  683. };
  684. } // anonymous namespace
  685. TEST(TestCompNode, DepedentObjectList) {
  686. CompNode::finalize();
  687. for (int i = 0; i < 5; ++i) {
  688. // loop multiple times so memory problems can be easier exposed
  689. int ts[4] = {0}, timer = 0;
  690. auto make = [&](int i) {
  691. return std::make_unique<CompNodeDepedentObjectInst>(ts + i, &timer);
  692. };
  693. auto i0 = make(0), i1 = make(1), i2 = make(2), i3 = make(3);
  694. ASSERT_NO_THROW(i0->chk());
  695. ASSERT_NO_THROW(i1->chk());
  696. i1.reset();
  697. comp_node_detail::DepedentObjList::invoke_callback_and_clean();
  698. ASSERT_EQ(1, ts[3]);
  699. ASSERT_EQ(2, ts[2]);
  700. ASSERT_EQ(0, ts[1]);
  701. ASSERT_EQ(3, ts[0]);
  702. ASSERT_THROW(i0->chk(), InternalError);
  703. }
  704. }
  705. namespace {
  706. template <typename tag>
  707. class TestCPUCompSeqRec : public ::testing::Test {};
  708. TYPED_TEST_CASE(TestCPUCompSeqRec, comp_node_test::seq_rec::test_types);
  709. TYPED_TEST(TestCPUCompSeqRec, run) {
  710. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpux"));
  711. }
  712. TYPED_TEST(TestCPUCompSeqRec, run_default_cpu) {
  713. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpu:default"));
  714. }
  715. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread) {
  716. auto cn = CompNode::load("multithread4:0");
  717. comp_node_test::seq_rec::run<TypeParam>(cn);
  718. }
  719. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) {
  720. auto cn = CompNode::load("multithread:default:4");
  721. comp_node_test::seq_rec::run<TypeParam>(cn);
  722. }
  723. } // anonymous namespace
  724. #include "megbrain/opr/basic_arith_wrapper.h"
  725. #include "megbrain/opr/io.h"
  726. #include "megbrain/opr/tensor_manip.h"
  727. #include "megbrain/opr/utility.h"
  728. TEST(TestCPUCompSeqRec, run_dyn_ptr) {
  729. CompNode cn = CompNode::load("cpux");
  730. HostTensorGenerator<> gen;
  731. auto host_x0 = gen({4, 1}, cn), host_y0 = gen({4, 1}, cn),
  732. host_z0 = gen({4, 1}, cn);
  733. auto host_x1 = gen({4, 1}, cn), host_y1 = gen({4, 1}, cn),
  734. host_z1 = gen({4, 1}, cn);
  735. auto dev_x0 = std::make_shared<DeviceTensorND>(cn);
  736. auto dev_y0 = std::make_shared<DeviceTensorND>(cn);
  737. auto dev_z0 = std::make_shared<DeviceTensorND>(cn);
  738. auto dev_x1 = std::make_shared<DeviceTensorND>(cn);
  739. auto dev_y1 = std::make_shared<DeviceTensorND>(cn);
  740. auto dev_z1 = std::make_shared<DeviceTensorND>(cn);
  741. (*dev_x0).comp_node(cn).copy_from(*host_x0).sync();
  742. (*dev_y0).comp_node(cn).copy_from(*host_y0).sync();
  743. (*dev_z0).comp_node(cn).copy_from(*host_z0).sync();
  744. (*dev_x1).comp_node(cn).copy_from(*host_x1).sync();
  745. (*dev_y1).comp_node(cn).copy_from(*host_y1).sync();
  746. (*dev_z1).comp_node(cn).copy_from(*host_z1).sync();
  747. auto check = [&]() {
  748. HostTensorND ret(CompNode::load("cpux"), host_x0->shape());
  749. auto px = host_x0->ptr<float>(), py = host_y0->ptr<float>(),
  750. pz = host_z0->ptr<float>(), pw = ret.ptr<float>();
  751. auto sz0 = host_x0->shape()[0], sz1 = host_x0->shape()[1];
  752. for (size_t i = 0; i < sz0; ++i) {
  753. for (size_t j = 0; j < sz1; ++j) {
  754. pw[i * sz1 + j] = px[i * sz1 + j] * py[i * sz1 + j] + pz[i * sz1 + j];
  755. }
  756. }
  757. return ret;
  758. };
  759. auto graph = ComputingGraph::make();
  760. // test record on first run
  761. graph->options().var_sanity_check_first_run = false;
  762. graph->options().graph_opt_level = 0;
  763. graph->options().comp_node_seq_record_level = 1;
  764. graph->options().fake_next_exec = true;
  765. auto x = opr::VolatileSharedDeviceTensor::make(*graph, dev_x0),
  766. y = opr::VolatileSharedDeviceTensor::make(*graph, dev_y0),
  767. z = opr::VolatileSharedDeviceTensor::make(*graph, dev_z0),
  768. w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3);
  769. HostTensorND host_w;
  770. auto func = graph->compile({{w, [&host_w](DeviceTensorND& d) {
  771. host_w = mgb::HostTensorND::make_proxy(d);
  772. }}});
  773. func->execute();
  774. for (int i = 0; i < 4; ++i) {
  775. if (i == 2) {
  776. *host_x0 = *host_x1;
  777. *host_y0 = *host_y1;
  778. *host_z0 = *host_z1;
  779. dev_x0->only_reset_raw_storage(dev_x1->storage());
  780. dev_y0->only_reset_raw_storage(dev_y1->storage());
  781. dev_z0->only_reset_raw_storage(dev_z1->storage());
  782. }
  783. func->execute();
  784. auto expect = check();
  785. MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i;
  786. }
  787. }
  788. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台