You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node.cpp 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884
  1. #include "./comp_node_helper.h"
  2. #include "megbrain/comp_node_env.h"
  3. #include "megbrain/opr/utility.h"
  4. #include "megbrain/system.h"
  5. #include "megbrain/test/helper.h"
  6. #include "megbrain/utils/comp_node_sync_manager.h"
  7. #include "megbrain/utils/timer.h"
  8. #include <chrono>
  9. #if MGB_HAVE_THREAD
  10. #include <thread>
  11. #endif
  12. using namespace mgb;
  13. TEST(TestCompNode, Parse) {
  14. using L = CompNode::Locator;
  15. using D = CompNode::DeviceType;
  16. auto make_lc = [](D t, int dev, int s) -> L { return {t, dev, {s}}; };
  17. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  18. ASSERT_EQ(L::parse("xpux:23"), make_lc(D::UNSPEC, -1, 23));
  19. ASSERT_EQ(L::parse("xpu2:23"), make_lc(D::UNSPEC, 2, 23));
  20. ASSERT_EQ(L::parse("xpu21:23"), make_lc(D::UNSPEC, 21, 23));
  21. ASSERT_EQ(L::parse("cpux"), make_lc(D::CPU, -1, 0));
  22. ASSERT_EQ(L::parse("cpux:23"), make_lc(D::CPU, -1, 23));
  23. ASSERT_EQ(L::parse("cpu2:23"), make_lc(D::CPU, 2, 23));
  24. ASSERT_EQ(L::parse("cpu21:23"), make_lc(D::CPU, 21, 23));
  25. ASSERT_EQ(L::parse("rocmx"), make_lc(D::ROCM, -1, 0));
  26. ASSERT_EQ(L::parse("rocm2"), make_lc(D::ROCM, 2, 0));
  27. ASSERT_EQ(L::parse("rocm2:3"), make_lc(D::ROCM, 2, 3));
  28. ASSERT_EQ(L::parse("cambriconx"), make_lc(D::CAMBRICON, -1, 0));
  29. ASSERT_EQ(L::parse("cambricon2"), make_lc(D::CAMBRICON, 2, 0));
  30. ASSERT_EQ(L::parse("cambricon2:3"), make_lc(D::CAMBRICON, 2, 3));
  31. ASSERT_EQ(L::parse("atlasx"), make_lc(D::ATLAS, -1, 0));
  32. ASSERT_EQ(L::parse("atlas2"), make_lc(D::ATLAS, 2, 0));
  33. ASSERT_EQ(L::parse("atlas2:3"), make_lc(D::ATLAS, 2, 3));
  34. ASSERT_EQ(L::parse("xpu"), make_lc(D::UNSPEC, -1, 0));
  35. ASSERT_EQ(L::parse("xpux"), make_lc(D::UNSPEC, -1, 0));
  36. ASSERT_EQ(L::parse("xpu23"), make_lc(D::UNSPEC, 23, 0));
  37. ASSERT_EQ(L::parse("xpu23:1"), make_lc(D::UNSPEC, 23, 1));
  38. ASSERT_EQ(L::parse("cpu:default"), make_lc(D::CPU, L::DEVICE_CPU_DEFAULT, 0));
  39. ASSERT_EQ(L::parse("multithread2:0"), make_lc(D::MULTITHREAD, 0, 2));
  40. ASSERT_EQ(L::parse("multithread1:3"), make_lc(D::MULTITHREAD, 3, 1));
  41. ASSERT_EQ(
  42. L::parse("multithread:default:2"),
  43. make_lc(D::MULTITHREAD, L::DEVICE_MULTITHREAD_DEFAULT, 2));
  44. ASSERT_THROW(L::parse("apu"), MegBrainError);
  45. ASSERT_THROW(L::parse("fpgbx"), MegBrainError);
  46. ASSERT_THROW(L::parse("cab0"), MegBrainError);
  47. ASSERT_THROW(L::parse("cpu"), MegBrainError);
  48. ASSERT_THROW(L::parse("cpu-1"), MegBrainError);
  49. ASSERT_THROW(L::parse("cpu0:"), MegBrainError);
  50. ASSERT_THROW(L::parse("cpu0:x"), MegBrainError);
  51. ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError);
  52. ASSERT_THROW(L::parse("rcom0"), MegBrainError);
  53. ASSERT_THROW(L::parse("cmabricon0"), MegBrainError);
  54. ASSERT_THROW(L::parse("atlast0"), MegBrainError);
  55. ASSERT_THROW(L::parse("multithread"), MegBrainError);
  56. ASSERT_THROW(L::parse("multithread1:"), MegBrainError);
  57. ASSERT_THROW(L::parse("multithread1:default"), MegBrainError);
  58. ASSERT_THROW(L::parse("multithread1:default:0"), MegBrainError);
  59. }
  60. TEST(TestCompNode, SetDefaultDev) {
  61. REQUIRE_GPU(3);
  62. CompNode::finalize();
  63. using L = CompNode::Locator;
  64. auto orig_dt = L::parse("xpu").to_physical(),
  65. orig_gpu = L::parse("gpux").to_physical(),
  66. orig_cpu = L::parse("cpux").to_physical();
  67. constexpr auto CUDA = CompNode::DeviceType::CUDA;
  68. constexpr auto CPU = CompNode::DeviceType::CPU;
  69. L::set_unspec_device_type(CUDA);
  70. auto run = [](int device) {
  71. ASSERT_EQ(
  72. CompNode::load("xpu").locator(),
  73. L::parse("gpu" + std::to_string(device)));
  74. };
  75. auto run_cpu = [](int device) {
  76. ASSERT_EQ(
  77. CompNode::load("cpux").locator(),
  78. L::parse("cpu" + std::to_string(device)));
  79. };
  80. MGB_TRY {
  81. L::set_device_map(CUDA, -1, 2);
  82. run(2);
  83. L::set_device_map(CUDA, -1, 1);
  84. run(1);
  85. L::set_device_map(CPU, -1, 2);
  86. run_cpu(2);
  87. L::set_device_map(CPU, -1, 1);
  88. run_cpu(1);
  89. }
  90. MGB_FINALLY({
  91. L::set_unspec_device_type(orig_dt.type);
  92. L::set_device_map(CUDA, -1, orig_gpu.device);
  93. L::set_device_map(CPU, -1, orig_cpu.device);
  94. });
  95. CompNode::finalize();
  96. }
  97. TEST(TestCompNode, Load) {
  98. auto cn0 = CompNode::load("xpux"), cn1 = CompNode::load("cpux");
  99. ASSERT_EQ(CompNode::DeviceType::UNSPEC, cn0.locator_logical().type);
  100. ASSERT_EQ(CompNode::DeviceType::CPU, cn1.locator_logical().type);
  101. ASSERT_EQ(CompNode::load("cpux"), cn1);
  102. ASSERT_EQ(CompNode::load("xpux"), cn0);
  103. auto cnp = CompNode::load("cpu1"), cnq = CompNode::load("cpu2");
  104. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  105. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  106. #if MGB_HAVE_THREAD
  107. ASSERT_NE(cnp, cnq);
  108. #else
  109. ASSERT_EQ(cnp, cnq);
  110. #endif
  111. #if MGB_HAVE_THREAD
  112. auto cn_multi_thread0 = CompNode::load("multithread2:0");
  113. auto cn_multi_thread1 = CompNode::load("multithread2:1");
  114. ASSERT_EQ(CompNode::load("multithread2:0"), cn_multi_thread0);
  115. ASSERT_EQ(CompNode::load("multithread2:1"), cn_multi_thread1);
  116. ASSERT_NE(CompNode::load("multithread4:0"), cn_multi_thread0);
  117. ASSERT_NE(CompNode::load("multithread4:1"), cn_multi_thread1);
  118. auto cn_multi_default0 = CompNode::load("multithread:default:2");
  119. auto cn_multi_default1 = CompNode::load("multithread:default:4");
  120. ASSERT_EQ(CompNode::load("multithread:default:2"), cn_multi_default0);
  121. ASSERT_EQ(CompNode::load("multithread:default:4"), cn_multi_default1);
  122. ASSERT_NE(cn_multi_thread0, cn_multi_default1);
  123. #endif
  124. ASSERT_EQ(CompNode::load("cpu1"), cnp);
  125. ASSERT_EQ(CompNode::load("cpu2"), cnq);
  126. if (check_gpu_available(2)) {
  127. auto cn2 = CompNode::load("gpux"), cn3 = CompNode::load("gpu1");
  128. ASSERT_EQ(CompNode::DeviceType::CUDA, cn2.locator_logical().type);
  129. ASSERT_NE(cn2, cn3);
  130. ASSERT_EQ(CompNode::load("gpux"), cn2);
  131. ASSERT_EQ(CompNode::load("gpu1"), cn3);
  132. }
  133. #if MGB_ATLAS
  134. auto atlas0 = CompNode::load("atlas0");
  135. auto atlas1 = CompNode::load("atlas1");
  136. ASSERT_NE(atlas0, atlas1);
  137. #endif
  138. }
  139. TEST(TestCompNode, FreeAfterFinalize) {
  140. CompNode::finalize();
  141. for (size_t i = 0; i < CompNode::NR_DEVICE_TYPE; ++i) {
  142. auto type = static_cast<CompNode::DeviceType>(i);
  143. if (!check_device_type_avaiable(type) || !CompNode::get_device_count(type))
  144. continue;
  145. auto cn = CompNode::load(CompNode::Locator{type, -1, {0}});
  146. auto ptr = cn.alloc_device(123);
  147. CompNode::finalize();
  148. cn.free_device(ptr);
  149. }
  150. }
  151. TEST(TestCompNode, CPUDispatchSync) {
  152. REQUIRE_THREAD();
  153. constexpr int LOOP = 160, tot_threads = 8;
  154. std::atomic_int started_threads{0};
  155. auto worker = [&](int* shared_cnt, CompNode dest) {
  156. int nr_call = 0;
  157. RNGxorshf rng{next_rand_seed()};
  158. auto func = [&rng, &nr_call, shared_cnt]() {
  159. ++nr_call;
  160. ++*shared_cnt;
  161. int volatile cnt = 0;
  162. while (rng() % 20)
  163. ++cnt;
  164. };
  165. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  166. ++started_threads;
  167. while (started_threads.load() != tot_threads)
  168. ;
  169. for (int i = 0; i < LOOP; ++i) {
  170. env.dispatch(func);
  171. dest.sync();
  172. ASSERT_EQ(i + 1, nr_call);
  173. }
  174. };
  175. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  176. int cnt0 = 0, cnt1 = 0;
  177. std::vector<std::thread> wk_threads;
  178. for (int i = 0; i < tot_threads / 2; ++i) {
  179. wk_threads.emplace_back(worker, &cnt0, cn0);
  180. wk_threads.emplace_back(worker, &cnt1, cn1);
  181. }
  182. for (auto&& i : wk_threads)
  183. i.join();
  184. ASSERT_EQ(LOOP * tot_threads / 2, cnt0);
  185. ASSERT_EQ(LOOP * tot_threads / 2, cnt1);
  186. }
  187. TEST(TestCompNodeCPU, CoreAffinity) {
  188. REQUIRE_THREAD();
  189. std::vector<size_t> data_v(2, 0);
  190. size_t data0, data1 = 0;
  191. auto empty_task = []() {};
  192. auto cn0 = CompNode::load("cpu:default"), cn1 = CompNode::load("cpu0"),
  193. cn2 = CompNode::load("multithread2:0");
  194. auto binding0 = [&](size_t) { data0 = 10; };
  195. CompNodeEnv::from_comp_node(cn0).cpu_env().set_affinity(binding0);
  196. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(empty_task);
  197. cn0.sync();
  198. auto binding1 = [&](size_t) { data1 = 20; };
  199. CompNodeEnv::from_comp_node(cn1).cpu_env().set_affinity(binding1);
  200. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(empty_task);
  201. cn1.sync();
  202. auto binding2 = [&](size_t thread_id) { data_v[thread_id] = 30; };
  203. auto temp_task = [](size_t, size_t) {};
  204. CompNodeEnv::from_comp_node(cn2).cpu_env().set_affinity(binding2);
  205. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(temp_task, 40u);
  206. cn2.sync();
  207. ASSERT_EQ(data0, static_cast<size_t>(10));
  208. ASSERT_EQ(data1, static_cast<size_t>(20));
  209. ASSERT_EQ(data_v[0], static_cast<size_t>(30));
  210. ASSERT_EQ(data_v[1], static_cast<size_t>(30));
  211. }
  212. TEST(TestCompNode, CPU_MULTI_THREAD) {
  213. REQUIRE_THREAD();
  214. std::vector<int> source(100), dst0(100), dst1(100);
  215. for (int i = 0; i < 100; i++) {
  216. source[i] = i;
  217. dst0[i] = 0;
  218. dst1[i] = 0;
  219. }
  220. size_t total_task = 20;
  221. auto worker = [&](std::vector<int>& dst, CompNode dest) {
  222. auto func = [&](size_t index, size_t) {
  223. size_t sub_task = 100 / total_task;
  224. for (size_t i = index * sub_task; i < (index + 1) * sub_task; i++) {
  225. int sum = 0;
  226. for (size_t j = 0; j < i; j++) {
  227. sum += source[j];
  228. }
  229. dst[i] = sum;
  230. }
  231. };
  232. auto&& env = CompNodeEnv::from_comp_node(dest).cpu_env();
  233. env.dispatch(std::move(func), total_task);
  234. dest.sync();
  235. };
  236. for (auto&& str : std::vector<std::string>{
  237. "multithread2:0", "multithread4:0", "multithread:default:4"}) {
  238. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load(str);
  239. std::thread wk_thread0{std::ref(worker), std::ref(dst0), std::ref(cn0)};
  240. std::thread wk_thread1{std::ref(worker), std::ref(dst1), std::ref(cn1)};
  241. wk_thread0.join();
  242. wk_thread1.join();
  243. for (int i = 0; i < 100; i++) {
  244. ASSERT_EQ(dst0[i], dst1[i]);
  245. }
  246. }
  247. }
  248. TEST(TestCompNodeCuda, MemNode) {
  249. REQUIRE_GPU(2);
  250. auto cn00 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"),
  251. cn01 = CompNode::load("gpu0:1");
  252. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  253. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  254. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  255. }
  256. TEST(TestCompNodeCuda, Uid) {
  257. REQUIRE_GPU(2);
  258. auto cn00 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1"),
  259. cn01 = CompNode::load("gpu0:0"), cn02 = CompNode::load("gpu0:2");
  260. ASSERT_EQ(cn00, CompNode::load("gpu0"));
  261. ASSERT_EQ(cn00.get_uid(), cn01.get_uid());
  262. ASSERT_NE(cn00.get_uid(), cn02.get_uid());
  263. ASSERT_NE(cn00.get_uid(), cn1.get_uid());
  264. }
  265. TEST(TestCompNodeCuda, set_prealloc_config) {
  266. CompNode::set_prealloc_config(
  267. 1024, 1024, 256 * 1024 * 1024, 4, CompNode::DeviceType::CUDA);
  268. }
  269. #if MGB_ROCM
  270. TEST(TestCompNodeROCm, MemNode) {
  271. REQUIRE_AMD_GPU(2);
  272. auto cn00 = CompNode::load("rocm0"), cn1 = CompNode::load("rocm1"),
  273. cn01 = CompNode::load("rocm0:1");
  274. ASSERT_EQ(cn00, CompNode::load("rocm0"));
  275. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  276. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  277. }
  278. #endif
  279. #if MGB_CAMBRICON
  280. TEST(TestCompNodeCambricon, MemNode) {
  281. REQUIRE_CAMBRICON_DEVICE(2);
  282. auto cn00 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1"),
  283. cn01 = CompNode::load("cambricon0:1");
  284. ASSERT_EQ(cn00, CompNode::load("cambricon0"));
  285. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  286. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  287. }
  288. #endif
  289. #if MGB_ATLAS
  290. TEST(TestCompNodeAtlas, MemNode) {
  291. auto cn00 = CompNode::load("atlas0"), cn1 = CompNode::load("atlas1"),
  292. cn01 = CompNode::load("atlas0:1");
  293. ASSERT_EQ(cn00, CompNode::load("atlas0"));
  294. ASSERT_EQ(cn00.mem_node(), cn01.mem_node());
  295. ASSERT_NE(cn00.mem_node(), cn1.mem_node());
  296. }
  297. #endif
  298. TEST(TestCompNodeCPU, PhysicalDispatch) {
  299. constexpr int ID = 0x2a6453e0;
  300. using L = CompNode::Locator;
  301. constexpr auto DT = CompNode::DeviceType::CPU;
  302. L::set_device_map(DT, ID, 0);
  303. L::set_device_map(DT, ID + 1, 0);
  304. L::set_device_map(DT, ID + 2, 1);
  305. auto cn0 = CompNode::load({DT, ID, {0}}), cn1 = CompNode::load({DT, ID + 1, {0}}),
  306. cn2 = CompNode::load({DT, ID + 2, {0}});
  307. #if MGB_HAVE_THREAD
  308. ASSERT_NE(cn0, cn1);
  309. #else
  310. ASSERT_EQ(cn0, cn1);
  311. #endif
  312. std::vector<std::thread::id> tids;
  313. std::mutex tids_mtx;
  314. auto get_tid = [&]() {
  315. MGB_LOCK_GUARD(tids_mtx);
  316. tids.push_back(std::this_thread::get_id());
  317. };
  318. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(get_tid);
  319. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(get_tid);
  320. CompNodeEnv::from_comp_node(cn2).cpu_env().dispatch(get_tid);
  321. CompNode::sync_all();
  322. std::unordered_set<std::thread::id> uniq_tids(tids.begin(), tids.end());
  323. ASSERT_EQ(3u, tids.size());
  324. #if MGB_HAVE_THREAD
  325. ASSERT_EQ(2u, uniq_tids.size());
  326. #else
  327. ASSERT_EQ(1u, uniq_tids.size());
  328. #endif
  329. }
  330. TEST(TestCompNodeCPU, EventWait) {
  331. REQUIRE_THREAD();
  332. std::atomic_bool start = ATOMIC_VAR_INIT(false);
  333. auto cn0 = CompNode::load("cpu0"), cn1 = CompNode::load("cpu1");
  334. auto task0 = [&]() {
  335. while (!start)
  336. std::this_thread::yield();
  337. };
  338. auto event = cn0.create_event();
  339. CompNodeEnv::from_comp_node(cn0).cpu_env().dispatch(task0);
  340. event->record();
  341. cn1.device_wait_event(*event);
  342. bool succ = false;
  343. auto task1 = [&]() { succ = start; };
  344. CompNodeEnv::from_comp_node(cn1).cpu_env().dispatch(task1);
  345. using namespace std::literals;
  346. std::this_thread::sleep_for(50ms);
  347. ASSERT_FALSE(succ);
  348. start = true;
  349. CompNode::sync_all();
  350. ASSERT_TRUE(succ);
  351. }
  352. TEST(TestCompNodeCPU, EventRecOverwrite) {
  353. REQUIRE_THREAD();
  354. auto cn = CompNode::load("cpu0");
  355. auto dispatcher = CompNodeEnv::from_comp_node(cn).cpu_env().dispatcher.get();
  356. auto dispatch = [&](MegcoreCPUDispatcher::Task&& t) {
  357. dispatcher->dispatch(std::move(t));
  358. };
  359. auto ev = cn.create_event();
  360. auto wait_atomic = [](std::atomic_bool* var) {
  361. while (!var->load())
  362. std::this_thread::yield();
  363. };
  364. auto set_atomic = [](std::atomic_bool* var) { var->store(true); };
  365. std::atomic_bool s0 = ATOMIC_VAR_INIT(false), s1 = ATOMIC_VAR_INIT(false),
  366. t0 = ATOMIC_VAR_INIT(false), t1 = ATOMIC_VAR_INIT(false),
  367. t2 = ATOMIC_VAR_INIT(false);
  368. dispatch(std::bind(set_atomic, &t0));
  369. dispatch(std::bind(wait_atomic, &s0));
  370. ev->record();
  371. dispatch(std::bind(set_atomic, &t1));
  372. dispatch(std::bind(wait_atomic, &s1));
  373. ev->record();
  374. dispatch(std::bind(set_atomic, &t2));
  375. wait_atomic(&t0);
  376. ASSERT_FALSE(ev->finished());
  377. set_atomic(&s0);
  378. wait_atomic(&t1);
  379. ASSERT_FALSE(ev->finished());
  380. set_atomic(&s1);
  381. wait_atomic(&t2);
  382. ASSERT_TRUE(ev->finished());
  383. }
  384. namespace {
  385. void test_peer_copy_from_device(const char* comp_node) {
  386. REQUIRE_THREAD();
  387. auto cn_gpu = CompNode::load(comp_node);
  388. auto cn_cpu = CompNode::load("cpux");
  389. HostTensorGenerator<> gen;
  390. auto a = gen({20, 3, 112, 112});
  391. auto b = gen({20, 3, 112, 112});
  392. auto c = gen({20, 3, 112, 112});
  393. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  394. dev_a.copy_from(*a).sync();
  395. dev_b.copy_from(*b).sync();
  396. dev_c.copy_from(*c).sync();
  397. auto wait_event = cn_gpu.create_event();
  398. opr::Sleep::sleep(cn_gpu, 0.1);
  399. dev_a.copy_from(dev_c);
  400. wait_event->record();
  401. cn_cpu.device_wait_event(*wait_event);
  402. dev_b.copy_from(dev_a);
  403. dev_b.sync();
  404. HostTensorND result;
  405. result.copy_from(dev_b);
  406. CompNode::sync_all();
  407. MGB_ASSERT_TENSOR_EQ(result, *c);
  408. }
  409. } // namespace
  410. TEST(TestCompNodeCPU, PeerCopyFromCUDA) {
  411. REQUIRE_GPU(1);
  412. test_peer_copy_from_device("gpux");
  413. }
  414. TEST(TestCompNodeCPU, PeerCopyFromROCm) {
  415. REQUIRE_AMD_GPU(1);
  416. test_peer_copy_from_device("rocmx");
  417. }
  418. #if MGB_CAMBRICON
  419. TEST(TestCompNodeCPU, PeerCopyFromCambricon) {
  420. REQUIRE_CAMBRICON_DEVICE(1);
  421. REQUIRE_THREAD();
  422. auto cn_gpu = CompNode::load("cambriconx");
  423. auto cn_cpu = CompNode::load("cpux");
  424. HostTensorGenerator<> gen;
  425. auto a = gen({20, 3, 112, 112});
  426. auto b = gen({20, 3, 112, 112});
  427. auto c = gen({20, 3, 112, 112});
  428. DeviceTensorND dev_a{cn_gpu}, dev_b{cn_cpu}, dev_c{cn_gpu};
  429. dev_a.copy_from(*a).sync();
  430. dev_b.copy_from(*b).sync();
  431. dev_c.copy_from(*c).sync();
  432. auto wait_event = cn_gpu.create_event();
  433. dev_a.copy_from(dev_c);
  434. wait_event->record();
  435. cn_cpu.device_wait_event(*wait_event);
  436. dev_b.copy_from(dev_a);
  437. dev_b.sync();
  438. HostTensorND result;
  439. result.copy_from(dev_b);
  440. CompNode::sync_all();
  441. MGB_ASSERT_TENSOR_EQ(result, *c);
  442. }
  443. #endif
  444. TEST(TestCompNodeSyncManager, HostWait) {
  445. REQUIRE_THREAD();
  446. CompNodeSyncManager mgr(CompNode::load("xpu0"));
  447. auto run_set = [&]() {
  448. using namespace std::literals;
  449. std::this_thread::sleep_for(200ms);
  450. mgr.set_ready();
  451. mgb_log_debug("set_ready() called");
  452. };
  453. for (int run = 0; run < 2; ++run) {
  454. std::thread th_run_set(run_set);
  455. RealTimer timer;
  456. mgr.clear_waiter_record();
  457. ASSERT_THROW(mgr.busy_wait_set_ready(), MegBrainError);
  458. mgr.add_waiter_record(false);
  459. mgr.add_waiter_record(false);
  460. mgr.busy_wait_set_ready();
  461. EXPECT_GE(timer.get_secs(), 0.1);
  462. timer.reset();
  463. mgr.busy_wait_set_ready();
  464. EXPECT_LE(timer.get_secs(), 0.001);
  465. th_run_set.join();
  466. }
  467. }
  468. TEST(TestCompNodeSyncManager, DeviceWait) {
  469. REQUIRE_THREAD();
  470. auto cns = load_multiple_xpus(3);
  471. auto cn0 = cns[0], cn1 = cns[1], cn2 = cns[2];
  472. CompNodeSyncManager mgr(cn0);
  473. using Event = CompNode::Event;
  474. auto ev_cn1 = cn1.create_event(),
  475. ev_cn2_begin = cn2.create_event(Event::NEED_TIMER),
  476. ev_cn2_end = cn2.create_event(Event::NEED_TIMER);
  477. for (int run = 0; run < 2; ++run) {
  478. RealTimer timer;
  479. mgr.clear_waiter_record();
  480. ASSERT_THROW(mgr.busy_wait_set_ready_and_get_event(), MegBrainError);
  481. mgr.add_waiter_record(true);
  482. mgr.add_waiter_record(true);
  483. opr::Sleep::sleep(cn0, 0.13);
  484. mgr.set_ready();
  485. ev_cn2_begin->record();
  486. cn1.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  487. cn2.device_wait_event(mgr.busy_wait_set_ready_and_get_event());
  488. ev_cn1->record();
  489. ev_cn2_end->record();
  490. EXPECT_LE(timer.get_secs(), 0.06);
  491. ev_cn1->host_wait();
  492. EXPECT_GE(timer.get_secs(), 0.1);
  493. ev_cn2_end->host_wait();
  494. auto ev2_t = ev_cn2_begin->elapsed_time_until(*ev_cn2_end);
  495. EXPECT_GE(ev2_t, 0.1);
  496. }
  497. }
  498. TEST(TestCompNodeSyncManager, DeviceWaitCross) {
  499. REQUIRE_THREAD();
  500. auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
  501. auto ev_cn0 = cn0.create_event(), ev_cn1 = cn1.create_event();
  502. RealTimer timer;
  503. // cross wait like deadlock, but guaranteed to work due to good timing
  504. ev_cn0->record();
  505. cn1.device_wait_event(*ev_cn0);
  506. ev_cn1->record();
  507. opr::Sleep::sleep(cn0, 0.1);
  508. cn0.device_wait_event(*ev_cn1);
  509. ev_cn0->record();
  510. cn1.device_wait_event(*ev_cn0);
  511. cn0.sync();
  512. cn1.sync();
  513. // sleep kernel in cuda is easily affected by the frequency change of GPU,
  514. // so we just print warn log instead assert. more refer to
  515. // XPU-226
  516. auto used = timer.get_secs();
  517. if (used <= 0.1 || used >= 0.2) {
  518. mgb_log_warn("expect time between [%f, %f], got %f", 0.1, 0.2, used);
  519. }
  520. }
  521. #if !MGB_HAVE_THREAD
  522. TEST(TestCompNodeSyncManager, DeviceWaitWithoutThread) {
  523. auto cn = CompNode::load("cpu:default");
  524. CompNodeSyncManager mgr(cn);
  525. mgr.add_waiter_record(true);
  526. ASSERT_ANY_THROW(mgr.busy_wait_set_ready());
  527. mgr.set_ready();
  528. EXPECT_TRUE(mgr.busy_wait_set_ready_and_get_event().finished());
  529. }
  530. #endif
  531. TEST(TestCompNode, MultipleLoad) {
  532. auto run = [](CompNode cn) {
  533. HostTensorND a(cn, {23}, dtype::Int32{}), b;
  534. auto pa = a.ptr<int>();
  535. for (int i = 0; i < 23; ++i) {
  536. pa[i] = i;
  537. }
  538. DeviceTensorND tmp;
  539. tmp.copy_from(a);
  540. b.copy_from(tmp).sync();
  541. auto pb = b.ptr<int>();
  542. for (int i = 0; i < 23; ++i) {
  543. ASSERT_EQ(i, pb[i]);
  544. }
  545. CompNode::finalize();
  546. };
  547. for (size_t i = 1; i < CompNode::NR_DEVICE_TYPE; ++i) {
  548. auto dt = static_cast<CompNode::DeviceType>(i);
  549. if (!check_device_type_avaiable(dt))
  550. continue;
  551. if (CompNode::get_device_count(dt)) {
  552. auto cn = CompNode::load({dt, 0, {0}});
  553. mgb_log("comp node %s is available", cn.to_string().c_str());
  554. run(cn);
  555. cn = CompNode::load({dt, 0, {0}});
  556. run(cn);
  557. }
  558. }
  559. }
  560. #if MGB_CAMBRICON
  561. TEST(TestCompNodeCambricon, D2DCopy) {
  562. auto run = [](CompNode cn) {
  563. constexpr size_t size = 100 * 1024 * 1024;
  564. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  565. auto pa = a.ptr<int>();
  566. for (size_t i = 0; i < size; ++i) {
  567. pa[i] = i;
  568. }
  569. DeviceTensorND tmp, tmp1;
  570. tmp.copy_from(a);
  571. tmp1.copy_from(tmp);
  572. b.copy_from(tmp1).sync();
  573. auto pb = b.ptr<int>();
  574. for (size_t i = 0; i < size; ++i) {
  575. ASSERT_EQ(static_cast<int>(i), pb[i]);
  576. }
  577. CompNode::finalize();
  578. };
  579. REQUIRE_CAMBRICON_DEVICE(1);
  580. auto cn = CompNode::load("cambricon0");
  581. run(cn);
  582. REQUIRE_CAMBRICON_DEVICE(2);
  583. cn = CompNode::load("cambricon1");
  584. run(cn);
  585. }
  586. // peer copy for cambricon between different devices is not correct now, so
  587. // disable this testcase
  588. #if 0
  589. TEST(TestCompNodeCambricon, P2PCopy) {
  590. auto run_raw = []() {
  591. int v0 = 0, v1 = 1;
  592. cnrtDev_t dev0, dev1;
  593. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev0, 0));
  594. MGB_CNRT_CHECK(cnrtGetDeviceHandle(&dev1, 1));
  595. int *dp0, *dp1;
  596. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev0));
  597. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp0), sizeof(int)));
  598. MGB_CNRT_CHECK(
  599. cnrtMemcpy(dp0, &v0, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  600. MGB_CNRT_CHECK(cnrtSetCurrentDevice(dev1));
  601. MGB_CNRT_CHECK(cnrtMalloc((void**)(&dp1), sizeof(int)));
  602. MGB_CNRT_CHECK(
  603. cnrtMemcpy(dp1, &v1, sizeof(int), CNRT_MEM_TRANS_DIR_HOST2DEV));
  604. unsigned int can = 0;
  605. MGB_CNRT_CHECK(cnrtGetPeerAccessibility(&can, 0, 1));
  606. printf("can = %s\n", can ? "TRUE" : "FALSE");
  607. if (can) {
  608. MGB_CNRT_CHECK(cnrtMemcpyPeer(dp1, 1, dp0, 0, sizeof(int)));
  609. int get;
  610. MGB_CNRT_CHECK(cnrtMemcpy(&get, dp1, sizeof(int),
  611. CNRT_MEM_TRANS_DIR_DEV2HOST));
  612. ASSERT_EQ(0, get);
  613. }
  614. };
  615. auto run = [](CompNode cn0, CompNode cn1) {
  616. constexpr size_t size = 100;
  617. HostTensorND a(cn0, {size}, dtype::Int32{}), b;
  618. auto pa = a.ptr<int>();
  619. for (size_t i = 0; i < size; ++i) {
  620. pa[i] = i;
  621. }
  622. DeviceTensorND tmp(cn0, {size}, dtype::Int32{}),
  623. tmp1(cn1, {size}, dtype::Int32{});
  624. tmp.copy_from(a);
  625. tmp1.copy_from(tmp);
  626. b.copy_from(tmp1).sync();
  627. auto pb = b.ptr<int>();
  628. for (size_t i = 0; i < size; ++i) {
  629. ASSERT_EQ(static_cast<int>(i), pb[i]);
  630. }
  631. CompNode::finalize();
  632. };
  633. REQUIRE_CAMBRICON_DEVICE(2);
  634. auto cn0 = CompNode::load("cambricon0"), cn1 = CompNode::load("cambricon1");
  635. run_raw();
  636. run(cn0, cn1);
  637. }
  638. #endif
  639. #endif // MGB_CAMBRICON
  640. #if MGB_ATLAS
  641. TEST(TestCompNodeAtlas, D2DCopy) {
  642. auto run = [](CompNode cn) {
  643. constexpr size_t size = 10 * 1024 * 1024;
  644. HostTensorND a(cn, {size}, dtype::Int32{}), b;
  645. auto pa = a.ptr<int>();
  646. for (size_t i = 0; i < size; ++i) {
  647. pa[i] = i;
  648. }
  649. DeviceTensorND tmp, tmp1;
  650. tmp.copy_from(a);
  651. tmp1.copy_from(tmp);
  652. b.copy_from(tmp1).sync();
  653. auto pb = b.ptr<int>();
  654. for (size_t i = 0; i < size; ++i) {
  655. ASSERT_EQ(static_cast<int>(i), pb[i]);
  656. }
  657. CompNode::finalize();
  658. };
  659. auto cn = CompNode::load("atlas0");
  660. run(cn);
  661. }
  662. #endif
  663. namespace {
  664. class CompNodeDepedentObjectInst final : public CompNodeDepedentObject {
  665. int *m_dst, *m_timer;
  666. std::shared_ptr<void> on_comp_node_finalize() override {
  667. EXPECT_EQ(0, *m_dst);
  668. *m_dst = ++*m_timer;
  669. return {};
  670. }
  671. public:
  672. CompNodeDepedentObjectInst(int* dst, int* timer) : m_dst{dst}, m_timer{timer} {}
  673. void chk() { check_not_finalized(); }
  674. };
  675. } // anonymous namespace
  676. TEST(TestCompNode, DepedentObjectList) {
  677. CompNode::finalize();
  678. for (int i = 0; i < 5; ++i) {
  679. // loop multiple times so memory problems can be easier exposed
  680. int ts[4] = {0}, timer = 0;
  681. auto make = [&](int i) {
  682. return std::make_unique<CompNodeDepedentObjectInst>(ts + i, &timer);
  683. };
  684. auto i0 = make(0), i1 = make(1), i2 = make(2), i3 = make(3);
  685. ASSERT_NO_THROW(i0->chk());
  686. ASSERT_NO_THROW(i1->chk());
  687. i1.reset();
  688. comp_node_detail::DepedentObjList::invoke_callback_and_clean();
  689. ASSERT_EQ(1, ts[3]);
  690. ASSERT_EQ(2, ts[2]);
  691. ASSERT_EQ(0, ts[1]);
  692. ASSERT_EQ(3, ts[0]);
  693. ASSERT_THROW(i0->chk(), InternalError);
  694. }
  695. }
  696. namespace {
  697. template <typename tag>
  698. class TestCPUCompSeqRec : public ::testing::Test {};
  699. TYPED_TEST_CASE(TestCPUCompSeqRec, comp_node_test::seq_rec::test_types);
  700. TYPED_TEST(TestCPUCompSeqRec, run) {
  701. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpux"));
  702. }
  703. TYPED_TEST(TestCPUCompSeqRec, run_default_cpu) {
  704. comp_node_test::seq_rec::run<TypeParam>(CompNode::load("cpu:default"));
  705. }
  706. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread) {
  707. auto cn = CompNode::load("multithread4:0");
  708. comp_node_test::seq_rec::run<TypeParam>(cn);
  709. }
  710. TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) {
  711. auto cn = CompNode::load("multithread:default:4");
  712. comp_node_test::seq_rec::run<TypeParam>(cn);
  713. }
  714. } // anonymous namespace
  715. #include "megbrain/opr/basic_arith_wrapper.h"
  716. #include "megbrain/opr/io.h"
  717. #include "megbrain/opr/tensor_manip.h"
  718. #include "megbrain/opr/utility.h"
  719. TEST(TestCPUCompSeqRec, run_dyn_ptr) {
  720. CompNode cn = CompNode::load("cpux");
  721. HostTensorGenerator<> gen;
  722. auto host_x0 = gen({4, 1}, cn), host_y0 = gen({4, 1}, cn),
  723. host_z0 = gen({4, 1}, cn);
  724. auto host_x1 = gen({4, 1}, cn), host_y1 = gen({4, 1}, cn),
  725. host_z1 = gen({4, 1}, cn);
  726. auto dev_x0 = std::make_shared<DeviceTensorND>(cn);
  727. auto dev_y0 = std::make_shared<DeviceTensorND>(cn);
  728. auto dev_z0 = std::make_shared<DeviceTensorND>(cn);
  729. auto dev_x1 = std::make_shared<DeviceTensorND>(cn);
  730. auto dev_y1 = std::make_shared<DeviceTensorND>(cn);
  731. auto dev_z1 = std::make_shared<DeviceTensorND>(cn);
  732. (*dev_x0).comp_node(cn).copy_from(*host_x0).sync();
  733. (*dev_y0).comp_node(cn).copy_from(*host_y0).sync();
  734. (*dev_z0).comp_node(cn).copy_from(*host_z0).sync();
  735. (*dev_x1).comp_node(cn).copy_from(*host_x1).sync();
  736. (*dev_y1).comp_node(cn).copy_from(*host_y1).sync();
  737. (*dev_z1).comp_node(cn).copy_from(*host_z1).sync();
  738. auto check = [&]() {
  739. HostTensorND ret(CompNode::load("cpux"), host_x0->shape());
  740. auto px = host_x0->ptr<float>(), py = host_y0->ptr<float>(),
  741. pz = host_z0->ptr<float>(), pw = ret.ptr<float>();
  742. auto sz0 = host_x0->shape()[0], sz1 = host_x0->shape()[1];
  743. for (size_t i = 0; i < sz0; ++i) {
  744. for (size_t j = 0; j < sz1; ++j) {
  745. pw[i * sz1 + j] = px[i * sz1 + j] * py[i * sz1 + j] + pz[i * sz1 + j];
  746. }
  747. }
  748. return ret;
  749. };
  750. auto graph = ComputingGraph::make();
  751. // test record on first run
  752. graph->options().var_sanity_check_first_run = false;
  753. graph->options().graph_opt_level = 0;
  754. graph->options().comp_node_seq_record_level = 1;
  755. graph->options().fake_next_exec = true;
  756. auto x = opr::VolatileSharedDeviceTensor::make(*graph, dev_x0),
  757. y = opr::VolatileSharedDeviceTensor::make(*graph, dev_y0),
  758. z = opr::VolatileSharedDeviceTensor::make(*graph, dev_z0),
  759. w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3);
  760. HostTensorND host_w;
  761. auto func = graph->compile({{w, [&host_w](DeviceTensorND& d) {
  762. host_w = mgb::HostTensorND::make_proxy(d);
  763. }}});
  764. func->execute();
  765. for (int i = 0; i < 4; ++i) {
  766. if (i == 2) {
  767. *host_x0 = *host_x1;
  768. *host_y0 = *host_y1;
  769. *host_z0 = *host_z1;
  770. dev_x0->only_reset_raw_storage(dev_x1->storage());
  771. dev_y0->only_reset_raw_storage(dev_y1->storage());
  772. dev_z0->only_reset_raw_storage(dev_z1->storage());
  773. }
  774. func->execute();
  775. auto expect = check();
  776. MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i;
  777. }
  778. }
  779. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}