You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comp_node_helper.cpp 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. #include "./comp_node_helper.h"
  2. #include "megbrain/opr/basic_arith_wrapper.h"
  3. #include "megbrain/opr/io.h"
  4. #include "megbrain/opr/tensor_manip.h"
  5. #include "megbrain/opr/utility.h"
  6. #include "megbrain/serialization/serializer.h"
  7. using namespace mgb;
  8. using namespace comp_node_test;
  9. namespace {
  10. void run_comp_seq_rec_basic(CompNode cn, bool fake_first) {
  11. using ConvParam = opr::Convolution::Param;
  12. ConvParam param;
  13. param.sparse = ConvParam::Sparse::GROUP;
  14. HostTensorGenerator<> gen;
  15. auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
  16. int iter = 0;
  17. std::vector<int> executed;
  18. HostTensorND host_z;
  19. auto graph = ComputingGraph::make();
  20. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  21. y = opr::Host2DeviceCopy::make(*graph, host_y),
  22. z = opr::CallbackInjector::make(
  23. opr::Convolution::make(x, y, param),
  24. [&](DeviceTensorND&dv) { executed.push_back(iter); });
  25. graph->options().comp_node_seq_record_level = 1;
  26. if (fake_first) {
  27. graph->options().fake_next_exec = true;
  28. graph->options().var_sanity_check_first_run = false;
  29. }
  30. auto func = graph->compile({make_callback_copy(z, host_z)});
  31. if (fake_first) {
  32. func->execute(); // first exec
  33. }
  34. int change = 5;
  35. for (; iter < 10; ++iter) {
  36. if (iter == change) {
  37. *host_x = *gen({2, 4, 15, 13}, cn);
  38. }
  39. host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
  40. func->execute();
  41. auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
  42. MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3) << "iter " << iter;
  43. }
  44. ASSERT_EQ(executed.size(), 4u);
  45. // if fake_first, both warmup exec and exec with recorder will perform in
  46. // iter0 else, normal exec will perform in iter0 and exec with recorder in
  47. // iter1
  48. ASSERT_EQ(executed[0], 0);
  49. ASSERT_EQ(executed[1], fake_first ? 0 : 1);
  50. // recorder would be reset, normal exec
  51. ASSERT_EQ(executed[2], change);
  52. // create new recorder, exec with recorder
  53. ASSERT_EQ(executed[3], change + 1);
  54. }
  55. void run_comp_seq_rec_basic_level2(CompNode cn) {
  56. using ConvParam = opr::ConvBias::Param;
  57. ConvParam param;
  58. param.sparse = ConvParam::Sparse::GROUP;
  59. HostTensorGenerator<> gen;
  60. auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
  61. int iter = 0;
  62. std::vector<int> executed;
  63. HostTensorND host_z;
  64. auto graph = ComputingGraph::make();
  65. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  66. y = opr::Host2DeviceCopy::make(*graph, host_y),
  67. z = opr::CallbackInjector::make(
  68. opr::ConvBias::make(x, y, param),
  69. [&](DeviceTensorND&dv) { executed.push_back(iter); });
  70. graph->options().comp_node_seq_record_level = 2;
  71. graph->options().var_sanity_check_first_run = false;
  72. auto func = graph->compile({make_callback_copy(z, host_z)});
  73. ComputingGraph::assert_destroy(graph);
  74. for (; iter < 10; ++iter) {
  75. host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
  76. func->execute();
  77. auto expect = eval_conv_cpu<opr::ConvBias>(*host_x, *host_y, param);
  78. MGB_ASSERT_TENSOR_NEAR(expect, host_z, 1e-3) << "iter " << iter;
  79. }
  80. ASSERT_EQ(executed.size(), 2u);
  81. //! test default_cpu with record2
  82. {
  83. HostTensorND hz;
  84. graph = ComputingGraph::make();
  85. x = opr::Host2DeviceCopy::make(*graph, host_x);
  86. y = opr::Host2DeviceCopy::make(*graph, host_y);
  87. z = opr::ConvBias::make(x, y, param);
  88. z = opr::GetVarShape::make(z);
  89. graph->options().comp_node_seq_record_level = 2;
  90. graph->options().var_sanity_check_first_run = false;
  91. auto func = graph->compile({make_callback_copy(z, hz, true)});
  92. ComputingGraph::assert_destroy(graph);
  93. func->execute();
  94. ASSERT_TRUE(hz.comp_node() == cn);
  95. ASSERT_EQ(hz.ptr<int>()[0], 3);
  96. ASSERT_EQ(hz.ptr<int>()[1], 6);
  97. ASSERT_EQ(hz.ptr<int>()[2], 8);
  98. ASSERT_EQ(hz.ptr<int>()[3], 6);
  99. }
  100. }
  101. void run_comp_seq_rec_dyn_elemwise(CompNode cn, bool fake_first) {
  102. // dynamic memory is allocated in elemwise
  103. HostTensorGenerator<> gen;
  104. auto host_x = gen({3, 3}, cn), host_y = gen({1, 3}, cn), host_z = gen({3, 1}, cn);
  105. auto check = [&]() {
  106. HostTensorND ret(CompNode::load("cpux"), host_x->shape());
  107. auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
  108. pz = host_z->ptr<float>(), pw = ret.ptr<float>();
  109. auto sz0 = host_x->shape()[0], sz1 = host_x->shape()[1];
  110. for (size_t i = 0; i < sz0; ++i) {
  111. for (size_t j = 0; j < sz1; ++j) {
  112. pw[i * sz1 + j] = px[i * sz1 + j] * py[j] + pz[i];
  113. }
  114. }
  115. return ret;
  116. };
  117. auto graph = ComputingGraph::make();
  118. // test record on first run
  119. graph->options().var_sanity_check_first_run = false;
  120. graph->options().graph_opt_level = 0;
  121. graph->options().comp_node_seq_record_level = 1;
  122. if (fake_first) {
  123. graph->options().fake_next_exec = true;
  124. }
  125. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  126. y = opr::Host2DeviceCopy::make(*graph, host_y),
  127. z = opr::Host2DeviceCopy::make(*graph, host_z),
  128. w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3);
  129. HostTensorND host_w;
  130. auto func = graph->compile({make_callback_copy(w, host_w)});
  131. if (fake_first) {
  132. func->execute();
  133. }
  134. for (int i = 0; i < 10; ++i) {
  135. if (i == 5) {
  136. *host_x = *gen({10, 8}, cn);
  137. *host_y = *gen({1, 8}, cn);
  138. *host_z = *gen({10, 1}, cn);
  139. }
  140. host_x->copy_from(*gen(host_x->shape(), cn));
  141. func->execute();
  142. auto expect = check();
  143. MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i;
  144. }
  145. }
  146. void run_level2(CompNode cn, bool use_multi_holder) {
  147. HostTensorGenerator<> gen;
  148. auto host_x = gen({4, 3, 6, 7}, cn), host_w = gen({2, 3, 2, 3}, cn),
  149. host_y = gen({1, 25}, cn), host_z = gen({8, 1}, cn),
  150. host_large = gen({8, 25}, cn);
  151. auto make_func = [&](bool enable) -> thin_function<const HostTensorND&()> {
  152. auto graph = ComputingGraph::make();
  153. graph->options().graph_opt_level = 0;
  154. if (enable) {
  155. graph->options().var_sanity_check_first_run = false;
  156. graph->options().comp_node_seq_record_level = 2;
  157. }
  158. auto repeat2 = [](SymbolVar x) { return opr::Concat::make({x, x}, 0); };
  159. SymbolVar w;
  160. auto dev_w = std::make_shared<DeviceTensorND>();
  161. // test shared dev tensor with 1 refcnt
  162. if (use_multi_holder) {
  163. dev_w->copy_from(*host_w).sync();
  164. w = opr::MultipleDeviceTensorHolder::make(*graph, {dev_w})[0];
  165. } else {
  166. w = opr::SharedDeviceTensor::make(*graph, *host_w);
  167. }
  168. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  169. // test shared dev tensor with 1 refcnt
  170. c = opr::Convolution::make(x, w).reshape({8, 25}),
  171. y = opr::Host2DeviceCopy::make(*graph, host_y),
  172. large = opr::ImmutableTensor::make(*graph, *host_large),
  173. z = opr::Host2DeviceCopy::make(*graph, host_z),
  174. // elemwise with larger tmp storage
  175. t0 = opr::Elemwise::make(
  176. {c, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3) +
  177. large,
  178. // t1 shape is {8, 1}
  179. t1 = opr::reduce_sum(t0, z.symshape()),
  180. t2 = opr::Elemwise::make(
  181. {repeat2(c), y, repeat2(t1)}, opr::Elemwise::Mode::FUSE_MUL_ADD3),
  182. large1 = opr::ImmutableTensor::make(*graph, *host_large);
  183. t2 * 2; // unused opr
  184. // used large static infer
  185. graph->static_infer_manager().infer_value(large.node());
  186. // unused large static infer
  187. graph->static_infer_manager().infer_value(large1.node());
  188. // static infer value
  189. graph->static_infer_manager().infer_value((t1.symshape() + 1).node());
  190. auto result = std::make_shared<HostTensorND>();
  191. auto func = graph->compile({make_callback_copy(t2, *result)});
  192. std::shared_ptr<cg::AsyncExecutable> sh_func(func.release());
  193. if (enable) {
  194. ComputingGraph::assert_destroy(graph);
  195. }
  196. auto exec = [result, sh_func]() -> const HostTensorND& {
  197. sh_func->execute();
  198. return *result;
  199. };
  200. return exec;
  201. };
  202. auto f0 = make_func(false), f1 = make_func(true);
  203. for (int i = 0; i < 3; ++i) {
  204. host_x->copy_from(*gen(host_x->shape(), cn));
  205. host_y->copy_from(*gen(host_y->shape(), cn));
  206. host_z->copy_from(*gen(host_z->shape(), cn));
  207. auto&& expect = f0();
  208. auto&& get = f1();
  209. MGB_ASSERT_TENSOR_EQ(expect, get);
  210. }
  211. host_x->resize({1});
  212. ASSERT_THROW(f1(), MegBrainError);
  213. }
  214. } // anonymous namespace
  215. namespace mgb {
  216. namespace comp_node_test {
  217. namespace seq_rec {
  218. template <>
  219. void run<basic>(CompNode cn) {
  220. run_comp_seq_rec_basic(cn, false);
  221. }
  222. template <>
  223. void run<basic_level2>(CompNode cn) {
  224. run_comp_seq_rec_basic_level2(cn);
  225. }
  226. template <>
  227. void run<basic_fake_exec>(CompNode cn) {
  228. run_comp_seq_rec_basic(cn, true);
  229. }
  230. template <>
  231. void run<dyn_elemwise>(CompNode cn) {
  232. run_comp_seq_rec_dyn_elemwise(cn, false);
  233. }
  234. template <>
  235. void run<dyn_elemwise_fake_exec>(CompNode cn) {
  236. run_comp_seq_rec_dyn_elemwise(cn, true);
  237. }
  238. template <>
  239. void run<level2>(CompNode cn) {
  240. run_level2(cn, false);
  241. }
  242. template <>
  243. void run<level2_multi_holder>(CompNode cn) {
  244. run_level2(cn, true);
  245. }
  246. template <>
  247. void run<level2_share_storage>(CompNode cn) {
  248. HostTensorGenerator<> gen;
  249. auto host_x = gen({1}, cn), host_y = gen({1}, cn), host_z = gen({10}, cn);
  250. auto make_func =
  251. [&](bool enable) -> thin_function<std::array<const HostTensorND*, 2>()> {
  252. auto g0 = ComputingGraph::make(), g1 = ComputingGraph::make();
  253. if (enable) {
  254. g0->options().var_sanity_check_first_run = false;
  255. g0->options().comp_node_seq_record_level = 2;
  256. g1->options().var_sanity_check_first_run = false;
  257. g1->options().comp_node_seq_record_level = 2;
  258. g0->share_device_memory_with(*g1);
  259. }
  260. auto x0 = opr::Host2DeviceCopy::make(*g0, host_x),
  261. x1 = opr::Host2DeviceCopy::make(*g1, host_x),
  262. y = opr::Host2DeviceCopy::make(*g0, host_y),
  263. z = opr::Host2DeviceCopy::make(*g1, host_z);
  264. auto t0 = x0 + y, t1 = x1 + z;
  265. auto host_t0 = std::make_shared<HostTensorND>(),
  266. host_t1 = std::make_shared<HostTensorND>();
  267. auto f0 = g0->compile({make_callback_copy(t0, *host_t0)});
  268. auto f1 = g1->compile({make_callback_copy(t1, *host_t1)});
  269. std::shared_ptr<cg::AsyncExecutable> sh_f0(f0.release()), sh_f1(f1.release());
  270. if (enable) {
  271. ComputingGraph::assert_destroy(g0);
  272. ComputingGraph::assert_destroy(g1);
  273. }
  274. auto exec = [host_t0, host_t1, sh_f0,
  275. sh_f1]() -> std::array<const HostTensorND*, 2> {
  276. sh_f0->execute();
  277. sh_f1->execute();
  278. return {host_t0.get(), host_t1.get()};
  279. };
  280. return exec;
  281. };
  282. auto f0 = make_func(false), f1 = make_func(true);
  283. for (int i = 0; i < 3; ++i) {
  284. host_x->copy_from(*gen(host_x->shape(), cn));
  285. host_y->copy_from(*gen(host_y->shape(), cn));
  286. host_z->copy_from(*gen(host_z->shape(), cn));
  287. auto&& expect = f0();
  288. auto&& get = f1();
  289. MGB_ASSERT_TENSOR_EQ(*expect[0], *get[0]);
  290. MGB_ASSERT_TENSOR_EQ(*expect[1], *get[1]);
  291. }
  292. }
  293. template <>
  294. void run<level2_exec_check>(CompNode cn) {
  295. HostTensorGenerator<> gen;
  296. auto host_x = gen({1}, cn);
  297. for (int testcase = 0; testcase < 3; ++testcase) {
  298. host_x->copy_from(*gen({1}));
  299. auto graph = ComputingGraph::make();
  300. auto x = opr::Host2DeviceCopy::make(*graph, host_x), y = x * 2;
  301. HostTensorND host_y;
  302. graph->options().var_sanity_check_first_run = false;
  303. graph->options().comp_node_seq_record_level = 2;
  304. auto func = graph->compile({make_callback_copy(y, host_y)});
  305. ASSERT_EQ(host_y.shape(), host_x->shape());
  306. auto expect = host_x->ptr<float>()[0] * 2;
  307. ASSERT_NE(expect, host_y.ptr<float>()[0]);
  308. if (testcase == 0) {
  309. ComputingGraph::assert_destroy(graph);
  310. func->execute();
  311. ASSERT_EQ(expect, host_y.ptr<float>()[0]);
  312. } else if (testcase == 1) {
  313. ASSERT_THROW(func->execute(), MegBrainError);
  314. } else {
  315. // it should be OK to destroy func and then graph
  316. func.reset();
  317. graph.reset();
  318. }
  319. };
  320. }
  321. template <>
  322. void run<sync_from_func>(CompNode cn) {
  323. REQUIRE_THREAD();
  324. HostTensorGenerator<> gen;
  325. auto host_x = gen({1}, cn);
  326. for (int level : {1, 2}) {
  327. for (bool sync : {false, true}) {
  328. auto graph = ComputingGraph::make();
  329. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  330. y = opr::Sleep::make(x, 0.15) * 2;
  331. HostTensorND host_y;
  332. graph->options().var_sanity_check_first_run = false;
  333. graph->options().comp_node_seq_record_level = level;
  334. auto cb = [&](const DeviceTensorND& dv) {
  335. host_y.copy_from(dv);
  336. if (sync) {
  337. host_y.sync();
  338. }
  339. };
  340. auto func = graph->compile({{y, cb}});
  341. if (level == 2) {
  342. ComputingGraph::assert_destroy(graph);
  343. }
  344. for (int i = 0; i < 3; ++i) {
  345. host_x->ptr<float>()[0] = i + 0.3;
  346. func->execute();
  347. if (!sync) {
  348. func->wait();
  349. }
  350. auto got = host_y.ptr<float>()[0];
  351. MGB_ASSERT_FLOAT_EQ((i + 0.3) * 2, got)
  352. << "level=" << level << " i=" << i;
  353. }
  354. }
  355. }
  356. }
  357. template <>
  358. void run<cb_non_contig>(CompNode cn) {
  359. REQUIRE_THREAD();
  360. HostTensorGenerator<> gen;
  361. auto host_x = gen({4, 5}, cn);
  362. for (int level : {1, 2}) {
  363. for (bool sync : {false, true}) {
  364. auto graph = ComputingGraph::make();
  365. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  366. y = opr::Dimshuffle::make(x, {1, 0});
  367. HostTensorND host_y;
  368. graph->options().var_sanity_check_first_run = false;
  369. graph->options().comp_node_seq_record_level = level;
  370. auto cb = [&](const DeviceTensorND& dv) {
  371. host_y.copy_from(dv);
  372. if (sync) {
  373. host_y.sync();
  374. }
  375. };
  376. auto func = graph->compile({{y, cb}});
  377. if (level == 2) {
  378. ComputingGraph::assert_destroy(graph);
  379. }
  380. for (int i = 0; i < 3; ++i) {
  381. host_x->copy_from(*gen(host_x->shape()));
  382. HostTensorND expect{host_x->comp_node(), {5, 4}};
  383. auto px = host_x->ptr<float>(), py = expect.ptr<float>();
  384. for (int i = 0; i < 5; ++i) {
  385. for (int j = 0; j < 4; ++j) {
  386. py[i * 4 + j] = px[j * 5 + i];
  387. }
  388. }
  389. func->execute();
  390. if (!sync) {
  391. func->wait();
  392. }
  393. MGB_ASSERT_TENSOR_EQ(expect, host_y);
  394. }
  395. }
  396. }
  397. }
  398. template <>
  399. void run<shape_dep_const_shape>(CompNode cn) {
  400. // load model using const var shape to work around shape dependencies
  401. using namespace serialization;
  402. HostTensorGenerator<> gen;
  403. auto host_x = gen({4, 5}, cn);
  404. auto fname = output_file("test_comp_node_record_shape_dep_const_shape");
  405. HostTensorND y_expect;
  406. {
  407. // dump graph
  408. auto graph = ComputingGraph::make();
  409. auto x = opr::Host2DeviceCopy::make(*graph, host_x, OperatorNodeConfig{"x"}),
  410. y = x.flatten() +
  411. opr::reduce_sum(opr::GetVarShape::make(x), x.make_scalar(1));
  412. graph->compile({make_callback_copy(y, y_expect)})->execute();
  413. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  414. dumper->dump({y});
  415. }
  416. HostTensorND host_y;
  417. {
  418. GraphLoadConfig config;
  419. config.const_var_shape = true;
  420. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  421. auto load_rst = loader->load(config);
  422. load_rst.graph->options().comp_node_seq_record_level = 2;
  423. load_rst.graph->options().var_sanity_check_first_run = false;
  424. auto x_inp = load_rst.tensor_map.at("x");
  425. auto y = load_rst.output_var_list.at(0);
  426. auto func = load_rst.graph_compile({make_callback_copy(y, host_y)});
  427. x_inp->copy_from(*host_x);
  428. func->execute();
  429. }
  430. MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
  431. }
  432. //! single thread multi recorder run interleave
  433. template <>
  434. void run<multi_recorder_run>(CompNode cn) {
  435. using ConvParam = opr::Convolution::Param;
  436. ConvParam param;
  437. param.sparse = ConvParam::Sparse::GROUP;
  438. HostTensorGenerator<> gen;
  439. std::vector<HostTensorND> host_z_v(2, HostTensorND());
  440. std::vector<std::unique_ptr<mgb::cg::AsyncExecutable>> funcs;
  441. auto host_x = gen({3, 4, 10, 8}, cn), host_y = gen({2, 3, 2, 3, 3}, cn);
  442. auto gen_graph = [&](int graph_id) -> std::unique_ptr<mgb::cg::AsyncExecutable> {
  443. auto graph = ComputingGraph::make();
  444. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  445. y = opr::Host2DeviceCopy::make(*graph, host_y),
  446. z = opr::Convolution::make(x, y, param);
  447. graph->options().comp_node_seq_record_level = 1;
  448. return graph->compile({make_callback_copy(z, host_z_v[graph_id])});
  449. };
  450. funcs.push_back(gen_graph(0));
  451. funcs.push_back(gen_graph(1));
  452. for (int iter = 0; iter < 10; ++iter) {
  453. host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
  454. funcs[0]->execute();
  455. funcs[1]->execute();
  456. auto expect = eval_conv_cpu<opr::Convolution>(*host_x, *host_y, param);
  457. MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[0], 1e-3) << "iter " << iter;
  458. MGB_ASSERT_TENSOR_NEAR(expect, host_z_v[1], 1e-3) << "iter " << iter;
  459. }
  460. }
  461. template <>
  462. void run<void>(CompNode) {}
  463. } // namespace seq_rec
  464. } // namespace comp_node_test
  465. } // namespace mgb
  466. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}