You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

reformat_manager.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /**
  2. * \file src/gopt/test/reformat_manager.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "./helper.h"
  13. #include "megbrain/gopt/reformat_manager.h"
  14. #include "megbrain/graph/event.h"
  15. #include "megbrain/opr/tensor_manip.h"
  16. #include "megbrain/plugin/base.h"
  17. #include "megbrain/plugin/profiler.h"
  18. using namespace mgb;
  19. using namespace gopt;
  20. TEST(TestReformatManager, Feature) {
  21. constexpr size_t N = 16, C = 128, H = 7, W = 7;
  22. HostTensorGenerator<> gen;
  23. using ReformatKey = ReformatManager::ReformatKey;
  24. auto src_format = TensorFormats::NHWC, dst_format = TensorFormats::NCHWc64;
  25. ReformatKey key{src_format, dst_format};
  26. auto reformat = ReformatManager::instance().get(key);
  27. auto graph = ComputingGraph::make();
  28. graph->options().graph_opt_level = 0;
  29. auto r = [](VarNode* inp) {
  30. auto x = SymbolVar(inp);
  31. auto xshp = opr::GetVarShape::make(x);
  32. auto cv = [&x](int v) { return x.make_scalar(v); };
  33. auto sub = [&xshp, &cv](int idx) {
  34. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  35. };
  36. auto tshp0 = opr::Concat::make(
  37. {sub(0), sub(1), sub(2), sub(3) / 64, cv(64)}, 0);
  38. auto y0 = opr::Reshape::make(x, tshp0);
  39. auto y1 = opr::Dimshuffle::make(y0, {0, 3, 1, 2, 4});
  40. return y1;
  41. };
  42. auto mkvar = [&](const char* name, const TensorShape& shp) {
  43. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  44. };
  45. auto x = mkvar("x", {N, H, W, C});
  46. auto y1 = SymbolVar(reformat({x.node()}));
  47. auto y2 = r(x.node());
  48. size_t nr_shapeof = 0;
  49. size_t nr_reshape = 0;
  50. cg::DepOprIter{[&nr_shapeof, &nr_reshape](cg::OperatorNodeBase* o) {
  51. if (o->same_type<opr::GetVarShape>())
  52. nr_shapeof++;
  53. if (o->same_type<opr::Reshape>())
  54. nr_reshape++;
  55. }}
  56. .add(y1.node()->owner_opr());
  57. ASSERT_EQ(nr_shapeof, 1);
  58. ASSERT_EQ(nr_reshape, 1);
  59. HostTensorND t1, t2;
  60. auto func1 = graph->compile({make_callback_copy(y1, t1)});
  61. func1->execute();
  62. auto func2 = graph->compile({make_callback_copy(y2, t2)});
  63. func2->execute();
  64. MGB_ASSERT_TENSOR_EQ(t1, t2);
  65. }
  66. TEST(TestReformatManager, Weight) {
  67. constexpr size_t G = 8, K = 128, C = 128, R = 3, S = 3;
  68. HostTensorGenerator<> gen;
  69. using ReformatKey = ReformatManager::ReformatKey;
  70. auto src_format = TensorFormats::GKCRS,
  71. dst_format = TensorFormats::GKCRSk4c4;
  72. ReformatKey key{src_format, dst_format};
  73. auto reformat = ReformatManager::instance().get(key);
  74. auto graph = ComputingGraph::make();
  75. graph->options().graph_opt_level = 0;
  76. auto r = [](VarNode* inp) {
  77. auto x = SymbolVar(inp);
  78. auto xshp = opr::GetVarShape::make(x);
  79. auto cv = [&x](int v) { return x.make_scalar(v); };
  80. auto sub = [&xshp, &cv](int idx) {
  81. return opr::IndexAt::make(xshp, {{0, cv(idx)}});
  82. };
  83. auto tshp0 = opr::Concat::make({sub(0), sub(1) / 4, cv(4), sub(2) / 4,
  84. cv(4), sub(3), sub(4)},
  85. 0),
  86. tshp1 = opr::Concat::make({sub(0), sub(1) / 4, sub(2) / 4, sub(3),
  87. sub(4), cv(4), cv(4)},
  88. 0);
  89. auto y0 = opr::Reshape::make(x, tshp0);
  90. auto y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 5, 6, 2, 4});
  91. auto y2 = opr::Reshape::make(y1, tshp1);
  92. return y2;
  93. };
  94. auto mkvar = [&](const char* name, const TensorShape& shp) {
  95. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  96. };
  97. auto w = mkvar("w", {G, K / G, C / G, R, S});
  98. auto y1 = SymbolVar(reformat({w.node()}));
  99. auto y2 = r(w.node());
  100. size_t nr_shapeof = 0;
  101. size_t nr_reshape = 0;
  102. cg::DepOprIter{[&nr_shapeof, &nr_reshape](cg::OperatorNodeBase* o) {
  103. if (o->same_type<opr::GetVarShape>())
  104. nr_shapeof++;
  105. if (o->same_type<opr::Reshape>())
  106. nr_reshape++;
  107. }}
  108. .add(y1.node()->owner_opr());
  109. ASSERT_EQ(nr_shapeof, 1);
  110. ASSERT_EQ(nr_reshape, 1);
  111. HostTensorND t1, t2;
  112. auto func1 = graph->compile({make_callback_copy(y1, t1)});
  113. func1->execute();
  114. auto func2 = graph->compile({make_callback_copy(y2, t2)});
  115. func2->execute();
  116. MGB_ASSERT_TENSOR_EQ(t1, t2);
  117. }
  118. TEST(TestReformatManager, InvalidKey) {
  119. using ReformatKey = ReformatManager::ReformatKey;
  120. using Attribute = ReformatKey::Attribute;
  121. auto src_format = TensorFormats::GKCRS,
  122. dst_format = TensorFormats::GKCRSk4c4;
  123. Attribute attribute = Attribute::IMAGE2D;
  124. ReformatKey key{src_format, dst_format, attribute};
  125. ASSERT_THROW(ReformatManager::instance().get(key), AssertionError);
  126. }
  127. TEST(TestReformatManager, InputChannelSmall) {
  128. constexpr size_t N = 16, C = 3, H = 224, W = 224;
  129. auto cn = CompNode::load("cpux");
  130. HostTensorGenerator<> gen;
  131. using ReformatKey = ReformatManager::ReformatKey;
  132. using Attribute = ReformatKey::Attribute;
  133. auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc4;
  134. ReformatKey key{src_format, dst_format, Attribute::IC_SMALL};
  135. auto reformat = ReformatManager::instance().get(key);
  136. auto graph = ComputingGraph::make();
  137. graph->options().graph_opt_level = 0;
  138. auto r = [](VarNode* inp) {
  139. auto x = SymbolVar(inp);
  140. auto y = opr::RelayoutFormat::make(
  141. x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL);
  142. return y;
  143. };
  144. auto mkvar = [&](const char* name, const TensorShape& shp) {
  145. return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
  146. };
  147. auto x = mkvar("x", {N, C, H, W});
  148. auto y1 = SymbolVar(reformat({x.node()}));
  149. auto y2 = r(x.node());
  150. HostTensorND t1, t2;
  151. auto func1 = graph->compile({make_callback_copy(y1, t1)});
  152. func1->execute();
  153. auto func2 = graph->compile({make_callback_copy(y2, t2)});
  154. func2->execute();
  155. MGB_ASSERT_TENSOR_EQ(t1, t2);
  156. }
  157. TEST(TestReformatManager, AutoAlignedFeature) {
  158. constexpr size_t N = 16, C = 22, H = 55, W = 55;
  159. HostTensorGenerator<> gen;
  160. using ReformatKey = ReformatManager::ReformatKey;
  161. auto src_format = TensorFormats::NCHWc4,
  162. dst_format = TensorFormats::NCHWc32;
  163. ReformatKey key{src_format, dst_format};
  164. auto graph = ComputingGraph::make();
  165. graph->options().graph_opt_level = 0;
  166. std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W});
  167. std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4});
  168. auto mkvar = [&](const char* name,
  169. const std::shared_ptr<HostTensorND>& host_val) {
  170. return opr::Host2DeviceCopy::make(*graph, host_val).rename(name);
  171. };
  172. auto orig_x = mkvar("orig_x", host_orig_x);
  173. auto x = mkvar("x", host_x);
  174. auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
  175. orig_x.node(), TensorFormats::NCHW, key);
  176. auto y = builder({x.node()});
  177. HostTensorND t;
  178. auto func = graph->compile({make_callback_copy(y, t)});
  179. func->execute();
  180. *host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4});
  181. func->execute();
  182. *host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4});
  183. func->execute();
  184. auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32};
  185. ASSERT_TRUE(shp.eq_shape(t.shape()));
  186. }
  187. TEST(TestReformatManager, AutoAlignedFeatureB4) {
  188. constexpr size_t N = 16, C = 94, H = 55, W = 55;
  189. HostTensorGenerator<> gen;
  190. using ReformatKey = ReformatManager::ReformatKey;
  191. auto src_format = TensorFormats::NCHWc4,
  192. dst_format = TensorFormats::NCHWc64;
  193. ReformatKey key{src_format, dst_format};
  194. auto graph = ComputingGraph::make();
  195. graph->options().graph_opt_level = 0;
  196. std::shared_ptr<HostTensorND> host_orig_x = gen({N, C, H, W});
  197. std::shared_ptr<HostTensorND> host_x = gen({N, (C + 3) / 4, H, W, 4});
  198. auto mkvar = [&](const char* name,
  199. const std::shared_ptr<HostTensorND>& host_val,
  200. const DType& dtype) {
  201. return opr::TypeCvt::make(
  202. opr::Host2DeviceCopy::make(*graph, host_val).rename(name),
  203. dtype);
  204. };
  205. auto orig_x = mkvar("orig_x", host_orig_x,
  206. dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(8)));
  207. auto x = mkvar("x", host_x,
  208. dtype::Quantized4Asymm(25.f, static_cast<uint8_t>(4)));
  209. auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
  210. orig_x.node(), TensorFormats::NCHW, key);
  211. auto y = builder({x.node()});
  212. HostTensorND t;
  213. auto func = graph->compile({make_callback_copy(y, t)});
  214. func->execute();
  215. }
  216. TEST(TestReformatManager, AutoAlignedWeight) {
  217. constexpr size_t K = 32, C = 32, R = 3, S = 3;
  218. HostTensorGenerator<> gen;
  219. using ReformatKey = ReformatManager::ReformatKey;
  220. auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64;
  221. ReformatKey key{src_format, dst_format};
  222. auto graph = ComputingGraph::make();
  223. graph->options().graph_opt_level = 0;
  224. auto mkvar = [&](const char* name, const TensorShape& shp) {
  225. return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name);
  226. };
  227. auto w = mkvar("w", {K, C, R, S});
  228. auto builder = ReformatManager::instance().auto_aligned_reformat_weight(
  229. w.node(), key,
  230. ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64});
  231. auto y = builder({w.node()});
  232. HostTensorND t;
  233. auto func = graph->compile({make_callback_copy(y, t)});
  234. func->execute();
  235. }
  236. #if MGB_CUDA
  237. #include "megbrain/comp_node_env.h"
  238. namespace {
  239. class ReformatProfiler : public PluginBase {
  240. using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;
  241. public:
  242. class MarkInputContiguous;
  243. ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start,
  244. cg::OperatorNodeBase* opr_end);
  245. ~ReformatProfiler() noexcept;
  246. double duration() const;
  247. private:
  248. CompNodeEventPtr m_start, m_end;
  249. cg::OperatorNodeBase *m_opr_start, *m_opr_end;
  250. };
  251. ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph,
  252. cg::OperatorNodeBase* opr_start,
  253. cg::OperatorNodeBase* opr_end)
  254. : PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) {
  255. using namespace cg::event;
  256. auto on_reformat_start = [this](BeforeKernel const& event) {
  257. auto opr = event.opr;
  258. if (opr != m_opr_start)
  259. return;
  260. if (m_start == nullptr) {
  261. m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER);
  262. }
  263. m_start->record();
  264. };
  265. auto on_reformat_end = [this](AfterKernel const& event) {
  266. auto opr = event.opr;
  267. if (opr != m_opr_end)
  268. return;
  269. if (m_end == nullptr) {
  270. m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER);
  271. }
  272. m_end->record();
  273. };
  274. auto&& ev = graph->event();
  275. add_event_handler(ev.register_receiver<BeforeKernel>(on_reformat_start));
  276. add_event_handler(ev.register_receiver<AfterKernel>(on_reformat_end));
  277. }
  278. ReformatProfiler::~ReformatProfiler() noexcept {
  279. if (m_start)
  280. m_start->host_wait();
  281. if (m_end)
  282. m_end->host_wait();
  283. }
  284. double ReformatProfiler::duration() const {
  285. mgb_assert(m_end);
  286. m_end->host_wait();
  287. return m_start->elapsed_time_until(*m_end) -
  288. m_start->elapsed_time_until(*m_start);
  289. }
  290. MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous,
  291. cg::SingleCNOperatorNodeBase) // {
  292. void scn_do_execute() override{};
  293. void init_output_static_infer_desc() override;
  294. void add_input_layout_constraint() override;
  295. public:
  296. MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config);
  297. static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {});
  298. }; // namespace
  299. MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous);
  300. ReformatProfiler::MarkInputContiguous::MarkInputContiguous(
  301. VarNode* node, const OperatorNodeConfig& config)
  302. : Super(node->owner_graph(), config, "mark_contiguous", {node}) {
  303. add_input({node});
  304. add_output(None);
  305. }
  306. SymbolVar ReformatProfiler::MarkInputContiguous::make(
  307. SymbolVar node, const OperatorNodeConfig& config) {
  308. return node.insert_single_output_opr<MarkInputContiguous>(node.node(),
  309. config);
  310. }
  311. void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() {
  312. using namespace cg::static_infer;
  313. auto&& mgr = owner_graph()->static_infer_manager();
  314. mgr.register_shape_infer(output(0),
  315. ShapeInferDesc::make_identity(input(0)));
  316. }
  317. void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() {
  318. input(0)->add_layout_constraint_contiguous();
  319. }
  320. class CUTimer {
  321. public:
  322. CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1)
  323. : m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} {
  324. reset();
  325. }
  326. void reset() {
  327. m_started = false;
  328. m_stopped = false;
  329. }
  330. void start() {
  331. mgb_assert(!m_started);
  332. mgb_assert(!m_stopped);
  333. m_started = true;
  334. cudaEventRecord(m_evt0, m_stream);
  335. }
  336. void stop() {
  337. mgb_assert(m_started);
  338. mgb_assert(!m_stopped);
  339. m_stopped = true;
  340. cudaEventRecord(m_evt1, m_stream);
  341. }
  342. size_t get_time_in_us() const {
  343. cudaStreamSynchronize(m_stream);
  344. float t = -1;
  345. cudaEventElapsedTime(&t, m_evt0, m_evt1);
  346. return static_cast<size_t>(t * 1e3);
  347. }
  348. private:
  349. bool m_started, m_stopped;
  350. size_t m_start_point, m_stop_point;
  351. cudaStream_t& m_stream;
  352. cudaEvent_t &m_evt0, &m_evt1;
  353. };
  354. } // namespace
  355. TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
  356. REQUIRE_GPU(1);
  357. auto cn = CompNode::load("gpux");
  358. using ReformatKey = ReformatManager::ReformatKey;
  359. auto dtype = dtype::Quantized4Asymm(20.f, static_cast<uint8_t>(4));
  360. HostTensorND hval(cn, dtype);
  361. constexpr size_t N = 16, C = 18, H = 55, W = 55;
  362. hval.resize({N, (C + 63) / 64, H, W, 64});
  363. std::shared_ptr<DeviceTensorND> dval =
  364. std::make_shared<DeviceTensorND>(cn, dtype);
  365. dval->copy_from(hval).sync();
  366. std::shared_ptr<DeviceTensorND> dprime =
  367. std::make_shared<DeviceTensorND>(cn, dtype);
  368. dprime->resize({N, C, H, W});
  369. auto graph = ComputingGraph::make();
  370. graph->options().graph_opt_level = 0;
  371. graph->options().var_sanity_check_first_run = false;
  372. auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval);
  373. auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime);
  374. ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW};
  375. auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
  376. xprime.node(), TensorFormats::NCHW, key);
  377. auto y = builder({x.node()});
  378. auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y));
  379. auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); };
  380. auto output_spec = std::make_pair(mark, cb);
  381. auto func = graph->compile({output_spec});
  382. static constexpr size_t RUNS = 100;
  383. cn.activate();
  384. auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream;
  385. cudaEvent_t evt0;
  386. cudaEvent_t evt1;
  387. MGB_CUDA_CHECK(cudaEventCreate(&evt0));
  388. MGB_CUDA_CHECK(cudaEventCreate(&evt1));
  389. CUTimer timer(stream, evt0, evt1);
  390. timer.start();
  391. for (size_t i = 0; i < RUNS; ++i)
  392. func->execute();
  393. timer.stop();
  394. double time_cuda_evt = timer.get_time_in_us() / static_cast<double>(RUNS);
  395. OperatorNodeBase* start = x.node()->owner_opr();
  396. OperatorNodeBase* end = y->owner_opr();
  397. std::unique_ptr<ReformatProfiler> profiler =
  398. std::make_unique<ReformatProfiler>(graph.get(), start, end);
  399. ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W}));
  400. for (size_t i = 0; i < RUNS; ++i)
  401. func->execute();
  402. double time_profiler = profiler->duration() * 1e6;
  403. MGB_CUDA_CHECK(cudaEventDestroy(evt0));
  404. MGB_CUDA_CHECK(cudaEventDestroy(evt1));
  405. }
  406. #endif
  407. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台