You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler.cpp 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. /**
  2. * \file src/gopt/test/profiler.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "./helper.h"
  13. #include "megbrain/gopt/global_layout_transform.h"
  14. #include "megbrain/gopt/inference.h"
  15. #include "megbrain/opr/dnn/pooling.h"
  16. #include "megbrain/opr/imgproc.h"
  17. #include "megbrain/opr/nn_int.h"
  18. #include "megbrain/serialization/serializer.h"
  19. using namespace mgb;
  20. using namespace gopt;
  21. using namespace serialization;
  22. namespace {
  23. class LayoutTransformContext : public NonCopyableObj {
  24. public:
  25. using OprList = SubGraphExtractor::OprList;
  26. using OprFormat = Problem::OprFormat;
  27. using OprConfigTrait = Problem::OprConfigTrait;
  28. LayoutTransformContext() = delete;
  29. LayoutTransformContext(OprList opr_list,
  30. SmallVector<TensorFormats> available_tensor_formats,
  31. OprConfigTrait opr_configs)
  32. : m_opr_list{std::move(opr_list)},
  33. m_available_tensor_formats{std::move(available_tensor_formats)},
  34. m_opr_configs{std::move(opr_configs)} {}
  35. const OprList& opr_list() const { return m_opr_list; }
  36. const SmallVector<TensorFormats>& available_tensor_formats() const {
  37. return m_available_tensor_formats;
  38. }
  39. const OprConfigTrait& opr_configs() const { return m_opr_configs; }
  40. static std::unique_ptr<LayoutTransformContext> make() {
  41. OprList opr_list = {
  42. opr::ConvBiasForward::typeinfo(),
  43. opr::ConvolutionForward::typeinfo(),
  44. opr::ConvolutionBackwardData::typeinfo(),
  45. opr::ElemwiseMultiType::typeinfo(),
  46. opr::Elemwise::typeinfo(),
  47. opr::TypeCvt::typeinfo(),
  48. opr::PoolingForward::typeinfo(),
  49. opr::WarpPerspectiveForward::typeinfo(),
  50. };
  51. OprConfigTrait opr_configs;
  52. {
  53. auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
  54. #define cb(_fmt) \
  55. dispatchers[OprFormat::_fmt] = \
  56. OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
  57. opr::ConvBias::typeinfo(), OprFormat::_fmt);
  58. cb(NCHW4);
  59. cb(NCHW32);
  60. cb(NHWC);
  61. cb(NCHW64);
  62. cb(CHWN4);
  63. #undef cb
  64. }
  65. {
  66. auto& dispatchers =
  67. opr_configs[opr::ConvolutionBackwardData::typeinfo()];
  68. #define cb(_fmt) \
  69. dispatchers[OprFormat::_fmt] = \
  70. OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
  71. opr::ConvolutionBackwardData::typeinfo(), \
  72. OprFormat::_fmt);
  73. cb(NCHW4);
  74. #undef cb
  75. }
  76. {
  77. auto& dispatchers =
  78. opr_configs[opr::ConvolutionForward::typeinfo()];
  79. #define cb(_fmt) \
  80. dispatchers[OprFormat::_fmt] = \
  81. OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
  82. opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
  83. cb(NCHW4);
  84. #undef cb
  85. }
  86. {
  87. auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
  88. #define cb(_fmt) \
  89. dispatchers[OprFormat::_fmt] = \
  90. OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
  91. opr::PoolingForward::typeinfo(), OprFormat::_fmt);
  92. cb(NCHW4);
  93. cb(NCHW32);
  94. cb(NHWC);
  95. cb(NCHW64);
  96. cb(CHWN4);
  97. #undef cb
  98. }
  99. {
  100. auto& dispatchers =
  101. opr_configs[opr::WarpPerspectiveForward::typeinfo()];
  102. #define cb(_fmt) \
  103. dispatchers[OprFormat::_fmt] = \
  104. OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
  105. opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
  106. cb(NHWC);
  107. cb(NCHW4);
  108. cb(NCHW64);
  109. #undef cb
  110. }
  111. SmallVector<TensorFormats> available_tensor_formats = {
  112. TensorFormats::NHWC, TensorFormats::NCHWc4,
  113. TensorFormats::NCHWc32, TensorFormats::NCHWc64};
  114. return std::make_unique<LayoutTransformContext>(
  115. std::move(opr_list), std::move(available_tensor_formats),
  116. std::move(opr_configs));
  117. }
  118. private:
  119. OprList m_opr_list;
  120. SmallVector<TensorFormats> m_available_tensor_formats;
  121. OprConfigTrait m_opr_configs;
  122. };
  123. }; // namespace
  124. #if MGB_CUDA
  125. #if CUDA_VERSION >= 10020
  126. TEST(TestProfiler, Conv) {
  127. REQUIRE_GPU(1);
  128. auto cn = CompNode::load("gpu0");
  129. cn.activate();
  130. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  131. auto ctx = LayoutTransformContext::make();
  132. HostTensorGenerator<dtype::Int8> gen;
  133. auto graph = ComputingGraph::make();
  134. graph->options().graph_opt_level = 0;
  135. auto mkvar = [&](const char* name, const TensorShape& shp,
  136. const DType& dtype) {
  137. return opr::TypeCvt::make(
  138. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  139. dtype);
  140. };
  141. auto mkcvar = [&](const char* name, const TensorShape& shp,
  142. const DType& dtype) {
  143. return opr::TypeCvt::make(
  144. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  145. .rename(name),
  146. dtype);
  147. };
  148. auto x = mkvar("x", {64, 48, 14, 14},
  149. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  150. auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
  151. auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
  152. opr::ConvBias::Param param;
  153. param.format = opr::ConvBias::Param::Format::NCHW;
  154. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  155. param.stride_h = param.stride_w = 1;
  156. param.pad_h = param.pad_w = 1;
  157. auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
  158. OperatorNodeConfig(dtype::Quantized4Asymm(
  159. 12.345f, static_cast<uint8_t>(5))));
  160. x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
  161. auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
  162. auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
  163. auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
  164. OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  165. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  166. S strategy = S::PROFILE;
  167. gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
  168. using OprFormat = OprTensorFormatsConfiguration::OprFormat;
  169. SubGraphExtractor extractor(ctx->opr_list());
  170. auto partitions = extractor.extract({c2});
  171. ASSERT_EQ(partitions.size(), 1u);
  172. using Attribute = Problem::Attribute;
  173. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
  174. Problem problem(partitions[0], ctx->available_tensor_formats(),
  175. ctx->opr_configs(), attribute);
  176. auto profiler = ProfilerBase::make_profiler();
  177. auto rst = profiler->profile(problem);
  178. const auto& opr_rst = rst.opr_record;
  179. const auto& var_rst = rst.var_record;
  180. EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
  181. EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
  182. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  183. EXPECT_TRUE(var_rst.count(w1.node()) == 0);
  184. EXPECT_TRUE(var_rst.count(b1.node()) == 0);
  185. EXPECT_TRUE(var_rst.count(w2.node()) == 0);
  186. EXPECT_TRUE(var_rst.count(b2.node()) == 0);
  187. }
  188. #endif
  189. TEST(TestProfiler, Deconv) {
  190. REQUIRE_GPU(1);
  191. auto cn = CompNode::load("gpu0");
  192. cn.activate();
  193. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  194. auto ctx = LayoutTransformContext::make();
  195. HostTensorGenerator<dtype::Int8> gen;
  196. auto graph = ComputingGraph::make();
  197. graph->options().graph_opt_level = 0;
  198. auto mkvar = [&](const char* name, const TensorShape& shp,
  199. const DType& dtype) {
  200. return opr::TypeCvt::make(
  201. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  202. dtype);
  203. };
  204. auto mkcvar = [&](const char* name, const TensorShape& shp,
  205. const DType& dtype) {
  206. return opr::TypeCvt::make(
  207. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  208. .rename(name),
  209. dtype);
  210. };
  211. auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
  212. auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
  213. using Param = opr::ConvolutionBackwardData::Param;
  214. Param param;
  215. param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
  216. param.stride_h = param.stride_w = 2;
  217. param.pad_h = param.pad_w = 0;
  218. auto c1 = opr::ConvolutionBackwardData::make(
  219. w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  220. auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
  221. auto c2 = opr::ConvolutionBackwardData::make(
  222. w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  223. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  224. S strategy = S::PROFILE;
  225. gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
  226. using OprFormat = OprTensorFormatsConfiguration::OprFormat;
  227. SubGraphExtractor extractor(ctx->opr_list());
  228. auto partitions = extractor.extract({c2});
  229. ASSERT_EQ(partitions.size(), 1u);
  230. using Attribute = Problem::Attribute;
  231. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
  232. Problem problem(partitions[0], ctx->available_tensor_formats(),
  233. ctx->opr_configs(), attribute);
  234. auto profiler = ProfilerBase::make_profiler();
  235. auto rst = profiler->profile(problem);
  236. const auto& opr_rst = rst.opr_record;
  237. const auto& var_rst = rst.var_record;
  238. EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
  239. EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
  240. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  241. EXPECT_TRUE(var_rst.count(w1.node()) == 0);
  242. EXPECT_TRUE(var_rst.count(w2.node()) == 0);
  243. }
  244. TEST(TestProfiler, Warp) {
  245. REQUIRE_GPU(1);
  246. auto cn = CompNode::load("gpu0");
  247. cn.activate();
  248. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  249. auto ctx = LayoutTransformContext::make();
  250. constexpr size_t INP_H = 10, INP_W = 10, N = 16;
  251. HostTensorGenerator<dtype::Int8> gen;
  252. auto graph = ComputingGraph::make();
  253. graph->options().graph_opt_level = 0;
  254. auto mkvar = [&](const char* name, const TensorShape& shp,
  255. const DType& dtype) {
  256. return opr::TypeCvt::make(
  257. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  258. dtype);
  259. };
  260. auto x = mkvar("x", {N, 48, INP_H, INP_W},
  261. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  262. float value1 = M_PI, value2 = 0.6;
  263. auto gen_mat = [&](HostTensorND& mat) {
  264. auto ptr = mat.ptr<float>();
  265. for (size_t i = 0; i < N; ++i) {
  266. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  267. dx = value2, ky = value2, kx = value2, kb = value2;
  268. ptr[0] = ptr[4] = cos(rot) * scale;
  269. ptr[1] = -(ptr[3] = sin(rot) * scale);
  270. ptr[3] *= sheer;
  271. ptr[4] *= sheer;
  272. ptr[2] = dx;
  273. ptr[5] = dy;
  274. ptr[6] = kx;
  275. ptr[7] = ky;
  276. ptr[8] = kb;
  277. ptr += 9;
  278. }
  279. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  280. };
  281. auto mat_host = std::make_shared<HostTensorND>(
  282. x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  283. gen_mat(*mat_host);
  284. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  285. TensorShape out_shp{20, 20};
  286. auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
  287. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  288. S strategy = S::PROFILE;
  289. gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
  290. using OprFormat = OprTensorFormatsConfiguration::OprFormat;
  291. SubGraphExtractor extractor(ctx->opr_list());
  292. auto partitions = extractor.extract({w1});
  293. ASSERT_EQ(partitions.size(), 1u);
  294. using Attribute = Problem::Attribute;
  295. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
  296. Problem problem(partitions[0], ctx->available_tensor_formats(),
  297. ctx->opr_configs(), attribute);
  298. auto profiler = ProfilerBase::make_profiler();
  299. auto rst = profiler->profile(problem);
  300. const auto& opr_rst = rst.opr_record;
  301. const auto& var_rst = rst.var_record;
  302. EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
  303. EXPECT_TRUE(var_rst.count(mat.node()) == 0);
  304. EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
  305. EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
  306. }
  307. TEST(TestProfiler, Pooling) {
  308. REQUIRE_GPU(1);
  309. auto cn = CompNode::load("gpu0");
  310. cn.activate();
  311. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  312. auto ctx = LayoutTransformContext::make();
  313. HostTensorGenerator<dtype::Int8> gen;
  314. auto graph = ComputingGraph::make();
  315. graph->options().graph_opt_level = 0;
  316. auto mkvar = [&](const char* name, const TensorShape& shp,
  317. const DType& dtype) {
  318. return opr::TypeCvt::make(
  319. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  320. dtype);
  321. };
  322. auto x = mkvar("x", {64, 64, 55, 55},
  323. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  324. using Param = opr::Pooling::Param;
  325. Param param;
  326. param.format = Param::Format::NCHW;
  327. auto p1 = opr::Pooling::make(x, param);
  328. x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
  329. auto p2 = opr::Pooling::make(x, param);
  330. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  331. S strategy = S::PROFILE;
  332. gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
  333. using OprFormat = OprTensorFormatsConfiguration::OprFormat;
  334. SubGraphExtractor extractor(ctx->opr_list());
  335. auto partitions = extractor.extract({p2});
  336. ASSERT_EQ(partitions.size(), 1u);
  337. using Attribute = Problem::Attribute;
  338. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
  339. Problem problem(partitions[0], ctx->available_tensor_formats(),
  340. ctx->opr_configs(), attribute);
  341. auto profiler = ProfilerBase::make_profiler();
  342. auto rst = profiler->profile(problem);
  343. const auto& opr_rst = rst.opr_record;
  344. EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
  345. EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
  346. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  347. }
  348. TEST(TestProfiler, Elemwise) {
  349. REQUIRE_GPU(1);
  350. auto cn = CompNode::load("gpu0");
  351. cn.activate();
  352. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  353. auto ctx = LayoutTransformContext::make();
  354. HostTensorGenerator<dtype::Int8> gen;
  355. auto graph = ComputingGraph::make();
  356. graph->options().graph_opt_level = 0;
  357. auto mkvar = [&](const char* name, const TensorShape& shp,
  358. const DType& dtype) {
  359. return opr::TypeCvt::make(
  360. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  361. dtype);
  362. };
  363. auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
  364. auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
  365. auto c = opr::Elemwise::make({a, b},
  366. {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
  367. auto q4c = opr::TypeCvt::make(
  368. c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  369. auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
  370. auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
  371. auto q8d = opr::ElemwiseMultiType::make(
  372. {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
  373. OperatorNodeConfig(dtype::QuantizedS8(12.f)));
  374. auto q4d = opr::TypeCvt::make(
  375. q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
  376. auto q4e = opr::ElemwiseMultiType::make(
  377. {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  378. OperatorNodeConfig(
  379. dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
  380. using OprFormat = OprTensorFormatsConfiguration::OprFormat;
  381. SubGraphExtractor extractor(ctx->opr_list());
  382. auto partitions = extractor.extract({q4e});
  383. ASSERT_EQ(partitions.size(), 1u);
  384. using Attribute = Problem::Attribute;
  385. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
  386. Problem problem(partitions[0], ctx->available_tensor_formats(),
  387. ctx->opr_configs(), attribute);
  388. auto profiler = ProfilerBase::make_profiler();
  389. auto rst = profiler->profile(problem);
  390. const auto& opr_rst = rst.opr_record;
  391. const auto& var_rst = rst.var_record;
  392. EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
  393. EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
  394. EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
  395. EXPECT_TRUE(var_rst.count(a.node()) > 0);
  396. EXPECT_TRUE(var_rst.count(b.node()) > 0);
  397. EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
  398. EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
  399. }
  400. #endif
  401. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台