You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /**
  2. * \file src/gopt/test/profiler.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/plugin/profiler.h"
  13. #include "./helper.h"
  14. #include "megbrain/gopt/global_layout_transform.h"
  15. #include "megbrain/gopt/inference.h"
  16. #include "megbrain/opr/dnn/pooling.h"
  17. #include "megbrain/opr/imgproc.h"
  18. #include "megbrain/opr/nn_int.h"
  19. #include "megbrain/serialization/serializer.h"
  20. using namespace mgb;
  21. using namespace gopt;
  22. using namespace serialization;
  23. #if MGB_CUDA
  24. namespace {
  25. std::unique_ptr<LayoutTransformContext> make_ctx() {
  26. using OprFormat = LayoutTransformContext::OprFormat;
  27. using OprList = LayoutTransformContext::OprList;
  28. using Attribute = LayoutTransformContext::Attribute;
  29. using Target = LayoutTransformContext::Target;
  30. OprList opr_list = {
  31. opr::ConvBiasForward::typeinfo(),
  32. opr::ConvolutionForward::typeinfo(),
  33. opr::ConvolutionBackwardData::typeinfo(),
  34. opr::ElemwiseMultiType::typeinfo(),
  35. opr::Elemwise::typeinfo(),
  36. opr::TypeCvt::typeinfo(),
  37. opr::PoolingForward::typeinfo(),
  38. opr::WarpPerspectiveForward::typeinfo(),
  39. };
  40. SmallVector<TensorFormats> available_tensor_formats = {
  41. TensorFormats::NCHW, TensorFormats::NHWC,
  42. TensorFormats::NCHWc4, TensorFormats::NCHWc32,
  43. TensorFormats::NCHWc64, TensorFormats::CHWNc4};
  44. Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW, Target::CUDA};
  45. auto ctx = std::make_unique<LayoutTransformContext>(
  46. std::move(opr_list), std::move(available_tensor_formats),
  47. attribute);
  48. ctx->add_opr_config(
  49. opr::ConvBiasForward::typeinfo(),
  50. {OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4,
  51. OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4})
  52. .add_opr_config(opr::ConvolutionForward::typeinfo(),
  53. {OprFormat::NCHW, OprFormat::NCHW4})
  54. .add_opr_config(opr::ConvolutionBackwardData::typeinfo(),
  55. {OprFormat::NCHW, OprFormat::NCHW4})
  56. .add_opr_config(
  57. opr::PoolingForward::typeinfo(),
  58. {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
  59. OprFormat::NCHW64, OprFormat::CHWN4})
  60. .add_opr_config(
  61. opr::WarpPerspectiveForward::typeinfo(),
  62. {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
  63. return ctx;
  64. }
  65. } // namespace
  66. #if CUDA_VERSION >= 10020
  67. TEST(TestProfiler, Conv) {
  68. REQUIRE_GPU(1);
  69. auto cn = CompNode::load("gpu0");
  70. cn.activate();
  71. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  72. auto ctx = make_ctx();
  73. HostTensorGenerator<dtype::Int8> gen;
  74. auto graph = ComputingGraph::make();
  75. graph->options().graph_opt_level = 0;
  76. auto mkvar = [&](const char* name, const TensorShape& shp,
  77. const DType& dtype) {
  78. return opr::TypeCvt::make(
  79. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  80. dtype);
  81. };
  82. auto mkcvar = [&](const char* name, const TensorShape& shp,
  83. const DType& dtype) {
  84. return opr::TypeCvt::make(
  85. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  86. .rename(name),
  87. dtype);
  88. };
  89. auto x = mkvar("x", {64, 48, 14, 14},
  90. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  91. auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
  92. auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
  93. opr::ConvBias::Param param;
  94. param.format = opr::ConvBias::Param::Format::NCHW;
  95. param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
  96. param.stride_h = param.stride_w = 1;
  97. param.pad_h = param.pad_w = 1;
  98. auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
  99. OperatorNodeConfig(dtype::Quantized4Asymm(
  100. 12.345f, static_cast<uint8_t>(5))));
  101. x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
  102. auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
  103. auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
  104. auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
  105. OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  106. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  107. S strategy = S::PROFILE;
  108. gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
  109. SubGraphExtractor extractor(ctx->opr_list());
  110. auto partitions = extractor.extract({c2});
  111. ASSERT_EQ(partitions.size(), 1u);
  112. Problem problem(partitions[0], *ctx);
  113. auto profiler = ProfilerBase::make_profiler();
  114. auto rst = profiler->profile(problem);
  115. const auto& opr_rst = rst.opr_record;
  116. const auto& var_rst = rst.var_record;
  117. EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
  118. EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
  119. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  120. EXPECT_TRUE(var_rst.count(w1.node()) == 0);
  121. EXPECT_TRUE(var_rst.count(b1.node()) == 0);
  122. EXPECT_TRUE(var_rst.count(w2.node()) == 0);
  123. EXPECT_TRUE(var_rst.count(b2.node()) == 0);
  124. }
  125. #endif
  126. TEST(TestProfiler, Deconv) {
  127. REQUIRE_GPU(1);
  128. auto cn = CompNode::load("gpu0");
  129. cn.activate();
  130. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  131. auto ctx = make_ctx();
  132. HostTensorGenerator<dtype::Int8> gen;
  133. auto graph = ComputingGraph::make();
  134. graph->options().graph_opt_level = 0;
  135. auto mkvar = [&](const char* name, const TensorShape& shp,
  136. const DType& dtype) {
  137. return opr::TypeCvt::make(
  138. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  139. dtype);
  140. };
  141. auto mkcvar = [&](const char* name, const TensorShape& shp,
  142. const DType& dtype) {
  143. return opr::TypeCvt::make(
  144. opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
  145. .rename(name),
  146. dtype);
  147. };
  148. auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
  149. auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
  150. using Param = opr::ConvolutionBackwardData::Param;
  151. Param param;
  152. param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
  153. param.stride_h = param.stride_w = 2;
  154. param.pad_h = param.pad_w = 0;
  155. auto c1 = opr::ConvolutionBackwardData::make(
  156. w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  157. auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
  158. auto c2 = opr::ConvolutionBackwardData::make(
  159. w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
  160. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  161. S strategy = S::PROFILE;
  162. gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
  163. SubGraphExtractor extractor(ctx->opr_list());
  164. auto partitions = extractor.extract({c2});
  165. ASSERT_EQ(partitions.size(), 1u);
  166. Problem problem(partitions[0], *ctx);
  167. auto profiler = ProfilerBase::make_profiler();
  168. auto rst = profiler->profile(problem);
  169. const auto& opr_rst = rst.opr_record;
  170. const auto& var_rst = rst.var_record;
  171. EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
  172. EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
  173. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  174. EXPECT_TRUE(var_rst.count(w1.node()) == 0);
  175. EXPECT_TRUE(var_rst.count(w2.node()) == 0);
  176. }
  177. TEST(TestProfiler, Warp) {
  178. REQUIRE_GPU(1);
  179. auto cn = CompNode::load("gpu0");
  180. cn.activate();
  181. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  182. auto ctx = make_ctx();
  183. constexpr size_t INP_H = 10, INP_W = 10, N = 16;
  184. HostTensorGenerator<dtype::Int8> gen;
  185. auto graph = ComputingGraph::make();
  186. graph->options().graph_opt_level = 0;
  187. auto mkvar = [&](const char* name, const TensorShape& shp,
  188. const DType& dtype) {
  189. return opr::TypeCvt::make(
  190. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  191. dtype);
  192. };
  193. auto x = mkvar("x", {N, 48, INP_H, INP_W},
  194. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  195. float value1 = M_PI, value2 = 0.6;
  196. auto gen_mat = [&](HostTensorND& mat) {
  197. auto ptr = mat.ptr<float>();
  198. for (size_t i = 0; i < N; ++i) {
  199. auto rot = value1, scale = value2, sheer = value1, dy = value2,
  200. dx = value2, ky = value2, kx = value2, kb = value2;
  201. ptr[0] = ptr[4] = cos(rot) * scale;
  202. ptr[1] = -(ptr[3] = sin(rot) * scale);
  203. ptr[3] *= sheer;
  204. ptr[4] *= sheer;
  205. ptr[2] = dx;
  206. ptr[5] = dy;
  207. ptr[6] = kx;
  208. ptr[7] = ky;
  209. ptr[8] = kb;
  210. ptr += 9;
  211. }
  212. mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
  213. };
  214. auto mat_host = std::make_shared<HostTensorND>(
  215. x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
  216. gen_mat(*mat_host);
  217. auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
  218. TensorShape out_shp{20, 20};
  219. auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
  220. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  221. S strategy = S::PROFILE;
  222. gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
  223. SubGraphExtractor extractor(ctx->opr_list());
  224. auto partitions = extractor.extract({w1});
  225. Problem problem(partitions[0], *ctx);
  226. auto profiler = ProfilerBase::make_profiler();
  227. auto rst = profiler->profile(problem);
  228. const auto& opr_rst = rst.opr_record;
  229. const auto& var_rst = rst.var_record;
  230. EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
  231. EXPECT_TRUE(var_rst.count(mat.node()) == 0);
  232. EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
  233. EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
  234. }
  235. TEST(TestProfiler, Pooling) {
  236. REQUIRE_GPU(1);
  237. auto cn = CompNode::load("gpu0");
  238. cn.activate();
  239. REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
  240. auto ctx = make_ctx();
  241. HostTensorGenerator<dtype::Int8> gen;
  242. auto graph = ComputingGraph::make();
  243. graph->options().graph_opt_level = 0;
  244. auto mkvar = [&](const char* name, const TensorShape& shp,
  245. const DType& dtype) {
  246. return opr::TypeCvt::make(
  247. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  248. dtype);
  249. };
  250. auto x = mkvar("x", {64, 64, 55, 55},
  251. dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  252. using Param = opr::Pooling::Param;
  253. Param param;
  254. param.format = Param::Format::NCHW;
  255. auto p1 = opr::Pooling::make(x, param);
  256. x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
  257. auto p2 = opr::Pooling::make(x, param);
  258. using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
  259. S strategy = S::PROFILE;
  260. gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
  261. SubGraphExtractor extractor(ctx->opr_list());
  262. auto partitions = extractor.extract({p2});
  263. ASSERT_EQ(partitions.size(), 1u);
  264. Problem problem(partitions[0], *ctx);
  265. auto profiler = ProfilerBase::make_profiler();
  266. auto rst = profiler->profile(problem);
  267. const auto& opr_rst = rst.opr_record;
  268. EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
  269. EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
  270. EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
  271. }
  272. TEST(TestProfiler, Elemwise) {
  273. REQUIRE_GPU(1);
  274. auto cn = CompNode::load("gpu0");
  275. cn.activate();
  276. auto ctx = make_ctx();
  277. HostTensorGenerator<dtype::Int8> gen;
  278. auto graph = ComputingGraph::make();
  279. graph->options().graph_opt_level = 0;
  280. auto mkvar = [&](const char* name, const TensorShape& shp,
  281. const DType& dtype) {
  282. return opr::TypeCvt::make(
  283. opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
  284. dtype);
  285. };
  286. auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
  287. auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
  288. auto c = opr::Elemwise::make({a, b},
  289. {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
  290. auto q4c = opr::TypeCvt::make(
  291. c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
  292. auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
  293. auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
  294. auto q8d = opr::ElemwiseMultiType::make(
  295. {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
  296. OperatorNodeConfig(dtype::QuantizedS8(12.f)));
  297. auto q4d = opr::TypeCvt::make(
  298. q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
  299. auto q4e = opr::ElemwiseMultiType::make(
  300. {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
  301. OperatorNodeConfig(
  302. dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
  303. SubGraphExtractor extractor(ctx->opr_list());
  304. auto partitions = extractor.extract({q4e});
  305. ASSERT_EQ(partitions.size(), 1u);
  306. Problem problem(partitions[0], *ctx);
  307. auto profiler = ProfilerBase::make_profiler();
  308. auto rst = profiler->profile(problem);
  309. const auto& opr_rst = rst.opr_record;
  310. const auto& var_rst = rst.var_record;
  311. EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
  312. EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
  313. EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
  314. EXPECT_TRUE(var_rst.count(a.node()) > 0);
  315. EXPECT_TRUE(var_rst.count(b.node()) > 0);
  316. EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
  317. EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
  318. }
  319. #endif
  320. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台