You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_test_utils.cpp 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. /**
  2. * \file dnn/test/cuda/conv_test_utils.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/conv_test_utils.h"
  21. #include "test/cuda/fixture.h"
  22. #include "test/cuda/utils.h"
  23. #define V1(x) #x
  24. #define V(x) V1(x)
  25. namespace megdnn {
  26. namespace test {
  27. namespace conv {
  28. #if MEGDNN_WITH_BENCHMARK
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
  88. std::vector<BenchArgs> args;
  89. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  90. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  91. return args;
  92. }
  93. void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
  94. DType src_dtype, DType filter_dtype,
  95. DType bias_dtype, DType dst_dtype, const char* algo,
  96. param::ConvBias::Format format) {
  97. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  98. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  99. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  100. size_t RUNS = 1000;
  101. benchmarker.set_display(false).set_times(RUNS);
  102. benchmarker_cudnn.set_display(false).set_times(RUNS);
  103. #define CUDNN_VERSION_STRING \
  104. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  105. benchmarker_cudnn.set_before_exec_callback(
  106. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  107. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  108. "ALGO_IMPLICIT_PRECOMP_"
  109. "GEMM" CUDNN_VERSION_STRING));
  110. benchmarker.set_dtype(0, src_dtype)
  111. .set_dtype(1, filter_dtype)
  112. .set_dtype(2, bias_dtype)
  113. .set_dtype(3, dst_dtype)
  114. .set_dtype(4, dst_dtype);
  115. benchmarker_cudnn.set_dtype(0, src_dtype)
  116. .set_dtype(1, filter_dtype)
  117. .set_dtype(2, bias_dtype)
  118. .set_dtype(3, dst_dtype)
  119. .set_dtype(4, dst_dtype);
  120. using Param = ConvBias::Param;
  121. using Format = Param::Format;
  122. // helper function to change format
  123. auto get_tensor_shape = [](TensorShape shape,
  124. Format format) -> TensorShape {
  125. TensorShape ret;
  126. if (format == Format::NCHW4) {
  127. ret = static_cast<TensorShape>(
  128. TensorLayout{shape, dtype::Int8()}
  129. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  130. shape[3]})
  131. .dimshuffle({0, 1, 3, 4, 2}));
  132. } else if (format == Format::CHWN4) {
  133. ret = static_cast<TensorShape>(
  134. TensorLayout{shape, dtype::Int8()}
  135. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  136. shape[3]})
  137. .dimshuffle({1, 3, 4, 0, 2}));
  138. }
  139. return ret;
  140. };
  141. for (auto&& arg : args) {
  142. Param param;
  143. param.pad_h = param.pad_w = arg.f / 2;
  144. param.stride_h = param.stride_w = arg.s;
  145. param.format = format;
  146. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  147. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  148. benchmarker.set_param(param);
  149. if (!algo) {
  150. benchmarker.proxy()->target_execution_policy.algo.reset();
  151. }
  152. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  153. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  154. z{arg.n, arg.co, ho, wo}, dst = z;
  155. float time_in_ms = 0.f;
  156. if (algo) {
  157. time_in_ms =
  158. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  159. CUTimer>(benchmarker,
  160. {get_tensor_shape(src, format),
  161. get_tensor_shape(filter, format),
  162. get_tensor_shape(bias, format),
  163. {},
  164. {}},
  165. algo) /
  166. RUNS;
  167. } else {
  168. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  169. get_tensor_shape(filter, format),
  170. get_tensor_shape(bias, format),
  171. {},
  172. {}}) /
  173. RUNS;
  174. }
  175. Format format_cudnn = Format::NCHW4;
  176. param.format = format_cudnn;
  177. benchmarker_cudnn.set_param(param);
  178. auto time_in_ms_cudnn =
  179. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  180. get_tensor_shape(filter, format_cudnn),
  181. get_tensor_shape(bias, format_cudnn),
  182. {},
  183. {}}) /
  184. RUNS;
  185. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  186. (1e12);
  187. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  188. "time(cudnn)=%.2f %.2fTops, "
  189. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  190. src.to_string().c_str(), filter.to_string().c_str(),
  191. dst.to_string().c_str(), algo, time_in_ms,
  192. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  193. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  194. time_in_ms_cudnn / time_in_ms);
  195. printf("bench with z tensor\n");
  196. if (algo) {
  197. time_in_ms =
  198. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  199. CUTimer>(benchmarker,
  200. {get_tensor_shape(src, format),
  201. get_tensor_shape(filter, format),
  202. get_tensor_shape(bias, format),
  203. get_tensor_shape(z, format),
  204. {}},
  205. algo) /
  206. RUNS;
  207. } else {
  208. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  209. get_tensor_shape(filter, format),
  210. get_tensor_shape(bias, format),
  211. get_tensor_shape(z, format),
  212. {}}) /
  213. RUNS;
  214. }
  215. time_in_ms_cudnn =
  216. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  217. get_tensor_shape(filter, format_cudnn),
  218. get_tensor_shape(bias, format_cudnn),
  219. get_tensor_shape(z, format_cudnn),
  220. {}}) /
  221. RUNS;
  222. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  223. "time(cudnn)=%.2f %.2fTops, "
  224. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  225. src.to_string().c_str(), filter.to_string().c_str(),
  226. dst.to_string().c_str(), algo, time_in_ms,
  227. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  228. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  229. time_in_ms_cudnn / time_in_ms);
  230. }
  231. }
  232. void benchmark_target_algo_with_cudnn_tsc(
  233. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  234. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  235. param::ConvBias::Format format, bool with_cudnn,
  236. const char* change_cudnn_algo,
  237. param::ConvBias::Format change_cudnn_format,
  238. DType change_cudnn_src_dtype, DType change_cudnn_filter_dtype,
  239. DType change_cudnn_bias_dtype, DType change_cudnn_dst_dtype) {
  240. megdnn_assert((src_dtype.enumv() == filter_dtype.enumv()) ||
  241. (src_dtype.enumv() == DTypeEnum::Quantized4Asymm &&
  242. filter_dtype.enumv() == DTypeEnum::QuantizedS4));
  243. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  244. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  245. size_t RUNS = 200;
  246. benchmarker.set_display(false).set_times(RUNS);
  247. benchmarker.set_dtype(0, src_dtype)
  248. .set_dtype(1, filter_dtype)
  249. .set_dtype(2, bias_dtype)
  250. .set_dtype(3, dst_dtype)
  251. .set_dtype(4, dst_dtype);
  252. benchmarker_cudnn.set_display(false).set_times(RUNS);
  253. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  254. new OprProxy<ConvBiasForward>{true}};
  255. if (!algo) {
  256. benchmarker.set_proxy(proxy);
  257. }
  258. if (change_cudnn_algo) {
  259. benchmarker_cudnn.set_dtype(0, change_cudnn_src_dtype)
  260. .set_dtype(1, change_cudnn_filter_dtype)
  261. .set_dtype(2, change_cudnn_bias_dtype)
  262. .set_dtype(3, change_cudnn_dst_dtype)
  263. .set_dtype(4, change_cudnn_dst_dtype);
  264. } else {
  265. benchmarker_cudnn.set_dtype(0, src_dtype)
  266. .set_dtype(1, filter_dtype)
  267. .set_dtype(2, bias_dtype)
  268. .set_dtype(3, dst_dtype)
  269. .set_dtype(4, dst_dtype);
  270. benchmarker_cudnn.set_before_exec_callback(
  271. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  272. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_"
  273. "FWD_"
  274. "ALGO_IMPLICIT_PRECOMP_GEMM" CUDNN_VERSION_STRING));
  275. }
  276. #undef CUDNN_VERSION_STRING
  277. using Param = ConvBias::Param;
  278. using Format = Param::Format;
  279. // helper function to change format
  280. auto get_tensor_shape = [](TensorShape shape, DType dtype,
  281. Format format) -> TensorShape {
  282. TensorShape ret;
  283. if (format == Format::NCHW4) {
  284. ret = static_cast<TensorShape>(
  285. TensorLayout{shape, dtype}
  286. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  287. shape[3]})
  288. .dimshuffle({0, 1, 3, 4, 2}));
  289. } else if (format == Format::NCHW32) {
  290. ret = static_cast<TensorShape>(
  291. TensorLayout{shape, dtype}
  292. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  293. shape[3]})
  294. .dimshuffle({0, 1, 3, 4, 2}));
  295. } else if (format == Format::NCHW64) {
  296. ret = static_cast<TensorShape>(
  297. TensorLayout{shape, dtype}
  298. .reshape({shape[0], shape[1] / 64, 64, shape[2],
  299. shape[3]})
  300. .dimshuffle({0, 1, 3, 4, 2}));
  301. } else if (format == Format::CHWN4) {
  302. ret = static_cast<TensorShape>(
  303. TensorLayout{shape, dtype}
  304. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  305. shape[3]})
  306. .dimshuffle({1, 3, 4, 0, 2}));
  307. }
  308. return ret;
  309. };
  310. for (auto&& arg : args) {
  311. Param param;
  312. param.pad_h = param.pad_w = arg.f / 2;
  313. param.stride_h = param.stride_w = arg.s;
  314. param.format = format;
  315. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  316. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  317. benchmarker.set_param(param);
  318. if (!algo) {
  319. benchmarker.proxy()->target_execution_policy.algo.reset();
  320. }
  321. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  322. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  323. z{arg.n, arg.co, ho, wo}, dst = z;
  324. // skip testcase which cannot enable nchw32 tensorcore
  325. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  326. continue;
  327. // skip testcase which cannot enable nchw32 tensorcore
  328. if (format == Format::NCHW64 && (arg.co % 64 != 0 || arg.ci % 64 != 0))
  329. continue;
  330. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  331. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  332. (arg.ci % 16 != 0))
  333. continue;
  334. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  335. ? Format::NCHW32
  336. : Format::NCHW4;
  337. if (change_cudnn_algo) {
  338. format_cudnn = change_cudnn_format;
  339. }
  340. param.format = format_cudnn;
  341. benchmarker_cudnn.set_param(param);
  342. float time_in_ms = 0.f;
  343. if (algo) {
  344. time_in_ms =
  345. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  346. CUTimer>(
  347. benchmarker,
  348. {get_tensor_shape(src, src_dtype, format),
  349. get_tensor_shape(filter, filter_dtype, format),
  350. get_tensor_shape(bias, bias_dtype, format),
  351. {},
  352. {}},
  353. algo) /
  354. RUNS;
  355. } else {
  356. time_in_ms =
  357. benchmarker.execs(
  358. {get_tensor_shape(src, src_dtype, format),
  359. get_tensor_shape(filter, filter_dtype, format),
  360. get_tensor_shape(bias, bias_dtype, format),
  361. {},
  362. {}}) /
  363. RUNS;
  364. }
  365. float time_in_ms_cudnn = 0;
  366. if (with_cudnn) {
  367. if (change_cudnn_algo) {
  368. time_in_ms_cudnn =
  369. algo_benchmark<ConvBiasForward,
  370. OprProxy<ConvBiasForward>, CUTimer>(
  371. benchmarker_cudnn,
  372. {get_tensor_shape(src, src_dtype, format_cudnn),
  373. get_tensor_shape(filter, filter_dtype,
  374. format_cudnn),
  375. get_tensor_shape(bias, bias_dtype,
  376. format_cudnn),
  377. {},
  378. {}},
  379. change_cudnn_algo) /
  380. RUNS;
  381. } else {
  382. time_in_ms_cudnn =
  383. benchmarker_cudnn.execs(
  384. {get_tensor_shape(src, src_dtype, format_cudnn),
  385. get_tensor_shape(filter, filter_dtype,
  386. format_cudnn),
  387. get_tensor_shape(bias, bias_dtype,
  388. format_cudnn),
  389. {},
  390. {}}) /
  391. RUNS;
  392. }
  393. }
  394. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  395. (1e12);
  396. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  397. "time(cudnn)=%.2f %.2fTops, "
  398. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  399. src.to_string().c_str(), filter.to_string().c_str(),
  400. dst.to_string().c_str(), algo, time_in_ms,
  401. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  402. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  403. time_in_ms_cudnn / time_in_ms);
  404. printf("bench with z tensor\n");
  405. if (algo) {
  406. time_in_ms =
  407. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  408. CUTimer>(
  409. benchmarker,
  410. {get_tensor_shape(src, src_dtype, format),
  411. get_tensor_shape(filter, filter_dtype, format),
  412. get_tensor_shape(bias, bias_dtype, format),
  413. get_tensor_shape(z, src_dtype, format),
  414. {}},
  415. algo) /
  416. RUNS;
  417. } else {
  418. time_in_ms =
  419. benchmarker.execs(
  420. {get_tensor_shape(src, src_dtype, format),
  421. get_tensor_shape(filter, filter_dtype, format),
  422. get_tensor_shape(bias, bias_dtype, format),
  423. get_tensor_shape(z, src_dtype, format),
  424. {}}) /
  425. RUNS;
  426. }
  427. time_in_ms_cudnn = 0;
  428. if (with_cudnn) {
  429. if (change_cudnn_algo) {
  430. time_in_ms_cudnn =
  431. algo_benchmark<ConvBiasForward,
  432. OprProxy<ConvBiasForward>, CUTimer>(
  433. benchmarker_cudnn,
  434. {get_tensor_shape(src, src_dtype, format_cudnn),
  435. get_tensor_shape(filter, filter_dtype,
  436. format_cudnn),
  437. get_tensor_shape(bias, bias_dtype,
  438. format_cudnn),
  439. get_tensor_shape(z, src_dtype, format_cudnn),
  440. {}},
  441. change_cudnn_algo) /
  442. RUNS;
  443. } else {
  444. time_in_ms_cudnn =
  445. benchmarker_cudnn.execs(
  446. {get_tensor_shape(src, src_dtype, format_cudnn),
  447. get_tensor_shape(filter, filter_dtype,
  448. format_cudnn),
  449. get_tensor_shape(bias, bias_dtype,
  450. format_cudnn),
  451. get_tensor_shape(z, src_dtype, format_cudnn),
  452. {}}) /
  453. RUNS;
  454. }
  455. }
  456. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  457. "time(cudnn)=%.2f %.2fTops, "
  458. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  459. src.to_string().c_str(), filter.to_string().c_str(),
  460. dst.to_string().c_str(), algo, time_in_ms,
  461. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  462. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  463. time_in_ms_cudnn / time_in_ms);
  464. }
  465. }
  466. #endif
  467. } // namespace conv
  468. } // namespace test
  469. } // namespace megdnn
  470. #undef V1
  471. #undef V

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台