You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 31 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/dtype.h"
  12. #include "megdnn/oprs.h"
  13. #include "megdnn/opr_param_defs.h"
  14. #include "test/cuda/fixture.h"
  15. #include "test/common/tensor.h"
  16. #include "test/common/workspace_wrapper.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/convolution.h"
  19. #include "test/common/rng.h"
  20. #include "test/cuda/benchmark.h"
  21. #include "src/cuda/utils.h"
  22. #define V1(x) #x
  23. #define V(x) V1(x)
  24. #define CUDNN_VERSION_STRING \
  25. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  26. namespace megdnn {
  27. namespace test {
  28. TEST_F(CUDA, CONVOLUTION_8X8X32)
  29. {
  30. if (!cuda::is_compute_capability_required(6, 1)) {
  31. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  32. "doesn't support\n");
  33. return;
  34. }
  35. using namespace convolution;
  36. std::vector<TestArg> args;
  37. {
  38. auto v = get_args();
  39. for (auto &&a: v) {
  40. args.push_back(std::move(a));
  41. }
  42. }
  43. {
  44. auto v = get_dilated_args();
  45. for (auto &&a: v) {
  46. args.push_back(std::move(a));
  47. }
  48. }
  49. {
  50. auto v = get_chanwise_args();
  51. for (auto &&a: v) {
  52. args.push_back(std::move(a));
  53. }
  54. }
  55. Checker<ConvolutionForward> checker(handle_cuda());
  56. UniformIntRNG rng(-4, 4);
  57. for (auto arg: args) {
  58. arg.param.format = param::Convolution::Format::NHWC;
  59. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  60. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  61. checker.set_dtype(0, dtype::Int8()).
  62. set_dtype(1, dtype::Int8()).
  63. set_dtype(2, dtype::Int32()).
  64. set_param(arg.param).
  65. set_rng(0, &rng).
  66. set_rng(1, &rng).
  67. execs({arg.src, arg.filter, {}});
  68. }
  69. }
  70. TEST_F(CUDA, CONVOLUTION_FORWARD)
  71. {
  72. using namespace convolution;
  73. std::vector<TestArg> args = get_args();
  74. Checker<ConvolutionForward> checker(handle_cuda());
  75. NormalRNG default_rng;
  76. for (auto &&arg: args) {
  77. float scale =
  78. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  79. UniformFloatRNG rng(scale, 2 * scale);
  80. checker.
  81. set_dtype(0, dtype::Float32()).
  82. set_dtype(1, dtype::Float32()).
  83. set_dtype(2, dtype::Float32()).
  84. set_rng(0, &default_rng).
  85. set_rng(1, &default_rng).
  86. set_epsilon(1e-3).
  87. set_param(arg.param).
  88. execs({arg.src, arg.filter, {}});
  89. checker.
  90. set_dtype(0, dtype::Float16()).
  91. set_dtype(1, dtype::Float16()).
  92. set_dtype(2, dtype::Float16()).
  93. set_rng(0, &rng).
  94. set_rng(1, &rng).
  95. set_epsilon(1e-1).
  96. set_param(arg.param).
  97. execs({arg.src, arg.filter, {}});
  98. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  99. checker.set_dtype(0, dtype::Float16())
  100. .set_dtype(1, dtype::Float16())
  101. .set_dtype(2, dtype::Float16())
  102. .set_rng(0, &rng)
  103. .set_rng(1, &rng)
  104. .set_epsilon(1e-1)
  105. .set_param(arg.param)
  106. .execs({arg.src, arg.filter, {}});
  107. checker.set_dtype(0, dtype::BFloat16())
  108. .set_dtype(1, dtype::BFloat16())
  109. .set_dtype(2, dtype::BFloat16())
  110. .set_epsilon(1e-1)
  111. .set_param(arg.param)
  112. .execs({arg.src, arg.filter, {}});
  113. }
  114. }
  115. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  116. if (!cuda::is_compute_capability_required(6, 1))
  117. return;
  118. using namespace convolution;
  119. Checker<Convolution> checker(handle_cuda());
  120. UniformIntRNG int_rng{-127, 127};
  121. Convolution::Param param;
  122. param.format = Convolution::Param::Format::NCHW4;
  123. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  124. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  125. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  126. .set_rng(0, &int_rng)
  127. .set_rng(1, &int_rng)
  128. .set_param(param);
  129. checker.set_before_exec_callback(
  130. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  131. "DEFAULT",
  132. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  133. "MATMUL8X8X32", {})
  134. .c_str(),
  135. {}}}}));
  136. param.sparse = Convolution::Param::Sparse::DENSE;
  137. param.pad_h = param.pad_w = 1;
  138. param.stride_h = param.stride_w = 1;
  139. checker.set_param(param);
  140. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  141. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  142. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  143. }
  144. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD)
  145. {
  146. using namespace convolution;
  147. std::vector<TestArg> args = get_1x1_args();
  148. Checker<ConvolutionForward> checker(handle_cuda());
  149. NormalRNG default_rng;
  150. for (auto &&arg: args) {
  151. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  152. UniformFloatRNG rng(scale, 2 * scale);
  153. checker.
  154. set_dtype(0, dtype::Float32()).
  155. set_dtype(1, dtype::Float32()).
  156. set_rng(0, &default_rng).
  157. set_rng(1, &default_rng).
  158. set_epsilon(1e-3).
  159. set_param(arg.param).
  160. execs({arg.src, arg.filter, {}});
  161. }
  162. }
  163. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD)
  164. {
  165. using namespace convolution;
  166. std::vector<TestArg> args = get_1x1_args();
  167. Benchmarker<ConvolutionForward> marker(handle_cuda());
  168. NormalRNG default_rng;
  169. for (auto &&arg: args) {
  170. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  171. UniformFloatRNG rng(scale, 2 * scale);
  172. marker.set_dtype(0, dtype::Float32()).
  173. set_dtype(1, dtype::Float32()).
  174. set_rng(0, &default_rng).
  175. set_rng(1, &default_rng).
  176. set_param(arg.param).
  177. execs({arg.src, arg.filter, {}});
  178. }
  179. }
  180. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
  181. {
  182. using namespace convolution;
  183. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  184. Checker<ConvolutionBackwardData> checker(handle_cuda());
  185. NormalRNG default_rng;
  186. for (auto &&arg: args) {
  187. float scale =
  188. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  189. UniformFloatRNG rng(scale, 2 * scale);
  190. auto src = TensorLayout(arg.src, dtype::Float32());
  191. auto filter = TensorLayout(arg.filter, dtype::Float32());
  192. TensorLayout dst;
  193. {
  194. auto opr = handle_cuda()->create_operator<Convolution>();
  195. opr->param() = arg.param;
  196. opr->deduce_layout(src, filter, dst);
  197. }
  198. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  199. checker.set_rng(0, &default_rng)
  200. .set_rng(1, &default_rng)
  201. .set_epsilon(1e-3)
  202. .set_param(arg.param)
  203. .exec(TensorLayoutArray{filter, dst, src});
  204. if (!cuda::is_compute_capability_required(6, 0)) {
  205. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  206. checker.set_rng(0, &rng)
  207. .set_rng(1, &rng)
  208. .set_epsilon(1e-1)
  209. .set_param(arg.param)
  210. .exec(TensorLayoutArray{filter, dst, src});
  211. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  212. checker.set_rng(0, &rng)
  213. .set_rng(1, &rng)
  214. .set_epsilon(1e-1)
  215. .set_param(arg.param)
  216. .exec(TensorLayoutArray{filter, dst, src});
  217. }
  218. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  219. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  220. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  221. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  222. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  223. checker.set_rng(0, &rng)
  224. .set_rng(1, &rng)
  225. .set_epsilon(1e-1)
  226. .set_param(arg.param)
  227. .exec(TensorLayoutArray{filter, dst, src});
  228. checker.reset_before_exec_callback();
  229. checker.opr()->execution_policy() = {};
  230. }
  231. }
  232. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL)
  233. {
  234. using namespace convolution;
  235. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  236. Checker<ConvolutionBackwardData> checker(handle_cuda());
  237. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  238. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  239. NormalRNG default_rng;
  240. for (auto &&arg: args) {
  241. float scale =
  242. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  243. UniformFloatRNG rng(scale, 2 * scale);
  244. auto src = TensorLayout(arg.src, dtype::Float32());
  245. auto filter = TensorLayout(arg.filter, dtype::Float32());
  246. TensorLayout dst;
  247. {
  248. auto opr = handle_cuda()->create_operator<Convolution>();
  249. opr->param() = arg.param;
  250. opr->deduce_layout(src, filter, dst);
  251. }
  252. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  253. checker.set_rng(0, &default_rng)
  254. .set_rng(1, &default_rng)
  255. .set_epsilon(1e-3)
  256. .set_param(arg.param)
  257. .exec(TensorLayoutArray{filter, dst, src});
  258. }
  259. }
  260. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5)
  261. {
  262. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  263. // when cudnn fixed the problem.
  264. if (cuda::is_compute_capability_required(7, 0))
  265. return;
  266. using namespace convolution;
  267. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  268. Checker<ConvolutionBackwardData> checker(handle_cuda());
  269. NormalRNG default_rng;
  270. for (auto &&arg: args) {
  271. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  272. scale = std::max(scale, 1.f);
  273. UniformFloatRNG rng(scale, 2 * scale);
  274. auto src = TensorLayout(arg.src, dtype::Float32());
  275. auto filter = TensorLayout(arg.filter, dtype::Float32());
  276. TensorLayout dst;
  277. {
  278. auto opr = handle_cuda()->create_operator<Convolution>();
  279. opr->param() = arg.param;
  280. opr->deduce_layout(src, filter, dst);
  281. }
  282. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  283. checker.
  284. set_rng(0, &default_rng).
  285. set_rng(1, &default_rng).
  286. set_epsilon(1e-3).
  287. set_param(arg.param).
  288. exec(TensorLayoutArray{filter, dst, src});
  289. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  290. checker.
  291. set_rng(0, &rng).
  292. set_rng(1, &rng).
  293. set_epsilon(1e-1).
  294. set_param(arg.param).
  295. exec(TensorLayoutArray{filter, dst, src});
  296. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  297. checker.set_rng(0, &rng)
  298. .set_rng(1, &rng)
  299. .set_epsilon(1e-1)
  300. .set_param(arg.param)
  301. .exec(TensorLayoutArray{filter, dst, src});
  302. }
  303. }
  304. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER)
  305. {
  306. using namespace convolution;
  307. std::vector<TestArg> args = get_args();
  308. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  309. bool f16_checked = false;
  310. for (auto &&arg: args) {
  311. auto src = TensorLayout(arg.src, dtype::Float32());
  312. auto filter = TensorLayout(arg.filter, dtype::Float32());
  313. TensorLayout dst;
  314. {
  315. auto opr = handle_cuda()->create_operator<Convolution>();
  316. opr->param() = arg.param;
  317. opr->deduce_layout(src, filter, dst);
  318. }
  319. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  320. UniformFloatRNG rng(scale, 2 * scale);
  321. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  322. checker.
  323. set_rng(0, &rng).
  324. set_rng(1, &rng).
  325. set_epsilon(1e-3).
  326. set_param(arg.param).
  327. exec(TensorLayoutArray{src, dst, filter});
  328. // reduce on large f16 array may introduce significant error
  329. if (dst.total_nr_elems() >= 1000 && f16_checked)
  330. continue;
  331. f16_checked = true;
  332. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  333. checker.
  334. set_rng(0, &rng).
  335. set_rng(1, &rng).
  336. set_epsilon(1e-1).
  337. set_param(arg.param).
  338. exec(TensorLayoutArray{src, dst, filter});
  339. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  340. checker.set_rng(0, &rng)
  341. .set_rng(1, &rng)
  342. .set_epsilon(1e-1)
  343. .set_param(arg.param)
  344. .exec(TensorLayoutArray{src, dst, filter});
  345. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  346. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  347. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  348. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  349. checker.set_rng(0, &rng)
  350. .set_rng(1, &rng)
  351. .set_epsilon(1e-1)
  352. .set_param(arg.param)
  353. .exec(TensorLayoutArray{src, dst, filter});
  354. checker.reset_before_exec_callback();
  355. checker.opr()->execution_policy() = {};
  356. }
  357. }
  358. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL)
  359. {
  360. using namespace convolution;
  361. std::vector<TestArg> args = get_args();
  362. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  363. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  364. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  365. for (auto &&arg: args) {
  366. auto src = TensorLayout(arg.src, dtype::Float32());
  367. auto filter = TensorLayout(arg.filter, dtype::Float32());
  368. TensorLayout dst;
  369. {
  370. auto opr = handle_cuda()->create_operator<Convolution>();
  371. opr->param() = arg.param;
  372. opr->deduce_layout(src, filter, dst);
  373. }
  374. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  375. UniformFloatRNG rng(scale, 2 * scale);
  376. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  377. checker.
  378. set_rng(0, &rng).
  379. set_rng(1, &rng).
  380. set_epsilon(1e-3).
  381. set_param(arg.param).
  382. exec(TensorLayoutArray{src, dst, filter});
  383. }
  384. }
  385. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  386. auto eps_getter = [](bool f16, int stage, const char *name) -> float {
  387. if (f16) {
  388. return stage == 2 ? 0.5 : 0.2;
  389. }
  390. if (strstr(name, "WINOGRAD_NONFUSED"))
  391. return 0.3;
  392. return 1e-3;
  393. };
  394. convolution::test_conv_config_combinations(2, handle_cuda(), false, true,
  395. true, eps_getter, true);
  396. convolution::test_conv_config_combinations(3, handle_cuda(), false, true,
  397. true, eps_getter, true);
  398. convolution::test_conv_config_combinations(5, handle_cuda(), false, true,
  399. true, eps_getter, true);
  400. }
  401. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  402. if (cuda::is_compute_capability_required(7, 0))
  403. return;
  404. using namespace convolution;
  405. Checker<ConvolutionBackwardData> checker(handle_cuda());
  406. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  407. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  408. NormalRNG default_rng;
  409. TensorShape s_filter = TensorShape{8, 8, 2, 2},
  410. s_src = TensorShape{2, 8, 18, 18};
  411. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  412. UniformFloatRNG rng(scale, 2 * scale);
  413. auto src = TensorLayout(s_src, dtype::Float16());
  414. auto filter = TensorLayout(s_filter, dtype::Float16());
  415. TensorLayout dst;
  416. param::Convolution param;
  417. param.pad_h = param.pad_w = 2;
  418. param.stride_h = param.stride_w = 2;
  419. {
  420. auto opr = handle_cuda()->create_operator<Convolution>();
  421. opr->param() = param;
  422. opr->deduce_layout(src, filter, dst);
  423. }
  424. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  425. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  426. checker.set_rng(0, &rng)
  427. .set_rng(1, &rng)
  428. .set_epsilon(0.2)
  429. .set_param(param)
  430. .exec(TensorLayoutArray{filter, dst, src});
  431. }
  432. #if MEGDNN_WITH_BENCHMARK
  433. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  434. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  435. size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
  436. size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
  437. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  438. benchmarker.set_dtype(0, dtype::Float16())
  439. .set_dtype(1, dtype::Float16())
  440. .set_dtype(2, dtype::Float16());
  441. ConvolutionForward::Param param;
  442. param.stride_h = SH;
  443. param.stride_w = SW;
  444. param.pad_h = PH;
  445. param.pad_w = PW;
  446. if (fp16io_c32) {
  447. param.compute_mode =
  448. ConvolutionForward::Param::ComputeMode::FLOAT32;
  449. }
  450. benchmarker.set_param(param);
  451. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  452. new OprProxy<ConvolutionForward>{true}};
  453. benchmarker.set_proxy(proxy);
  454. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  455. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  456. auto time = benchmarker.execs(
  457. {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  458. time /= 1000.0 * 10.0;
  459. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  460. auto flops = flo / time / 1e12;
  461. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  462. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  463. };
  464. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  465. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  466. }
  467. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  468. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  469. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  470. new OprProxy<ConvolutionForward>{true}};
  471. size_t RUNS = 10;
  472. bench.set_proxy(proxy).set_times(RUNS);
  473. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  474. size_t FH, size_t SH, size_t PH) {
  475. bench.set_dtype(0, dtype::Float32())
  476. .set_dtype(1, dtype::Float32())
  477. .set_dtype(2, dtype::Float32());
  478. param::Convolution param;
  479. param.stride_h = param.stride_w = SH;
  480. param.pad_h = param.pad_w = PH;
  481. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  482. bench.set_param(param);
  483. bench.proxy()->target_execution_policy.algo.reset();
  484. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  485. filter{{OC, IC, FH, FH}, dtype::Float32()};
  486. TensorLayout dst;
  487. {
  488. auto&& opr = handle_cuda()->create_operator<Convolution>();
  489. opr->param() = param;
  490. opr->deduce_layout(src, filter, dst);
  491. }
  492. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  493. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  494. bench.proxy()->target_execution_policy.algo.reset();
  495. bench.set_dtype(0, dtype::Float16())
  496. .set_dtype(1, dtype::Float16())
  497. .set_dtype(2, dtype::Float16());
  498. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  499. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  500. bench.proxy()->target_execution_policy.algo.reset();
  501. bench.set_param(param);
  502. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  503. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  504. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  505. filter.to_string().c_str(), dst.to_string().c_str());
  506. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  507. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  508. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  509. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  510. (flo / (time_ms_pseudo_fp16 * 1e9)));
  511. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  512. time_ms_fp32 / time_ms_true_fp16,
  513. time_ms_pseudo_fp16 / time_ms_true_fp16);
  514. };
  515. run(32, 64, 3, 224, 224, 7, 2, 3);
  516. run(32, 128, 128, 28, 28, 3, 1, 1);
  517. run(32, 256, 256, 14, 14, 3, 1, 1);
  518. run(32, 512, 512, 7, 7, 3, 1, 1);
  519. run(32, 64, 64, 56, 56, 3, 1, 1);
  520. run(32, 512, 256, 56, 56, 1, 2, 0);
  521. run(32, 1024, 512, 28, 28, 1, 2, 0);
  522. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  523. run(32, 512, 128, 28, 28, 1, 1, 0);
  524. run(32, 128, 512, 28, 28, 1, 1, 0);
  525. run(32, 1024, 256, 14, 14, 1, 1, 0);
  526. run(32, 256, 1024, 14, 14, 1, 1, 0);
  527. run(32, 2048, 512, 7, 7, 1, 1, 0);
  528. run(32, 512, 2048, 7, 7, 1, 1, 0);
  529. run(32, 256, 64, 56, 56, 1, 1, 0);
  530. run(32, 64, 256, 56, 56, 1, 1, 0);
  531. run(32, 128, 256, 56, 56, 1, 2, 0);
  532. run(32, 256, 512, 28, 28, 1, 2, 0);
  533. run(32, 512, 1024, 14, 14, 1, 2, 0);
  534. run(32, 64, 64, 56, 56, 1, 1, 0);
  535. }
  536. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  537. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  538. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  539. new OprProxy<ConvolutionBackwardData>{true}};
  540. size_t RUNS = 10;
  541. bench.set_proxy(proxy).set_times(RUNS);
  542. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  543. size_t FH, size_t SH, size_t PH) {
  544. bench.set_dtype(0, dtype::Float32())
  545. .set_dtype(1, dtype::Float32())
  546. .set_dtype(2, dtype::Float32());
  547. param::Convolution param;
  548. param.stride_h = param.stride_w = SH;
  549. param.pad_h = param.pad_w = PH;
  550. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  551. bench.set_param(param);
  552. bench.proxy()->target_execution_policy.algo.reset();
  553. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  554. filter{{OC, IC, FH, FH}, dtype::Float32()};
  555. TensorLayout dst;
  556. {
  557. auto&& opr = handle_cuda()->create_operator<Convolution>();
  558. opr->param() = param;
  559. opr->deduce_layout(src, filter, dst);
  560. }
  561. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  562. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  563. bench.proxy()->target_execution_policy.algo.reset();
  564. bench.set_dtype(0, dtype::Float16())
  565. .set_dtype(1, dtype::Float16())
  566. .set_dtype(2, dtype::Float16());
  567. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  568. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  569. bench.proxy()->target_execution_policy.algo.reset();
  570. bench.set_param(param);
  571. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  572. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  573. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  574. filter.to_string().c_str(), dst.to_string().c_str());
  575. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  576. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  577. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  578. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  579. (flo / (time_ms_pseudo_fp16 * 1e9)));
  580. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  581. time_ms_fp32 / time_ms_true_fp16,
  582. time_ms_pseudo_fp16 / time_ms_true_fp16);
  583. };
  584. run(32, 64, 3, 224, 224, 7, 2, 3);
  585. run(32, 128, 128, 28, 28, 3, 1, 1);
  586. run(32, 256, 256, 14, 14, 3, 1, 1);
  587. run(32, 512, 512, 7, 7, 3, 1, 1);
  588. run(32, 64, 64, 56, 56, 3, 1, 1);
  589. run(32, 512, 256, 56, 56, 1, 2, 0);
  590. run(32, 1024, 512, 28, 28, 1, 2, 0);
  591. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  592. run(32, 512, 128, 28, 28, 1, 1, 0);
  593. run(32, 128, 512, 28, 28, 1, 1, 0);
  594. run(32, 1024, 256, 14, 14, 1, 1, 0);
  595. run(32, 256, 1024, 14, 14, 1, 1, 0);
  596. run(32, 2048, 512, 7, 7, 1, 1, 0);
  597. run(32, 512, 2048, 7, 7, 1, 1, 0);
  598. run(32, 256, 64, 56, 56, 1, 1, 0);
  599. run(32, 64, 256, 56, 56, 1, 1, 0);
  600. run(32, 128, 256, 56, 56, 1, 2, 0);
  601. run(32, 256, 512, 28, 28, 1, 2, 0);
  602. run(32, 512, 1024, 14, 14, 1, 2, 0);
  603. run(32, 64, 64, 56, 56, 1, 1, 0);
  604. }
  605. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  606. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  607. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  608. new OprProxy<ConvolutionBackwardData>{true}};
  609. size_t RUNS = 10;
  610. bench.set_proxy(proxy).set_times(RUNS);
  611. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  612. size_t FH, size_t SH, size_t PH) {
  613. bench.set_dtype(0, dtype::BFloat16())
  614. .set_dtype(1, dtype::BFloat16())
  615. .set_dtype(2, dtype::BFloat16());
  616. param::Convolution param;
  617. param.stride_h = param.stride_w = SH;
  618. param.pad_h = param.pad_w = PH;
  619. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  620. bench.set_param(param);
  621. bench.proxy()->target_execution_policy = {};
  622. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  623. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  624. TensorLayout dst;
  625. {
  626. auto&& opr = handle_cuda()->create_operator<Convolution>();
  627. opr->param() = param;
  628. opr->deduce_layout(src, filter, dst);
  629. }
  630. auto used = bench.execl({filter, dst, src}) / RUNS;
  631. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  632. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  633. filter.to_string().c_str(), dst.to_string().c_str());
  634. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  635. (flo / (used * 1e9)));
  636. };
  637. run(32, 64, 3, 224, 224, 7, 2, 3);
  638. run(32, 128, 128, 28, 28, 3, 1, 1);
  639. run(32, 256, 256, 14, 14, 3, 1, 1);
  640. run(32, 512, 512, 7, 7, 3, 1, 1);
  641. run(32, 64, 64, 56, 56, 3, 1, 1);
  642. run(32, 512, 256, 56, 56, 1, 2, 0);
  643. run(32, 1024, 512, 28, 28, 1, 2, 0);
  644. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  645. run(32, 512, 128, 28, 28, 1, 1, 0);
  646. run(32, 128, 512, 28, 28, 1, 1, 0);
  647. run(32, 1024, 256, 14, 14, 1, 1, 0);
  648. run(32, 256, 1024, 14, 14, 1, 1, 0);
  649. run(32, 2048, 512, 7, 7, 1, 1, 0);
  650. run(32, 512, 2048, 7, 7, 1, 1, 0);
  651. run(32, 256, 64, 56, 56, 1, 1, 0);
  652. run(32, 64, 256, 56, 56, 1, 1, 0);
  653. run(32, 128, 256, 56, 56, 1, 2, 0);
  654. run(32, 256, 512, 28, 28, 1, 2, 0);
  655. run(32, 512, 1024, 14, 14, 1, 2, 0);
  656. run(32, 64, 64, 56, 56, 1, 1, 0);
  657. }
  658. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  659. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  660. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  661. new OprProxy<ConvolutionBackwardFilter>{true}};
  662. size_t RUNS = 10;
  663. bench.set_proxy(proxy).set_times(RUNS);
  664. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  665. size_t FH, size_t SH, size_t PH) {
  666. bench.set_dtype(0, dtype::Float32())
  667. .set_dtype(1, dtype::Float32())
  668. .set_dtype(2, dtype::Float32());
  669. param::Convolution param;
  670. param.stride_h = param.stride_w = SH;
  671. param.pad_h = param.pad_w = PH;
  672. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  673. bench.set_param(param);
  674. bench.proxy()->target_execution_policy.algo.reset();
  675. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  676. filter{{OC, IC, FH, FH}, dtype::Float32()};
  677. TensorLayout dst;
  678. {
  679. auto&& opr = handle_cuda()->create_operator<Convolution>();
  680. opr->param() = param;
  681. opr->deduce_layout(src, filter, dst);
  682. }
  683. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  684. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  685. bench.proxy()->target_execution_policy.algo.reset();
  686. bench.set_dtype(0, dtype::Float16())
  687. .set_dtype(1, dtype::Float16())
  688. .set_dtype(2, dtype::Float16());
  689. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  690. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  691. bench.proxy()->target_execution_policy.algo.reset();
  692. bench.set_param(param);
  693. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  694. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  695. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  696. filter.to_string().c_str(), dst.to_string().c_str());
  697. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  698. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  699. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  700. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  701. (flo / (time_ms_pseudo_fp16 * 1e9)));
  702. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  703. time_ms_fp32 / time_ms_true_fp16,
  704. time_ms_pseudo_fp16 / time_ms_true_fp16);
  705. };
  706. run(32, 64, 3, 224, 224, 7, 2, 3);
  707. run(32, 128, 128, 28, 28, 3, 1, 1);
  708. run(32, 256, 256, 14, 14, 3, 1, 1);
  709. run(32, 512, 512, 7, 7, 3, 1, 1);
  710. run(32, 64, 64, 56, 56, 3, 1, 1);
  711. run(32, 512, 256, 56, 56, 1, 2, 0);
  712. run(32, 1024, 512, 28, 28, 1, 2, 0);
  713. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  714. run(32, 512, 128, 28, 28, 1, 1, 0);
  715. run(32, 128, 512, 28, 28, 1, 1, 0);
  716. run(32, 1024, 256, 14, 14, 1, 1, 0);
  717. run(32, 256, 1024, 14, 14, 1, 1, 0);
  718. run(32, 2048, 512, 7, 7, 1, 1, 0);
  719. run(32, 512, 2048, 7, 7, 1, 1, 0);
  720. run(32, 256, 64, 56, 56, 1, 1, 0);
  721. run(32, 64, 256, 56, 56, 1, 1, 0);
  722. run(32, 128, 256, 56, 56, 1, 2, 0);
  723. run(32, 256, 512, 28, 28, 1, 2, 0);
  724. run(32, 512, 1024, 14, 14, 1, 2, 0);
  725. run(32, 64, 64, 56, 56, 1, 1, 0);
  726. }
  727. #endif
  728. #undef CUDNN_VERSION_STRING
  729. #undef V
  730. #undef V1
  731. } // namespace test
  732. } // namespace megdnn
  733. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台