You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 34 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "test/cuda/fixture.h"
  16. #include "test/common/tensor.h"
  17. #include "test/common/workspace_wrapper.h"
  18. #include "test/common/checker.h"
  19. #include "test/common/convolution.h"
  20. #include "test/common/rng.h"
  21. #include "test/cuda/benchmark.h"
  22. #include "src/cuda/utils.h"
  23. #define V1(x) #x
  24. #define V(x) V1(x)
  25. #define CUDNN_VERSION_STRING \
  26. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  27. namespace megdnn {
  28. namespace test {
  29. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  30. if (!cuda::is_compute_capability_required(6, 1)) {
  31. printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device"
  32. "doesn't support\n");
  33. return;
  34. }
  35. using namespace convolution;
  36. std::vector<TestArg> args;
  37. {
  38. auto v = get_args();
  39. for (auto&& a : v) {
  40. args.push_back(std::move(a));
  41. }
  42. }
  43. {
  44. auto v = get_dilated_args();
  45. for (auto&& a : v) {
  46. args.push_back(std::move(a));
  47. }
  48. }
  49. {
  50. auto v = get_chanwise_args();
  51. for (auto&& a : v) {
  52. args.push_back(std::move(a));
  53. }
  54. }
  55. Checker<ConvolutionForward> checker(handle_cuda());
  56. UniformIntRNG rng(-4, 4);
  57. for (auto arg : args) {
  58. arg.param.format = param::Convolution::Format::NHWC;
  59. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  60. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  61. checker.set_dtype(0, dtype::Int8())
  62. .set_dtype(1, dtype::Int8())
  63. .set_dtype(2, dtype::Int32())
  64. .set_param(arg.param)
  65. .set_rng(0, &rng)
  66. .set_rng(1, &rng)
  67. .execs({arg.src, arg.filter, {}});
  68. }
  69. }
  70. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  71. using namespace convolution;
  72. std::vector<TestArg> args = get_args();
  73. Checker<ConvolutionForward> checker(handle_cuda());
  74. NormalRNG default_rng;
  75. for (auto&& arg : args) {
  76. float scale =
  77. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  78. UniformFloatRNG rng(scale, 2 * scale);
  79. checker.set_dtype(0, dtype::Float32())
  80. .set_dtype(1, dtype::Float32())
  81. .set_dtype(2, dtype::Float32())
  82. .set_rng(0, &default_rng)
  83. .set_rng(1, &default_rng)
  84. .set_epsilon(1e-3)
  85. .set_param(arg.param)
  86. .execs({arg.src, arg.filter, {}});
  87. checker.set_dtype(0, dtype::Float16())
  88. .set_dtype(1, dtype::Float16())
  89. .set_dtype(2, dtype::Float16())
  90. .set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_epsilon(1e-1)
  93. .set_param(arg.param)
  94. .execs({arg.src, arg.filter, {}});
  95. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  96. checker.set_dtype(0, dtype::Float16())
  97. .set_dtype(1, dtype::Float16())
  98. .set_dtype(2, dtype::Float16())
  99. .set_rng(0, &rng)
  100. .set_rng(1, &rng)
  101. .set_epsilon(1e-1)
  102. .set_param(arg.param)
  103. .execs({arg.src, arg.filter, {}});
  104. checker.set_dtype(0, dtype::BFloat16())
  105. .set_dtype(1, dtype::BFloat16())
  106. .set_dtype(2, dtype::BFloat16())
  107. .set_epsilon(1e-1)
  108. .set_param(arg.param)
  109. .execs({arg.src, arg.filter, {}});
  110. }
  111. }
  112. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  113. if (!cuda::is_compute_capability_required(6, 1))
  114. return;
  115. using namespace convolution;
  116. Checker<Convolution> checker(handle_cuda());
  117. UniformIntRNG int_rng{-127, 127};
  118. Convolution::Param param;
  119. param.format = Convolution::Param::Format::NCHW4;
  120. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  121. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  122. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  123. .set_rng(0, &int_rng)
  124. .set_rng(1, &int_rng)
  125. .set_param(param);
  126. checker.set_before_exec_callback(
  127. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  128. "DEFAULT",
  129. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  130. "MATMUL8X8X32", {})
  131. .c_str(),
  132. {}}}}));
  133. param.sparse = Convolution::Param::Sparse::DENSE;
  134. param.pad_h = param.pad_w = 1;
  135. param.stride_h = param.stride_w = 1;
  136. checker.set_param(param);
  137. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  138. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  139. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  140. }
  141. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  142. using namespace convolution;
  143. std::vector<TestArg> args = get_1x1_args();
  144. Checker<ConvolutionForward> checker(handle_cuda());
  145. NormalRNG default_rng;
  146. for (auto&& arg : args) {
  147. float scale =
  148. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  149. UniformFloatRNG rng(scale, 2 * scale);
  150. checker.set_dtype(0, dtype::Float32())
  151. .set_dtype(1, dtype::Float32())
  152. .set_rng(0, &default_rng)
  153. .set_rng(1, &default_rng)
  154. .set_epsilon(1e-3)
  155. .set_param(arg.param)
  156. .execs({arg.src, arg.filter, {}});
  157. }
  158. }
  159. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  160. using namespace convolution;
  161. std::vector<TestArg> args = get_1x1_args();
  162. Benchmarker<ConvolutionForward> marker(handle_cuda());
  163. NormalRNG default_rng;
  164. for (auto&& arg : args) {
  165. float scale =
  166. 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  167. UniformFloatRNG rng(scale, 2 * scale);
  168. marker.set_dtype(0, dtype::Float32())
  169. .set_dtype(1, dtype::Float32())
  170. .set_rng(0, &default_rng)
  171. .set_rng(1, &default_rng)
  172. .set_param(arg.param)
  173. .execs({arg.src, arg.filter, {}});
  174. }
  175. }
  176. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  177. using namespace convolution;
  178. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  179. Checker<ConvolutionBackwardData> checker(handle_cuda());
  180. NormalRNG default_rng;
  181. for (auto&& arg : args) {
  182. float scale =
  183. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  184. UniformFloatRNG rng(scale, 2 * scale);
  185. auto src = TensorLayout(arg.src, dtype::Float32());
  186. auto filter = TensorLayout(arg.filter, dtype::Float32());
  187. TensorLayout dst;
  188. {
  189. auto opr = handle_cuda()->create_operator<Convolution>();
  190. opr->param() = arg.param;
  191. opr->deduce_layout(src, filter, dst);
  192. }
  193. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  194. checker.set_rng(0, &default_rng)
  195. .set_rng(1, &default_rng)
  196. .set_epsilon(1e-3)
  197. .set_param(arg.param)
  198. .exec(TensorLayoutArray{filter, dst, src});
  199. if (!cuda::is_compute_capability_required(6, 0)) {
  200. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  201. checker.set_rng(0, &rng)
  202. .set_rng(1, &rng)
  203. .set_epsilon(1e-1)
  204. .set_param(arg.param)
  205. .exec(TensorLayoutArray{filter, dst, src});
  206. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  207. checker.set_rng(0, &rng)
  208. .set_rng(1, &rng)
  209. .set_epsilon(1e-1)
  210. .set_param(arg.param)
  211. .exec(TensorLayoutArray{filter, dst, src});
  212. }
  213. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  214. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  215. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  216. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  217. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  218. checker.set_rng(0, &rng)
  219. .set_rng(1, &rng)
  220. .set_epsilon(1e-1)
  221. .set_param(arg.param)
  222. .exec(TensorLayoutArray{filter, dst, src});
  223. checker.reset_before_exec_callback();
  224. checker.opr()->execution_policy() = {};
  225. }
  226. }
  227. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  228. using namespace convolution;
  229. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  230. Checker<ConvolutionBackwardData> checker(handle_cuda());
  231. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  232. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  233. NormalRNG default_rng;
  234. for (auto&& arg : args) {
  235. float scale =
  236. 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  237. UniformFloatRNG rng(scale, 2 * scale);
  238. auto src = TensorLayout(arg.src, dtype::Float32());
  239. auto filter = TensorLayout(arg.filter, dtype::Float32());
  240. TensorLayout dst;
  241. {
  242. auto opr = handle_cuda()->create_operator<Convolution>();
  243. opr->param() = arg.param;
  244. opr->deduce_layout(src, filter, dst);
  245. }
  246. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  247. checker.set_rng(0, &default_rng)
  248. .set_rng(1, &default_rng)
  249. .set_epsilon(1e-3)
  250. .set_param(arg.param)
  251. .exec(TensorLayoutArray{filter, dst, src});
  252. }
  253. }
  254. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_DP4A) {
  255. if (!cuda::is_compute_capability_required(6, 1)) {
  256. printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_DP4A test as current "
  257. "device doesn't support\n");
  258. return;
  259. }
  260. using namespace convolution;
  261. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  262. Checker<ConvolutionBackwardData> checker(handle_cuda());
  263. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  264. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  265. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  266. for (auto&& arg : args) {
  267. UniformIntRNG rng(-3, 3);
  268. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  269. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  270. TensorLayout dst;
  271. dst.dtype = dtype::QuantizedS8{1.2f};
  272. {
  273. auto opr = handle_cuda()->create_operator<Convolution>();
  274. opr->param() = arg.param;
  275. opr->deduce_layout(src, filter, dst);
  276. }
  277. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  278. TensorLayoutArray{filter, dst, src});
  279. }
  280. }
  281. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  282. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  283. // when cudnn fixed the problem.
  284. if (cuda::is_compute_capability_required(7, 0))
  285. return;
  286. using namespace convolution;
  287. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  288. Checker<ConvolutionBackwardData> checker(handle_cuda());
  289. NormalRNG default_rng;
  290. for (auto&& arg : args) {
  291. float scale =
  292. 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  293. scale = std::max(scale, 1.f);
  294. UniformFloatRNG rng(scale, 2 * scale);
  295. auto src = TensorLayout(arg.src, dtype::Float32());
  296. auto filter = TensorLayout(arg.filter, dtype::Float32());
  297. TensorLayout dst;
  298. {
  299. auto opr = handle_cuda()->create_operator<Convolution>();
  300. opr->param() = arg.param;
  301. opr->deduce_layout(src, filter, dst);
  302. }
  303. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  304. checker.set_rng(0, &default_rng)
  305. .set_rng(1, &default_rng)
  306. .set_epsilon(1e-3)
  307. .set_param(arg.param)
  308. .exec(TensorLayoutArray{filter, dst, src});
  309. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  310. checker.set_rng(0, &rng)
  311. .set_rng(1, &rng)
  312. .set_epsilon(1e-1)
  313. .set_param(arg.param)
  314. .exec(TensorLayoutArray{filter, dst, src});
  315. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  316. checker.set_rng(0, &rng)
  317. .set_rng(1, &rng)
  318. .set_epsilon(1e-1)
  319. .set_param(arg.param)
  320. .exec(TensorLayoutArray{filter, dst, src});
  321. }
  322. }
  323. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  324. using namespace convolution;
  325. std::vector<TestArg> args = get_args();
  326. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  327. bool f16_checked = false;
  328. for (auto&& arg : args) {
  329. auto src = TensorLayout(arg.src, dtype::Float32());
  330. auto filter = TensorLayout(arg.filter, dtype::Float32());
  331. TensorLayout dst;
  332. {
  333. auto opr = handle_cuda()->create_operator<Convolution>();
  334. opr->param() = arg.param;
  335. opr->deduce_layout(src, filter, dst);
  336. }
  337. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  338. UniformFloatRNG rng(scale, 2 * scale);
  339. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  340. checker.set_rng(0, &rng)
  341. .set_rng(1, &rng)
  342. .set_epsilon(1e-3)
  343. .set_param(arg.param)
  344. .exec(TensorLayoutArray{src, dst, filter});
  345. // reduce on large f16 array may introduce significant error
  346. if (dst.total_nr_elems() >= 1000 && f16_checked)
  347. continue;
  348. f16_checked = true;
  349. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  350. checker.set_rng(0, &rng)
  351. .set_rng(1, &rng)
  352. .set_epsilon(1e-1)
  353. .set_param(arg.param)
  354. .exec(TensorLayoutArray{src, dst, filter});
  355. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  356. checker.set_rng(0, &rng)
  357. .set_rng(1, &rng)
  358. .set_epsilon(1e-1)
  359. .set_param(arg.param)
  360. .exec(TensorLayoutArray{src, dst, filter});
  361. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  362. ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  363. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  364. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  365. checker.set_rng(0, &rng)
  366. .set_rng(1, &rng)
  367. .set_epsilon(1e-1)
  368. .set_param(arg.param)
  369. .exec(TensorLayoutArray{src, dst, filter});
  370. checker.reset_before_exec_callback();
  371. checker.opr()->execution_policy() = {};
  372. }
  373. }
  374. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  375. using namespace convolution;
  376. std::vector<TestArg> args = get_args();
  377. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  378. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  379. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  380. for (auto&& arg : args) {
  381. auto src = TensorLayout(arg.src, dtype::Float32());
  382. auto filter = TensorLayout(arg.filter, dtype::Float32());
  383. TensorLayout dst;
  384. {
  385. auto opr = handle_cuda()->create_operator<Convolution>();
  386. opr->param() = arg.param;
  387. opr->deduce_layout(src, filter, dst);
  388. }
  389. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  390. UniformFloatRNG rng(scale, 2 * scale);
  391. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  392. checker.set_rng(0, &rng)
  393. .set_rng(1, &rng)
  394. .set_epsilon(1e-3)
  395. .set_param(arg.param)
  396. .exec(TensorLayoutArray{src, dst, filter});
  397. }
  398. }
  399. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  400. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  401. if (f16) {
  402. return stage == 2 ? 0.5 : 0.2;
  403. }
  404. if (strstr(name, "WINOGRAD_NONFUSED"))
  405. return 0.3;
  406. return 1e-3;
  407. };
  408. convolution::test_conv_config_combinations(2, handle_cuda(), false, true,
  409. true, eps_getter, true);
  410. convolution::test_conv_config_combinations(3, handle_cuda(), false, true,
  411. true, eps_getter, true);
  412. convolution::test_conv_config_combinations(5, handle_cuda(), false, true,
  413. true, eps_getter, true);
  414. }
  415. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  416. if (cuda::is_compute_capability_required(7, 0))
  417. return;
  418. using namespace convolution;
  419. Checker<ConvolutionBackwardData> checker(handle_cuda());
  420. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  421. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  422. NormalRNG default_rng;
  423. TensorShape s_filter = TensorShape{8, 8, 2, 2},
  424. s_src = TensorShape{2, 8, 18, 18};
  425. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  426. UniformFloatRNG rng(scale, 2 * scale);
  427. auto src = TensorLayout(s_src, dtype::Float16());
  428. auto filter = TensorLayout(s_filter, dtype::Float16());
  429. TensorLayout dst;
  430. param::Convolution param;
  431. param.pad_h = param.pad_w = 2;
  432. param.stride_h = param.stride_w = 2;
  433. {
  434. auto opr = handle_cuda()->create_operator<Convolution>();
  435. opr->param() = param;
  436. opr->deduce_layout(src, filter, dst);
  437. }
  438. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  439. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  440. checker.set_rng(0, &rng)
  441. .set_rng(1, &rng)
  442. .set_epsilon(0.2)
  443. .set_param(param)
  444. .exec(TensorLayoutArray{filter, dst, src});
  445. }
  446. #if MEGDNN_WITH_BENCHMARK
  447. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  448. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  449. size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
  450. size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
  451. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  452. benchmarker.set_dtype(0, dtype::Float16())
  453. .set_dtype(1, dtype::Float16())
  454. .set_dtype(2, dtype::Float16());
  455. ConvolutionForward::Param param;
  456. param.stride_h = SH;
  457. param.stride_w = SW;
  458. param.pad_h = PH;
  459. param.pad_w = PW;
  460. if (fp16io_c32) {
  461. param.compute_mode =
  462. ConvolutionForward::Param::ComputeMode::FLOAT32;
  463. }
  464. benchmarker.set_param(param);
  465. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  466. new OprProxy<ConvolutionForward>{true}};
  467. benchmarker.set_proxy(proxy);
  468. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  469. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  470. auto time = benchmarker.execs(
  471. {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  472. time /= 1000.0 * 10.0;
  473. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  474. auto flops = flo / time / 1e12;
  475. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  476. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  477. };
  478. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  479. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  480. }
  481. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  482. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  483. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  484. new OprProxy<ConvolutionForward>{true}};
  485. size_t RUNS = 10;
  486. bench.set_proxy(proxy).set_times(RUNS);
  487. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  488. size_t FH, size_t SH, size_t PH) {
  489. bench.set_dtype(0, dtype::Float32())
  490. .set_dtype(1, dtype::Float32())
  491. .set_dtype(2, dtype::Float32());
  492. param::Convolution param;
  493. param.stride_h = param.stride_w = SH;
  494. param.pad_h = param.pad_w = PH;
  495. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  496. bench.set_param(param);
  497. bench.proxy()->target_execution_policy.algo.reset();
  498. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  499. filter{{OC, IC, FH, FH}, dtype::Float32()};
  500. TensorLayout dst;
  501. {
  502. auto&& opr = handle_cuda()->create_operator<Convolution>();
  503. opr->param() = param;
  504. opr->deduce_layout(src, filter, dst);
  505. }
  506. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  507. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  508. bench.proxy()->target_execution_policy.algo.reset();
  509. bench.set_dtype(0, dtype::Float16())
  510. .set_dtype(1, dtype::Float16())
  511. .set_dtype(2, dtype::Float16());
  512. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  513. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  514. bench.proxy()->target_execution_policy.algo.reset();
  515. bench.set_param(param);
  516. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  517. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  518. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  519. filter.to_string().c_str(), dst.to_string().c_str());
  520. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  521. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  522. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  523. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  524. (flo / (time_ms_pseudo_fp16 * 1e9)));
  525. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  526. time_ms_fp32 / time_ms_true_fp16,
  527. time_ms_pseudo_fp16 / time_ms_true_fp16);
  528. };
  529. run(32, 64, 3, 224, 224, 7, 2, 3);
  530. run(32, 128, 128, 28, 28, 3, 1, 1);
  531. run(32, 256, 256, 14, 14, 3, 1, 1);
  532. run(32, 512, 512, 7, 7, 3, 1, 1);
  533. run(32, 64, 64, 56, 56, 3, 1, 1);
  534. run(32, 512, 256, 56, 56, 1, 2, 0);
  535. run(32, 1024, 512, 28, 28, 1, 2, 0);
  536. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  537. run(32, 512, 128, 28, 28, 1, 1, 0);
  538. run(32, 128, 512, 28, 28, 1, 1, 0);
  539. run(32, 1024, 256, 14, 14, 1, 1, 0);
  540. run(32, 256, 1024, 14, 14, 1, 1, 0);
  541. run(32, 2048, 512, 7, 7, 1, 1, 0);
  542. run(32, 512, 2048, 7, 7, 1, 1, 0);
  543. run(32, 256, 64, 56, 56, 1, 1, 0);
  544. run(32, 64, 256, 56, 56, 1, 1, 0);
  545. run(32, 128, 256, 56, 56, 1, 2, 0);
  546. run(32, 256, 512, 28, 28, 1, 2, 0);
  547. run(32, 512, 1024, 14, 14, 1, 2, 0);
  548. run(32, 64, 64, 56, 56, 1, 1, 0);
  549. }
  550. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  551. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  552. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  553. new OprProxy<ConvolutionBackwardData>{true}};
  554. size_t RUNS = 10;
  555. bench.set_proxy(proxy).set_times(RUNS);
  556. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  557. size_t FH, size_t SH, size_t PH) {
  558. bench.set_dtype(0, dtype::Float32())
  559. .set_dtype(1, dtype::Float32())
  560. .set_dtype(2, dtype::Float32());
  561. param::Convolution param;
  562. param.stride_h = param.stride_w = SH;
  563. param.pad_h = param.pad_w = PH;
  564. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  565. bench.set_param(param);
  566. bench.proxy()->target_execution_policy.algo.reset();
  567. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  568. filter{{OC, IC, FH, FH}, dtype::Float32()};
  569. TensorLayout dst;
  570. {
  571. auto&& opr = handle_cuda()->create_operator<Convolution>();
  572. opr->param() = param;
  573. opr->deduce_layout(src, filter, dst);
  574. }
  575. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  576. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  577. bench.proxy()->target_execution_policy.algo.reset();
  578. bench.set_dtype(0, dtype::Float16())
  579. .set_dtype(1, dtype::Float16())
  580. .set_dtype(2, dtype::Float16());
  581. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  582. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  583. bench.proxy()->target_execution_policy.algo.reset();
  584. bench.set_param(param);
  585. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  586. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  587. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  588. filter.to_string().c_str(), dst.to_string().c_str());
  589. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  590. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  591. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  592. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  593. (flo / (time_ms_pseudo_fp16 * 1e9)));
  594. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  595. time_ms_fp32 / time_ms_true_fp16,
  596. time_ms_pseudo_fp16 / time_ms_true_fp16);
  597. };
  598. run(32, 64, 3, 224, 224, 7, 2, 3);
  599. run(32, 128, 128, 28, 28, 3, 1, 1);
  600. run(32, 256, 256, 14, 14, 3, 1, 1);
  601. run(32, 512, 512, 7, 7, 3, 1, 1);
  602. run(32, 64, 64, 56, 56, 3, 1, 1);
  603. run(32, 512, 256, 56, 56, 1, 2, 0);
  604. run(32, 1024, 512, 28, 28, 1, 2, 0);
  605. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  606. run(32, 512, 128, 28, 28, 1, 1, 0);
  607. run(32, 128, 512, 28, 28, 1, 1, 0);
  608. run(32, 1024, 256, 14, 14, 1, 1, 0);
  609. run(32, 256, 1024, 14, 14, 1, 1, 0);
  610. run(32, 2048, 512, 7, 7, 1, 1, 0);
  611. run(32, 512, 2048, 7, 7, 1, 1, 0);
  612. run(32, 256, 64, 56, 56, 1, 1, 0);
  613. run(32, 64, 256, 56, 56, 1, 1, 0);
  614. run(32, 128, 256, 56, 56, 1, 2, 0);
  615. run(32, 256, 512, 28, 28, 1, 2, 0);
  616. run(32, 512, 1024, 14, 14, 1, 2, 0);
  617. run(32, 64, 64, 56, 56, 1, 1, 0);
  618. }
  619. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  620. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  621. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  622. new OprProxy<ConvolutionBackwardData>{true}};
  623. size_t RUNS = 10;
  624. bench.set_proxy(proxy).set_times(RUNS);
  625. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  626. size_t FH, size_t SH, size_t PH) {
  627. bench.set_dtype(0, dtype::BFloat16())
  628. .set_dtype(1, dtype::BFloat16())
  629. .set_dtype(2, dtype::BFloat16());
  630. param::Convolution param;
  631. param.stride_h = param.stride_w = SH;
  632. param.pad_h = param.pad_w = PH;
  633. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  634. bench.set_param(param);
  635. bench.proxy()->target_execution_policy = {};
  636. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  637. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  638. TensorLayout dst;
  639. {
  640. auto&& opr = handle_cuda()->create_operator<Convolution>();
  641. opr->param() = param;
  642. opr->deduce_layout(src, filter, dst);
  643. }
  644. auto used = bench.execl({filter, dst, src}) / RUNS;
  645. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  646. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  647. filter.to_string().c_str(), dst.to_string().c_str());
  648. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  649. (flo / (used * 1e9)));
  650. };
  651. run(32, 64, 3, 224, 224, 7, 2, 3);
  652. run(32, 128, 128, 28, 28, 3, 1, 1);
  653. run(32, 256, 256, 14, 14, 3, 1, 1);
  654. run(32, 512, 512, 7, 7, 3, 1, 1);
  655. run(32, 64, 64, 56, 56, 3, 1, 1);
  656. run(32, 512, 256, 56, 56, 1, 2, 0);
  657. run(32, 1024, 512, 28, 28, 1, 2, 0);
  658. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  659. run(32, 512, 128, 28, 28, 1, 1, 0);
  660. run(32, 128, 512, 28, 28, 1, 1, 0);
  661. run(32, 1024, 256, 14, 14, 1, 1, 0);
  662. run(32, 256, 1024, 14, 14, 1, 1, 0);
  663. run(32, 2048, 512, 7, 7, 1, 1, 0);
  664. run(32, 512, 2048, 7, 7, 1, 1, 0);
  665. run(32, 256, 64, 56, 56, 1, 1, 0);
  666. run(32, 64, 256, 56, 56, 1, 1, 0);
  667. run(32, 128, 256, 56, 56, 1, 2, 0);
  668. run(32, 256, 512, 28, 28, 1, 2, 0);
  669. run(32, 512, 1024, 14, 14, 1, 2, 0);
  670. run(32, 64, 64, 56, 56, 1, 1, 0);
  671. }
  672. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  673. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  674. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  675. new OprProxy<ConvolutionBackwardData>{true}};
  676. size_t RUNS = 10;
  677. bench.set_proxy(proxy).set_times(RUNS);
  678. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  679. size_t FH, size_t SH, size_t PH) {
  680. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  681. .set_dtype(1, dtype::QuantizedS8{1.0f})
  682. .set_dtype(2, dtype::QuantizedS8{1.0f});
  683. param::Convolution param;
  684. param.format = param::Convolution::Format::NCHW4;
  685. param.stride_h = param.stride_w = SH;
  686. param.pad_h = param.pad_w = PH;
  687. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  688. bench.set_param(param);
  689. bench.proxy()->target_execution_policy = {};
  690. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  691. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  692. TensorLayout dst;
  693. dst.dtype = dtype::QuantizedS8{1.0f};
  694. {
  695. auto&& opr = handle_cuda()->create_operator<Convolution>();
  696. opr->param() = param;
  697. opr->deduce_layout(src, filter, dst);
  698. }
  699. auto used = bench.execl({filter, dst, src}) / RUNS;
  700. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  701. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  702. filter.to_string().c_str(), dst.to_string().c_str());
  703. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
  704. (flo / (used * 1e9)));
  705. };
  706. run(64, 32, 32, 92, 180, 4, 2, 2);
  707. run(64, 32, 32, 46, 80, 4, 2, 2);
  708. run(16, 16, 16, 92, 180, 4, 2, 2);
  709. run(16, 16, 16, 46, 80, 4, 2, 2);
  710. }
  711. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  712. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  713. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  714. new OprProxy<ConvolutionBackwardFilter>{true}};
  715. size_t RUNS = 10;
  716. bench.set_proxy(proxy).set_times(RUNS);
  717. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
  718. size_t FH, size_t SH, size_t PH) {
  719. bench.set_dtype(0, dtype::Float32())
  720. .set_dtype(1, dtype::Float32())
  721. .set_dtype(2, dtype::Float32());
  722. param::Convolution param;
  723. param.stride_h = param.stride_w = SH;
  724. param.pad_h = param.pad_w = PH;
  725. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  726. bench.set_param(param);
  727. bench.proxy()->target_execution_policy.algo.reset();
  728. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  729. filter{{OC, IC, FH, FH}, dtype::Float32()};
  730. TensorLayout dst;
  731. {
  732. auto&& opr = handle_cuda()->create_operator<Convolution>();
  733. opr->param() = param;
  734. opr->deduce_layout(src, filter, dst);
  735. }
  736. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  737. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  738. bench.proxy()->target_execution_policy.algo.reset();
  739. bench.set_dtype(0, dtype::Float16())
  740. .set_dtype(1, dtype::Float16())
  741. .set_dtype(2, dtype::Float16());
  742. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  743. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  744. bench.proxy()->target_execution_policy.algo.reset();
  745. bench.set_param(param);
  746. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  747. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  748. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  749. filter.to_string().c_str(), dst.to_string().c_str());
  750. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  751. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  752. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  753. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  754. (flo / (time_ms_pseudo_fp16 * 1e9)));
  755. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  756. time_ms_fp32 / time_ms_true_fp16,
  757. time_ms_pseudo_fp16 / time_ms_true_fp16);
  758. };
  759. run(32, 64, 3, 224, 224, 7, 2, 3);
  760. run(32, 128, 128, 28, 28, 3, 1, 1);
  761. run(32, 256, 256, 14, 14, 3, 1, 1);
  762. run(32, 512, 512, 7, 7, 3, 1, 1);
  763. run(32, 64, 64, 56, 56, 3, 1, 1);
  764. run(32, 512, 256, 56, 56, 1, 2, 0);
  765. run(32, 1024, 512, 28, 28, 1, 2, 0);
  766. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  767. run(32, 512, 128, 28, 28, 1, 1, 0);
  768. run(32, 128, 512, 28, 28, 1, 1, 0);
  769. run(32, 1024, 256, 14, 14, 1, 1, 0);
  770. run(32, 256, 1024, 14, 14, 1, 1, 0);
  771. run(32, 2048, 512, 7, 7, 1, 1, 0);
  772. run(32, 512, 2048, 7, 7, 1, 1, 0);
  773. run(32, 256, 64, 56, 56, 1, 1, 0);
  774. run(32, 64, 256, 56, 56, 1, 1, 0);
  775. run(32, 128, 256, 56, 56, 1, 2, 0);
  776. run(32, 256, 512, 28, 28, 1, 2, 0);
  777. run(32, 512, 1024, 14, 14, 1, 2, 0);
  778. run(32, 64, 64, 56, 56, 1, 1, 0);
  779. }
  780. #endif
  781. #undef CUDNN_VERSION_STRING
  782. #undef V
  783. #undef V1
  784. } // namespace test
  785. } // namespace megdnn
  786. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台