You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 52 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257
  1. /**
  2. * \file dnn/test/cuda/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/convolution.h"
  13. #include "megdnn/dtype.h"
  14. #include "megdnn/opr_param_defs.h"
  15. #include "megdnn/oprs.h"
  16. #include "test/common/accuracy_shake_checker.h"
  17. #include "test/common/checker.h"
  18. #include "test/common/rng.h"
  19. #include "test/common/tensor.h"
  20. #include "test/common/workspace_wrapper.h"
  21. #include "test/cuda/benchmark.h"
  22. #include "test/cuda/fixture.h"
  23. #include "test/cuda/utils.h"
  24. #define V1(x) #x
  25. #define V(x) V1(x)
  26. #define CUDNN_VERSION_STRING \
  27. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  28. namespace megdnn {
  29. namespace test {
  30. TEST_F(CUDA, CONVOLUTION_8X8X32) {
  31. require_compute_capability(6, 1);
  32. using namespace convolution;
  33. std::vector<TestArg> args;
  34. {
  35. auto v = get_args();
  36. for (auto&& a : v) {
  37. args.push_back(std::move(a));
  38. }
  39. }
  40. {
  41. auto v = get_dilated_args();
  42. for (auto&& a : v) {
  43. args.push_back(std::move(a));
  44. }
  45. }
  46. {
  47. auto v = get_chanwise_args();
  48. for (auto&& a : v) {
  49. args.push_back(std::move(a));
  50. }
  51. }
  52. Checker<ConvolutionForward> checker(handle_cuda());
  53. UniformIntRNG rng(-4, 4);
  54. for (auto arg : args) {
  55. arg.param.format = param::Convolution::Format::NHWC;
  56. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  57. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  58. checker.set_dtype(0, dtype::Int8())
  59. .set_dtype(1, dtype::Int8())
  60. .set_dtype(2, dtype::Int32())
  61. .set_param(arg.param)
  62. .set_rng(0, &rng)
  63. .set_rng(1, &rng)
  64. .execs({arg.src, arg.filter, {}});
  65. }
  66. }
  67. TEST_F(CUDA, CONVOLUTION_FORWARD) {
  68. using namespace convolution;
  69. std::vector<TestArg> args = get_args();
  70. Checker<ConvolutionForward> checker(handle_cuda());
  71. NormalRNG default_rng;
  72. for (auto&& arg : args) {
  73. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  74. UniformFloatRNG rng(scale, 2 * scale);
  75. checker.set_dtype(0, dtype::Float32())
  76. .set_dtype(1, dtype::Float32())
  77. .set_dtype(2, dtype::Float32())
  78. .set_rng(0, &default_rng)
  79. .set_rng(1, &default_rng)
  80. .set_epsilon(1e-3)
  81. .set_param(arg.param)
  82. .execs({arg.src, arg.filter, {}});
  83. checker.set_dtype(0, dtype::Float16())
  84. .set_dtype(1, dtype::Float16())
  85. .set_dtype(2, dtype::Float16())
  86. .set_rng(0, &rng)
  87. .set_rng(1, &rng)
  88. .set_epsilon(1e-1)
  89. .set_param(arg.param)
  90. .execs({arg.src, arg.filter, {}});
  91. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  92. checker.set_dtype(0, dtype::Float16())
  93. .set_dtype(1, dtype::Float16())
  94. .set_dtype(2, dtype::Float16())
  95. .set_rng(0, &rng)
  96. .set_rng(1, &rng)
  97. .set_epsilon(1e-1)
  98. .set_param(arg.param)
  99. .execs({arg.src, arg.filter, {}});
  100. checker.set_dtype(0, dtype::BFloat16())
  101. .set_dtype(1, dtype::BFloat16())
  102. .set_dtype(2, dtype::BFloat16())
  103. .set_epsilon(1e-1)
  104. .set_param(arg.param)
  105. .execs({arg.src, arg.filter, {}});
  106. }
  107. }
  108. TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) {
  109. require_compute_capability(6, 1);
  110. using namespace convolution;
  111. Checker<Convolution> checker(handle_cuda());
  112. UniformIntRNG int_rng{-127, 127};
  113. Convolution::Param param;
  114. param.format = Convolution::Param::Format::NCHW4;
  115. checker.set_dtype(0, dtype::QuantizedS8(0.132f))
  116. .set_dtype(1, dtype::QuantizedS8(0.0239f))
  117. .set_dtype(2, dtype::QuantizedS32(0.132f * 0.0239f))
  118. .set_rng(0, &int_rng)
  119. .set_rng(1, &int_rng)
  120. .set_param(param);
  121. checker.set_before_exec_callback(
  122. AlgoChecker<ConvolutionForward>(ExecutionPolicyAlgoName{
  123. "DEFAULT",
  124. {{ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  125. "MATMUL8X8X32", {})
  126. .c_str(),
  127. {}}}}));
  128. param.sparse = Convolution::Param::Sparse::DENSE;
  129. param.pad_h = param.pad_w = 1;
  130. param.stride_h = param.stride_w = 1;
  131. checker.set_param(param);
  132. checker.exec({{8, 4, 10, 10, 4}, {16, 4, 3, 3, 4}, {}});
  133. checker.exec({{1, 4, 2, 2, 4}, {16, 4, 3, 3, 4}, {}});
  134. checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}});
  135. }
  136. TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
  137. using namespace convolution;
  138. std::vector<TestArg> args = get_1x1_args();
  139. Checker<ConvolutionForward> checker(handle_cuda());
  140. NormalRNG default_rng;
  141. for (auto&& arg : args) {
  142. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  143. UniformFloatRNG rng(scale, 2 * scale);
  144. checker.set_dtype(0, dtype::Float32())
  145. .set_dtype(1, dtype::Float32())
  146. .set_rng(0, &default_rng)
  147. .set_rng(1, &default_rng)
  148. .set_epsilon(1e-3)
  149. .set_param(arg.param)
  150. .execs({arg.src, arg.filter, {}});
  151. }
  152. }
  153. TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
  154. using namespace convolution;
  155. std::vector<TestArg> args = get_1x1_args();
  156. Benchmarker<ConvolutionForward> marker(handle_cuda());
  157. NormalRNG default_rng;
  158. for (auto&& arg : args) {
  159. float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
  160. UniformFloatRNG rng(scale, 2 * scale);
  161. marker.set_dtype(0, dtype::Float32())
  162. .set_dtype(1, dtype::Float32())
  163. .set_rng(0, &default_rng)
  164. .set_rng(1, &default_rng)
  165. .set_param(arg.param)
  166. .execs({arg.src, arg.filter, {}});
  167. }
  168. }
  169. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
  170. using namespace convolution;
  171. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  172. Checker<ConvolutionBackwardData> checker(handle_cuda());
  173. NormalRNG default_rng;
  174. for (auto&& arg : args) {
  175. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  176. UniformFloatRNG rng(scale, 2 * scale);
  177. auto src = TensorLayout(arg.src, dtype::Float32());
  178. auto filter = TensorLayout(arg.filter, dtype::Float32());
  179. TensorLayout dst;
  180. {
  181. auto opr = handle_cuda()->create_operator<Convolution>();
  182. opr->param() = arg.param;
  183. opr->deduce_layout(src, filter, dst);
  184. }
  185. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  186. checker.set_rng(0, &default_rng)
  187. .set_rng(1, &default_rng)
  188. .set_epsilon(1e-3)
  189. .set_param(arg.param)
  190. .exec(TensorLayoutArray{filter, dst, src});
  191. if (!check_compute_capability(6, 0)) {
  192. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  193. checker.set_rng(0, &rng)
  194. .set_rng(1, &rng)
  195. .set_epsilon(1e-1)
  196. .set_param(arg.param)
  197. .exec(TensorLayoutArray{filter, dst, src});
  198. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  199. checker.set_rng(0, &rng)
  200. .set_rng(1, &rng)
  201. .set_epsilon(1e-1)
  202. .set_param(arg.param)
  203. .exec(TensorLayoutArray{filter, dst, src});
  204. }
  205. checker.set_before_exec_callback(
  206. AlgoChecker<ConvolutionBackwardData>(ExecutionPolicyAlgoName{
  207. "CONVOLUTION_BACKWARD_DATD_BFLOAT16",
  208. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  209. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  210. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  211. checker.set_rng(0, &rng)
  212. .set_rng(1, &rng)
  213. .set_epsilon(1e-1)
  214. .set_param(arg.param)
  215. .exec(TensorLayoutArray{filter, dst, src});
  216. checker.reset_before_exec_callback();
  217. checker.opr()->execution_policy() = {};
  218. }
  219. }
  220. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FP16_CUDNN7_5) {
  221. // algo CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 with
  222. // TensorCore operations produces incorrect result.
  223. // Maybe nvidia has fixed this issue
  224. // There is a test using incorrect case:
  225. // inp={2x8x18x18}, kern={8x8x2x2}, pad_h=pad_w=2, stride_h=stride_w=2,
  226. // dtype=float16
  227. using namespace convolution;
  228. std::vector<TestArg> args = get_args_cudnn_5_1_backward();
  229. Checker<ConvolutionBackwardData> checker(handle_cuda());
  230. NormalRNG default_rng;
  231. for (auto&& arg : args) {
  232. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  233. scale = std::max(scale, 1.f);
  234. UniformFloatRNG rng(scale, 2 * scale);
  235. arg.param.format = param::Convolution::Format::NHWC;
  236. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  237. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  238. auto src = TensorLayout(arg.src, dtype::Float32());
  239. auto filter = TensorLayout(arg.filter, dtype::Float32());
  240. TensorLayout dst;
  241. {
  242. auto opr = handle_cuda()->create_operator<Convolution>();
  243. opr->param() = arg.param;
  244. opr->deduce_layout(src, filter, dst);
  245. }
  246. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  247. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  248. checker.set_rng(0, &rng)
  249. .set_rng(1, &rng)
  250. .set_epsilon(1e-2)
  251. .set_param(arg.param)
  252. .exec(TensorLayoutArray{filter, dst, src});
  253. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  254. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  255. checker.set_rng(0, &rng)
  256. .set_rng(1, &rng)
  257. .set_epsilon(1e-2)
  258. .set_param(arg.param)
  259. .exec(TensorLayoutArray{filter, dst, src});
  260. }
  261. }
  262. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_NHWC) {
  263. using namespace convolution;
  264. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  265. Checker<ConvolutionBackwardData> checker(handle_cuda());
  266. NormalRNG default_rng;
  267. for (auto&& arg : args) {
  268. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  269. UniformFloatRNG rng(scale, 2 * scale);
  270. arg.param.format = param::Convolution::Format::NHWC;
  271. arg.src = cvt_src_or_dst_nchw2nhwc(arg.src);
  272. arg.filter = cvt_filter_nchw2nhwc(arg.filter);
  273. auto src = TensorLayout(arg.src, dtype::Float32());
  274. auto filter = TensorLayout(arg.filter, dtype::Float32());
  275. TensorLayout dst;
  276. {
  277. auto opr = handle_cuda()->create_operator<Convolution>();
  278. opr->param() = arg.param;
  279. opr->deduce_layout(src, filter, dst);
  280. }
  281. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  282. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  283. checker.set_rng(0, &rng)
  284. .set_rng(1, &rng)
  285. .set_epsilon(1e-2)
  286. .set_param(arg.param)
  287. .exec(TensorLayoutArray{filter, dst, src});
  288. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  289. arg.param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  290. checker.set_rng(0, &rng)
  291. .set_rng(1, &rng)
  292. .set_epsilon(1e-2)
  293. .set_param(arg.param)
  294. .exec(TensorLayoutArray{filter, dst, src});
  295. }
  296. }
  297. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_CUDNN) {
  298. require_compute_capability(7, 0);
  299. using namespace convolution;
  300. Checker<ConvolutionBackwardData> checker(handle_cuda());
  301. checker.set_before_exec_callback(
  302. AlgoChecker<ConvolutionBackwardData>("CUDNN_CONVOLUTION"));
  303. //! noncontiguous case
  304. {
  305. param::Convolution param;
  306. param.pad_h = param.pad_w = 1;
  307. checker.set_param(param).execl(TensorLayoutArray{
  308. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  309. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  310. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  311. });
  312. }
  313. }
  314. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) {
  315. using namespace convolution;
  316. std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
  317. Checker<ConvolutionBackwardData> checker(handle_cuda());
  318. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  319. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  320. NormalRNG default_rng;
  321. for (auto&& arg : args) {
  322. float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  323. UniformFloatRNG rng(scale, 2 * scale);
  324. auto src = TensorLayout(arg.src, dtype::Float32());
  325. auto filter = TensorLayout(arg.filter, dtype::Float32());
  326. TensorLayout dst;
  327. {
  328. auto opr = handle_cuda()->create_operator<Convolution>();
  329. opr->param() = arg.param;
  330. opr->deduce_layout(src, filter, dst);
  331. }
  332. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  333. checker.set_rng(0, &default_rng)
  334. .set_rng(1, &default_rng)
  335. .set_epsilon(1e-3)
  336. .set_param(arg.param)
  337. .exec(TensorLayoutArray{filter, dst, src});
  338. }
  339. //! noncontiguous case
  340. {
  341. param::Convolution param;
  342. param.pad_h = param.pad_w = 1;
  343. checker.set_param(param).execl(TensorLayoutArray{
  344. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()},
  345. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  346. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  347. });
  348. }
  349. }
  350. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
  351. require_compute_capability(6, 1);
  352. using namespace convolution;
  353. std::vector<TestArg> args = get_args_int8_nchw4_conv_bwd_data();
  354. struct AlgoParam {
  355. int threadblock_m;
  356. int threadblock_n;
  357. int threadblock_k;
  358. int warp_m;
  359. int warp_n;
  360. int warp_k;
  361. int stage;
  362. std::string to_string() {
  363. return ssprintf(
  364. "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
  365. threadblock_k, warp_m, warp_n, warp_k, stage);
  366. }
  367. };
  368. std::vector<AlgoParam> all_params;
  369. all_params.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2});
  370. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
  371. all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
  372. all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
  373. for (auto algo_param : all_params) {
  374. Checker<ConvolutionBackwardData> checker(handle_cuda());
  375. std::string algo_name(ssprintf(
  376. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  377. checker.set_before_exec_callback(
  378. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  379. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  380. for (auto&& arg : args) {
  381. UniformIntRNG rng(-3, 3);
  382. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  383. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  384. TensorLayout dst;
  385. dst.dtype = dtype::QuantizedS8{1.2f};
  386. {
  387. auto opr = handle_cuda()->create_operator<Convolution>();
  388. opr->param() = arg.param;
  389. opr->deduce_layout(src, filter, dst);
  390. }
  391. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  392. TensorLayoutArray{filter, dst, src});
  393. }
  394. }
  395. }
  396. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW_DP4A) {
  397. require_compute_capability(6, 1);
  398. using namespace convolution;
  399. std::vector<TestArg> args = get_args_int8_nchw_conv_bwd_data();
  400. Checker<ConvolutionBackwardData> checker(handle_cuda());
  401. checker.set_before_exec_callback(
  402. AlgoChecker<ConvolutionBackwardData>("INT8_NCHW_DOTPROD_IMPLICIT_GEMM"));
  403. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  404. for (auto&& arg : args) {
  405. UniformIntRNG rng(-3, 3);
  406. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  407. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  408. TensorLayout dst;
  409. dst.dtype = dtype::QuantizedS8{1.2f};
  410. {
  411. auto opr = handle_cuda()->create_operator<Convolution>();
  412. opr->param() = arg.param;
  413. opr->deduce_layout(src, filter, dst);
  414. }
  415. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  416. TensorLayoutArray{filter, dst, src});
  417. }
  418. }
  419. #if CUDA_VERSION >= 10020
  420. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NHWC_IMMA) {
  421. require_compute_capability(7, 5);
  422. using namespace convolution;
  423. std::vector<TestArg> args = get_args_int8_nhwc_conv_bwd_data();
  424. struct AlgoParam {
  425. int threadblock_m;
  426. int threadblock_n;
  427. int threadblock_k;
  428. int warp_m;
  429. int warp_n;
  430. int warp_k;
  431. int stage;
  432. int access_size;
  433. std::string to_string() {
  434. return ssprintf(
  435. "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
  436. threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
  437. }
  438. };
  439. std::vector<AlgoParam> all_params;
  440. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 4});
  441. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 8});
  442. all_params.emplace_back(AlgoParam{64, 16, 32, 64, 16, 32, 2, 16});
  443. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 4});
  444. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 8});
  445. all_params.emplace_back(AlgoParam{128, 32, 32, 64, 32, 32, 1, 16});
  446. for (auto algo_param : all_params) {
  447. Checker<ConvolutionBackwardData> checker(handle_cuda());
  448. std::string algo_name(ssprintf(
  449. "INT8_NHWC_IMMA_IMPLICIT_GEMM%s", algo_param.to_string().c_str()));
  450. checker.set_before_exec_callback(
  451. AlgoChecker<ConvolutionBackwardData>(algo_name.c_str()));
  452. checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1);
  453. for (auto&& arg : args) {
  454. UniformIntRNG rng(-3, 3);
  455. auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f});
  456. auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f});
  457. TensorLayout dst;
  458. dst.dtype = dtype::QuantizedS8{1.2f};
  459. {
  460. auto opr = handle_cuda()->create_operator<Convolution>();
  461. opr->param() = arg.param;
  462. opr->deduce_layout(src, filter, dst);
  463. }
  464. checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec(
  465. TensorLayoutArray{filter, dst, src});
  466. }
  467. }
  468. }
  469. #endif
  470. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) {
  471. // BRAIN-481 failed on architectures 7.0, remove the following if statement,
  472. // when cudnn fixed the problem.
  473. require_compute_capability(7, 0);
  474. using namespace convolution;
  475. std::vector<TestArg> args = get_args_cudnn_7_5_failures();
  476. Checker<ConvolutionBackwardData> checker(handle_cuda());
  477. NormalRNG default_rng;
  478. for (auto&& arg : args) {
  479. float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
  480. scale = std::max(scale, 1.f);
  481. UniformFloatRNG rng(scale, 2 * scale);
  482. auto src = TensorLayout(arg.src, dtype::Float32());
  483. auto filter = TensorLayout(arg.filter, dtype::Float32());
  484. TensorLayout dst;
  485. {
  486. auto opr = handle_cuda()->create_operator<Convolution>();
  487. opr->param() = arg.param;
  488. opr->deduce_layout(src, filter, dst);
  489. }
  490. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  491. checker.set_rng(0, &default_rng)
  492. .set_rng(1, &default_rng)
  493. .set_epsilon(1e-3)
  494. .set_param(arg.param)
  495. .exec(TensorLayoutArray{filter, dst, src});
  496. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  497. checker.set_rng(0, &rng)
  498. .set_rng(1, &rng)
  499. .set_epsilon(1e-1)
  500. .set_param(arg.param)
  501. .exec(TensorLayoutArray{filter, dst, src});
  502. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  503. checker.set_rng(0, &rng)
  504. .set_rng(1, &rng)
  505. .set_epsilon(1e-1)
  506. .set_param(arg.param)
  507. .exec(TensorLayoutArray{filter, dst, src});
  508. }
  509. }
  510. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) {
  511. using namespace convolution;
  512. std::vector<TestArg> args = get_args();
  513. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  514. bool f16_checked = false;
  515. for (auto&& arg : args) {
  516. auto src = TensorLayout(arg.src, dtype::Float32());
  517. auto filter = TensorLayout(arg.filter, dtype::Float32());
  518. TensorLayout dst;
  519. {
  520. auto opr = handle_cuda()->create_operator<Convolution>();
  521. opr->param() = arg.param;
  522. opr->deduce_layout(src, filter, dst);
  523. }
  524. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  525. UniformFloatRNG rng(scale, 2 * scale);
  526. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  527. checker.set_rng(0, &rng)
  528. .set_rng(1, &rng)
  529. .set_epsilon(1e-3)
  530. .set_param(arg.param)
  531. .exec(TensorLayoutArray{src, dst, filter});
  532. // reduce on large f16 array may introduce significant error
  533. if (dst.total_nr_elems() >= 1000 && f16_checked)
  534. continue;
  535. f16_checked = true;
  536. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  537. checker.set_rng(0, &rng)
  538. .set_rng(1, &rng)
  539. .set_epsilon(1e-1)
  540. .set_param(arg.param)
  541. .exec(TensorLayoutArray{src, dst, filter});
  542. arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  543. checker.set_rng(0, &rng)
  544. .set_rng(1, &rng)
  545. .set_epsilon(1e-1)
  546. .set_param(arg.param)
  547. .exec(TensorLayoutArray{src, dst, filter});
  548. checker.set_before_exec_callback(
  549. AlgoChecker<ConvolutionBackwardFilter>(ExecutionPolicyAlgoName{
  550. "CONVOLUTION_BACKWARD_FILTER_BFLOAT16",
  551. {{"MATMUL", {{"CUBLAS", {}}}}}}));
  552. src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
  553. checker.set_rng(0, &rng)
  554. .set_rng(1, &rng)
  555. .set_epsilon(1e-1)
  556. .set_param(arg.param)
  557. .exec(TensorLayoutArray{src, dst, filter});
  558. checker.reset_before_exec_callback();
  559. checker.opr()->execution_policy() = {};
  560. }
  561. }
  562. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) {
  563. using namespace convolution;
  564. std::vector<TestArg> args = get_args();
  565. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  566. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  567. ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
  568. for (auto&& arg : args) {
  569. auto src = TensorLayout(arg.src, dtype::Float32());
  570. auto filter = TensorLayout(arg.filter, dtype::Float32());
  571. TensorLayout dst;
  572. {
  573. auto opr = handle_cuda()->create_operator<Convolution>();
  574. opr->param() = arg.param;
  575. opr->deduce_layout(src, filter, dst);
  576. }
  577. float scale = 1.0f / sqrt(dst[2] * dst[3]);
  578. UniformFloatRNG rng(scale, 2 * scale);
  579. src.dtype = dst.dtype = filter.dtype = dtype::Float32();
  580. checker.set_rng(0, &rng)
  581. .set_rng(1, &rng)
  582. .set_epsilon(1e-3)
  583. .set_param(arg.param)
  584. .exec(TensorLayoutArray{src, dst, filter});
  585. }
  586. //! noncontiguous case
  587. {
  588. NormalRNG default_rng;
  589. param::Convolution param;
  590. param.pad_h = param.pad_w = 1;
  591. checker.set_rng(0, &default_rng)
  592. .set_rng(1, &default_rng)
  593. .set_param(param)
  594. .execl(TensorLayoutArray{
  595. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  596. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  597. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  598. }
  599. }
  600. TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_CUDNN) {
  601. require_compute_capability(7, 0);
  602. using namespace convolution;
  603. Checker<ConvolutionBackwardFilter> checker(handle_cuda());
  604. checker.set_before_exec_callback(
  605. AlgoChecker<ConvolutionBackwardFilter>("CUDNN_CONVOLUTION"));
  606. //! noncontiguous case
  607. {
  608. param::Convolution param;
  609. param.pad_h = param.pad_w = 1;
  610. checker.set_param(param).execl(TensorLayoutArray{
  611. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  612. {{2, 16, 7, 7}, {1568, 49, 7, 1}, dtype::Float32()},
  613. {{16, 16, 3, 3}, {144, 9, 3, 1}, dtype::Float32()}});
  614. }
  615. }
  616. TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) {
  617. auto eps_getter = [](bool f16, int stage, const char* name) -> float {
  618. if (f16) {
  619. return stage == 2 ? 0.5 : 0.2;
  620. }
  621. if (strstr(name, "WINOGRAD_NONFUSED"))
  622. return 0.3;
  623. return 1e-3;
  624. };
  625. convolution::test_conv_config_combinations(
  626. 2, handle_cuda(), false, true, true, eps_getter, true);
  627. convolution::test_conv_config_combinations(
  628. 3, handle_cuda(), false, true, true, eps_getter, true);
  629. convolution::test_conv_config_combinations(
  630. 5, handle_cuda(), false, true, true, eps_getter, true);
  631. }
  632. TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
  633. require_compute_capability(7, 0);
  634. using namespace convolution;
  635. Checker<ConvolutionBackwardData> checker(handle_cuda());
  636. checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
  637. "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING));
  638. NormalRNG default_rng;
  639. TensorShape s_filter = TensorShape{8, 8, 2, 2}, s_src = TensorShape{2, 8, 18, 18};
  640. float scale = 1.0f / sqrt(s_filter[0] * s_filter[2] * s_filter[3]);
  641. UniformFloatRNG rng(scale, 2 * scale);
  642. auto src = TensorLayout(s_src, dtype::Float16());
  643. auto filter = TensorLayout(s_filter, dtype::Float16());
  644. TensorLayout dst;
  645. param::Convolution param;
  646. param.pad_h = param.pad_w = 2;
  647. param.stride_h = param.stride_w = 2;
  648. {
  649. auto opr = handle_cuda()->create_operator<Convolution>();
  650. opr->param() = param;
  651. opr->deduce_layout(src, filter, dst);
  652. }
  653. src.dtype = dst.dtype = filter.dtype = dtype::Float16();
  654. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  655. checker.set_rng(0, &rng).set_rng(1, &rng).set_epsilon(0.2).set_param(param).exec(
  656. TensorLayoutArray{filter, dst, src});
  657. }
  658. TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
  659. Checker<ConvolutionBackwardData> checker(handle_cuda());
  660. checker.set_before_exec_callback(
  661. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  662. for (auto dtype : std::vector<DType> {
  663. dtype::Float32(),
  664. #if CUDA_VERSION >= 9000
  665. dtype::Float16()
  666. #endif
  667. }) {
  668. auto run = [&checker, &dtype](
  669. size_t n, size_t g, size_t h, size_t fh, size_t padding,
  670. size_t stride) {
  671. param::Convolution param;
  672. param.stride_h = param.stride_w = stride;
  673. param.pad_h = param.pad_w = padding;
  674. param.mode = Convolution::Mode::CROSS_CORRELATION;
  675. param.sparse = param::Convolution::Sparse::GROUP;
  676. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  677. float scale = 64.f / sqrt(fh * fh);
  678. UniformFloatRNG rng(scale, scale * 2);
  679. checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
  680. if (dtype.enumv() == DTypeEnum::Float16)
  681. checker.set_epsilon(1e-1);
  682. checker.set_param(param).execs(
  683. {{g, 1, 1, fh, fh},
  684. {n, g, (h + 2 * padding - fh + 1) / stride,
  685. (h + 2 * padding - fh + 1) / stride},
  686. {n, g, h, h}});
  687. };
  688. run(4, 8, 32, 5, 5 / 2, 1);
  689. run(4, 8, 32, 7, 7 / 2, 1);
  690. run(4, 8, 32, 9, 9 / 2, 1);
  691. run(4, 8, 32, 11, 11 / 2, 1);
  692. run(4, 8, 32, 13, 13 / 2, 1);
  693. run(4, 8, 32, 15, 15 / 2, 1);
  694. run(4, 8, 32, 17, 17 / 2, 1);
  695. run(4, 8, 32, 19, 19 / 2, 1);
  696. run(4, 8, 32, 21, 21 / 2, 1);
  697. run(4, 8, 32, 23, 23 / 2, 1);
  698. run(4, 8, 32, 25, 25 / 2, 1);
  699. run(4, 8, 32, 27, 27 / 2, 1);
  700. run(4, 8, 32, 29, 29 / 2, 1);
  701. run(4, 8, 32, 31, 31 / 2, 1);
  702. run(4, 8, 64, 5, 5 / 2, 2);
  703. run(4, 8, 64, 7, 7 / 3, 2);
  704. run(4, 8, 64, 9, 9 / 3, 2);
  705. run(4, 8, 64, 11, 11 / 3, 2);
  706. run(4, 8, 64, 13, 13 / 3, 2);
  707. run(4, 8, 64, 15, 15 / 3, 2);
  708. run(4, 8, 64, 17, 17 / 3, 2);
  709. run(4, 8, 64, 19, 19 / 3, 2);
  710. run(4, 8, 64, 21, 21 / 3, 2);
  711. run(4, 8, 64, 23, 23 / 3, 2);
  712. run(4, 8, 64, 25, 25 / 3, 2);
  713. run(4, 8, 64, 27, 27 / 3, 2);
  714. run(4, 8, 64, 29, 29 / 3, 2);
  715. run(4, 8, 64, 31, 31 / 3, 2);
  716. run(1, 2, 128, 31, 31 / 3, 2);
  717. run(1, 2, 256, 31, 31 / 3, 2);
  718. }
  719. }
  720. #if MEGDNN_WITH_BENCHMARK
  721. TEST_F(CUDA, CONV_FWD_BENCHMARK) {
  722. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
  723. size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
  724. size_t PW = 0, bool fp16io_c32 = false) {
  725. auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
  726. benchmarker.set_dtype(0, dtype::Float16())
  727. .set_dtype(1, dtype::Float16())
  728. .set_dtype(2, dtype::Float16());
  729. ConvolutionForward::Param param;
  730. param.stride_h = SH;
  731. param.stride_w = SW;
  732. param.pad_h = PH;
  733. param.pad_w = PW;
  734. if (fp16io_c32) {
  735. param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
  736. }
  737. benchmarker.set_param(param);
  738. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  739. new OprProxy<ConvolutionForward>{true}};
  740. benchmarker.set_proxy(proxy);
  741. size_t OH = (IH - FH + 2 * PH) / SH + 1;
  742. size_t OW = (IW - FW + 2 * PW) / SW + 1;
  743. auto time =
  744. benchmarker.execs({{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
  745. time /= 1000.0 * 10.0;
  746. auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
  747. auto flops = flo / time / 1e12;
  748. printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
  749. printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
  750. };
  751. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
  752. run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
  753. }
  754. TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
  755. CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
  756. std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
  757. new OprProxy<ConvolutionForward>{true}};
  758. size_t RUNS = 10;
  759. bench.set_proxy(proxy).set_times(RUNS);
  760. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  761. size_t SH, size_t PH) {
  762. bench.set_dtype(0, dtype::Float32())
  763. .set_dtype(1, dtype::Float32())
  764. .set_dtype(2, dtype::Float32());
  765. param::Convolution param;
  766. param.stride_h = param.stride_w = SH;
  767. param.pad_h = param.pad_w = PH;
  768. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  769. bench.set_param(param);
  770. bench.proxy()->target_execution_policy.algo.reset();
  771. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  772. filter{{OC, IC, FH, FH}, dtype::Float32()};
  773. TensorLayout dst;
  774. {
  775. auto&& opr = handle_cuda()->create_operator<Convolution>();
  776. opr->param() = param;
  777. opr->deduce_layout(src, filter, dst);
  778. }
  779. auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
  780. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  781. bench.proxy()->target_execution_policy.algo.reset();
  782. bench.set_dtype(0, dtype::Float16())
  783. .set_dtype(1, dtype::Float16())
  784. .set_dtype(2, dtype::Float16());
  785. auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
  786. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  787. bench.proxy()->target_execution_policy.algo.reset();
  788. bench.set_param(param);
  789. auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
  790. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  791. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  792. filter.to_string().c_str(), dst.to_string().c_str());
  793. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  794. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  795. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  796. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  797. (flo / (time_ms_pseudo_fp16 * 1e9)));
  798. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  799. time_ms_fp32 / time_ms_true_fp16,
  800. time_ms_pseudo_fp16 / time_ms_true_fp16);
  801. };
  802. run(32, 64, 3, 224, 224, 7, 2, 3);
  803. run(32, 128, 128, 28, 28, 3, 1, 1);
  804. run(32, 256, 256, 14, 14, 3, 1, 1);
  805. run(32, 512, 512, 7, 7, 3, 1, 1);
  806. run(32, 64, 64, 56, 56, 3, 1, 1);
  807. run(32, 512, 256, 56, 56, 1, 2, 0);
  808. run(32, 1024, 512, 28, 28, 1, 2, 0);
  809. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  810. run(32, 512, 128, 28, 28, 1, 1, 0);
  811. run(32, 128, 512, 28, 28, 1, 1, 0);
  812. run(32, 1024, 256, 14, 14, 1, 1, 0);
  813. run(32, 256, 1024, 14, 14, 1, 1, 0);
  814. run(32, 2048, 512, 7, 7, 1, 1, 0);
  815. run(32, 512, 2048, 7, 7, 1, 1, 0);
  816. run(32, 256, 64, 56, 56, 1, 1, 0);
  817. run(32, 64, 256, 56, 56, 1, 1, 0);
  818. run(32, 128, 256, 56, 56, 1, 2, 0);
  819. run(32, 256, 512, 28, 28, 1, 2, 0);
  820. run(32, 512, 1024, 14, 14, 1, 2, 0);
  821. run(32, 64, 64, 56, 56, 1, 1, 0);
  822. }
  823. TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
  824. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  825. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  826. new OprProxy<ConvolutionBackwardData>{true}};
  827. size_t RUNS = 10;
  828. bench.set_proxy(proxy).set_times(RUNS);
  829. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  830. size_t SH, size_t PH) {
  831. bench.set_dtype(0, dtype::Float32())
  832. .set_dtype(1, dtype::Float32())
  833. .set_dtype(2, dtype::Float32());
  834. param::Convolution param;
  835. param.stride_h = param.stride_w = SH;
  836. param.pad_h = param.pad_w = PH;
  837. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  838. bench.set_param(param);
  839. bench.proxy()->target_execution_policy.algo.reset();
  840. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  841. filter{{OC, IC, FH, FH}, dtype::Float32()};
  842. TensorLayout dst;
  843. {
  844. auto&& opr = handle_cuda()->create_operator<Convolution>();
  845. opr->param() = param;
  846. opr->deduce_layout(src, filter, dst);
  847. }
  848. auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
  849. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  850. bench.proxy()->target_execution_policy.algo.reset();
  851. bench.set_dtype(0, dtype::Float16())
  852. .set_dtype(1, dtype::Float16())
  853. .set_dtype(2, dtype::Float16());
  854. auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
  855. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  856. bench.proxy()->target_execution_policy.algo.reset();
  857. bench.set_param(param);
  858. auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
  859. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  860. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  861. filter.to_string().c_str(), dst.to_string().c_str());
  862. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  863. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  864. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  865. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  866. (flo / (time_ms_pseudo_fp16 * 1e9)));
  867. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  868. time_ms_fp32 / time_ms_true_fp16,
  869. time_ms_pseudo_fp16 / time_ms_true_fp16);
  870. };
  871. run(32, 64, 3, 224, 224, 7, 2, 3);
  872. run(32, 128, 128, 28, 28, 3, 1, 1);
  873. run(32, 256, 256, 14, 14, 3, 1, 1);
  874. run(32, 512, 512, 7, 7, 3, 1, 1);
  875. run(32, 64, 64, 56, 56, 3, 1, 1);
  876. run(32, 512, 256, 56, 56, 1, 2, 0);
  877. run(32, 1024, 512, 28, 28, 1, 2, 0);
  878. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  879. run(32, 512, 128, 28, 28, 1, 1, 0);
  880. run(32, 128, 512, 28, 28, 1, 1, 0);
  881. run(32, 1024, 256, 14, 14, 1, 1, 0);
  882. run(32, 256, 1024, 14, 14, 1, 1, 0);
  883. run(32, 2048, 512, 7, 7, 1, 1, 0);
  884. run(32, 512, 2048, 7, 7, 1, 1, 0);
  885. run(32, 256, 64, 56, 56, 1, 1, 0);
  886. run(32, 64, 256, 56, 56, 1, 1, 0);
  887. run(32, 128, 256, 56, 56, 1, 2, 0);
  888. run(32, 256, 512, 28, 28, 1, 2, 0);
  889. run(32, 512, 1024, 14, 14, 1, 2, 0);
  890. run(32, 64, 64, 56, 56, 1, 1, 0);
  891. }
  892. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
  893. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  894. bencher.set_display(false);
  895. bencher.set_before_exec_callback(
  896. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  897. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  898. size_t SH, size_t nr_times) {
  899. bencher.set_dtype(0, dtype::Float32())
  900. .set_dtype(1, dtype::Float32())
  901. .set_dtype(2, dtype::Float32());
  902. param::Convolution param;
  903. param.stride_h = param.stride_w = SH;
  904. param.pad_h = param.pad_w = FH / 2;
  905. param.sparse = param::Convolution::Sparse::GROUP;
  906. bencher.set_param(param);
  907. bencher.set_times(nr_times);
  908. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  909. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  910. TensorLayout dst;
  911. {
  912. auto&& opr = handle_cuda()->create_operator<Convolution>();
  913. opr->param() = param;
  914. opr->deduce_layout(src, filter, dst);
  915. }
  916. auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
  917. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  918. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  919. filter.to_string().c_str(), dst.to_string().c_str());
  920. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  921. (flo / (time_ms_fp32 * 1e9)));
  922. };
  923. run(64, 384, 384, 32, 32, 3, 1, 10);
  924. run(64, 384, 384, 32, 32, 5, 1, 10);
  925. run(64, 384, 384, 32, 32, 7, 1, 10);
  926. run(64, 384, 384, 32, 32, 9, 1, 10);
  927. run(64, 384, 384, 32, 32, 11, 1, 10);
  928. run(64, 384, 384, 32, 32, 13, 1, 10);
  929. run(64, 384, 384, 32, 32, 15, 1, 10);
  930. run(64, 384, 384, 32, 32, 17, 1, 10);
  931. run(64, 384, 384, 32, 32, 19, 1, 10);
  932. run(64, 384, 384, 32, 32, 21, 1, 10);
  933. run(64, 384, 384, 32, 32, 23, 1, 10);
  934. run(64, 384, 384, 32, 32, 25, 1, 10);
  935. run(64, 384, 384, 32, 32, 27, 1, 10);
  936. run(64, 384, 384, 32, 32, 29, 1, 10);
  937. run(64, 384, 384, 32, 32, 31, 1, 10);
  938. }
  939. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
  940. CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
  941. bencher.set_display(false);
  942. bencher.set_before_exec_callback(
  943. AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
  944. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  945. size_t SH, size_t nr_times) {
  946. bencher.set_dtype(0, dtype::Float16())
  947. .set_dtype(1, dtype::Float16())
  948. .set_dtype(2, dtype::Float16());
  949. param::Convolution param;
  950. param.stride_h = param.stride_w = SH;
  951. param.pad_h = param.pad_w = FH / 2;
  952. param.sparse = param::Convolution::Sparse::GROUP;
  953. bencher.set_param(param);
  954. bencher.set_times(nr_times);
  955. TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
  956. filter{{g, 1, 1, FH, FH}, dtype::Float16()};
  957. TensorLayout dst;
  958. {
  959. auto&& opr = handle_cuda()->create_operator<Convolution>();
  960. opr->param() = param;
  961. opr->deduce_layout(src, filter, dst);
  962. }
  963. auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
  964. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  965. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  966. filter.to_string().c_str(), dst.to_string().c_str());
  967. printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
  968. (flo / (time_ms_fp16 * 1e9)));
  969. };
  970. run(64, 384, 384, 32, 32, 3, 1, 10);
  971. run(64, 384, 384, 32, 32, 5, 1, 10);
  972. run(64, 384, 384, 32, 32, 7, 1, 10);
  973. run(64, 384, 384, 32, 32, 9, 1, 10);
  974. run(64, 384, 384, 32, 32, 11, 1, 10);
  975. run(64, 384, 384, 32, 32, 13, 1, 10);
  976. run(64, 384, 384, 32, 32, 15, 1, 10);
  977. run(64, 384, 384, 32, 32, 17, 1, 10);
  978. run(64, 384, 384, 32, 32, 19, 1, 10);
  979. run(64, 384, 384, 32, 32, 21, 1, 10);
  980. run(64, 384, 384, 32, 32, 23, 1, 10);
  981. run(64, 384, 384, 32, 32, 25, 1, 10);
  982. run(64, 384, 384, 32, 32, 27, 1, 10);
  983. run(64, 384, 384, 32, 32, 29, 1, 10);
  984. run(64, 384, 384, 32, 32, 31, 1, 10);
  985. }
  986. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
  987. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  988. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  989. new OprProxy<ConvolutionBackwardData>{true}};
  990. size_t RUNS = 10;
  991. bench.set_proxy(proxy).set_times(RUNS);
  992. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  993. size_t SH, size_t PH) {
  994. bench.set_dtype(0, dtype::BFloat16())
  995. .set_dtype(1, dtype::BFloat16())
  996. .set_dtype(2, dtype::BFloat16());
  997. param::Convolution param;
  998. param.stride_h = param.stride_w = SH;
  999. param.pad_h = param.pad_w = PH;
  1000. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1001. bench.set_param(param);
  1002. bench.proxy()->target_execution_policy = {};
  1003. TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
  1004. filter{{OC, IC, FH, FH}, dtype::BFloat16()};
  1005. TensorLayout dst;
  1006. {
  1007. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1008. opr->param() = param;
  1009. opr->deduce_layout(src, filter, dst);
  1010. }
  1011. auto used = bench.execl({filter, dst, src}) / RUNS;
  1012. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1013. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1014. filter.to_string().c_str(), dst.to_string().c_str());
  1015. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1016. };
  1017. run(32, 64, 3, 224, 224, 7, 2, 3);
  1018. run(32, 128, 128, 28, 28, 3, 1, 1);
  1019. run(32, 256, 256, 14, 14, 3, 1, 1);
  1020. run(32, 512, 512, 7, 7, 3, 1, 1);
  1021. run(32, 64, 64, 56, 56, 3, 1, 1);
  1022. run(32, 512, 256, 56, 56, 1, 2, 0);
  1023. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1024. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1025. run(32, 512, 128, 28, 28, 1, 1, 0);
  1026. run(32, 128, 512, 28, 28, 1, 1, 0);
  1027. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1028. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1029. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1030. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1031. run(32, 256, 64, 56, 56, 1, 1, 0);
  1032. run(32, 64, 256, 56, 56, 1, 1, 0);
  1033. run(32, 128, 256, 56, 56, 1, 2, 0);
  1034. run(32, 256, 512, 28, 28, 1, 2, 0);
  1035. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1036. run(32, 64, 64, 56, 56, 1, 1, 0);
  1037. }
  1038. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) {
  1039. CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
  1040. std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
  1041. new OprProxy<ConvolutionBackwardData>{true}};
  1042. size_t RUNS = 10;
  1043. bench.set_proxy(proxy).set_times(RUNS);
  1044. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1045. size_t SH, size_t PH) {
  1046. bench.set_dtype(0, dtype::QuantizedS8{1.0f})
  1047. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1048. .set_dtype(2, dtype::QuantizedS8{1.0f});
  1049. param::Convolution param;
  1050. param.format = param::Convolution::Format::NCHW4;
  1051. param.stride_h = param.stride_w = SH;
  1052. param.pad_h = param.pad_w = PH;
  1053. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1054. bench.set_param(param);
  1055. bench.proxy()->target_execution_policy = {};
  1056. TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}},
  1057. filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}};
  1058. TensorLayout dst;
  1059. dst.dtype = dtype::QuantizedS8{1.0f};
  1060. {
  1061. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1062. opr->param() = param;
  1063. opr->deduce_layout(src, filter, dst);
  1064. }
  1065. auto used = bench.execl({filter, dst, src}) / RUNS;
  1066. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1067. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1068. filter.to_string().c_str(), dst.to_string().c_str());
  1069. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, (flo / (used * 1e9)));
  1070. };
  1071. run(64, 32, 32, 92, 180, 4, 2, 2);
  1072. run(64, 32, 32, 46, 80, 4, 2, 2);
  1073. run(16, 16, 16, 92, 180, 4, 2, 2);
  1074. run(16, 16, 16, 46, 80, 4, 2, 2);
  1075. }
  1076. TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
  1077. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1078. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1079. new OprProxy<ConvolutionBackwardFilter>{true}};
  1080. size_t RUNS = 10;
  1081. bench.set_proxy(proxy).set_times(RUNS);
  1082. auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t FH,
  1083. size_t SH, size_t PH) {
  1084. bench.set_dtype(0, dtype::Float32())
  1085. .set_dtype(1, dtype::Float32())
  1086. .set_dtype(2, dtype::Float32());
  1087. param::Convolution param;
  1088. param.stride_h = param.stride_w = SH;
  1089. param.pad_h = param.pad_w = PH;
  1090. param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
  1091. bench.set_param(param);
  1092. bench.proxy()->target_execution_policy.algo.reset();
  1093. TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
  1094. filter{{OC, IC, FH, FH}, dtype::Float32()};
  1095. TensorLayout dst;
  1096. {
  1097. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1098. opr->param() = param;
  1099. opr->deduce_layout(src, filter, dst);
  1100. }
  1101. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1102. src.dtype = filter.dtype = dst.dtype = dtype::Float16();
  1103. bench.proxy()->target_execution_policy.algo.reset();
  1104. bench.set_dtype(0, dtype::Float16())
  1105. .set_dtype(1, dtype::Float16())
  1106. .set_dtype(2, dtype::Float16());
  1107. auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1108. param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
  1109. bench.proxy()->target_execution_policy.algo.reset();
  1110. bench.set_param(param);
  1111. auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
  1112. float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
  1113. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1114. filter.to_string().c_str(), dst.to_string().c_str());
  1115. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\ntime_true_fp16=%.2fms, "
  1116. "flops=%.3fTFLOPS\ntime_pseudo_fp16=%.2fms, flops=%.3fFLOPS\n",
  1117. time_ms_fp32, (flo / (time_ms_fp32 * 1e9)), time_ms_true_fp16,
  1118. (flo / (time_ms_true_fp16 * 1e9)), time_ms_pseudo_fp16,
  1119. (flo / (time_ms_pseudo_fp16 * 1e9)));
  1120. printf("speedup (true_fp16/fp32)=%.2f, (true_fp16/pseudo_fp16)=%.2f\n",
  1121. time_ms_fp32 / time_ms_true_fp16,
  1122. time_ms_pseudo_fp16 / time_ms_true_fp16);
  1123. };
  1124. run(32, 64, 3, 224, 224, 7, 2, 3);
  1125. run(32, 128, 128, 28, 28, 3, 1, 1);
  1126. run(32, 256, 256, 14, 14, 3, 1, 1);
  1127. run(32, 512, 512, 7, 7, 3, 1, 1);
  1128. run(32, 64, 64, 56, 56, 3, 1, 1);
  1129. run(32, 512, 256, 56, 56, 1, 2, 0);
  1130. run(32, 1024, 512, 28, 28, 1, 2, 0);
  1131. run(32, 2048, 1024, 14, 14, 1, 2, 0);
  1132. run(32, 512, 128, 28, 28, 1, 1, 0);
  1133. run(32, 128, 512, 28, 28, 1, 1, 0);
  1134. run(32, 1024, 256, 14, 14, 1, 1, 0);
  1135. run(32, 256, 1024, 14, 14, 1, 1, 0);
  1136. run(32, 2048, 512, 7, 7, 1, 1, 0);
  1137. run(32, 512, 2048, 7, 7, 1, 1, 0);
  1138. run(32, 256, 64, 56, 56, 1, 1, 0);
  1139. run(32, 64, 256, 56, 56, 1, 1, 0);
  1140. run(32, 128, 256, 56, 56, 1, 2, 0);
  1141. run(32, 256, 512, 28, 28, 1, 2, 0);
  1142. run(32, 512, 1024, 14, 14, 1, 2, 0);
  1143. run(32, 64, 64, 56, 56, 1, 1, 0);
  1144. }
  1145. TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
  1146. CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
  1147. std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
  1148. new OprProxy<ConvolutionBackwardFilter>{true}};
  1149. size_t RUNS = 10;
  1150. bench.set_proxy(proxy).set_times(RUNS);
  1151. bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
  1152. "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));
  1153. auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
  1154. size_t SH, size_t PH) {
  1155. bench.set_dtype(0, dtype::Float32())
  1156. .set_dtype(1, dtype::Float32())
  1157. .set_dtype(2, dtype::Float32());
  1158. param::Convolution param;
  1159. param.stride_h = param.stride_w = SH;
  1160. param.pad_h = param.pad_w = FH / 2;
  1161. param.sparse = param::Convolution::Sparse::GROUP;
  1162. bench.set_param(param);
  1163. bench.proxy()->target_execution_policy.algo.reset();
  1164. TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
  1165. filter{{g, 1, 1, FH, FH}, dtype::Float32()};
  1166. TensorLayout dst;
  1167. {
  1168. auto&& opr = handle_cuda()->create_operator<Convolution>();
  1169. opr->param() = param;
  1170. opr->deduce_layout(src, filter, dst);
  1171. }
  1172. auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
  1173. float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
  1174. printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
  1175. filter.to_string().c_str(), dst.to_string().c_str());
  1176. printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
  1177. (flo / (time_ms_fp32 * 1e9)));
  1178. };
  1179. run(64, 384, 384, 32, 32, 31, 1, 15);
  1180. }
  1181. #endif
  1182. #undef CUDNN_VERSION_STRING
  1183. #undef V
  1184. #undef V1
  1185. } // namespace test
  1186. } // namespace megdnn
  1187. // vim: syntax=cpp.doxygen