You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /**
  2. * \file dnn/test/armv7/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/armv7/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. TEST_F(ARMV7, MATRIX_MUL) {
  19. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  20. dtype::Float32{}, handle(), "ARMV7_F32");
  21. }
  22. TEST_F(ARMV7, MATRIX_MUL_MK4) {
  23. matrix_mul::check_matrix_mul(
  24. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  25. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, 4);
  26. }
  27. TEST_F(ARMV7, MATRIX_MUL_MK4_INT8) {
  28. std::vector<matrix_mul::TestArg> args;
  29. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  30. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  31. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  32. args.emplace_back(m, n, k, 0);
  33. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  34. handle(), "ARMV7_INT8X8X32_MK4_4X2X16",
  35. param::MatrixMul::Format::MK4, 1, 1e-3,
  36. std::move(args));
  37. }
  38. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K4x8x8) {
  39. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  40. handle(), "ARMV7_INT8X8X16_K4X8X8");
  41. }
  42. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32) {
  43. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  44. handle(),"ARMV7_INT16X16X32_K12X4X1");
  45. }
  46. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32_MK8) {
  47. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  48. handle(), "ARMV7_INT16X16X32_MK8_4X8",
  49. param::MatrixMul::Format::MK8, 4);
  50. }
  51. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  52. TEST_F(ARMV7, MATRIX_MUL_FP16) {
  53. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  54. dtype::Float16{}, handle(),
  55. "AARCH32_F16_K4X16X1");
  56. }
  57. TEST_F(ARMV7, MATRIX_MUL_F16_MK8) {
  58. matrix_mul::check_matrix_mul(
  59. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  60. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, 4);
  61. }
  62. #endif
  63. #if __ARM_FEATURE_DOTPROD
  64. TEST_F(ARMV7, MATRIX_MUL_SDOT) {
  65. matrix_mul::check_matrix_mul(dtype::Int8(), dtype::Int8(), dtype::Int32(),
  66. handle(), "AARCH32_INT8_K6X8X4");
  67. }
  68. TEST_F(ARMV7, MATRIX_MUL_UDOT) {
  69. matrix_mul::check_matrix_mul(
  70. dtype::Quantized8Asymm(4.0f, static_cast<uint8_t>(10)), dtype::Quantized8Asymm(3.0f, static_cast<uint8_t>(54)),
  71. dtype::QuantizedS32(12.0f), handle(), "AARCH32_QUINT8_K4X8X4");
  72. }
  73. #endif
  74. #if MEGDNN_WITH_BENCHMARK
  75. namespace {
  76. void run_8x8x16_benchmark(const char* algo, Handle* handle) {
  77. constexpr size_t RUNS = 50;
  78. param::MatrixMul param;
  79. Benchmarker<MatrixMul> benchmarker_int(handle);
  80. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  81. benchmarker_int.set_before_exec_callback(
  82. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X16"));
  83. benchmarker_int.set_times(RUNS)
  84. .set_dtype(0, dtype::Int8{})
  85. .set_dtype(1, dtype::Int8{})
  86. .set_dtype(2, dtype::Int16{})
  87. .set_param(param)
  88. .set_display(false);
  89. benchmarker_int_kern_4x2x16.set_before_exec_callback(
  90. AlgoChecker<MatrixMul>(algo));
  91. benchmarker_int_kern_4x2x16.set_times(RUNS)
  92. .set_dtype(0, dtype::Int8{})
  93. .set_dtype(1, dtype::Int8{})
  94. .set_dtype(2, dtype::Int16{})
  95. .set_param(param)
  96. .set_display(false);
  97. Benchmarker<MatrixMul> benchmarker_float(handle);
  98. benchmarker_float.set_display(false).set_times(RUNS);
  99. auto run = [&](size_t M, size_t N, size_t K) {
  100. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  101. auto int_kern_used =
  102. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  103. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  104. float computations = 2.f * M * K * N * 1e-6;
  105. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f "
  106. "ms "
  107. "%f Gflops %s: %f ms %f Gflops "
  108. "speedup(%s/arm_common, %s/float): %f "
  109. "%f\n",
  110. M, K, N, float_used, computations / float_used, int_used,
  111. computations / int_used, algo, int_kern_used,
  112. computations / int_kern_used, algo, algo,
  113. int_used / int_kern_used, float_used / int_kern_used);
  114. };
  115. run(256, 12 * 24, 256);
  116. //////////////////////// gemv //////////////////////////
  117. for (size_t M : {8, 64, 112, 256}) {
  118. for (size_t K : {8, 64, 112, 256}) {
  119. run(M, 1, K);
  120. }
  121. }
  122. //////////////////////// gemm //////////////////////////
  123. for (size_t M : {8, 64, 112, 256}) {
  124. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  125. for (size_t N : {8, 64, 112, 256}) {
  126. run(M, N, K);
  127. }
  128. }
  129. }
  130. }
  131. void run_16x16x32_benchmark(const char* algo, Handle* handle) {
  132. constexpr size_t RUNS = 50;
  133. param::MatrixMul param;
  134. Benchmarker<MatrixMul> benchmarker_int(handle);
  135. benchmarker_int.set_before_exec_callback(
  136. AlgoChecker<MatrixMul>("ARMV7_INT16X16X32_K12X4X1"));
  137. benchmarker_int.set_times(RUNS)
  138. .set_dtype(0, dtype::Int16{})
  139. .set_dtype(1, dtype::Int16{})
  140. .set_dtype(2, dtype::Int32{})
  141. .set_param(param)
  142. .set_display(false);
  143. Benchmarker<MatrixMul> benchmarker_float(handle);
  144. benchmarker_float.set_display(false).set_times(RUNS);
  145. auto run = [&](size_t M, size_t N, size_t K) {
  146. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  147. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  148. float computations = 2.f * M * K * N * 1e-6;
  149. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  150. "int: %f ms %f Gflops %s: \n"
  151. "speedup(%s/arm_common, %s/float): %f\n",
  152. M, K, N, float_used, computations / float_used, int_used,
  153. computations / int_used,algo,algo,algo,float_used / int_used);
  154. };
  155. run(256, 12 * 24, 256);
  156. //////////////////////// gemv //////////////////////////
  157. for (size_t M : {8, 64, 112, 256}) {
  158. for (size_t K : {8, 64, 112, 256}) {
  159. run(M, 1, K);
  160. }
  161. }
  162. //////////////////////// gemm //////////////////////////
  163. for (size_t M : {8, 64, 112, 256}) {
  164. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  165. for (size_t N :
  166. {1, 2, 3, 4, 8, 64, 112, 113, 114, 115, 256, 257, 258, 259}) {
  167. run(M, N, K);
  168. }
  169. }
  170. }
  171. }
  172. #if __ARM_FEATURE_DOTPROD
  173. void run_8x8x32_benchmark(const char* algo, Handle* handle) {
  174. constexpr size_t RUNS = 50;
  175. param::MatrixMul param;
  176. Benchmarker<MatrixMul> benchmarker_int8(handle);
  177. benchmarker_int8.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  178. benchmarker_int8.set_times(RUNS)
  179. .set_dtype(0, dtype::Int8{})
  180. .set_dtype(1, dtype::Int8{})
  181. .set_dtype(2, dtype::Int32{})
  182. .set_param(param)
  183. .set_display(false);
  184. Benchmarker<MatrixMul> benchmarker_float(handle);
  185. benchmarker_float.set_display(false).set_times(RUNS);
  186. auto run = [&](size_t M, size_t N, size_t K) {
  187. auto int_used = benchmarker_int8.exec({{M, K}, {K, N}, {}}) / RUNS;
  188. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  189. float computations = 2.f * M * K * N * 1e-6;
  190. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  191. "int: %f ms %f Gflops %s: \n"
  192. "speedup(%s/arm_common, %s/float): %f\n",
  193. M, K, N, float_used, computations / float_used, int_used,
  194. computations / int_used,algo,algo,algo,float_used / int_used);
  195. };
  196. run(256, 12 * 24, 256);
  197. //////////////////////// gemm //////////////////////////
  198. for (size_t M : {8, 64, 112, 256}) {
  199. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  200. for (size_t N : {113, 114, 115, 256, 1024}) {
  201. run(M, N, K);
  202. }
  203. }
  204. }
  205. }
  206. void run_8x8x32_quint_benchmark(Handle* handle) {
  207. constexpr size_t RUNS = 50;
  208. param::MatrixMul param;
  209. Benchmarker<MatrixMul> benchmarker_quint8_dot(handle);
  210. benchmarker_quint8_dot.set_before_exec_callback(
  211. AlgoChecker<MatrixMul>("AARCH32_QUINT8_K4X8X4"));
  212. benchmarker_quint8_dot.set_times(RUNS)
  213. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  214. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  215. .set_dtype(2, dtype::QuantizedS32(2.3f*3.1f))
  216. .set_param(param)
  217. .set_display(false);
  218. Benchmarker<MatrixMul> benchmarker_quint8(handle);
  219. benchmarker_quint8.set_before_exec_callback(
  220. AlgoChecker<MatrixMul>("ARMV7_QUINT8_K4X8X8"));
  221. benchmarker_quint8.set_times(RUNS)
  222. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  223. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  224. .set_dtype(2, dtype::QuantizedS32(2.3f*3.1f))
  225. .set_param(param)
  226. .set_display(false);
  227. auto run = [&](size_t M, size_t N, size_t K) {
  228. auto dot_used = benchmarker_quint8_dot.exec({{M, K}, {K, N}, {}}) / RUNS;
  229. auto normal_used = benchmarker_quint8.exec({{M, K}, {K, N}, {}}) / RUNS;
  230. float computations = 2.f * M * K * N * 1e-6;
  231. printf("run: {%zu{M} %zu{K} %zu{N}} dot: %f ms %f Gflops \n"
  232. "normal: %f ms %f Gflops.speedup: %f\n",
  233. M, K, N, dot_used, computations / dot_used, normal_used,
  234. computations / normal_used, normal_used / dot_used);
  235. };
  236. run(256, 12 * 24, 256);
  237. //////////////////////// gemm //////////////////////////
  238. for (size_t M : {8, 64, 112, 256}) {
  239. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  240. for (size_t N : {113, 114, 115, 256, 1024}) {
  241. run(M, N, K);
  242. }
  243. }
  244. }
  245. }
  246. #endif
  247. } // namespace
  248. #if __ARM_FEATURE_DOTPROD
  249. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_K6x8x4) {
  250. run_8x8x32_benchmark("AARCH32_INT8_K6X8X4", handle());
  251. }
  252. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_QUINT8x8x32_K4x8x4) {
  253. run_8x8x32_quint_benchmark(handle());
  254. }
  255. #endif
  256. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x2x16) {
  257. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X2X16", handle());
  258. }
  259. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8) {
  260. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X8X8", handle());
  261. }
  262. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_K12x4x1) {
  263. run_16x16x32_benchmark("ARMV7_INT16X16X32_K12X4X1", handle());
  264. }
  265. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  266. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_FP16) {
  267. constexpr size_t RUNS = 50;
  268. param::MatrixMul param;
  269. Benchmarker<MatrixMul> benchmarker_fp16(handle());
  270. benchmarker_fp16.set_times(RUNS)
  271. .set_dtype(0, dtype::Float16())
  272. .set_dtype(1, dtype::Float16())
  273. .set_dtype(2, dtype::Float16())
  274. .set_param(param)
  275. .set_display(false);
  276. Benchmarker<MatrixMul> benchmarker_float(handle());
  277. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  278. auto run = [&](size_t M, size_t N, size_t K) {
  279. auto fp16_used = benchmarker_fp16.exec({{M, K}, {K, N}, {}}) / RUNS;
  280. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  281. float computations = 2.f * M * K * N * 1e-6;
  282. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops fp16: %f ms "
  283. "%f Gflops speedup: %f\n",
  284. M, K, N, float_used, computations / float_used, fp16_used,
  285. computations / fp16_used, float_used / fp16_used);
  286. };
  287. run(256, 12 * 24, 256);
  288. for (size_t M : {8, 64, 112, 256}) {
  289. for (size_t K : {8, 64, 112, 256}) {
  290. for (size_t N : {8, 64, 112, 256}) {
  291. run(M, N, K);
  292. }
  293. }
  294. }
  295. }
  296. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_F16_MK8) {
  297. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  298. matrix_mul::benchmark_with_contrast(
  299. handle(), args, dtype::Float16{}, dtype::Float16{},
  300. dtype::Float16{}, "AARCH32_F16_MK8_4X8",
  301. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  302. dtype::Float16{}, "AARCH32_F16_K4X16X1");
  303. }
  304. #endif
  305. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_MK4) {
  306. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  307. matrix_mul::benchmark_with_contrast(
  308. handle(), args, dtype::Float32{}, dtype::Float32{},
  309. dtype::Float32{}, "ARMV7_F32_MK4_4x8",
  310. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  311. dtype::Float32{});
  312. }
  313. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  314. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  315. matrix_mul::benchmark_with_contrast(
  316. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  317. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8,
  318. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  319. }
  320. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) {
  321. constexpr size_t RUNS = 50;
  322. param::MatrixMul param;
  323. param.transposeA = false;
  324. param.transposeB = false;
  325. Benchmarker<MatrixMul> benchmarker(handle());
  326. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  327. benchmarker.set_times(RUNS)
  328. .set_dtype(0, dtype::Int8{})
  329. .set_dtype(1, dtype::Int8{})
  330. .set_dtype(2, dtype::Int32{})
  331. .set_param(param)
  332. .set_display(false);
  333. benchmarker.set_before_exec_callback(
  334. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_K4X2X16"));
  335. param.format = MatrixMul::Param::Format::MK4;
  336. benchmarker_mk4.set_before_exec_callback(
  337. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_MK4_4X2X16"));
  338. benchmarker_mk4.set_times(RUNS)
  339. .set_dtype(0, dtype::Int8{})
  340. .set_dtype(1, dtype::Int8{})
  341. .set_dtype(2, dtype::Int32{})
  342. .set_param(param)
  343. .set_display(false);
  344. auto run = [&](size_t M, size_t N, size_t K) {
  345. auto mk_used = benchmarker_mk4.exec(
  346. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  347. RUNS;
  348. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  349. float computations = 2.f * M * K * N * 1e-6;
  350. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  351. "%f Gflops speedup_vs_normal: %f\n",
  352. M, K, N, default_used, computations / default_used, mk_used,
  353. computations / mk_used, default_used / mk_used);
  354. };
  355. run(256, 256, 128);
  356. for (size_t k = 4; k <= 512; k *= 2) {
  357. for (size_t m = 4; m <= 512; m *= 2) {
  358. for (size_t n = 4; n <= 512; n *= 2) {
  359. run(m, n, k);
  360. }
  361. }
  362. std::cout << std::endl;
  363. }
  364. }
  365. #endif
  366. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台