You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. /**
  2. * \file dnn/test/arm_common/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/elemwise.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "megdnn/opr_param_defs.h"
  17. #include "megdnn/oprs/general.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. template <typename tag>
  21. class ARM_ELEMWISE : public ARM_COMMON {};
  22. TYPED_TEST_CASE(ARM_ELEMWISE, elemwise::test_types);
  23. TYPED_TEST(ARM_ELEMWISE, run) {
  24. elemwise::run_test<TypeParam>(this->handle());
  25. }
  26. template <typename tag>
  27. class ARM_ELEMWISE_MULTI_THREADS : public ARM_COMMON_MULTI_THREADS {};
  28. TYPED_TEST_CASE(ARM_ELEMWISE_MULTI_THREADS, elemwise::test_types);
  29. TYPED_TEST(ARM_ELEMWISE_MULTI_THREADS, run) {
  30. elemwise::run_test<TypeParam>(this->handle());
  31. }
  32. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_TERNARY) {
  33. using Mode = ElemwiseForward::Param::Mode;
  34. Checker<ElemwiseForward> checker(handle());
  35. checker.set_param(Mode::FUSE_MUL_ADD3);
  36. auto run = [&] {
  37. //! nchw44
  38. checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  39. checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  40. checker.execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  41. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  42. checker.execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  43. //! nchw44
  44. checker.execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  45. checker.execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  46. checker.execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  47. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  48. checker.execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  49. //! nchw88
  50. checker.execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  51. checker.execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  52. checker.execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  53. checker.execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  54. checker.execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  55. //! nchw88
  56. checker.execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  57. checker.execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  58. checker.execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  59. checker.execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  60. checker.execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  61. checker.execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}});
  62. checker.execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}});
  63. checker.execs({{1, 4, 1}, {3, 4, 7}, {1, 4, 1}, {}});
  64. checker.execs({{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}, {}});
  65. checker.execs({{1, 7}, {1, 7}, {1, 7}, {}});
  66. checker.execs({{1, 2, 1}, {1, 2, 2}, {1, 2, 1}, {}});
  67. checker.execs({{1, 2, 2}, {1, 2, 2}, {1, 1, 1}, {}});
  68. checker.execs({{3, 4, 1}, {3, 4, 1}, {3, 4, 1}, {}});
  69. checker.execs({{3, 4, 5}, {1}, {1}, {}});
  70. checker.execs({{1}, {3, 4, 5}, {1}, {}});
  71. };
  72. // case int
  73. checker.set_dtype(0, dtype::Int8());
  74. checker.set_dtype(1, dtype::Int8());
  75. checker.set_dtype(2, dtype::Int8());
  76. run();
  77. checker.set_dtype(0, dtype::Int16());
  78. checker.set_dtype(1, dtype::Int16());
  79. checker.set_dtype(2, dtype::Int16());
  80. run();
  81. checker.set_dtype(0, dtype::Int32());
  82. checker.set_dtype(1, dtype::Int32());
  83. checker.set_dtype(2, dtype::Int32());
  84. run();
  85. // case float
  86. UniformFloatRNG rng(1e-5, 7e1);
  87. checker.set_rng(0, &rng);
  88. checker.set_epsilon(1e-5);
  89. checker.set_dtype(0, dtype::Float32());
  90. checker.set_dtype(1, dtype::Float32());
  91. checker.set_dtype(2, dtype::Float32());
  92. run();
  93. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  94. // case half
  95. UniformFloatRNG rng_float16(1, 10);
  96. checker.set_rng(0, &rng_float16);
  97. checker.set_epsilon(1e-2);
  98. checker.set_dtype(0, dtype::Float16());
  99. checker.set_dtype(1, dtype::Float16());
  100. checker.set_dtype(2, dtype::Float16());
  101. run();
  102. #endif
  103. }
  104. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NCHW44_INT8_INT16_INT32) {
  105. using Mode = ElemwiseForward::Param::Mode;
  106. Checker<ElemwiseForward> checker(handle());
  107. auto run = [&]() {
  108. // VEC_BCAST101x not PowOp
  109. checker.set_param(Mode::ADD).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  110. checker.set_param(Mode::ADD).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  111. checker.set_param(Mode::ADD).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  112. checker.set_param(Mode::ADD).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  113. checker.set_param(Mode::ADD).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  114. checker.set_param(Mode::RMULH).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  115. checker.set_param(Mode::RMULH).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  116. checker.set_param(Mode::RMULH).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  117. checker.set_param(Mode::RMULH).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  118. checker.set_param(Mode::RMULH).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  119. checker.set_param(Mode::FUSE_ADD_RELU)
  120. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  121. checker.set_param(Mode::FUSE_ADD_RELU)
  122. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  123. checker.set_param(Mode::FUSE_ADD_RELU)
  124. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  125. checker.set_param(Mode::FUSE_ADD_RELU)
  126. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  127. checker.set_param(Mode::FUSE_ADD_RELU)
  128. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  129. // BCAST101x_VEC not PowOp
  130. checker.set_param(Mode::ADD).execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  131. checker.set_param(Mode::ADD).execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  132. checker.set_param(Mode::ADD).execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  133. checker.set_param(Mode::ADD).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  134. checker.set_param(Mode::ADD).execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  135. checker.set_param(Mode::FUSE_ADD_RELU)
  136. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  137. checker.set_param(Mode::FUSE_ADD_RELU)
  138. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  139. checker.set_param(Mode::FUSE_ADD_RELU)
  140. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  141. checker.set_param(Mode::FUSE_ADD_RELU)
  142. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  143. checker.set_param(Mode::FUSE_ADD_RELU)
  144. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  145. };
  146. checker.set_dtype(0, dtype::Int8());
  147. checker.set_dtype(1, dtype::Int8());
  148. run();
  149. checker.set_dtype(0, dtype::Int16());
  150. checker.set_dtype(1, dtype::Int16());
  151. run();
  152. checker.set_dtype(0, dtype::Int32());
  153. checker.set_dtype(1, dtype::Int32());
  154. run();
  155. }
  156. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NCHW44_FP32) {
  157. using Mode = ElemwiseForward::Param::Mode;
  158. Checker<ElemwiseForward> checker(handle());
  159. UniformFloatRNG rng(1e-5, 7e1);
  160. checker.set_rng(0, &rng);
  161. checker.set_epsilon(1e-5);
  162. checker.set_dtype(0, dtype::Float32());
  163. checker.set_dtype(1, dtype::Float32());
  164. checker.set_param(Mode::FUSE_ADD_RELU)
  165. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  166. checker.set_param(Mode::FUSE_ADD_RELU)
  167. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  168. checker.set_param(Mode::FUSE_ADD_RELU)
  169. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  170. checker.set_param(Mode::FUSE_ADD_RELU)
  171. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  172. checker.set_param(Mode::FUSE_ADD_RELU)
  173. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  174. checker.set_param(Mode::FUSE_ADD_RELU)
  175. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  176. checker.set_param(Mode::FUSE_ADD_RELU)
  177. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  178. checker.set_param(Mode::FUSE_ADD_RELU)
  179. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  180. checker.set_param(Mode::FUSE_ADD_RELU)
  181. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  182. checker.set_param(Mode::FUSE_ADD_RELU)
  183. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  184. auto run = [&](Mode mode) {
  185. // VEC_BCAST101x
  186. checker.set_param(mode).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  187. checker.set_param(mode).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  188. checker.set_param(mode).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  189. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  190. checker.set_param(mode).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  191. // BCAST101x_VEC not powOp
  192. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  193. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  194. checker.set_param(mode).execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  195. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  196. checker.set_param(mode).execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  197. };
  198. run(Mode::ADD);
  199. run(Mode::FUSE_ADD_H_SWISH);
  200. run(Mode::FUSE_ADD_RELU);
  201. run(Mode::MAX);
  202. run(Mode::MIN);
  203. run(Mode::MUL);
  204. run(Mode::SUB);
  205. run(Mode::TRUE_DIV);
  206. run(Mode::POW);
  207. }
  208. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NCHW88_FP) {
  209. using Mode = ElemwiseForward::Param::Mode;
  210. Checker<ElemwiseForward> checker(handle());
  211. checker.set_param(Mode::FUSE_ADD_RELU)
  212. .execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  213. checker.set_param(Mode::FUSE_ADD_RELU)
  214. .execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  215. checker.set_param(Mode::FUSE_ADD_RELU)
  216. .execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  217. checker.set_param(Mode::FUSE_ADD_RELU)
  218. .execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  219. checker.set_param(Mode::FUSE_ADD_RELU)
  220. .execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  221. checker.set_param(Mode::FUSE_ADD_RELU)
  222. .execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  223. checker.set_param(Mode::FUSE_ADD_RELU)
  224. .execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  225. checker.set_param(Mode::FUSE_ADD_RELU)
  226. .execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  227. checker.set_param(Mode::FUSE_ADD_RELU)
  228. .execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  229. checker.set_param(Mode::FUSE_ADD_RELU)
  230. .execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  231. auto run = [&](Mode mode) {
  232. // VEC_BCAST101x
  233. checker.set_param(mode).execs({{1, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  234. checker.set_param(mode).execs({{2, 3, 2, 2, 8}, {1, 3, 1, 1, 8}, {}});
  235. checker.set_param(mode).execs({{3, 8, 5, 3, 8}, {1, 8, 1, 1, 8}, {}});
  236. checker.set_param(mode).execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  237. checker.set_param(mode).execs({{1, 2, 5, 7, 8}, {1, 2, 1, 1, 8}, {}});
  238. // BCAST101x_VEC not powOp
  239. checker.set_param(mode).execs({{1, 3, 1, 1, 8}, {1, 3, 2, 2, 8}, {}});
  240. checker.set_param(mode).execs({{1, 3, 1, 1, 8}, {2, 3, 2, 2, 8}, {}});
  241. checker.set_param(mode).execs({{1, 8, 1, 1, 8}, {3, 8, 5, 3, 8}, {}});
  242. checker.set_param(mode).execs({{3, 4, 5, 7, 8}, {3, 4, 5, 7, 8}, {}});
  243. checker.set_param(mode).execs({{1, 2, 1, 1, 8}, {1, 2, 5, 7, 8}, {}});
  244. };
  245. auto run_all = [&]() {
  246. run(Mode::ADD);
  247. run(Mode::FUSE_ADD_H_SWISH);
  248. run(Mode::FUSE_ADD_RELU);
  249. run(Mode::MAX);
  250. run(Mode::MIN);
  251. run(Mode::MUL);
  252. run(Mode::SUB);
  253. run(Mode::TRUE_DIV);
  254. run(Mode::POW);
  255. };
  256. {
  257. UniformFloatRNG rng(1e-5, 7e1);
  258. checker.set_rng(0, &rng);
  259. checker.set_epsilon(1e-5);
  260. checker.set_dtype(0, dtype::Float32());
  261. checker.set_dtype(1, dtype::Float32());
  262. run_all();
  263. }
  264. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  265. {
  266. UniformFloatRNG rng(1, 2);
  267. checker.set_rng(0, &rng);
  268. checker.set_epsilon(3e-3);
  269. checker.set_dtype(0, dtype::Float16());
  270. checker.set_dtype(1, dtype::Float16());
  271. run_all();
  272. }
  273. #endif
  274. }
  275. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NHWC_FP32_BCAST) {
  276. using Mode = ElemwiseForward::Param::Mode;
  277. Checker<ElemwiseForward> checker(handle());
  278. UniformFloatRNG rng(1e-5, 7e1);
  279. checker.set_rng(0, &rng);
  280. checker.set_epsilon(1e-5);
  281. checker.set_dtype(0, dtype::Float32());
  282. checker.set_dtype(1, dtype::Float32());
  283. //! 2 dim
  284. auto run = [&](Mode mode) {
  285. // VEC_BCAST111C
  286. checker.set_param(mode).execs({{1, 2, 2, 12}, {1, 1, 1, 12}, {}});
  287. checker.set_param(mode).execs({{2, 5, 3, 28}, {1, 1, 1, 28}, {}});
  288. checker.set_param(mode).execs({{3, 5, 8, 32}, {1, 1, 1, 32}, {}});
  289. // BCAST111C_VEC
  290. checker.set_param(mode).execs({{1, 1, 1, 12}, {1, 2, 2, 12}, {}});
  291. checker.set_param(mode).execs({{1, 1, 1, 28}, {2, 5, 3, 28}, {}});
  292. checker.set_param(mode).execs({{1, 1, 1, 32}, {3, 5, 8, 32}, {}});
  293. };
  294. run(Mode::ADD);
  295. run(Mode::MUL);
  296. run(Mode::SUB);
  297. //! 3 dim contig
  298. auto run_3d_contig = [&](Mode mode) {
  299. // BCAST111C_VEC_BCAST111C
  300. checker.set_param(mode).execs(
  301. {{1, 1, 1, 12}, {1, 2, 2, 12}, {1, 1, 1, 12}, {}});
  302. checker.set_param(mode).execs(
  303. {{1, 1, 1, 28}, {2, 5, 3, 28}, {1, 1, 1, 28}, {}});
  304. checker.set_param(mode).execs(
  305. {{1, 1, 1, 32}, {3, 5, 8, 32}, {1, 1, 1, 32}, {}});
  306. // VEC_BCAST111C_VEC
  307. checker.set_param(mode).execs(
  308. {{1, 2, 2, 12}, {1, 1, 1, 12}, {1, 2, 2, 12}, {}});
  309. checker.set_param(mode).execs(
  310. {{2, 5, 3, 28}, {1, 1, 1, 28}, {2, 5, 3, 28}, {}});
  311. checker.set_param(mode).execs(
  312. {{3, 5, 8, 32}, {1, 1, 1, 32}, {3, 5, 8, 32}, {}});
  313. };
  314. run_3d_contig(Mode::FUSE_MUL_ADD3);
  315. //! 3 dim incontig
  316. auto run_3d_incontig = [&](Mode mode) {
  317. megdnn::TensorLayout src0({1, 1, 1, 12}, dtype::Float32());
  318. megdnn::TensorLayout src1({1, 2, 2, 12}, {80, 40, 20, 1}, dtype::Float32());
  319. // BCAST111C_VEC_BCAST111C
  320. checker.set_param(mode).execl({src0, src1, src0, {}});
  321. // VEC_BCAST111C_VEC
  322. checker.set_param(mode).execl({src1, src0, src1, {}});
  323. };
  324. run_3d_incontig(Mode::FUSE_MUL_ADD3);
  325. }
  326. #if MEGDNN_WITH_BENCHMARK
  327. namespace {
  328. void run_elemwise_benchmark(
  329. const TensorShapeArray& shapes, param::Elemwise::Mode mode,
  330. const char* mode_str, DType type, Handle* handle_bench) {
  331. auto handle_fallback = create_cpu_handle(1);
  332. Benchmarker<Elemwise> benchmarker_bench(handle_bench);
  333. Benchmarker<Elemwise> benchmarker_fallback(handle_fallback.get());
  334. float throughput = 0;
  335. SmallVector<TensorLayout> layouts;
  336. std::string src_strs;
  337. for (size_t i = 0; i < shapes.size(); i++) {
  338. layouts.emplace_back(shapes[i], type);
  339. throughput += layouts.back().span().dist_byte();
  340. src_strs += layouts.back().to_string();
  341. if (i != shapes.size() - 1) {
  342. src_strs += ",";
  343. }
  344. }
  345. constexpr size_t RUN = 50;
  346. benchmarker_fallback.set_times(RUN).set_display(false);
  347. benchmarker_bench.set_times(RUN).set_display(false);
  348. benchmarker_fallback.set_param(mode);
  349. benchmarker_bench.set_param(mode);
  350. TensorLayout dst_layout;
  351. auto opr = handle_bench->create_operator<Elemwise>();
  352. opr->param() = mode;
  353. opr->deduce_layout(layouts, dst_layout);
  354. float computations =
  355. dst_layout.total_nr_elems() * (std::max<size_t>(shapes.size(), 2) - 1);
  356. throughput += dst_layout.span().dist_byte();
  357. computations *= (1e3 / (1024.0 * 1024));
  358. throughput *= (1e3 / (1024.0 * 1024));
  359. layouts.emplace_back(dst_layout);
  360. auto fallback_time = benchmarker_fallback.execl(layouts) / RUN;
  361. auto bench_time = benchmarker_bench.execl(layouts) / RUN;
  362. float fallback_flops = computations / fallback_time;
  363. float bench_flops = computations / bench_time;
  364. float fallback_thr = throughput / fallback_time;
  365. float bench_thr = throughput / bench_time;
  366. printf("%s = %s (type: %s, mode: %s) cpu=%fMFLOPS %fMB/s, bench=%fMFLOPS "
  367. "%fMB/s "
  368. "computations: %fx, throughput: %fx\n",
  369. src_strs.c_str(), dst_layout.to_string().c_str(), type.name(), mode_str,
  370. fallback_flops, fallback_thr, bench_flops, bench_thr,
  371. bench_flops / fallback_flops, bench_thr / fallback_thr);
  372. }
  373. } // namespace
  374. TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NHWC) {
  375. Benchmarker<Elemwise> benchmarker(handle());
  376. constexpr size_t RUN = 50;
  377. benchmarker.set_times(RUN).set_display(false);
  378. auto run = [&](size_t N, size_t C, size_t H, size_t W, param::Elemwise::Mode mode,
  379. const char* mode_name) {
  380. megdnn::param::Elemwise param;
  381. param.mode = mode;
  382. benchmarker.set_param(param);
  383. megdnn::TensorShape nhwc_src0{N, H, W, C};
  384. megdnn::TensorShape nhwc_src1{1, 1, 1, C};
  385. megdnn::TensorShape nchw_src0{N, C, H, W};
  386. megdnn::TensorShape nchw_src1{1, C, 1, 1};
  387. float computations = N * C * H * W;
  388. auto nhwc_time = benchmarker.execs({nhwc_src1, nhwc_src0, {}}) / RUN;
  389. auto nchw_time = benchmarker.execs({nchw_src1, nchw_src0, {}}) / RUN;
  390. auto perf_nhwc = computations / nhwc_time / 1e6;
  391. auto perf_nchw = computations / nchw_time / 1e6;
  392. printf("Elemwise Mode : %s\nNHWC : %fms %fGflops\nNCHW : %fms "
  393. "%fGflops\n",
  394. mode_name, nhwc_time, perf_nhwc, nchw_time, perf_nchw);
  395. };
  396. run(1, 120, 16, 24, param::Elemwise::Mode::ADD, "ADD");
  397. run(1, 120, 16, 24, param::Elemwise::Mode::MUL, "MUL");
  398. run(1, 120, 32, 48, param::Elemwise::Mode::ADD, "ADD");
  399. run(1, 120, 32, 48, param::Elemwise::Mode::MUL, "MUL");
  400. run(1, 120, 64, 96, param::Elemwise::Mode::ADD, "ADD");
  401. run(1, 120, 64, 96, param::Elemwise::Mode::MUL, "MUL");
  402. }
  403. #define INT_RUN(shape, mode) \
  404. run_elemwise_benchmark(shape, mode, #mode, dtype::Int8{}, handle()); \
  405. run_elemwise_benchmark(shape, mode, #mode, dtype::Int16{}, handle()); \
  406. run_elemwise_benchmark(shape, mode, #mode, dtype::Int32{}, handle());
  407. #define FLOAT_RUN(shape, mode) \
  408. run_elemwise_benchmark(shape, mode, #mode, dtype::Float32{}, handle()); \
  409. run_elemwise_benchmark(shape, mode, #mode, dtype::Float16{}, handle());
  410. #define BENCHMARK_CASES(shape) \
  411. INT_BENCHMARK_CASES(shape) \
  412. FLOAT_BENCHMARK_CASES(shape)
  413. TEST_F(ARM_COMMON, BENCHMARK_UNARY) {
  414. #define INT_BENCHMARK_CASES(shape) \
  415. INT_RUN(shape, Mode::RELU); \
  416. INT_RUN(shape, Mode::ABS);
  417. #define FLOAT_BENCHMARK_CASES(shape) \
  418. FLOAT_RUN(shape, Mode::RELU); \
  419. FLOAT_RUN(shape, Mode::ABS); \
  420. FLOAT_RUN(shape, Mode::SIGMOID); \
  421. FLOAT_RUN(shape, Mode::EXP); \
  422. FLOAT_RUN(shape, Mode::TANH); \
  423. FLOAT_RUN(shape, Mode::FAST_TANH);
  424. using Mode = param::Elemwise::Mode;
  425. BENCHMARK_CASES({{10000}});
  426. BENCHMARK_CASES({{50000}});
  427. #undef INT_BENCHMARK_CASES
  428. #undef FLOAT_BENCHMARK_CASES
  429. }
  430. TEST_F(ARM_COMMON, BENCHMARK_BINARY) {
  431. #define INT_BENCHMARK_CASES(shape) \
  432. INT_RUN(shape, Mode::MIN); \
  433. INT_RUN(shape, Mode::MAX); \
  434. INT_RUN(shape, Mode::ADD); \
  435. INT_RUN(shape, Mode::SUB); \
  436. INT_RUN(shape, Mode::MUL); \
  437. INT_RUN(shape, Mode::RMULH); \
  438. INT_RUN(shape, Mode::FUSE_ADD_RELU);
  439. #define FLOAT_BENCHMARK_CASES(shape) \
  440. FLOAT_RUN(shape, Mode::MIN); \
  441. FLOAT_RUN(shape, Mode::MAX); \
  442. FLOAT_RUN(shape, Mode::ADD); \
  443. FLOAT_RUN(shape, Mode::SUB); \
  444. FLOAT_RUN(shape, Mode::MUL); \
  445. FLOAT_RUN(shape, Mode::POW); \
  446. FLOAT_RUN(shape, Mode::TRUE_DIV); \
  447. FLOAT_RUN(shape, Mode::FUSE_ADD_RELU);
  448. using Mode = param::Elemwise::Mode;
  449. TensorShapeArray shapes = {{1, 112, 28, 28}, {1, 112, 28, 28}};
  450. BENCHMARK_CASES(shapes);
  451. shapes = {{1, 16, 1, 1}, {1, 16, 112, 112}};
  452. BENCHMARK_CASES(shapes);
  453. shapes = {{1, 448, 7, 7}, {1, 448, 7, 7}};
  454. BENCHMARK_CASES(shapes);
  455. #undef INT_BENCHMARK_CASES
  456. #undef FLOAT_BENCHMARK_CASES
  457. }
  458. TEST_F(ARM_COMMON, BENCHMARK_TERNARY_FMA3) {
  459. #define INT_BENCHMARK_CASES(shape) INT_RUN(shape, Mode::FUSE_MUL_ADD3);
  460. #define FLOAT_BENCHMARK_CASES(shape) FLOAT_RUN(shape, Mode::FUSE_MUL_ADD3);
  461. using Mode = param::Elemwise::Mode;
  462. TensorShapeArray shapes = {{30, 40, 70}, {30, 40, 70}, {30, 40, 70}};
  463. BENCHMARK_CASES(shapes);
  464. shapes = {{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}};
  465. BENCHMARK_CASES(shapes);
  466. shapes = {{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}};
  467. BENCHMARK_CASES(shapes);
  468. #undef INT_BENCHMARK_CASES
  469. #undef FLOAT_BENCHMARK_CASES
  470. }
  471. #undef BENCHMARK_CASES
  472. #undef INT_RUN
  473. #undef FLOAT_RUN
  474. #endif
  475. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台