You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algos.cpp 31 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. #include "src/fallback/convolution/algos.h"
  2. #include "src/common/opr_delegate.h"
  3. #include "src/fallback/convolution/col2img_helper.h"
  4. #include "src/fallback/convolution/run_conv.h"
  5. #include "midout.h"
  6. using namespace megdnn;
  7. using namespace fallback;
  8. MIDOUT_DECL(megdnn_fallback_conv)
  9. MIDOUT_DECL(megdnn_fallback_deconv)
  10. namespace {
  11. template <typename T>
  12. void incr_ptr(T*& dst, ptrdiff_t delta) {
  13. dst = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(dst) + delta);
  14. }
  15. using NCBKernSizeParam = ConvolutionBackwardDataImpl::NCBKernSizeParam;
  16. using NCBKernParam = ConvolutionBackwardDataImpl::NCBKernParam;
  17. Relayout* get_relayout_opr() {
  18. static CpuOprDelegationStorage<> storage;
  19. return storage.get<Relayout>();
  20. }
  21. MatrixMul* get_matmul_opr(const NCBKernSizeParam& param) {
  22. using ConvCM = param::Convolution::ComputeMode;
  23. using MmCM = param::MatrixMul::ComputeMode;
  24. static CpuOprDelegationStorage<3> storage;
  25. if (param.filter_meta.format == param::Convolution::Format::NCHW44) {
  26. MatrixMul::Param p;
  27. p.format = param::MatrixMul::Format::MK4;
  28. return storage.get<MatrixMul, 0>(p);
  29. }
  30. switch (param.compute_mode) {
  31. default:
  32. return storage.get<MatrixMul, 1>({});
  33. case ConvCM::FLOAT32: {
  34. MatrixMul::Param p;
  35. p.compute_mode = MmCM::FLOAT32;
  36. return storage.get<MatrixMul, 2>(p);
  37. }
  38. }
  39. }
  40. WorkspaceBundle get_bundle(const NCBKernSizeParam& param) {
  41. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  42. MEGDNN_MARK_USED_VAR(N);
  43. MEGDNN_MARK_USED_VAR(OH);
  44. MEGDNN_MARK_USED_VAR(OW);
  45. bool can_matrix_mul_direct =
  46. (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
  47. // temp space to store unrolled matrix
  48. // workspace for matrix mul opr
  49. // workspace for relayout opr
  50. size_t part0, part1, part2;
  51. if (can_matrix_mul_direct) {
  52. part0 = 0;
  53. } else {
  54. part0 = (IC * FH * FW * IH * IW) * param.grad_type.size();
  55. }
  56. part2 = (OC * IC * FH * FW) * param.filter_type.size();
  57. if (param.filter_meta.format == param::Convolution::Format::NCHW44) {
  58. TensorLayout A_, B_, C_;
  59. A_ = TensorLayout({IC / 4 * FH * FW, OC / 4, 4, 4}, param.filter_type);
  60. B_ = TensorLayout({OC / 4, IH * IW}, param.diff_type);
  61. C_ = TensorLayout({IC / 4 * FH * FW, IH * IW, 4}, param.grad_type);
  62. auto matmul_algo = get_matmul_opr(param);
  63. part1 = matmul_algo->get_workspace_in_bytes(A_, B_, C_);
  64. } else {
  65. TensorLayout A_, B_, C_;
  66. A_ = TensorLayout({IC * FH * FW, OC}, param.filter_type);
  67. B_ = TensorLayout({OC, IH * IW}, param.diff_type);
  68. C_ = TensorLayout({IC * FH * FW, IH * IW}, param.grad_type);
  69. part1 = get_matmul_opr(param)->get_workspace_in_bytes(A_, B_, C_);
  70. }
  71. return {nullptr, {part0, part1, part2}};
  72. }
  73. template <typename ftype, typename dtype, typename gtype>
  74. void kern_matmul(const NCBKernParam& param) {
  75. bool is_xcorr = !param.filter_meta.should_flip;
  76. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  77. auto bundle = get_bundle(param);
  78. bundle.set(param.workspace_ptr);
  79. bool is1X1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
  80. typedef void (*Func1)(const gtype*, gtype*, int, int, int, int, int, int, int);
  81. typedef void (*Func2)(
  82. const gtype*, gtype*, int, int, int, int, int, int, int, int, int, int,
  83. int);
  84. Func1 f1 = nullptr;
  85. Func2 f2 = nullptr;
  86. if (is_xcorr) {
  87. f1 = col2img<true>;
  88. f2 = col2img_stride_padding<true>;
  89. } else {
  90. f1 = col2img<false>;
  91. f2 = col2img_stride_padding<false>;
  92. }
  93. ftype* filter = const_cast<ftype*>(param.filter<ftype>());
  94. TensorND A_src, A_dst;
  95. {
  96. A_src.layout = TensorLayout(
  97. {IC * FH * FW, OC},
  98. {static_cast<std::ptrdiff_t>(1),
  99. static_cast<std::ptrdiff_t>(IC * FH * FW)},
  100. param.filter_type);
  101. A_src.reset_ptr(static_cast<void*>(filter));
  102. A_dst.layout = TensorLayout({IC * FH * FW, OC}, param.filter_type);
  103. A_dst.reset_ptr(static_cast<void*>(bundle.get(2)));
  104. // TODO Should be removed once armv8 convolution support transpose.
  105. get_relayout_opr()->exec(A_src, A_dst, inplace_cpu_handle().get());
  106. }
  107. TensorND B_, C_;
  108. for (size_t n = 0; n < N; ++n) {
  109. gtype *C_src, *C_dst;
  110. dtype* diff = const_cast<dtype*>(param.diff<dtype>() + n * param.inp_bs);
  111. gtype* grad = param.grad<gtype>() + n * param.out_bs;
  112. if (is1X1) {
  113. C_src = grad;
  114. } else {
  115. C_src = static_cast<gtype*>(bundle.get(0));
  116. }
  117. {
  118. B_.layout = TensorLayout({OC, IH * IW}, param.diff_type);
  119. B_.reset_ptr(static_cast<void*>(diff));
  120. C_.layout = TensorLayout({IC * FH * FW, IH * IW}, param.grad_type);
  121. C_.reset_ptr(C_src);
  122. Workspace workspace(
  123. static_cast<dt_byte*>(bundle.get(1)), bundle.get_size(1));
  124. get_matmul_opr(param)->exec(A_dst, B_, C_, workspace);
  125. }
  126. if (!is1X1) {
  127. C_dst = grad;
  128. std::memset(C_dst, 0, param.grad_type.size() * IC * OH * OW);
  129. if (PH == 0 && PW == 0 && SH == 1 && SW == 1) {
  130. f1(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW);
  131. } else {
  132. f2(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW, SH, SW, PH, PW);
  133. }
  134. }
  135. }
  136. }
  137. void kern_direct(const NCBKernParam& param) {
  138. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  139. auto diff = param.diff<float>(), filter = param.filter<float>();
  140. auto grad = param.grad<float>();
  141. for (size_t n = 0; n < N; ++n) {
  142. convolution::run_conv_backward_data(
  143. diff + n * param.inp_bs, filter, grad + n * param.out_bs,
  144. param.workspace_ptr, IH, IW, IC, FH, FW, OH, OW, OC, PH, PW, SH, SW,
  145. !param.filter_meta.should_flip);
  146. }
  147. }
  148. } // namespace
  149. /* ===================== fallback algo ===================== */
  150. bool ConvolutionImpl::AlgoFallback::usable(
  151. const NCBKernSizeParam& param,
  152. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  153. auto&& fm = param.filter_meta;
  154. return fm.format == param::Convolution::Format::NCHW &&
  155. param.src_type.enumv() == DTypeEnum::Float32 &&
  156. param.filter_type.enumv() == DTypeEnum::Float32 &&
  157. param.dst_type.enumv() == DTypeEnum::Float32 && fm.spatial_ndim == 2 &&
  158. fm.dilation[0] == 1 && fm.dilation[1] == 1;
  159. }
  160. size_t ConvolutionImpl::AlgoFallback::get_workspace(
  161. const NCBKernSizeParam& param) const {
  162. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("AlgoFallback::get_workspace"_hash)) {
  163. auto FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1];
  164. size_t nr_threads = param.nr_threads;
  165. if (param.filter_meta.should_flip) {
  166. // need transpose filter
  167. return WorkspaceBundle{nullptr, {FH * FW * sizeof(float)}}
  168. .total_size_in_bytes() *
  169. nr_threads;
  170. } else {
  171. return 0;
  172. }
  173. }
  174. MIDOUT_END();
  175. return 0;
  176. }
  177. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoFallback::dispatch_kern(
  178. const NCBKernSizeParam& param) const {
  179. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("AlgoFallback::dispatch_kern"_hash)) {
  180. size_t group = param.filter_meta.group;
  181. size_t N = param.n;
  182. size_t nr_threads = param.nr_threads;
  183. size_t workspace_per_thread = get_workspace(param) / nr_threads;
  184. auto kern_fallback = [workspace_per_thread](
  185. const NCBKernParam& p,
  186. const NCBKernIndex& ncb_index) {
  187. UNPACK_CONV_F32_NCB_KERN_SIZES(p);
  188. size_t batch_id = ncb_index.ndrange_id[1];
  189. size_t group_id = ncb_index.ndrange_id[0];
  190. MEGDNN_MARK_USED_VAR(N);
  191. auto src = p.src<float>(batch_id, group_id),
  192. filter = p.filter<float>(group_id);
  193. auto dst = p.dst<float>(batch_id, group_id);
  194. size_t thread_id = ncb_index.thread_id;
  195. void* workspace_ptr = reinterpret_cast<void*>(
  196. reinterpret_cast<ptrdiff_t>(p.workspace_ptr) +
  197. workspace_per_thread * thread_id);
  198. convolution::run_conv(
  199. src, filter, dst, workspace_ptr, IH, IW, IC, FH, FW, OH, OW, OC, PH,
  200. PW, SH, SW, !p.filter_meta.should_flip);
  201. };
  202. return {{kern_fallback, {group, N, 1_z}}};
  203. }
  204. MIDOUT_END();
  205. }
  206. /* ===================== naive algo ===================== */
  207. bool ConvolutionImpl::AlgoNaive::usable(
  208. const NCBKernSizeParam& param,
  209. AlgoSelectionStrategy /*algo_selection_strategy*/) const {
  210. bool ret = false;
  211. #define cb(dt) ret |= (param.src_type.enumv() == DTypeTrait<dt>::enumv);
  212. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  213. #undef cb
  214. #define cb(dt_src, dt_dst) \
  215. ret |= (param.src_type.enumv() == DTypeTrait<dt_src>::enumv && \
  216. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  217. param.dst_type.enumv() == DTypeTrait<dt_dst>::enumv)
  218. cb(dtype::Int8, dtype::Int16);
  219. cb(dtype::Int8, dtype::Int32);
  220. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  221. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  222. #undef cb
  223. ret = ret && (param.filter_meta.format == param::Convolution::Format::NCHW ||
  224. param.filter_meta.format == param::Convolution::Format::NHWC);
  225. return ret;
  226. }
  227. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoNaive::dispatch_kern(
  228. const NCBKernSizeParam& param) const {
  229. size_t N = param.n;
  230. size_t group = param.filter_meta.group;
  231. #define cb(dt, cmode, compute_type) \
  232. do { \
  233. if (param.src_type.enumv() == DTypeTrait<dt>::enumv && \
  234. param.compute_mode == param::ConvBias::ComputeMode::cmode) { \
  235. using ctype = DTypeTrait<dt>::ctype; \
  236. using comp_type = DTypeTrait<compute_type>::ctype; \
  237. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(1)) { \
  238. return { \
  239. {kern_naive_forward<ctype, ctype, comp_type>, \
  240. {group, N, 1_z}}}; \
  241. } \
  242. MIDOUT_END(); \
  243. } \
  244. } while (0)
  245. cb(dtype::Float32, DEFAULT, dtype::Float32);
  246. #if !MEGDNN_DISABLE_FLOAT16
  247. cb(dtype::Float16, DEFAULT, dtype::Float16);
  248. cb(dtype::Float16, FLOAT32, dtype::Float32);
  249. #endif
  250. #undef cb
  251. #define cb(dt_src, dt_dst) \
  252. do { \
  253. if (param.src_type.enumv() == DTypeTrait<dt_src>::enumv && \
  254. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  255. param.dst_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  256. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(2)) { \
  257. return { \
  258. {kern_naive_forward< \
  259. DTypeTrait<dt_src>::ctype, DTypeTrait<dt_dst>::ctype, \
  260. DTypeTrait<dt_dst>::ctype>, \
  261. {group, N, 1_z}}}; \
  262. } \
  263. MIDOUT_END(); \
  264. } \
  265. } while (0)
  266. cb(dtype::Int8, dtype::Int16);
  267. cb(dtype::Int8, dtype::Int32);
  268. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  269. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  270. megdnn_throw("unknown convolution data type");
  271. #undef cb
  272. }
  273. /* ===================== default algo ===================== */
  274. ConvolutionImpl::AlgoDefault::AlgoDefault(ConvBiasImpl::AlgoBase* algorithm)
  275. : m_algorithm(algorithm) {
  276. megdnn_assert_internal(algorithm);
  277. m_name = ssprintf("CONVOLUTION_DEFAULT_%s", m_algorithm->name());
  278. }
  279. ConvBiasImpl::NCBKernSizeParam ConvolutionImpl::AlgoDefault::init_conv_bias_param(
  280. const NCBKernSizeParam& param) {
  281. DType bias_type = param.dst_type;
  282. if (bias_type.category() == DTypeCategory::QUANTIZED) {
  283. bias_type = dtype::QuantizedS32(mul_scale(param.src_type, param.filter_type));
  284. }
  285. return {param, bias_type, 0, BiasMode::NO_BIAS,
  286. param::ConvBias::NonlineMode::IDENTITY};
  287. }
  288. bool ConvolutionImpl::AlgoDefault::is_preferred(const NCBKernSizeParam& param) const {
  289. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = init_conv_bias_param(param);
  290. return m_algorithm->is_preferred(conv_bias_param);
  291. }
  292. bool ConvolutionImpl::AlgoDefault::usable(
  293. const NCBKernSizeParam& param,
  294. AlgoSelectionStrategy algo_selection_strategy) const {
  295. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = init_conv_bias_param(param);
  296. return m_algorithm->usable(
  297. conv_bias_param,
  298. static_cast<ConvBiasImpl::AlgoSelectionStrategy>(algo_selection_strategy));
  299. }
  300. WorkspaceBundle ConvolutionImpl::AlgoDefault::get_bundle(
  301. const NCBKernSizeParam& param) const {
  302. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = init_conv_bias_param(param);
  303. return WorkspaceBundle(nullptr, {m_algorithm->get_workspace(conv_bias_param)});
  304. }
  305. size_t ConvolutionImpl::AlgoDefault::get_workspace(
  306. const NCBKernSizeParam& param) const {
  307. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("AlgoDefault::get_workspace"_hash)) {
  308. return get_bundle(param).total_size_in_bytes();
  309. }
  310. MIDOUT_END();
  311. return 0;
  312. }
  313. size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace(
  314. const NCBKernSizeParam& param) const {
  315. MIDOUT_BEGIN(
  316. megdnn_fallback_conv,
  317. midout_iv("AlgoDefault::get_preprocess_workspace"_hash)) {
  318. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = init_conv_bias_param(param);
  319. return m_algorithm->get_preprocess_workspace(conv_bias_param);
  320. }
  321. MIDOUT_END();
  322. }
  323. SmallVector<TensorLayout> ConvolutionImpl::AlgoDefault::
  324. deduce_preprocessed_filter_layout(const NCBKernSizeParam& param) const {
  325. MIDOUT_BEGIN(
  326. megdnn_fallback_conv,
  327. midout_iv("AlgoDefault::deduce_preprocessed_filter_layout"_hash)) {
  328. ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = init_conv_bias_param(param);
  329. return m_algorithm->deduce_preprocessed_filter_layout(conv_bias_param);
  330. }
  331. MIDOUT_END();
  332. }
  333. //! Return the implement preprocess kernel
  334. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::
  335. get_preprocess_kimpl(
  336. ConvBiasImpl::AlgoBase* algo, const NCBKernSizeParam& param) {
  337. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) {
  338. // construct the conv_bias kern param
  339. ::ConvBiasImpl::NCBKernParam conv_bias_param;
  340. static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) =
  341. init_conv_bias_param(param);
  342. auto conv_bias_preprocess_kerns =
  343. algo->dispatch_preprocess_kerns(conv_bias_param);
  344. SmallVector<ConvolutionImpl::NCBKern> convolution_preprocess_kerns;
  345. for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) {
  346. auto kernel = conv_bias_preprocess_kerns[i];
  347. //! If the kerenl batch parallel
  348. auto run = [conv_bias_param, kernel](
  349. const NCBKernParam& p, const NCBKernIndex& ncb_index) {
  350. auto param = conv_bias_param;
  351. param.filter_ptr = p.filter_ptr;
  352. param.workspace_ptr = p.workspace_ptr;
  353. param.workspace_size = p.workspace_size;
  354. kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
  355. };
  356. convolution_preprocess_kerns.push_back({run, kernel.global_size});
  357. }
  358. return convolution_preprocess_kerns;
  359. }
  360. MIDOUT_END();
  361. }
  362. //! Return the implement kernel
  363. SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl(
  364. ConvBiasImpl::AlgoBase* algo, const NCBKernSizeParam& param) {
  365. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv(0)) {
  366. // construct the conv_bias kern param
  367. ::ConvBiasImpl::NCBKernParam conv_bias_param;
  368. static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) =
  369. init_conv_bias_param(param);
  370. auto&& conv_bias_kerns = algo->dispatch_kerns(conv_bias_param);
  371. SmallVector<ConvolutionImpl::NCBKern> convolution_kerns;
  372. for (size_t i = 0; i < conv_bias_kerns.size(); i++) {
  373. auto&& kernel = conv_bias_kerns[i];
  374. //! If the kerenl batch parallel
  375. auto run = [conv_bias_param, kernel](
  376. const NCBKernParam& p, const NCBKernIndex& ncb_index) {
  377. auto param = conv_bias_param;
  378. param.src_ptr = p.src_ptr;
  379. param.filter_ptr = p.filter_ptr;
  380. param.dst_ptr = p.dst_ptr;
  381. param.workspace_ptr = p.workspace_ptr;
  382. param.workspace_size = p.workspace_size;
  383. kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
  384. };
  385. convolution_kerns.push_back({run, kernel.global_size});
  386. }
  387. return convolution_kerns;
  388. }
  389. MIDOUT_END();
  390. }
  391. /////////////////////////// ConvolutionBackwardData /////////////////////
  392. /* ===================== naive algo ===================== */
  393. bool ConvolutionBackwardDataImpl::AlgoNaive::usable(
  394. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  395. bool ret = false;
  396. #define cb(dt) ret |= (param.diff_type.enumv() == DTypeTrait<dt>::enumv);
  397. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  398. #undef cb
  399. #define cb(dt_src, dt_dst) \
  400. ret |= (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  401. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  402. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv)
  403. cb(dtype::Int8, dtype::Int32);
  404. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  405. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  406. #undef cb
  407. return ret;
  408. }
  409. size_t ConvolutionBackwardDataImpl::AlgoNaive::get_workspace(
  410. ConvolutionBackwardDataImpl*, const NCBKernSizeParam&) const {
  411. return 0;
  412. }
  413. ConvolutionBackwardDataImpl::ncb_kern_t ConvolutionBackwardDataImpl::AlgoNaive::
  414. dispatch_kern(
  415. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  416. #define cb(_dt) \
  417. do { \
  418. if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \
  419. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv(DTypeTrait<_dt>::enumv)) { \
  420. using ctype = DTypeTrait<_dt>::ctype; \
  421. return kern_naive<ctype, ctype, ctype>; \
  422. } \
  423. MIDOUT_END(); \
  424. } \
  425. } while (0);
  426. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb);
  427. #undef cb
  428. #define cb(dt_src, dt_dst) \
  429. do { \
  430. if (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  431. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  432. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  433. MIDOUT_BEGIN( \
  434. megdnn_fallback_deconv, midout_iv(DTypeTrait<dt_src>::enumv)) { \
  435. return kern_naive< \
  436. DTypeTrait<dt_src>::ctype, DTypeTrait<dt_src>::ctype, \
  437. DTypeTrait<dt_dst>::ctype>; \
  438. } \
  439. MIDOUT_END(); \
  440. } \
  441. } while (0)
  442. cb(dtype::Int8, dtype::Int32);
  443. cb(dtype::Quantized8Asymm, dtype::QuantizedS32);
  444. cb(dtype::QuantizedS8, dtype::QuantizedS32);
  445. megdnn_throw("unsupported data type on ConvolutionBackwardData");
  446. #undef cb
  447. }
  448. /* ===================== direct algo ===================== */
  449. bool ConvolutionBackwardDataImpl::AlgoDirect::usable(
  450. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  451. auto&& fm = param.filter_meta;
  452. return fm.format == param::Convolution::Format::NCHW &&
  453. param.diff_type.enumv() == DTypeEnum::Float32 &&
  454. param.filter_type.enumv() == DTypeEnum::Float32 &&
  455. param.grad_type.enumv() == DTypeEnum::Float32 && fm.spatial_ndim == 2 &&
  456. fm.group == 1 && fm.dilation[0] == 1 && fm.dilation[1] == 1;
  457. }
  458. size_t ConvolutionBackwardDataImpl::AlgoDirect::get_workspace(
  459. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  460. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv("AlgoDirect::get_workspace"_hash)) {
  461. auto FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1];
  462. if (param.filter_meta.should_flip) {
  463. // need transpose filter
  464. return FH * FW * sizeof(float);
  465. } else {
  466. return 0;
  467. }
  468. }
  469. MIDOUT_END();
  470. return 0;
  471. }
  472. ConvolutionBackwardDataImpl::ncb_kern_t ConvolutionBackwardDataImpl::AlgoDirect::
  473. dispatch_kern(ConvolutionBackwardDataImpl*, const NCBKernSizeParam&) const {
  474. MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("AlgoDirect::dispatch_kern"_hash)) {
  475. return kern_direct;
  476. }
  477. MIDOUT_END();
  478. }
  479. /* ===================== Matrix mul algo ===================== */
  480. bool ConvolutionBackwardDataImpl::AlgoMatrixMul::usable(
  481. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  482. auto&& fm = param.filter_meta;
  483. return fm.format == param::Convolution::Format::NCHW && fm.spatial_ndim == 2 &&
  484. fm.group == 1 && fm.dilation[0] == 1 && fm.dilation[1] == 1;
  485. }
  486. size_t ConvolutionBackwardDataImpl::AlgoMatrixMul::get_workspace(
  487. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  488. MIDOUT_BEGIN(
  489. megdnn_fallback_deconv, midout_iv("AlgoMatrixMul::get_workspace"_hash)) {
  490. return get_bundle(param).total_size_in_bytes();
  491. }
  492. MIDOUT_END();
  493. return 0;
  494. }
  495. ConvolutionBackwardDataImpl::ncb_kern_t ConvolutionBackwardDataImpl::AlgoMatrixMul::
  496. dispatch_kern(
  497. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  498. #define cb(dt, midout_tag) \
  499. do { \
  500. if (param.filter_type.enumv() == DTypeTrait<dt>::enumv) { \
  501. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv(midout_tag)) { \
  502. using ctype = DTypeTrait<dt>::ctype; \
  503. return kern_matmul<ctype, ctype, ctype>; \
  504. } \
  505. MIDOUT_END(); \
  506. } \
  507. } while (0);
  508. cb(dtype::Float32, "FLOAT"_hash);
  509. DNN_INC_FLOAT16(cb(dtype::Float16, "FLOAT16"_hash));
  510. DNN_INC_FLOAT16(cb(dtype::BFloat16, "BFLOAT16"_hash));
  511. #undef cb
  512. #define cb(dt_src, dt_dst, midout_tag) \
  513. do { \
  514. if (param.diff_type.enumv() == DTypeTrait<dt_src>::enumv && \
  515. param.filter_type.enumv() == DTypeTrait<dt_src>::enumv && \
  516. param.grad_type.enumv() == DTypeTrait<dt_dst>::enumv) { \
  517. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv(midout_tag)) { \
  518. return kern_matmul< \
  519. DTypeTrait<dt_src>::ctype, DTypeTrait<dt_src>::ctype, \
  520. DTypeTrait<dt_dst>::ctype>; \
  521. } \
  522. MIDOUT_END(); \
  523. } \
  524. } while (0)
  525. cb(dtype::Int8, dtype::Int32, "INT8x8x32"_hash);
  526. cb(dtype::QuantizedS8, dtype::QuantizedS32, "QINT8x8x32"_hash);
  527. cb(dtype::Quantized8Asymm, dtype::QuantizedS32, "QUINT8x8x32"_hash);
  528. megdnn_throw("unsupported data type on matrix mul");
  529. #undef cb
  530. }
  531. bool ConvolutionBackwardDataImpl::AlgoMatrixMul::is_preferred(
  532. const NCBKernSizeParam& param) const {
  533. return is_matrix_mul_preferred(param);
  534. }
  535. /* ===================== Matrix mul nchw44 algo ===================== */
  536. namespace{
  537. void kern_matmul_nchw44(const NCBKernParam& param) {
  538. bool is_xcorr = !param.filter_meta.should_flip;
  539. UNPACK_CONV_F32_NCB_KERN_SIZES(param);
  540. auto bundle = get_bundle(param);
  541. bundle.set(param.workspace_ptr);
  542. bool is1X1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
  543. typedef void (*Func1)(const float*, float*, int, int, int, int, int, int, int);
  544. typedef void (*Func2)(
  545. const float*, float*, int, int, int, int, int, int, int, int, int, int,
  546. int);
  547. Func1 f1 = nullptr;
  548. Func2 f2 = nullptr;
  549. if (is_xcorr) {
  550. f1 = col2img_nchw44<true>;
  551. f2 = col2img_stride_padding_nchw44<true>;
  552. } else {
  553. f1 = col2img_nchw44<false>;
  554. f2 = col2img_stride_padding_nchw44<false>;
  555. }
  556. float* filter = const_cast<float*>(param.filter<float>());
  557. TensorND A_src, A_dst;
  558. {
  559. A_src.layout = TensorLayout(
  560. {IC / 4 * FH * FW, OC / 4, 4, 4},
  561. {
  562. static_cast<std::ptrdiff_t>(16),
  563. static_cast<std::ptrdiff_t>(IC * FH * FW * 4),
  564. static_cast<std::ptrdiff_t>(1),
  565. static_cast<std::ptrdiff_t>(4),
  566. },
  567. param.filter_type);
  568. A_src.reset_ptr(static_cast<void*>(filter));
  569. A_dst.layout =
  570. TensorLayout({IC / 4 * FH * FW, OC / 4, 4, 4}, param.filter_type);
  571. A_dst.reset_ptr(static_cast<void*>(bundle.get(2)));
  572. // TODO Should be removed once armv8 convolution support transpose.
  573. get_relayout_opr()->exec(A_src, A_dst, inplace_cpu_handle().get());
  574. }
  575. TensorND B_, C_;
  576. for (size_t n = 0; n < N; ++n) {
  577. float*C_src, *C_dst;
  578. float* diff = const_cast<float*>(param.diff<float>() + n * param.inp_bs);
  579. float* grad = param.grad<float>() + n * param.out_bs;
  580. if (is1X1) {
  581. C_src = grad;
  582. } else {
  583. C_src = static_cast<float*>(bundle.get(0));
  584. }
  585. {
  586. B_.layout = TensorLayout({OC/4, IH * IW, 4}, param.diff_type);
  587. B_.reset_ptr(static_cast<void*>(diff));
  588. C_.layout = TensorLayout({IC / 4 * FH * FW, IH * IW, 4}, param.grad_type);
  589. C_.reset_ptr(C_src);
  590. Workspace workspace(
  591. static_cast<dt_byte*>(bundle.get(1)), bundle.get_size(1));
  592. auto matmul_opr =get_matmul_opr(param);
  593. matmul_opr->exec(A_dst, B_, C_, workspace);
  594. }
  595. if (!is1X1) {
  596. C_dst = grad;
  597. std::memset(C_dst, 0, param.grad_type.size() * IC * OH * OW);
  598. if (PH == 0 && PW == 0 && SH == 1 && SW == 1) {
  599. f1(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW);
  600. } else {
  601. f2(C_src, C_dst, OH, OW, IC, IH, IW, FH, FW, SH, SW, PH, PW);
  602. }
  603. }
  604. }
  605. }
  606. } // namespace
  607. bool ConvolutionBackwardDataImpl::AlgoMatrixMulNCHW44::usable(
  608. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  609. auto&& fm = param.filter_meta;
  610. return fm.format == param::Convolution::Format::NCHW44 &&
  611. param.diff_type.enumv() == DTypeTrait<dtype::Float32>::enumv &&
  612. param.filter_type.enumv() == DTypeTrait<dtype::Float32>::enumv &&
  613. param.grad_type.enumv() == DTypeTrait<dtype::Float32>::enumv &&
  614. fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
  615. fm.dilation[1] == 1 && fm.icpg % 4 == 0 && fm.ocpg % 4 == 0;
  616. }
  617. size_t ConvolutionBackwardDataImpl::AlgoMatrixMulNCHW44::get_workspace(
  618. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  619. MIDOUT_BEGIN(
  620. megdnn_fallback_deconv,
  621. midout_iv("AlgoMatrixMulNCHW44::get_workspace"_hash)) {
  622. return get_bundle(param).total_size_in_bytes();
  623. }
  624. MIDOUT_END();
  625. return 0;
  626. }
  627. ConvolutionBackwardDataImpl::ncb_kern_t ConvolutionBackwardDataImpl::
  628. AlgoMatrixMulNCHW44::dispatch_kern(
  629. ConvolutionBackwardDataImpl*, const NCBKernSizeParam& param) const {
  630. if (param.filter_type.enumv() == DTypeTrait<dtype::Float32>::enumv) {
  631. MIDOUT_BEGIN(megdnn_fallback_deconv, midout_iv("FLOAT_NCHW44"_hash)) {
  632. return kern_matmul_nchw44;
  633. }
  634. MIDOUT_END();
  635. }
  636. megdnn_throw("unsupported data type on matrix mul");
  637. }
  638. bool ConvolutionBackwardDataImpl::AlgoMatrixMulNCHW44::is_preferred(
  639. const NCBKernSizeParam& param) const {
  640. return is_matrix_mul_preferred(param);
  641. }
  642. // vim: syntax=cpp.doxygen