You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_arm.cpp 43 kB

7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_arm.h"
  15. #include "benchmark.h"
  16. #include "layer_type.h"
  17. #if __ARM_NEON
  18. #include <arm_neon.h>
  19. #include "neon_mathfun.h"
  20. #endif // __ARM_NEON
  21. namespace ncnn {
  22. #include "convolution_1x1.h"
  23. #include "convolution_2x2.h"
  24. #include "convolution_3x3.h"
  25. #include "convolution_4x4.h"
  26. #include "convolution_5x5.h"
  27. #include "convolution_7x7.h"
  28. #include "convolution_sgemm.h"
  29. #include "convolution_sgemm_int8.h"
  30. #include "convolution_1x1_int8.h"
  31. #include "convolution_3x3_int8.h"
  32. #include "convolution_5x5_int8.h"
  33. #include "convolution_7x7_int8.h"
  34. #if __ARM_NEON
  35. #include "convolution_1x1_pack4.h"
  36. #include "convolution_3x3_pack4.h"
  37. #include "convolution_3x3_pack1to4.h"
  38. #endif // __ARM_NEON
  39. DEFINE_LAYER_CREATOR(Convolution_arm)
  40. Convolution_arm::Convolution_arm()
  41. {
  42. #if __ARM_NEON
  43. support_packing = true;
  44. #endif // __ARM_NEON
  45. activation = 0;
  46. }
  47. int Convolution_arm::create_pipeline(const Option& opt)
  48. {
  49. if (activation_type == 1)
  50. {
  51. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  52. ncnn::ParamDict pd;
  53. activation->load_param(pd);
  54. }
  55. else if (activation_type == 2)
  56. {
  57. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  58. ncnn::ParamDict pd;
  59. pd.set(0, activation_params[0]);// slope
  60. activation->load_param(pd);
  61. }
  62. else if (activation_type == 3)
  63. {
  64. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  65. ncnn::ParamDict pd;
  66. pd.set(0, activation_params[0]);// min
  67. pd.set(1, activation_params[1]);// max
  68. activation->load_param(pd);
  69. }
  70. else if (activation_type == 4)
  71. {
  72. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  73. ncnn::ParamDict pd;
  74. activation->load_param(pd);
  75. }
  76. if (activation)
  77. {
  78. Option opt_cpu = opt;
  79. opt_cpu.use_vulkan_compute = false;
  80. activation->create_pipeline(opt_cpu);
  81. }
  82. const int maxk = kernel_w * kernel_h;
  83. int num_input = weight_data_size / maxk / num_output;
  84. #if __ARM_NEON
  85. if (opt.use_packing_layout)
  86. {
  87. // pack4
  88. if (num_input % 4 == 0 && num_output % 4 == 0)
  89. {
  90. // src = kw-kh-inch-outch
  91. // dst = 4b-4a-kw-kh-inch/4a-outch/4b
  92. {
  93. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  94. weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);
  95. for (int q=0; q+3<num_output; q+=4)
  96. {
  97. const Mat k0 = weight_data_r2.channel(q);
  98. const Mat k1 = weight_data_r2.channel(q+1);
  99. const Mat k2 = weight_data_r2.channel(q+2);
  100. const Mat k3 = weight_data_r2.channel(q+3);
  101. Mat g0 = weight_data_pack4.channel(q/4);
  102. for (int p=0; p+3<num_input; p+=4)
  103. {
  104. const float* k00 = k0.row(p);
  105. const float* k01 = k0.row(p+1);
  106. const float* k02 = k0.row(p+2);
  107. const float* k03 = k0.row(p+3);
  108. const float* k10 = k1.row(p);
  109. const float* k11 = k1.row(p+1);
  110. const float* k12 = k1.row(p+2);
  111. const float* k13 = k1.row(p+3);
  112. const float* k20 = k2.row(p);
  113. const float* k21 = k2.row(p+1);
  114. const float* k22 = k2.row(p+2);
  115. const float* k23 = k2.row(p+3);
  116. const float* k30 = k3.row(p);
  117. const float* k31 = k3.row(p+1);
  118. const float* k32 = k3.row(p+2);
  119. const float* k33 = k3.row(p+3);
  120. float* g00 = g0.row(p/4);
  121. for (int k=0; k<maxk; k++)
  122. {
  123. g00[0] = k00[k];
  124. g00[1] = k10[k];
  125. g00[2] = k20[k];
  126. g00[3] = k30[k];
  127. g00[4] = k01[k];
  128. g00[5] = k11[k];
  129. g00[6] = k21[k];
  130. g00[7] = k31[k];
  131. g00[8] = k02[k];
  132. g00[9] = k12[k];
  133. g00[10] = k22[k];
  134. g00[11] = k32[k];
  135. g00[12] = k03[k];
  136. g00[13] = k13[k];
  137. g00[14] = k23[k];
  138. g00[15] = k33[k];
  139. g00 += 16;
  140. }
  141. }
  142. }
  143. }
  144. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
  145. {
  146. conv3x3s1_winograd64_transform_kernel_pack4_neon(weight_data, weight_3x3_winograd64_data_pack4, num_input, num_output);
  147. }
  148. }
  149. // pack1to4
  150. if (num_input % 4 != 0 && num_output % 4 == 0)
  151. {
  152. // src = kw-kh-inch-outch
  153. // dst = 4b-kw-kh-inch-outch/4b
  154. {
  155. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  156. weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);
  157. for (int q=0; q+3<num_output; q+=4)
  158. {
  159. const Mat k0 = weight_data_r2.channel(q);
  160. const Mat k1 = weight_data_r2.channel(q+1);
  161. const Mat k2 = weight_data_r2.channel(q+2);
  162. const Mat k3 = weight_data_r2.channel(q+3);
  163. Mat g0 = weight_data_pack1to4.channel(q/4);
  164. for (int p=0; p<num_input; p++)
  165. {
  166. const float* k00 = k0.row(p);
  167. const float* k10 = k1.row(p);
  168. const float* k20 = k2.row(p);
  169. const float* k30 = k3.row(p);
  170. float* g00 = g0.row(p);
  171. for (int k=0; k<maxk; k++)
  172. {
  173. g00[0] = k00[k];
  174. g00[1] = k10[k];
  175. g00[2] = k20[k];
  176. g00[3] = k30[k];
  177. g00 += 4;
  178. }
  179. }
  180. }
  181. }
  182. }
  183. // pack4to1
  184. if (num_input % 4 == 0 && num_output % 4 != 0)
  185. {
  186. // src = kw-kh-inch-outch
  187. // dst = 4a-kw-kh-inch/4a-outch
  188. {
  189. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  190. weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);
  191. for (int q=0; q<num_output; q++)
  192. {
  193. const Mat k0 = weight_data_r2.channel(q);
  194. Mat g0 = weight_data_pack4to1.channel(q);
  195. for (int p=0; p+3<num_input; p+=4)
  196. {
  197. const float* k00 = k0.row(p);
  198. const float* k01 = k0.row(p+1);
  199. const float* k02 = k0.row(p+2);
  200. const float* k03 = k0.row(p+3);
  201. float* g00 = g0.row(p/4);
  202. for (int k=0; k<maxk; k++)
  203. {
  204. g00[0] = k00[k];
  205. g00[1] = k01[k];
  206. g00[2] = k02[k];
  207. g00[3] = k03[k];
  208. g00 += 4;
  209. }
  210. }
  211. }
  212. }
  213. }
  214. } // opt.use_packing_layout
  215. #endif // __ARM_NEON
  216. use_winograd3x3 = false;
  217. use_sgemm1x1 = false;
  218. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  219. {
  220. // winograd is slow on small channel count
  221. if (num_input >= 16 && num_output >= 16)
  222. use_winograd3x3 = true;
  223. if (use_int8_inference)
  224. use_winograd3x3 = true;
  225. }
  226. // TODO assume more proper condition
  227. if (opt.use_sgemm_convolution && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  228. {
  229. if (num_input >= 64 && num_output >= 64)
  230. use_sgemm1x1 = true;
  231. }
  232. if (use_int8_inference)
  233. {
  234. if (use_winograd3x3)
  235. {
  236. // conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output);
  237. conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output);
  238. }
  239. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  240. {
  241. conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output);
  242. }
  243. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  244. {
  245. conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output);
  246. use_sgemm1x1 = true;
  247. }
  248. else
  249. {
  250. conv_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_int8_data, num_input, num_output, maxk);
  251. }
  252. return 0;
  253. }
  254. if (impl_type > 0)
  255. {
  256. switch(impl_type)
  257. {
  258. case 1:
  259. // winograd
  260. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  261. break;
  262. case 2:
  263. // pointwise
  264. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  265. break;
  266. case 3:
  267. // im2col
  268. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  269. break;
  270. case 4:
  271. // direct
  272. break;
  273. case 5:
  274. // conv3x3s2
  275. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  276. break;
  277. default:
  278. return -1;
  279. }
  280. return 0;
  281. }
  282. if (use_winograd3x3)
  283. {
  284. // conv3x3s1_winograd64_transform_kernel_neon(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  285. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  286. }
  287. if (use_sgemm1x1)
  288. {
  289. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  290. }
  291. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  292. {
  293. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  294. }
  295. {
  296. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  297. }
  298. return 0;
  299. }
  300. int Convolution_arm::destroy_pipeline(const Option& opt)
  301. {
  302. if (activation)
  303. {
  304. Option opt_cpu = opt;
  305. opt_cpu.use_vulkan_compute = false;
  306. activation->destroy_pipeline(opt_cpu);
  307. delete activation;
  308. activation = 0;
  309. }
  310. return 0;
  311. }
  312. int Convolution_arm::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
  313. {
  314. int w = bottom_blob.w;
  315. int h = bottom_blob.h;
  316. size_t elemsize = bottom_blob.elemsize;
  317. const int kernel_size = kernel_w;
  318. const int stride = stride_w;
  319. const int dilation = dilation_w;
  320. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  321. Mat bottom_blob_bordered = bottom_blob;
  322. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  323. {
  324. Option opt_b = opt;
  325. opt_b.blob_allocator = opt.workspace_allocator;
  326. copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
  327. if (bottom_blob_bordered.empty())
  328. return -100;
  329. w = bottom_blob_bordered.w;
  330. h = bottom_blob_bordered.h;
  331. }
  332. else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  333. {
  334. int wpad = kernel_extent + (w - 1) / stride * stride - w;
  335. int hpad = kernel_extent + (h - 1) / stride * stride - h;
  336. if (wpad > 0 || hpad > 0)
  337. {
  338. Option opt_b = opt;
  339. opt_b.blob_allocator = opt.workspace_allocator;
  340. copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  341. if (bottom_blob_bordered.empty())
  342. return -100;
  343. }
  344. w = bottom_blob_bordered.w;
  345. h = bottom_blob_bordered.h;
  346. }
  347. else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
  348. {
  349. int wpad = kernel_extent + (w - 1) / stride * stride - w;
  350. int hpad = kernel_extent + (h - 1) / stride * stride - h;
  351. if (wpad > 0 || hpad > 0)
  352. {
  353. Option opt_b = opt;
  354. opt_b.blob_allocator = opt.workspace_allocator;
  355. copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  356. if (bottom_blob_bordered.empty())
  357. return -100;
  358. }
  359. w = bottom_blob_bordered.w;
  360. h = bottom_blob_bordered.h;
  361. }
  362. int outw = (w - kernel_extent) / stride + 1;
  363. int outh = (h - kernel_extent) / stride + 1;
  364. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  365. if (top_blob.empty())
  366. return -100;
  367. // Make (dilation * dilation) batches
  368. Mat inner_bottom_blob;
  369. Mat inner_top_blob;
  370. for (int x = 0; x < dilation; x ++)
  371. {
  372. for (int y = 0; y < dilation; y ++)
  373. {
  374. int inner_w = (w - y + dilation - 1) / dilation;
  375. int inner_h = (h - x + dilation - 1) / dilation;
  376. int inner_outw = (inner_w - kernel_size) / stride + 1;
  377. int inner_outh = (inner_h - kernel_size) / stride + 1;
  378. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  379. if (inner_bottom_blob.empty())
  380. return -100;
  381. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  382. if (inner_top_blob.empty())
  383. return -100;
  384. #pragma omp parallel for num_threads(opt.num_threads)
  385. for (int c = 0; c < bottom_blob.c; c ++)
  386. {
  387. float *outptr = inner_bottom_blob.channel(c);
  388. for (int i = 0; i < inner_h; i ++)
  389. {
  390. const float *ptr = (const float *) bottom_blob_bordered.channel(c) + dilation * i * w + x * w + y;
  391. for (int j = 0; j < inner_w; j ++)
  392. {
  393. outptr[j] = ptr[j*dilation];
  394. }
  395. outptr += inner_w;
  396. }
  397. }
  398. ncnn::Option opt_g = opt;
  399. opt_g.blob_allocator = inner_top_blob.allocator;
  400. conv(inner_bottom_blob, inner_top_blob, weight_data, bias_data, opt_g);
  401. #pragma omp parallel for num_threads(opt.num_threads)
  402. for (int c = 0; c < num_output; c ++)
  403. {
  404. float *outptr = (float *) top_blob.channel(c) + x * outw + y;
  405. for (int i = 0; i < inner_outh; i ++)
  406. {
  407. const float *ptr = (const float *) inner_top_blob.channel(c) + i * inner_outw;
  408. for (int j = 0; j < inner_outw; j ++)
  409. {
  410. outptr[j*dilation] = ptr[j];
  411. }
  412. outptr += dilation * outw;
  413. }
  414. }
  415. }
  416. }
  417. return 0;
  418. }
  419. int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  420. {
  421. // convolv with NxN kernel
  422. // value = value + bias
  423. #if __ARM_NEON
  424. if (opt.use_packing_layout)
  425. {
  426. int w = bottom_blob.w;
  427. int h = bottom_blob.h;
  428. int channels = bottom_blob.c;
  429. size_t elemsize = bottom_blob.elemsize;
  430. int elempack = bottom_blob.elempack;
  431. // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  432. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  433. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  434. Mat bottom_blob_bordered = bottom_blob;
  435. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  436. {
  437. Option opt_b = opt;
  438. opt_b.blob_allocator = opt.workspace_allocator;
  439. copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
  440. }
  441. else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  442. {
  443. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  444. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  445. if (wpad > 0 || hpad > 0)
  446. {
  447. Option opt_b = opt;
  448. opt_b.blob_allocator = opt.workspace_allocator;
  449. copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  450. }
  451. }
  452. else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
  453. {
  454. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  455. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  456. if (wpad > 0 || hpad > 0)
  457. {
  458. Option opt_b = opt;
  459. opt_b.blob_allocator = opt.workspace_allocator;
  460. copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  461. }
  462. }
  463. if (bottom_blob_bordered.empty())
  464. return -100;
  465. w = bottom_blob_bordered.w;
  466. h = bottom_blob_bordered.h;
  467. int outw = (w - kernel_extent_w) / stride_w + 1;
  468. int outh = (h - kernel_extent_h) / stride_h + 1;
  469. int out_elempack = num_output % 4 == 0 ? 4 : 1;
  470. size_t out_elemsize = elemsize / elempack * out_elempack;
  471. const int maxk = kernel_w * kernel_h;
  472. // kernel offsets
  473. std::vector<int> _space_ofs(maxk);
  474. int* space_ofs = &_space_ofs[0];
  475. {
  476. int p1 = 0;
  477. int p2 = 0;
  478. int gap = w * dilation_h - kernel_w * dilation_w;
  479. for (int i = 0; i < kernel_h; i++)
  480. {
  481. for (int j = 0; j < kernel_w; j++)
  482. {
  483. space_ofs[p1] = p2;
  484. p1++;
  485. p2 += dilation_w;
  486. }
  487. p2 += gap;
  488. }
  489. }
  490. // float32
  491. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  492. if (top_blob.empty())
  493. return -100;
  494. if (elempack == 4 && out_elempack == 4)
  495. {
  496. if (kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
  497. {
  498. conv1x1s1_sgemm_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  499. if (activation)
  500. {
  501. activation->forward_inplace(top_blob, opt);
  502. }
  503. return 0;
  504. }
  505. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
  506. {
  507. conv3x3s1_winograd64_pack4_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data_pack4, bias_data, opt);
  508. if (activation)
  509. {
  510. activation->forward_inplace(top_blob, opt);
  511. }
  512. return 0;
  513. }
  514. // num_output
  515. #pragma omp parallel for num_threads(opt.num_threads)
  516. for (int p=0; p<num_output / out_elempack; p++)
  517. {
  518. float* outptr = top_blob.channel(p);
  519. for (int i = 0; i < outh; i++)
  520. {
  521. for (int j = 0; j < outw; j++)
  522. {
  523. float32x4_t _sum = vdupq_n_f32(0.f);
  524. if (bias_term)
  525. {
  526. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  527. }
  528. const float* kptr = (const float*)weight_data_pack4 + maxk * channels * p * 16;
  529. // channels
  530. for (int q=0; q<channels; q++)
  531. {
  532. const Mat m = bottom_blob_bordered.channel(q);
  533. const float* sptr = m.row(i*stride_h) + j*stride_w * 4;
  534. for (int k = 0; k < maxk; k++) // 29.23
  535. {
  536. float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
  537. float32x4_t _w0 = vld1q_f32( kptr );
  538. float32x4_t _w1 = vld1q_f32( kptr + 4 );
  539. float32x4_t _w2 = vld1q_f32( kptr + 8 );
  540. float32x4_t _w3 = vld1q_f32( kptr + 12 );
  541. #if __aarch64__
  542. _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
  543. _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
  544. _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
  545. _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
  546. #else
  547. _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
  548. _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
  549. _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
  550. _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
  551. #endif
  552. kptr += 16;
  553. }
  554. }
  555. if (activation_type == 1)
  556. {
  557. float32x4_t _zero = vdupq_n_f32(0.f);
  558. _sum = vmaxq_f32(_sum, _zero);
  559. }
  560. else if (activation_type == 2)
  561. {
  562. float32x4_t _zero = vdupq_n_f32(0.f);
  563. float32x4_t _slope = vdupq_n_f32(activation_params[0]);
  564. uint32x4_t _lemask = vcleq_f32(_sum, _zero);
  565. float32x4_t _ps = vmulq_f32(_sum, _slope);
  566. _sum = vbslq_f32(_lemask, _ps, _sum);
  567. }
  568. else if (activation_type == 3)
  569. {
  570. float32x4_t _min = vdupq_n_f32(activation_params[0]);
  571. float32x4_t _max = vdupq_n_f32(activation_params[1]);
  572. _sum = vmaxq_f32(_sum, _min);
  573. _sum = vminq_f32(_sum, _max);
  574. }
  575. else if (activation_type == 4)
  576. {
  577. float32x4_t _one = vdupq_n_f32(1.f);
  578. _sum = vnegq_f32(_sum);
  579. _sum = exp_ps(_sum);
  580. _sum = vaddq_f32(_sum, _one);
  581. float32x4_t _outp = vrecpeq_f32(_sum);
  582. _outp = vmulq_f32(vrecpsq_f32(_sum, _outp), _outp);
  583. // _outp = vmulq_f32(vrecpsq_f32(_sum, _outp), _outp);
  584. _sum = _outp;
  585. }
  586. vst1q_f32(outptr + j * 4, _sum);
  587. }
  588. outptr += outw * 4;
  589. }
  590. }
  591. return 0;
  592. }
  593. if (elempack == 1 && out_elempack == 4)
  594. {
  595. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
  596. {
  597. conv3x3s1_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  598. if (activation)
  599. {
  600. activation->forward_inplace(top_blob, opt);
  601. }
  602. return 0;
  603. }
  604. if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
  605. {
  606. conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  607. if (activation)
  608. {
  609. activation->forward_inplace(top_blob, opt);
  610. }
  611. return 0;
  612. }
  613. // num_output
  614. #pragma omp parallel for num_threads(opt.num_threads)
  615. for (int p=0; p<num_output / out_elempack; p++)
  616. {
  617. float* outptr = top_blob.channel(p);
  618. for (int i = 0; i < outh; i++)
  619. {
  620. for (int j = 0; j < outw; j++)
  621. {
  622. float32x4_t _sum = vdupq_n_f32(0.f);
  623. if (bias_term)
  624. {
  625. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  626. }
  627. const float* kptr = (const float*)weight_data_pack1to4 + maxk * channels * p * 4;
  628. // channels
  629. for (int q=0; q<channels; q++)
  630. {
  631. const Mat m = bottom_blob_bordered.channel(q);
  632. const float* sptr = m.row(i*stride_h) + j*stride_w;
  633. for (int k = 0; k < maxk; k++) // 29.23
  634. {
  635. float32x4_t _val = vdupq_n_f32( sptr[ space_ofs[k] ] );
  636. float32x4_t _w = vld1q_f32( kptr );
  637. _sum = vmlaq_f32(_sum, _val, _w);
  638. kptr += 4;
  639. }
  640. }
  641. if (activation_type == 1)
  642. {
  643. float32x4_t _zero = vdupq_n_f32(0.f);
  644. _sum = vmaxq_f32(_sum, _zero);
  645. }
  646. else if (activation_type == 2)
  647. {
  648. float32x4_t _zero = vdupq_n_f32(0.f);
  649. float32x4_t _slope = vdupq_n_f32(activation_params[0]);
  650. uint32x4_t _lemask = vcleq_f32(_sum, _zero);
  651. float32x4_t _ps = vmulq_f32(_sum, _slope);
  652. _sum = vbslq_f32(_lemask, _ps, _sum);
  653. }
  654. else if (activation_type == 3)
  655. {
  656. float32x4_t _min = vdupq_n_f32(activation_params[0]);
  657. float32x4_t _max = vdupq_n_f32(activation_params[1]);
  658. _sum = vmaxq_f32(_sum, _min);
  659. _sum = vminq_f32(_sum, _max);
  660. }
  661. else if (activation_type == 4)
  662. {
  663. float32x4_t _one = vdupq_n_f32(1.f);
  664. _sum = vnegq_f32(_sum);
  665. _sum = exp_ps(_sum);
  666. _sum = vaddq_f32(_sum, _one);
  667. float32x4_t _outp = vrecpeq_f32(_sum);
  668. _outp = vmulq_f32(vrecpsq_f32(_sum, _outp), _outp);
  669. // _outp = vmulq_f32(vrecpsq_f32(_sum, _outp), _outp);
  670. _sum = _outp;
  671. }
  672. vst1q_f32(outptr + j * 4, _sum);
  673. }
  674. outptr += outw * 4;
  675. }
  676. }
  677. return 0;
  678. }
  679. if (elempack == 4 && out_elempack == 1)
  680. {
  681. // num_output
  682. #pragma omp parallel for num_threads(opt.num_threads)
  683. for (int p=0; p<num_output; p++)
  684. {
  685. float* outptr = top_blob.channel(p);
  686. for (int i = 0; i < outh; i++)
  687. {
  688. for (int j = 0; j < outw; j++)
  689. {
  690. float sum = 0.f;
  691. if (bias_term)
  692. {
  693. sum = bias_data[p];
  694. }
  695. const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
  696. // channels
  697. for (int q=0; q<channels; q++)
  698. {
  699. const Mat m = bottom_blob_bordered.channel(q);
  700. const float* sptr = m.row(i*stride_h) + j*stride_w * 4;
  701. for (int k = 0; k < maxk; k++) // 29.23
  702. {
  703. float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
  704. float32x4_t _w = vld1q_f32( kptr );
  705. float32x4_t _s4 = vmulq_f32(_val, _w);
  706. #if __aarch64__
  707. sum += vaddvq_f32(_s4); // dot
  708. #else
  709. float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
  710. _ss = vpadd_f32(_ss, _ss);
  711. sum += vget_lane_f32(_ss, 0);
  712. #endif
  713. kptr += 4;
  714. }
  715. }
  716. if (activation_type == 1)
  717. {
  718. sum = std::max(sum, 0.f);
  719. }
  720. else if (activation_type == 2)
  721. {
  722. float slope = activation_params[0];
  723. sum = sum > 0.f ? sum : sum * slope;
  724. }
  725. else if (activation_type == 3)
  726. {
  727. float min = activation_params[0];
  728. float max = activation_params[1];
  729. if (sum < min)
  730. sum = min;
  731. if (sum > max)
  732. sum = max;
  733. }
  734. else if (activation_type == 4)
  735. {
  736. sum = 1.f / (1.f + exp(-sum));
  737. }
  738. outptr[j] = sum;
  739. }
  740. outptr += outw;
  741. }
  742. }
  743. return 0;
  744. }
  745. } // opt.use_packed_layout
  746. #endif // __ARM_NEON
  747. if (bottom_blob.dims != 3)
  748. {
  749. return Convolution::forward(bottom_blob, top_blob, opt);
  750. }
  751. if (kernel_w != kernel_h || stride_w != stride_h)
  752. {
  753. return Convolution::forward(bottom_blob, top_blob, opt);
  754. }
  755. const int kernel_size = kernel_w;
  756. //const int stride = stride_w;
  757. int stride = stride_w;
  758. if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
  759. {
  760. return Convolution::forward(bottom_blob, top_blob, opt);
  761. }
  762. typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
  763. // kernel_size x stride
  764. conv_func conv_func_table[7][4] =
  765. {
  766. {
  767. conv1x1s1_neon,
  768. conv1x1s2_neon,
  769. 0,
  770. 0
  771. }, // kernel_size = 1
  772. {
  773. conv2x2s1_neon,
  774. 0,
  775. 0,
  776. 0
  777. }, // kernel_size = 2
  778. {
  779. conv3x3s1_neon,
  780. conv3x3s2_neon,
  781. 0,
  782. 0
  783. }, // kernel_size = 3
  784. {
  785. 0,
  786. 0,
  787. 0,
  788. conv4x4s4_neon
  789. }, // kernel_size = 4
  790. {
  791. conv5x5s1_neon,
  792. conv5x5s2_neon,
  793. 0,
  794. 0
  795. }, // kernel_size = 5
  796. {
  797. 0,
  798. 0,
  799. 0,
  800. 0
  801. }, // kernel_size = 6
  802. {
  803. conv7x7s1_neon,
  804. conv7x7s2_neon,
  805. 0,
  806. 0
  807. } // kernel_size = 7
  808. };
  809. typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);
  810. // kernel_size x stride
  811. conv_int8_func conv_int8_func_table[7][4] =
  812. {
  813. {
  814. conv1x1s1_int8_neon,
  815. conv1x1s2_int8_neon,
  816. 0,
  817. 0
  818. }, // kernel_size = 1
  819. {
  820. 0,
  821. 0,
  822. 0,
  823. 0
  824. }, // kernel_size = 2
  825. {
  826. conv3x3s1_int8_neon,
  827. conv3x3s2_int8_neon,
  828. 0,
  829. 0
  830. }, // kernel_size = 3
  831. {
  832. 0,
  833. 0,
  834. 0,
  835. 0
  836. }, // kernel_size = 4
  837. {
  838. conv5x5s1_int8_neon,
  839. conv5x5s2_int8_neon,
  840. 0,
  841. 0
  842. }, // kernel_size = 5
  843. {
  844. 0,
  845. 0,
  846. 0,
  847. 0
  848. }, // kernel_size = 6
  849. {
  850. conv7x7s1_int8_neon,
  851. conv7x7s2_int8_neon,
  852. 0,
  853. 0
  854. } // kernel_size = 7
  855. };
  856. conv_func conv = 0;
  857. conv_int8_func conv_int8 = 0;
  858. if (use_int8_inference)
  859. {
  860. conv_int8 = conv_int8_func_table[kernel_size-1][stride-1];
  861. if (!conv_int8)
  862. {
  863. return Convolution::forward(bottom_blob, top_blob, opt);
  864. }
  865. }
  866. else
  867. {
  868. conv = conv_func_table[kernel_size-1][stride-1];
  869. if (!conv)
  870. {
  871. return Convolution::forward(bottom_blob, top_blob, opt);
  872. }
  873. if (dilation_w != 1)
  874. {
  875. if (stride != 1)
  876. return Convolution::forward(bottom_blob, top_blob, opt);
  877. return forwardDilation(bottom_blob, top_blob, conv, opt);
  878. }
  879. }
  880. int w = bottom_blob.w;
  881. int h = bottom_blob.h;
  882. int channels = bottom_blob.c;
  883. size_t elemsize = bottom_blob.elemsize;
  884. Mat bottom_blob_unbordered = bottom_blob;
  885. if (use_int8_inference && elemsize != 1)
  886. {
  887. Mat bottom_blob_int8;
  888. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  889. if (bottom_blob_int8.empty())
  890. return -100;
  891. // quantize, scale and round to nearest
  892. {
  893. ncnn::Option opt_g = opt;
  894. opt_g.blob_allocator = bottom_blob_int8.allocator;
  895. quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
  896. }
  897. bottom_blob_unbordered = bottom_blob_int8;
  898. }
  899. Mat bottom_blob_bordered = bottom_blob_unbordered;
  900. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  901. {
  902. Option opt_b = opt;
  903. opt_b.blob_allocator = opt.workspace_allocator;
  904. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
  905. }
  906. else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  907. {
  908. int wpad = kernel_size + (w - 1) / stride * stride - w;
  909. int hpad = kernel_size + (h - 1) / stride * stride - h;
  910. if (wpad > 0 || hpad > 0)
  911. {
  912. Option opt_b = opt;
  913. opt_b.blob_allocator = opt.workspace_allocator;
  914. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  915. }
  916. }
  917. else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
  918. {
  919. int wpad = kernel_size + (w - 1) / stride * stride - w;
  920. int hpad = kernel_size + (h - 1) / stride * stride - h;
  921. if (wpad > 0 || hpad > 0)
  922. {
  923. Option opt_b = opt;
  924. opt_b.blob_allocator = opt.workspace_allocator;
  925. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  926. }
  927. }
  928. if (bottom_blob_bordered.empty())
  929. return -100;
  930. w = bottom_blob_bordered.w;
  931. h = bottom_blob_bordered.h;
  932. int outw = (w - kernel_size) / stride + 1;
  933. int outh = (h - kernel_size) / stride + 1;
  934. // int8
  935. if (use_int8_inference)
  936. {
  937. if (use_int8_requantize == true)
  938. {
  939. Mat top_blob_tm;
  940. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  941. if (top_blob_tm.empty())
  942. return -100;
  943. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  944. if (top_blob.empty())
  945. return -100;
  946. if (use_sgemm1x1)
  947. {
  948. conv1x1s1_sgemm_int8_requant_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, bias_data, requantize_scales, opt);
  949. if (activation)
  950. {
  951. activation->forward_inplace(top_blob, opt);
  952. }
  953. return 0;
  954. }
  955. else if (use_winograd3x3)
  956. {
  957. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt);
  958. conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt);
  959. }
  960. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  961. {
  962. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt);
  963. }
  964. else
  965. {
  966. conv_int8(bottom_blob_bordered, top_blob_tm, weight_sgemm_int8_data, opt);
  967. }
  968. // requantize, reverse scale inplace
  969. #pragma omp parallel for num_threads(opt.num_threads)
  970. for (int p=0; p<num_output; p++)
  971. {
  972. ncnn::Option opt_g = opt;
  973. opt_g.num_threads = 1;
  974. opt_g.blob_allocator = top_blob.allocator;
  975. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  976. Mat top_blob_g = top_blob.channel_range(p, 1);
  977. requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
  978. }
  979. }
  980. else
  981. {
  982. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  983. if (top_blob.empty())
  984. return -100;
  985. if (use_sgemm1x1)
  986. {
  987. conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
  988. }
  989. else if (use_winograd3x3)
  990. {
  991. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt);
  992. // conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt);
  993. conv3x3s1_winograd43_dequant_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, bias_data, dequantize_scales, opt);
  994. if (activation)
  995. {
  996. activation->forward_inplace(top_blob, opt);
  997. }
  998. return 0;
  999. }
  1000. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1001. {
  1002. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
  1003. }
  1004. else
  1005. {
  1006. conv_int8(bottom_blob_bordered, top_blob, weight_sgemm_int8_data, opt);
  1007. }
  1008. // dequantize, reverse scale inplace
  1009. #pragma omp parallel for num_threads(opt.num_threads)
  1010. for (int p=0; p<num_output; p++)
  1011. {
  1012. ncnn::Option opt_g = opt;
  1013. opt_g.num_threads = 1;
  1014. opt_g.blob_allocator = top_blob.allocator;
  1015. Mat top_blob_g = top_blob.channel_range(p, 1);
  1016. dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
  1017. }
  1018. }
  1019. if (activation)
  1020. {
  1021. activation->forward_inplace(top_blob, opt);
  1022. }
  1023. return 0;
  1024. }
  1025. // float32
  1026. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  1027. if (top_blob.empty())
  1028. return -100;
  1029. if (impl_type > 0)
  1030. {
  1031. // engineering is magic.
  1032. switch(impl_type)
  1033. {
  1034. case 1:
  1035. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  1036. break;
  1037. case 2:
  1038. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  1039. break;
  1040. case 3:
  1041. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  1042. break;
  1043. case 4:
  1044. conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  1045. break;
  1046. case 5:
  1047. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  1048. break;
  1049. default:
  1050. return -1;
  1051. }
  1052. } else
  1053. {
  1054. if (use_winograd3x3 && w <= 120 && h <= 120)
  1055. {
  1056. // conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  1057. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  1058. }
  1059. else if (use_sgemm1x1)
  1060. {
  1061. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  1062. }
  1063. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1064. {
  1065. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  1066. }
  1067. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1068. {
  1069. if (outw >=8 && outh >=8)
  1070. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  1071. else
  1072. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  1073. }
  1074. else
  1075. conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  1076. }
  1077. if (activation)
  1078. {
  1079. activation->forward_inplace(top_blob, opt);
  1080. }
  1081. return 0;
  1082. }
  1083. } // namespace ncnn