You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_arm.cpp 71 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_arm.h"
  15. #include "benchmark.h"
  16. #include "cpu.h"
  17. #include "layer_type.h"
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #include "neon_mathfun.h"
  21. #endif // __ARM_NEON
  22. #include "neon_activation.h"
  23. namespace ncnn {
  24. #include "convolution_sgemm.h"
  25. #include "convolution_sgemm_int8.h"
  26. #include "convolution_1x1.h"
  27. #include "convolution_1x1_bf16s.h"
  28. #include "convolution_1x1_int8.h"
  29. #include "convolution_2x2.h"
  30. #include "convolution_3x3.h"
  31. #include "convolution_3x3_int8.h"
  32. #include "convolution_4x4.h"
  33. #include "convolution_5x5.h"
  34. #include "convolution_7x7.h"
  35. #if __ARM_NEON
  36. #include "convolution_1x1_pack4.h"
  37. #include "convolution_1x1_pack4_bf16s.h"
  38. #include "convolution_1x1_pack4to1.h"
  39. #include "convolution_1x1_pack4to1_bf16s.h"
  40. #include "convolution_3x3_pack1to4.h"
  41. #include "convolution_3x3_pack1to4_bf16s.h"
  42. #include "convolution_3x3_pack4.h"
  43. #include "convolution_3x3_pack4_bf16s.h"
  44. #include "convolution_3x3_pack4to1.h"
  45. #include "convolution_3x3_pack4to1_bf16s.h"
  46. #include "convolution_5x5_pack4.h"
  47. #include "convolution_5x5_pack4_bf16s.h"
  48. #include "convolution_7x7_pack1to4.h"
  49. #include "convolution_7x7_pack1to4_bf16s.h"
  50. #endif // __ARM_NEON
  51. Convolution_arm::Convolution_arm()
  52. {
  53. #if __ARM_NEON
  54. support_packing = true;
  55. #endif // __ARM_NEON
  56. support_bf16_storage = true;
  57. activation = 0;
  58. convolution_dilation1 = 0;
  59. }
  60. int Convolution_arm::create_pipeline(const Option& opt)
  61. {
  62. if (activation_type == 1)
  63. {
  64. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  65. ncnn::ParamDict pd;
  66. activation->load_param(pd);
  67. }
  68. else if (activation_type == 2)
  69. {
  70. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  71. ncnn::ParamDict pd;
  72. pd.set(0, activation_params[0]); // slope
  73. activation->load_param(pd);
  74. }
  75. else if (activation_type == 3)
  76. {
  77. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  78. ncnn::ParamDict pd;
  79. pd.set(0, activation_params[0]); // min
  80. pd.set(1, activation_params[1]); // max
  81. activation->load_param(pd);
  82. }
  83. else if (activation_type == 4)
  84. {
  85. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  86. ncnn::ParamDict pd;
  87. activation->load_param(pd);
  88. }
  89. else if (activation_type == 5)
  90. {
  91. activation = ncnn::create_layer(ncnn::LayerType::Mish);
  92. ncnn::ParamDict pd;
  93. activation->load_param(pd);
  94. }
  95. if (activation)
  96. {
  97. activation->create_pipeline(opt);
  98. }
  99. if (opt.use_bf16_storage)
  100. {
  101. return create_pipeline_bf16s(opt);
  102. }
  103. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  104. {
  105. support_packing = false;
  106. return create_pipeline_int8_arm(opt);
  107. }
  108. if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  109. {
  110. convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
  111. // set param
  112. ncnn::ParamDict pd;
  113. pd.set(0, num_output); // num_output
  114. pd.set(1, kernel_w);
  115. pd.set(11, kernel_h);
  116. pd.set(2, 1);
  117. pd.set(12, 1);
  118. pd.set(3, 1); // stride_w
  119. pd.set(13, 1); // stride_h
  120. pd.set(4, 0); // pad_w
  121. pd.set(14, 0); // pad_h
  122. pd.set(5, bias_term);
  123. pd.set(6, weight_data_size);
  124. convolution_dilation1->load_param(pd);
  125. // set weights
  126. if (bias_term)
  127. {
  128. ncnn::Mat weights[2];
  129. weights[0] = weight_data;
  130. weights[1] = bias_data;
  131. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  132. }
  133. else
  134. {
  135. ncnn::Mat weights[1];
  136. weights[0] = weight_data;
  137. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  138. }
  139. convolution_dilation1->create_pipeline(opt);
  140. return 0;
  141. }
  142. const int maxk = kernel_w * kernel_h;
  143. const int num_input = weight_data_size / maxk / num_output;
  144. int elempack = (support_packing && opt.use_packing_layout && num_input % 4 == 0) ? 4 : 1;
  145. int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  146. #if __ARM_NEON
  147. // pack4
  148. if (elempack == 4 && out_elempack == 4)
  149. {
  150. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  151. {
  152. conv1x1s1_sgemm_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  153. }
  154. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  155. {
  156. conv1x1s1_sgemm_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  157. }
  158. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  159. {
  160. conv3x3s1_winograd64_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  161. }
  162. else
  163. {
  164. // src = kw-kh-inch-outch
  165. // dst = 4b-4a-kw-kh-inch/4a-outch/4b
  166. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  167. weight_data_pack4.create(maxk, num_input / 4, num_output / 4, (size_t)4 * 16, 16);
  168. for (int q = 0; q + 3 < num_output; q += 4)
  169. {
  170. const Mat k0 = weight_data_r2.channel(q);
  171. const Mat k1 = weight_data_r2.channel(q + 1);
  172. const Mat k2 = weight_data_r2.channel(q + 2);
  173. const Mat k3 = weight_data_r2.channel(q + 3);
  174. Mat g0 = weight_data_pack4.channel(q / 4);
  175. for (int p = 0; p + 3 < num_input; p += 4)
  176. {
  177. const float* k00 = k0.row(p);
  178. const float* k01 = k0.row(p + 1);
  179. const float* k02 = k0.row(p + 2);
  180. const float* k03 = k0.row(p + 3);
  181. const float* k10 = k1.row(p);
  182. const float* k11 = k1.row(p + 1);
  183. const float* k12 = k1.row(p + 2);
  184. const float* k13 = k1.row(p + 3);
  185. const float* k20 = k2.row(p);
  186. const float* k21 = k2.row(p + 1);
  187. const float* k22 = k2.row(p + 2);
  188. const float* k23 = k2.row(p + 3);
  189. const float* k30 = k3.row(p);
  190. const float* k31 = k3.row(p + 1);
  191. const float* k32 = k3.row(p + 2);
  192. const float* k33 = k3.row(p + 3);
  193. float* g00 = g0.row(p / 4);
  194. for (int k = 0; k < maxk; k++)
  195. {
  196. g00[0] = k00[k];
  197. g00[1] = k10[k];
  198. g00[2] = k20[k];
  199. g00[3] = k30[k];
  200. g00[4] = k01[k];
  201. g00[5] = k11[k];
  202. g00[6] = k21[k];
  203. g00[7] = k31[k];
  204. g00[8] = k02[k];
  205. g00[9] = k12[k];
  206. g00[10] = k22[k];
  207. g00[11] = k32[k];
  208. g00[12] = k03[k];
  209. g00[13] = k13[k];
  210. g00[14] = k23[k];
  211. g00[15] = k33[k];
  212. g00 += 16;
  213. }
  214. }
  215. }
  216. }
  217. }
  218. // pack1to4
  219. if (elempack == 1 && out_elempack == 4)
  220. {
  221. // src = kw-kh-inch-outch
  222. // dst = 4b-kw-kh-inch-outch/4b
  223. {
  224. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  225. weight_data_pack1to4.create(maxk, num_input, num_output / 4, (size_t)4 * 4, 4);
  226. for (int q = 0; q + 3 < num_output; q += 4)
  227. {
  228. const Mat k0 = weight_data_r2.channel(q);
  229. const Mat k1 = weight_data_r2.channel(q + 1);
  230. const Mat k2 = weight_data_r2.channel(q + 2);
  231. const Mat k3 = weight_data_r2.channel(q + 3);
  232. Mat g0 = weight_data_pack1to4.channel(q / 4);
  233. for (int p = 0; p < num_input; p++)
  234. {
  235. const float* k00 = k0.row(p);
  236. const float* k10 = k1.row(p);
  237. const float* k20 = k2.row(p);
  238. const float* k30 = k3.row(p);
  239. float* g00 = g0.row(p);
  240. for (int k = 0; k < maxk; k++)
  241. {
  242. g00[0] = k00[k];
  243. g00[1] = k10[k];
  244. g00[2] = k20[k];
  245. g00[3] = k30[k];
  246. g00 += 4;
  247. }
  248. }
  249. }
  250. }
  251. }
  252. // pack4to1
  253. if (elempack == 4 && out_elempack == 1)
  254. {
  255. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  256. {
  257. conv1x1s1_sgemm_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  258. }
  259. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  260. {
  261. conv1x1s1_sgemm_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  262. }
  263. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  264. {
  265. conv3x3s1_winograd64_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  266. }
  267. else
  268. {
  269. // src = kw-kh-inch-outch
  270. // dst = 4a-kw-kh-inch/4a-outch
  271. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  272. weight_data_pack4to1.create(maxk, num_input / 4, num_output, (size_t)4 * 4, 4);
  273. for (int q = 0; q < num_output; q++)
  274. {
  275. const Mat k0 = weight_data_r2.channel(q);
  276. Mat g0 = weight_data_pack4to1.channel(q);
  277. for (int p = 0; p + 3 < num_input; p += 4)
  278. {
  279. const float* k00 = k0.row(p);
  280. const float* k01 = k0.row(p + 1);
  281. const float* k02 = k0.row(p + 2);
  282. const float* k03 = k0.row(p + 3);
  283. float* g00 = g0.row(p / 4);
  284. for (int k = 0; k < maxk; k++)
  285. {
  286. g00[0] = k00[k];
  287. g00[1] = k01[k];
  288. g00[2] = k02[k];
  289. g00[3] = k03[k];
  290. g00 += 4;
  291. }
  292. }
  293. }
  294. }
  295. }
  296. #endif // __ARM_NEON
  297. // pack1
  298. if (elempack == 1 && out_elempack == 1)
  299. {
  300. use_winograd3x3 = false;
  301. use_sgemm1x1 = false;
  302. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  303. {
  304. // winograd is slow on small channel count
  305. if (num_input >= 16 && num_output >= 16)
  306. use_winograd3x3 = true;
  307. if (use_winograd3x3)
  308. {
  309. // conv3x3s1_winograd64_transform_kernel_neon(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  310. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  311. }
  312. }
  313. // TODO assume more proper condition
  314. if (opt.use_sgemm_convolution && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  315. {
  316. if (num_input >= 64 && num_output >= 64)
  317. use_sgemm1x1 = true;
  318. if (use_sgemm1x1)
  319. {
  320. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  321. }
  322. }
  323. if (impl_type > 0 && impl_type < 6 && impl_type != 4)
  324. {
  325. switch (impl_type)
  326. {
  327. case 1:
  328. // winograd
  329. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  330. break;
  331. case 2:
  332. // pointwise
  333. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  334. break;
  335. case 3:
  336. // im2col
  337. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  338. break;
  339. // case 4:
  340. // // direct
  341. // break;
  342. case 5:
  343. // conv3x3s2
  344. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  345. break;
  346. }
  347. }
  348. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  349. {
  350. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  351. }
  352. if (opt.use_sgemm_convolution && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  353. {
  354. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  355. }
  356. if (opt.use_sgemm_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  357. {
  358. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  359. }
  360. }
  361. return 0;
  362. }
  363. int Convolution_arm::destroy_pipeline(const Option& opt)
  364. {
  365. if (activation)
  366. {
  367. activation->destroy_pipeline(opt);
  368. delete activation;
  369. activation = 0;
  370. }
  371. if (convolution_dilation1)
  372. {
  373. convolution_dilation1->destroy_pipeline(opt);
  374. delete convolution_dilation1;
  375. convolution_dilation1 = 0;
  376. }
  377. return 0;
  378. }
  379. int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  380. {
  381. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  382. {
  383. return forward_int8_arm(bottom_blob, top_blob, opt);
  384. }
  385. if (bottom_blob.dims != 3)
  386. {
  387. return Convolution::forward(bottom_blob, top_blob, opt);
  388. }
  389. if (opt.use_bf16_storage)
  390. return forward_bf16s(bottom_blob, top_blob, opt);
  391. int w = bottom_blob.w;
  392. int h = bottom_blob.h;
  393. int channels = bottom_blob.c;
  394. size_t elemsize = bottom_blob.elemsize;
  395. int elempack = bottom_blob.elempack;
  396. // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  397. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  398. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  399. Mat bottom_blob_bordered;
  400. make_padding(bottom_blob, bottom_blob_bordered, opt);
  401. if (bottom_blob_bordered.empty())
  402. return -100;
  403. w = bottom_blob_bordered.w;
  404. h = bottom_blob_bordered.h;
  405. int outw = (w - kernel_extent_w) / stride_w + 1;
  406. int outh = (h - kernel_extent_h) / stride_h + 1;
  407. int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  408. size_t out_elemsize = elemsize / elempack * out_elempack;
  409. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  410. if (top_blob.empty())
  411. return -100;
  412. if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  413. {
  414. if (outw >= dilation_w && outh >= dilation_h)
  415. {
  416. return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  417. }
  418. }
  419. const int maxk = kernel_w * kernel_h;
  420. // kernel offsets
  421. std::vector<int> _space_ofs(maxk);
  422. int* space_ofs = &_space_ofs[0];
  423. {
  424. int p1 = 0;
  425. int p2 = 0;
  426. int gap = w * dilation_h - kernel_w * dilation_w;
  427. for (int i = 0; i < kernel_h; i++)
  428. {
  429. for (int j = 0; j < kernel_w; j++)
  430. {
  431. space_ofs[p1] = p2;
  432. p1++;
  433. p2 += dilation_w;
  434. }
  435. p2 += gap;
  436. }
  437. }
  438. #if __ARM_NEON
  439. if (elempack == 4 && out_elempack == 4)
  440. {
  441. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  442. {
  443. conv1x1s1_sgemm_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  444. if (activation)
  445. {
  446. activation->forward_inplace(top_blob, opt);
  447. }
  448. }
  449. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  450. {
  451. conv1x1s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  452. if (activation)
  453. {
  454. activation->forward_inplace(top_blob, opt);
  455. }
  456. }
  457. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  458. {
  459. conv3x3s1_winograd64_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  460. if (activation)
  461. {
  462. activation->forward_inplace(top_blob, opt);
  463. }
  464. }
  465. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  466. {
  467. conv3x3s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  468. if (activation)
  469. {
  470. activation->forward_inplace(top_blob, opt);
  471. }
  472. }
  473. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  474. {
  475. conv5x5s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  476. if (activation)
  477. {
  478. activation->forward_inplace(top_blob, opt);
  479. }
  480. }
  481. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  482. {
  483. conv5x5s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  484. if (activation)
  485. {
  486. activation->forward_inplace(top_blob, opt);
  487. }
  488. }
  489. else
  490. {
  491. // num_output
  492. #pragma omp parallel for num_threads(opt.num_threads)
  493. for (int p = 0; p < num_output / out_elempack; p++)
  494. {
  495. float* outptr = top_blob.channel(p);
  496. for (int i = 0; i < outh; i++)
  497. {
  498. for (int j = 0; j < outw; j++)
  499. {
  500. float32x4_t _sum = vdupq_n_f32(0.f);
  501. if (bias_term)
  502. {
  503. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  504. }
  505. const float* kptr = (const float*)weight_data_pack4 + maxk * channels * p * 16;
  506. // channels
  507. for (int q = 0; q < channels; q++)
  508. {
  509. const Mat m = bottom_blob_bordered.channel(q);
  510. const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
  511. for (int k = 0; k < maxk; k++) // 29.23
  512. {
  513. float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
  514. float32x4_t _w0 = vld1q_f32(kptr);
  515. float32x4_t _w1 = vld1q_f32(kptr + 4);
  516. float32x4_t _w2 = vld1q_f32(kptr + 8);
  517. float32x4_t _w3 = vld1q_f32(kptr + 12);
  518. #if __aarch64__
  519. _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
  520. _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
  521. _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
  522. _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
  523. #else
  524. _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
  525. _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
  526. _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
  527. _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
  528. #endif
  529. kptr += 16;
  530. }
  531. }
  532. _sum = activation_ps(_sum, activation_type, activation_params);
  533. vst1q_f32(outptr + j * 4, _sum);
  534. }
  535. outptr += outw * 4;
  536. }
  537. }
  538. }
  539. }
  540. if (elempack == 1 && out_elempack == 4)
  541. {
  542. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  543. {
  544. conv3x3s1_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  545. if (activation)
  546. {
  547. activation->forward_inplace(top_blob, opt);
  548. }
  549. }
  550. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  551. {
  552. conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  553. if (activation)
  554. {
  555. activation->forward_inplace(top_blob, opt);
  556. }
  557. }
  558. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  559. {
  560. conv7x7s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  561. if (activation)
  562. {
  563. activation->forward_inplace(top_blob, opt);
  564. }
  565. }
  566. else
  567. {
  568. // num_output
  569. #pragma omp parallel for num_threads(opt.num_threads)
  570. for (int p = 0; p < num_output / out_elempack; p++)
  571. {
  572. float* outptr = top_blob.channel(p);
  573. for (int i = 0; i < outh; i++)
  574. {
  575. for (int j = 0; j < outw; j++)
  576. {
  577. float32x4_t _sum = vdupq_n_f32(0.f);
  578. if (bias_term)
  579. {
  580. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  581. }
  582. const float* kptr = (const float*)weight_data_pack1to4 + maxk * channels * p * 4;
  583. // channels
  584. for (int q = 0; q < channels; q++)
  585. {
  586. const Mat m = bottom_blob_bordered.channel(q);
  587. const float* sptr = m.row(i * stride_h) + j * stride_w;
  588. for (int k = 0; k < maxk; k++) // 29.23
  589. {
  590. float32x4_t _val = vdupq_n_f32(sptr[space_ofs[k]]);
  591. float32x4_t _w = vld1q_f32(kptr);
  592. _sum = vmlaq_f32(_sum, _val, _w);
  593. kptr += 4;
  594. }
  595. }
  596. _sum = activation_ps(_sum, activation_type, activation_params);
  597. vst1q_f32(outptr + j * 4, _sum);
  598. }
  599. outptr += outw * 4;
  600. }
  601. }
  602. }
  603. }
  604. if (elempack == 4 && out_elempack == 1)
  605. {
  606. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  607. {
  608. conv1x1s1_sgemm_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  609. if (activation)
  610. {
  611. activation->forward_inplace(top_blob, opt);
  612. }
  613. }
  614. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  615. {
  616. conv1x1s2_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  617. if (activation)
  618. {
  619. activation->forward_inplace(top_blob, opt);
  620. }
  621. }
  622. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  623. {
  624. // TODO more proper condition
  625. conv3x3s1_winograd64_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  626. // conv3x3s1_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  627. if (activation)
  628. {
  629. activation->forward_inplace(top_blob, opt);
  630. }
  631. }
  632. else
  633. {
  634. // num_output
  635. #pragma omp parallel for num_threads(opt.num_threads)
  636. for (int p = 0; p < num_output; p++)
  637. {
  638. float* outptr = top_blob.channel(p);
  639. for (int i = 0; i < outh; i++)
  640. {
  641. for (int j = 0; j < outw; j++)
  642. {
  643. float sum = 0.f;
  644. if (bias_term)
  645. {
  646. sum = bias_data[p];
  647. }
  648. const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
  649. // channels
  650. for (int q = 0; q < channels; q++)
  651. {
  652. const Mat m = bottom_blob_bordered.channel(q);
  653. const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
  654. for (int k = 0; k < maxk; k++) // 29.23
  655. {
  656. float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
  657. float32x4_t _w = vld1q_f32(kptr);
  658. float32x4_t _s4 = vmulq_f32(_val, _w);
  659. #if __aarch64__
  660. sum += vaddvq_f32(_s4); // dot
  661. #else
  662. float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
  663. _ss = vpadd_f32(_ss, _ss);
  664. sum += vget_lane_f32(_ss, 0);
  665. #endif
  666. kptr += 4;
  667. }
  668. }
  669. sum = activation_ss(sum, activation_type, activation_params);
  670. outptr[j] = sum;
  671. }
  672. outptr += outw;
  673. }
  674. }
  675. }
  676. }
  677. #endif // __ARM_NEON
  678. if (elempack == 1 && out_elempack == 1)
  679. {
  680. if (impl_type > 0 && impl_type < 6 && impl_type != 4)
  681. {
  682. // engineering is magic.
  683. switch (impl_type)
  684. {
  685. case 1:
  686. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  687. break;
  688. case 2:
  689. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  690. break;
  691. case 3:
  692. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  693. break;
  694. // case 4: FIXME fallback to auto path
  695. // conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  696. // break;
  697. case 5:
  698. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  699. break;
  700. }
  701. if (activation)
  702. {
  703. activation->forward_inplace(top_blob, opt);
  704. }
  705. }
  706. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  707. {
  708. if (use_sgemm1x1)
  709. {
  710. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  711. }
  712. else
  713. {
  714. conv1x1s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  715. }
  716. if (activation)
  717. {
  718. activation->forward_inplace(top_blob, opt);
  719. }
  720. }
  721. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  722. {
  723. if (opt.use_sgemm_convolution)
  724. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  725. else
  726. conv1x1s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  727. if (activation)
  728. {
  729. activation->forward_inplace(top_blob, opt);
  730. }
  731. }
  732. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  733. {
  734. if (use_winograd3x3 && w <= 120 && h <= 120)
  735. {
  736. // conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  737. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  738. }
  739. else
  740. {
  741. conv3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  742. }
  743. if (activation)
  744. {
  745. activation->forward_inplace(top_blob, opt);
  746. }
  747. }
  748. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  749. {
  750. if (opt.use_sgemm_convolution && !(outw >= 8 && outh >= 8))
  751. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  752. else
  753. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  754. if (activation)
  755. {
  756. activation->forward_inplace(top_blob, opt);
  757. }
  758. }
  759. else if (kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
  760. {
  761. conv4x4s4_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  762. if (activation)
  763. {
  764. activation->forward_inplace(top_blob, opt);
  765. }
  766. }
  767. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  768. {
  769. conv5x5s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  770. if (activation)
  771. {
  772. activation->forward_inplace(top_blob, opt);
  773. }
  774. }
  775. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  776. {
  777. conv5x5s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  778. if (activation)
  779. {
  780. activation->forward_inplace(top_blob, opt);
  781. }
  782. }
  783. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  784. {
  785. conv7x7s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  786. if (activation)
  787. {
  788. activation->forward_inplace(top_blob, opt);
  789. }
  790. }
  791. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  792. {
  793. conv7x7s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  794. if (activation)
  795. {
  796. activation->forward_inplace(top_blob, opt);
  797. }
  798. }
  799. else
  800. {
  801. // num_output
  802. #pragma omp parallel for num_threads(opt.num_threads)
  803. for (int p = 0; p < num_output; p++)
  804. {
  805. float* outptr = top_blob.channel(p);
  806. for (int i = 0; i < outh; i++)
  807. {
  808. for (int j = 0; j < outw; j++)
  809. {
  810. float sum = 0.f;
  811. if (bias_term)
  812. {
  813. sum = bias_data[p];
  814. }
  815. const float* kptr = (const float*)weight_data + maxk * channels * p;
  816. // channels
  817. for (int q = 0; q < channels; q++)
  818. {
  819. const Mat m = bottom_blob_bordered.channel(q);
  820. const float* sptr = m.row(i * stride_h) + j * stride_w;
  821. for (int k = 0; k < maxk; k++)
  822. {
  823. float val = sptr[space_ofs[k]];
  824. float w = kptr[k];
  825. sum += val * w;
  826. }
  827. kptr += maxk;
  828. }
  829. if (activation_type == 1)
  830. {
  831. sum = std::max(sum, 0.f);
  832. }
  833. else if (activation_type == 2)
  834. {
  835. float slope = activation_params[0];
  836. sum = sum > 0.f ? sum : sum * slope;
  837. }
  838. else if (activation_type == 3)
  839. {
  840. float min = activation_params[0];
  841. float max = activation_params[1];
  842. if (sum < min)
  843. sum = min;
  844. if (sum > max)
  845. sum = max;
  846. }
  847. else if (activation_type == 4)
  848. {
  849. sum = static_cast<float>(1.f / (1.f + exp(-sum)));
  850. }
  851. else if (activation_type == 5)
  852. {
  853. sum = static_cast<float>(sum * tanh(log(exp(sum) + 1.f)));
  854. }
  855. outptr[j] = sum;
  856. }
  857. outptr += outw;
  858. }
  859. }
  860. }
  861. }
  862. return 0;
  863. }
  864. int Convolution_arm::create_pipeline_bf16s(const Option& opt)
  865. {
  866. const int maxk = kernel_w * kernel_h;
  867. const int num_input = weight_data_size / maxk / num_output;
  868. int elempack = (support_packing && opt.use_packing_layout && num_input % 4 == 0) ? 4 : 1;
  869. int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  870. #if __ARM_NEON
  871. // pack4
  872. if (elempack == 4 && out_elempack == 4)
  873. {
  874. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  875. {
  876. conv1x1s1_sgemm_transform_kernel_pack4_bf16s_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  877. }
  878. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  879. {
  880. conv1x1s1_sgemm_transform_kernel_pack4_bf16s_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  881. }
  882. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  883. {
  884. conv3x3s1_winograd64_transform_kernel_pack4_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  885. }
  886. else
  887. {
  888. // src = kw-kh-inch-outch
  889. // dst = 4b-4a-kw-kh-inch/4a-outch/4b
  890. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  891. weight_data_pack4_bf16.create(maxk, num_input / 4, num_output / 4, (size_t)2 * 16, 16);
  892. for (int q = 0; q + 3 < num_output; q += 4)
  893. {
  894. const Mat k0 = weight_data_r2.channel(q);
  895. const Mat k1 = weight_data_r2.channel(q + 1);
  896. const Mat k2 = weight_data_r2.channel(q + 2);
  897. const Mat k3 = weight_data_r2.channel(q + 3);
  898. Mat g0 = weight_data_pack4_bf16.channel(q / 4);
  899. for (int p = 0; p + 3 < num_input; p += 4)
  900. {
  901. const float* k00 = k0.row(p);
  902. const float* k01 = k0.row(p + 1);
  903. const float* k02 = k0.row(p + 2);
  904. const float* k03 = k0.row(p + 3);
  905. const float* k10 = k1.row(p);
  906. const float* k11 = k1.row(p + 1);
  907. const float* k12 = k1.row(p + 2);
  908. const float* k13 = k1.row(p + 3);
  909. const float* k20 = k2.row(p);
  910. const float* k21 = k2.row(p + 1);
  911. const float* k22 = k2.row(p + 2);
  912. const float* k23 = k2.row(p + 3);
  913. const float* k30 = k3.row(p);
  914. const float* k31 = k3.row(p + 1);
  915. const float* k32 = k3.row(p + 2);
  916. const float* k33 = k3.row(p + 3);
  917. unsigned short* g00 = g0.row<unsigned short>(p / 4);
  918. for (int k = 0; k < maxk; k++)
  919. {
  920. g00[0] = float32_to_bfloat16(k00[k]);
  921. g00[1] = float32_to_bfloat16(k10[k]);
  922. g00[2] = float32_to_bfloat16(k20[k]);
  923. g00[3] = float32_to_bfloat16(k30[k]);
  924. g00[4] = float32_to_bfloat16(k01[k]);
  925. g00[5] = float32_to_bfloat16(k11[k]);
  926. g00[6] = float32_to_bfloat16(k21[k]);
  927. g00[7] = float32_to_bfloat16(k31[k]);
  928. g00[8] = float32_to_bfloat16(k02[k]);
  929. g00[9] = float32_to_bfloat16(k12[k]);
  930. g00[10] = float32_to_bfloat16(k22[k]);
  931. g00[11] = float32_to_bfloat16(k32[k]);
  932. g00[12] = float32_to_bfloat16(k03[k]);
  933. g00[13] = float32_to_bfloat16(k13[k]);
  934. g00[14] = float32_to_bfloat16(k23[k]);
  935. g00[15] = float32_to_bfloat16(k33[k]);
  936. g00 += 16;
  937. }
  938. }
  939. }
  940. }
  941. }
  942. // pack1to4
  943. if (elempack == 1 && out_elempack == 4)
  944. {
  945. // src = kw-kh-inch-outch
  946. // dst = 4b-kw-kh-inch-outch/4b
  947. {
  948. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  949. weight_data_pack1to4_bf16.create(maxk, num_input, num_output / 4, (size_t)2 * 4, 4);
  950. for (int q = 0; q + 3 < num_output; q += 4)
  951. {
  952. const Mat k0 = weight_data_r2.channel(q);
  953. const Mat k1 = weight_data_r2.channel(q + 1);
  954. const Mat k2 = weight_data_r2.channel(q + 2);
  955. const Mat k3 = weight_data_r2.channel(q + 3);
  956. Mat g0 = weight_data_pack1to4_bf16.channel(q / 4);
  957. for (int p = 0; p < num_input; p++)
  958. {
  959. const float* k00 = k0.row(p);
  960. const float* k10 = k1.row(p);
  961. const float* k20 = k2.row(p);
  962. const float* k30 = k3.row(p);
  963. unsigned short* g00 = g0.row<unsigned short>(p);
  964. for (int k = 0; k < maxk; k++)
  965. {
  966. g00[0] = float32_to_bfloat16(k00[k]);
  967. g00[1] = float32_to_bfloat16(k10[k]);
  968. g00[2] = float32_to_bfloat16(k20[k]);
  969. g00[3] = float32_to_bfloat16(k30[k]);
  970. g00 += 4;
  971. }
  972. }
  973. }
  974. }
  975. }
  976. // pack4to1
  977. if (elempack == 4 && out_elempack == 1)
  978. {
  979. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  980. {
  981. conv1x1s1_sgemm_transform_kernel_pack4to1_bf16s_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  982. }
  983. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  984. {
  985. conv1x1s1_sgemm_transform_kernel_pack4to1_bf16s_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  986. }
  987. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  988. {
  989. conv3x3s1_winograd64_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  990. }
  991. else
  992. {
  993. // src = kw-kh-inch-outch
  994. // dst = 4a-kw-kh-inch/4a-outch
  995. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  996. weight_data_pack4to1_bf16.create(maxk, num_input / 4, num_output, (size_t)2 * 4, 4);
  997. for (int q = 0; q < num_output; q++)
  998. {
  999. const Mat k0 = weight_data_r2.channel(q);
  1000. Mat g0 = weight_data_pack4to1_bf16.channel(q);
  1001. for (int p = 0; p + 3 < num_input; p += 4)
  1002. {
  1003. const float* k00 = k0.row(p);
  1004. const float* k01 = k0.row(p + 1);
  1005. const float* k02 = k0.row(p + 2);
  1006. const float* k03 = k0.row(p + 3);
  1007. unsigned short* g00 = g0.row<unsigned short>(p / 4);
  1008. for (int k = 0; k < maxk; k++)
  1009. {
  1010. g00[0] = float32_to_bfloat16(k00[k]);
  1011. g00[1] = float32_to_bfloat16(k01[k]);
  1012. g00[2] = float32_to_bfloat16(k02[k]);
  1013. g00[3] = float32_to_bfloat16(k03[k]);
  1014. g00 += 4;
  1015. }
  1016. }
  1017. }
  1018. }
  1019. }
  1020. #endif // __ARM_NEON
  1021. // pack1
  1022. if (elempack == 1 && out_elempack == 1)
  1023. {
  1024. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1025. {
  1026. conv1x1s1_sgemm_transform_kernel_bf16s_neon(weight_data, weight_data_bf16, num_input, num_output);
  1027. }
  1028. else
  1029. {
  1030. ncnn::cast_float32_to_bfloat16(weight_data, weight_data_bf16, opt);
  1031. }
  1032. }
  1033. return 0;
  1034. }
  1035. int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1036. {
  1037. int w = bottom_blob.w;
  1038. int h = bottom_blob.h;
  1039. int channels = bottom_blob.c;
  1040. size_t elemsize = bottom_blob.elemsize;
  1041. int elempack = bottom_blob.elempack;
  1042. // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  1043. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1044. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1045. Mat bottom_blob_bordered;
  1046. make_padding(bottom_blob, bottom_blob_bordered, opt);
  1047. if (bottom_blob_bordered.empty())
  1048. return -100;
  1049. w = bottom_blob_bordered.w;
  1050. h = bottom_blob_bordered.h;
  1051. int outw = (w - kernel_extent_w) / stride_w + 1;
  1052. int outh = (h - kernel_extent_h) / stride_h + 1;
  1053. int out_elempack = (support_packing && opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  1054. size_t out_elemsize = elemsize / elempack * out_elempack;
  1055. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  1056. if (top_blob.empty())
  1057. return -100;
  1058. // FIXME
  1059. // if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  1060. // {
  1061. // return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  1062. // }
  1063. const int maxk = kernel_w * kernel_h;
  1064. // kernel offsets
  1065. std::vector<int> _space_ofs(maxk);
  1066. int* space_ofs = &_space_ofs[0];
  1067. {
  1068. int p1 = 0;
  1069. int p2 = 0;
  1070. int gap = w * dilation_h - kernel_w * dilation_w;
  1071. for (int i = 0; i < kernel_h; i++)
  1072. {
  1073. for (int j = 0; j < kernel_w; j++)
  1074. {
  1075. space_ofs[p1] = p2;
  1076. p1++;
  1077. p2 += dilation_w;
  1078. }
  1079. p2 += gap;
  1080. }
  1081. }
  1082. #if __ARM_NEON
  1083. if (elempack == 4 && out_elempack == 4)
  1084. {
  1085. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1086. {
  1087. conv1x1s1_sgemm_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1088. if (activation)
  1089. {
  1090. activation->forward_inplace(top_blob, opt);
  1091. }
  1092. }
  1093. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1094. {
  1095. conv1x1s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1096. if (activation)
  1097. {
  1098. activation->forward_inplace(top_blob, opt);
  1099. }
  1100. }
  1101. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1102. {
  1103. conv3x3s1_winograd64_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1104. if (activation)
  1105. {
  1106. activation->forward_inplace(top_blob, opt);
  1107. }
  1108. }
  1109. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1110. {
  1111. conv3x3s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1112. if (activation)
  1113. {
  1114. activation->forward_inplace(top_blob, opt);
  1115. }
  1116. }
  1117. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1118. {
  1119. conv5x5s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1120. if (activation)
  1121. {
  1122. activation->forward_inplace(top_blob, opt);
  1123. }
  1124. }
  1125. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1126. {
  1127. conv5x5s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1128. if (activation)
  1129. {
  1130. activation->forward_inplace(top_blob, opt);
  1131. }
  1132. }
  1133. else
  1134. {
  1135. // num_output
  1136. #pragma omp parallel for num_threads(opt.num_threads)
  1137. for (int p = 0; p < num_output / out_elempack; p++)
  1138. {
  1139. unsigned short* outptr = top_blob.channel(p);
  1140. for (int i = 0; i < outh; i++)
  1141. {
  1142. for (int j = 0; j < outw; j++)
  1143. {
  1144. float32x4_t _sum = vdupq_n_f32(0.f);
  1145. if (bias_term)
  1146. {
  1147. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  1148. }
  1149. const unsigned short* kptr = weight_data_pack4_bf16.channel(p);
  1150. // channels
  1151. for (int q = 0; q < channels; q++)
  1152. {
  1153. const Mat m = bottom_blob_bordered.channel(q);
  1154. const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;
  1155. for (int k = 0; k < maxk; k++)
  1156. {
  1157. float32x4_t _val = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(sptr + space_ofs[k] * 4), 16));
  1158. float32x4_t _w0 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr), 16));
  1159. float32x4_t _w1 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr + 4), 16));
  1160. float32x4_t _w2 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr + 8), 16));
  1161. float32x4_t _w3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr + 12), 16));
  1162. #if __aarch64__
  1163. _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
  1164. _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
  1165. _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
  1166. _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
  1167. #else
  1168. _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
  1169. _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
  1170. _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
  1171. _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
  1172. #endif
  1173. kptr += 16;
  1174. }
  1175. }
  1176. _sum = activation_ps(_sum, activation_type, activation_params);
  1177. vst1_u16(outptr + j * 4, vshrn_n_u32(vreinterpretq_u32_f32(_sum), 16));
  1178. }
  1179. outptr += outw * 4;
  1180. }
  1181. }
  1182. }
  1183. }
  1184. if (elempack == 1 && out_elempack == 4)
  1185. {
  1186. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1187. {
  1188. conv3x3s1_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1189. if (activation)
  1190. {
  1191. activation->forward_inplace(top_blob, opt);
  1192. }
  1193. }
  1194. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1195. {
  1196. conv3x3s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1197. if (activation)
  1198. {
  1199. activation->forward_inplace(top_blob, opt);
  1200. }
  1201. }
  1202. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1203. {
  1204. conv7x7s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1205. if (activation)
  1206. {
  1207. activation->forward_inplace(top_blob, opt);
  1208. }
  1209. }
  1210. else
  1211. {
  1212. // num_output
  1213. #pragma omp parallel for num_threads(opt.num_threads)
  1214. for (int p = 0; p < num_output / out_elempack; p++)
  1215. {
  1216. unsigned short* outptr = top_blob.channel(p);
  1217. for (int i = 0; i < outh; i++)
  1218. {
  1219. for (int j = 0; j < outw; j++)
  1220. {
  1221. float32x4_t _sum = vdupq_n_f32(0.f);
  1222. if (bias_term)
  1223. {
  1224. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  1225. }
  1226. const unsigned short* kptr = weight_data_pack1to4_bf16.channel(p);
  1227. // channels
  1228. for (int q = 0; q < channels; q++)
  1229. {
  1230. const Mat m = bottom_blob_bordered.channel(q);
  1231. const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;
  1232. for (int k = 0; k < maxk; k++)
  1233. {
  1234. float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(sptr[space_ofs[k]]));
  1235. float32x4_t _w = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr), 16));
  1236. _sum = vmlaq_f32(_sum, _val, _w);
  1237. kptr += 4;
  1238. }
  1239. }
  1240. _sum = activation_ps(_sum, activation_type, activation_params);
  1241. vst1_u16(outptr + j * 4, vshrn_n_u32(vreinterpretq_u32_f32(_sum), 16));
  1242. }
  1243. outptr += outw * 4;
  1244. }
  1245. }
  1246. }
  1247. }
  1248. if (elempack == 4 && out_elempack == 1)
  1249. {
  1250. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1251. {
  1252. conv1x1s1_sgemm_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1253. if (activation)
  1254. {
  1255. activation->forward_inplace(top_blob, opt);
  1256. }
  1257. }
  1258. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1259. {
  1260. conv1x1s2_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1261. if (activation)
  1262. {
  1263. activation->forward_inplace(top_blob, opt);
  1264. }
  1265. }
  1266. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1267. {
  1268. // TODO more proper condition
  1269. conv3x3s1_winograd64_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1270. // conv3x3s1_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1271. if (activation)
  1272. {
  1273. activation->forward_inplace(top_blob, opt);
  1274. }
  1275. }
  1276. else
  1277. {
  1278. // num_output
  1279. #pragma omp parallel for num_threads(opt.num_threads)
  1280. for (int p = 0; p < num_output; p++)
  1281. {
  1282. unsigned short* outptr = top_blob.channel(p);
  1283. for (int i = 0; i < outh; i++)
  1284. {
  1285. for (int j = 0; j < outw; j++)
  1286. {
  1287. float sum = 0.f;
  1288. if (bias_term)
  1289. {
  1290. sum = bias_data[p];
  1291. }
  1292. const unsigned short* kptr = weight_data_pack4to1_bf16.channel(p);
  1293. // channels
  1294. for (int q = 0; q < channels; q++)
  1295. {
  1296. const Mat m = bottom_blob_bordered.channel(q);
  1297. const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;
  1298. for (int k = 0; k < maxk; k++)
  1299. {
  1300. float32x4_t _val = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(sptr + space_ofs[k] * 4), 16));
  1301. float32x4_t _w = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(kptr), 16));
  1302. float32x4_t _s4 = vmulq_f32(_val, _w);
  1303. #if __aarch64__
  1304. sum += vaddvq_f32(_s4); // dot
  1305. #else
  1306. float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
  1307. _ss = vpadd_f32(_ss, _ss);
  1308. sum += vget_lane_f32(_ss, 0);
  1309. #endif
  1310. kptr += 4;
  1311. }
  1312. }
  1313. sum = activation_ss(sum, activation_type, activation_params);
  1314. outptr[j] = float32_to_bfloat16(sum);
  1315. }
  1316. outptr += outw;
  1317. }
  1318. }
  1319. }
  1320. }
  1321. #endif // __ARM_NEON
  1322. if (elempack == 1 && out_elempack == 1)
  1323. {
  1324. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1325. {
  1326. conv1x1s1_sgemm_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_bf16, bias_data, opt);
  1327. if (activation)
  1328. {
  1329. activation->forward_inplace(top_blob, opt);
  1330. }
  1331. }
  1332. else
  1333. {
  1334. // num_output
  1335. #pragma omp parallel for num_threads(opt.num_threads)
  1336. for (int p = 0; p < num_output; p++)
  1337. {
  1338. unsigned short* outptr = top_blob.channel(p);
  1339. for (int i = 0; i < outh; i++)
  1340. {
  1341. for (int j = 0; j < outw; j++)
  1342. {
  1343. float sum = 0.f;
  1344. if (bias_term)
  1345. {
  1346. sum = bias_data[p];
  1347. }
  1348. const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * channels * p;
  1349. // channels
  1350. for (int q = 0; q < channels; q++)
  1351. {
  1352. const Mat m = bottom_blob_bordered.channel(q);
  1353. const unsigned short* sptr = m.row<unsigned short>(i * stride_h) + j * stride_w;
  1354. for (int k = 0; k < maxk; k++)
  1355. {
  1356. float val = bfloat16_to_float32(sptr[space_ofs[k]]);
  1357. float w = bfloat16_to_float32(kptr[k]);
  1358. sum += val * w;
  1359. }
  1360. kptr += maxk;
  1361. }
  1362. if (activation_type == 1)
  1363. {
  1364. sum = std::max(sum, 0.f);
  1365. }
  1366. else if (activation_type == 2)
  1367. {
  1368. float slope = activation_params[0];
  1369. sum = sum > 0.f ? sum : sum * slope;
  1370. }
  1371. else if (activation_type == 3)
  1372. {
  1373. float min = activation_params[0];
  1374. float max = activation_params[1];
  1375. if (sum < min)
  1376. sum = min;
  1377. if (sum > max)
  1378. sum = max;
  1379. }
  1380. else if (activation_type == 4)
  1381. {
  1382. sum = static_cast<float>(1.f / (1.f + exp(-sum)));
  1383. }
  1384. else if (activation_type == 5)
  1385. {
  1386. sum = static_cast<float>(sum * tanh(log(exp(sum) + 1.f)));
  1387. }
  1388. outptr[j] = float32_to_bfloat16(sum);
  1389. }
  1390. outptr += outw;
  1391. }
  1392. }
  1393. }
  1394. }
  1395. return 0;
  1396. }
  1397. int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
  1398. {
  1399. const int maxk = kernel_w * kernel_h;
  1400. const int num_input = weight_data_size / maxk / num_output;
  1401. use_winograd3x3_int8 = false;
  1402. use_sgemm1x1_int8 = false;
  1403. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1404. {
  1405. use_winograd3x3_int8 = true;
  1406. // conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
  1407. conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
  1408. }
  1409. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1410. {
  1411. conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
  1412. }
  1413. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1414. {
  1415. use_sgemm1x1_int8 = true;
  1416. conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_data_int8, num_input, num_output);
  1417. }
  1418. else
  1419. {
  1420. conv_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data_int8, num_input, num_output, maxk);
  1421. }
  1422. return 0;
  1423. }
  1424. int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1425. {
  1426. if (dilation_w > 1 || dilation_h > 1)
  1427. {
  1428. return Convolution::forward(bottom_blob, top_blob, opt);
  1429. }
  1430. int w = bottom_blob.w;
  1431. int h = bottom_blob.h;
  1432. // int channels = bottom_blob.c;
  1433. size_t elemsize = bottom_blob.elemsize;
  1434. // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
  1435. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1436. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1437. Mat bottom_blob_unbordered = bottom_blob;
  1438. if (elemsize != 1)
  1439. {
  1440. Option opt_g = opt;
  1441. opt_g.blob_allocator = opt.workspace_allocator;
  1442. quantize_float32_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scale, opt_g);
  1443. }
  1444. Mat bottom_blob_bordered;
  1445. make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
  1446. if (bottom_blob_bordered.empty())
  1447. return -100;
  1448. w = bottom_blob_bordered.w;
  1449. h = bottom_blob_bordered.h;
  1450. int outw = (w - kernel_extent_w) / stride_w + 1;
  1451. int outh = (h - kernel_extent_h) / stride_h + 1;
  1452. // int8
  1453. size_t out_elemsize = use_int8_requantize ? 1u : 4u;
  1454. top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
  1455. if (top_blob.empty())
  1456. return -100;
  1457. // int8
  1458. if (use_int8_requantize == true)
  1459. {
  1460. Mat top_blob_tm;
  1461. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  1462. if (top_blob_tm.empty())
  1463. return -100;
  1464. if (use_sgemm1x1_int8)
  1465. {
  1466. std::vector<float> requantize_scales;
  1467. for (int p = 0; p < num_output; p++)
  1468. {
  1469. float scale_in;
  1470. if (weight_data_int8_scales[p] == 0)
  1471. scale_in = 0;
  1472. else
  1473. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1474. float scale_out = top_blob_int8_scale;
  1475. requantize_scales.push_back(scale_in);
  1476. requantize_scales.push_back(scale_out);
  1477. }
  1478. conv1x1s1_sgemm_int8_requant_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_data_int8, bias_data, requantize_scales, opt);
  1479. if (activation)
  1480. {
  1481. activation->forward_inplace(top_blob, opt);
  1482. }
  1483. return 0;
  1484. }
  1485. else if (use_winograd3x3_int8)
  1486. {
  1487. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
  1488. conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
  1489. }
  1490. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1491. {
  1492. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_data_int8, opt);
  1493. }
  1494. else
  1495. {
  1496. conv_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_sgemm_data_int8, kernel_w, kernel_h, stride_w, stride_h, opt);
  1497. }
  1498. // requantize, reverse scale inplace
  1499. #pragma omp parallel for num_threads(opt.num_threads)
  1500. for (int p = 0; p < num_output; p++)
  1501. {
  1502. Option opt_g = opt;
  1503. opt_g.num_threads = 1;
  1504. opt_g.blob_allocator = top_blob.allocator;
  1505. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  1506. Mat top_blob_g = top_blob.channel_range(p, 1);
  1507. // requantize and relu
  1508. float scale_in;
  1509. if (weight_data_int8_scales[p] == 0)
  1510. scale_in = 0;
  1511. else
  1512. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1513. float scale_out = top_blob_int8_scale; //FIXME load param
  1514. requantize_int8_to_int8(top_blob_tm_g, top_blob_g, scale_in, scale_out, bias_term ? (const float*)bias_data + p : 0, bias_term ? 1 : 0, 0, opt_g);
  1515. }
  1516. }
  1517. else
  1518. {
  1519. if (use_sgemm1x1_int8)
  1520. {
  1521. conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_data_int8, opt);
  1522. }
  1523. else if (use_winograd3x3_int8)
  1524. {
  1525. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
  1526. conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
  1527. }
  1528. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1529. {
  1530. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data_int8, opt);
  1531. }
  1532. else
  1533. {
  1534. conv_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_sgemm_data_int8, kernel_w, kernel_h, stride_w, stride_h, opt);
  1535. }
  1536. // dequantize, reverse scale inplace
  1537. #pragma omp parallel for num_threads(opt.num_threads)
  1538. for (int p = 0; p < num_output; p++)
  1539. {
  1540. Option opt_g = opt;
  1541. opt_g.num_threads = 1;
  1542. opt_g.blob_allocator = top_blob.allocator;
  1543. Mat top_blob_g = top_blob.channel_range(p, 1);
  1544. // dequantize
  1545. float scale_in;
  1546. if (weight_data_int8_scales[p] == 0)
  1547. scale_in = 0;
  1548. else
  1549. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1550. dequantize_int32_to_float32(top_blob_g, scale_in, bias_term ? (const float*)bias_data + p : 0, bias_term ? 1 : 0, opt_g);
  1551. }
  1552. }
  1553. if (activation)
  1554. {
  1555. activation->forward_inplace(top_blob, opt);
  1556. }
  1557. return 0;
  1558. }
  1559. int Convolution_arm::forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1560. {
  1561. int w = bottom_blob.w;
  1562. int h = bottom_blob.h;
  1563. size_t elemsize = bottom_blob.elemsize;
  1564. const int kernel_size = kernel_w;
  1565. const int stride = stride_w;
  1566. const int dilation = dilation_w;
  1567. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  1568. int outw = (w - kernel_extent) / stride + 1;
  1569. int outh = (h - kernel_extent) / stride + 1;
  1570. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  1571. if (top_blob.empty())
  1572. return -100;
  1573. // Make (dilation * dilation) batches
  1574. Mat inner_bottom_blob;
  1575. Mat inner_top_blob;
  1576. for (int x = 0; x < dilation; x++)
  1577. {
  1578. for (int y = 0; y < dilation; y++)
  1579. {
  1580. int inner_w = (w - y + dilation - 1) / dilation;
  1581. int inner_h = (h - x + dilation - 1) / dilation;
  1582. int inner_outw = (inner_w - kernel_size) / stride + 1;
  1583. int inner_outh = (inner_h - kernel_size) / stride + 1;
  1584. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  1585. if (inner_bottom_blob.empty())
  1586. return -100;
  1587. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  1588. if (inner_top_blob.empty())
  1589. return -100;
  1590. #pragma omp parallel for num_threads(opt.num_threads)
  1591. for (int c = 0; c < bottom_blob.c; c++)
  1592. {
  1593. float* outptr = inner_bottom_blob.channel(c);
  1594. for (int i = 0; i < inner_h; i++)
  1595. {
  1596. const float* ptr = (const float*)bottom_blob.channel(c) + dilation * i * w + x * w + y;
  1597. for (int j = 0; j < inner_w; j++)
  1598. {
  1599. outptr[j] = ptr[j * dilation];
  1600. }
  1601. outptr += inner_w;
  1602. }
  1603. }
  1604. Option opt_g = opt;
  1605. opt_g.blob_allocator = inner_top_blob.allocator;
  1606. convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
  1607. #pragma omp parallel for num_threads(opt.num_threads)
  1608. for (int c = 0; c < num_output; c++)
  1609. {
  1610. float* outptr = (float*)top_blob.channel(c) + x * outw + y;
  1611. for (int i = 0; i < inner_outh; i++)
  1612. {
  1613. const float* ptr = (const float*)inner_top_blob.channel(c) + i * inner_outw;
  1614. for (int j = 0; j < inner_outw; j++)
  1615. {
  1616. outptr[j * dilation] = ptr[j];
  1617. }
  1618. outptr += dilation * outw;
  1619. }
  1620. }
  1621. }
  1622. }
  1623. if (activation)
  1624. {
  1625. activation->forward_inplace(top_blob, opt);
  1626. }
  1627. return 0;
  1628. }
  1629. } // namespace ncnn