You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_arm.cpp 70 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_arm.h"
  15. #include "benchmark.h"
  16. #include "cpu.h"
  17. #include "layer_type.h"
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #include "neon_mathfun.h"
  21. #include "neon_activation.h"
  22. #endif // __ARM_NEON
  23. namespace ncnn {
  24. #include "convolution_1x1.h"
  25. #include "convolution_2x2.h"
  26. #include "convolution_3x3.h"
  27. #include "convolution_4x4.h"
  28. #include "convolution_5x5.h"
  29. #include "convolution_7x7.h"
  30. #include "convolution_sgemm.h"
  31. #include "convolution_sgemm_int8.h"
  32. #include "convolution_1x1_int8.h"
  33. #include "convolution_3x3_int8.h"
  34. #include "convolution_1x1_bf16s.h"
  35. #if __ARM_NEON
  36. #include "convolution_1x1_pack4.h"
  37. #include "convolution_1x1_pack4to1.h"
  38. #include "convolution_3x3_pack4.h"
  39. #include "convolution_3x3_pack1to4.h"
  40. #include "convolution_3x3_pack4to1.h"
  41. #include "convolution_5x5_pack4.h"
  42. #include "convolution_7x7_pack1to4.h"
  43. #include "convolution_1x1_pack4_bf16s.h"
  44. #include "convolution_1x1_pack4to1_bf16s.h"
  45. #include "convolution_3x3_pack4_bf16s.h"
  46. #include "convolution_3x3_pack1to4_bf16s.h"
  47. #include "convolution_3x3_pack4to1_bf16s.h"
  48. #include "convolution_5x5_pack4_bf16s.h"
  49. #include "convolution_7x7_pack1to4_bf16s.h"
  50. #endif // __ARM_NEON
  51. DEFINE_LAYER_CREATOR(Convolution_arm)
  52. Convolution_arm::Convolution_arm()
  53. {
  54. #if __ARM_NEON
  55. support_packing = true;
  56. #endif // __ARM_NEON
  57. support_bf16_storage = true;
  58. activation = 0;
  59. convolution_dilation1 = 0;
  60. }
  61. int Convolution_arm::create_pipeline(const Option& opt)
  62. {
  63. if (activation_type == 1)
  64. {
  65. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  66. ncnn::ParamDict pd;
  67. activation->load_param(pd);
  68. }
  69. else if (activation_type == 2)
  70. {
  71. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  72. ncnn::ParamDict pd;
  73. pd.set(0, activation_params[0]);// slope
  74. activation->load_param(pd);
  75. }
  76. else if (activation_type == 3)
  77. {
  78. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  79. ncnn::ParamDict pd;
  80. pd.set(0, activation_params[0]);// min
  81. pd.set(1, activation_params[1]);// max
  82. activation->load_param(pd);
  83. }
  84. else if (activation_type == 4)
  85. {
  86. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  87. ncnn::ParamDict pd;
  88. activation->load_param(pd);
  89. }
  90. if (activation)
  91. {
  92. activation->create_pipeline(opt);
  93. }
  94. if (opt.use_bf16_storage)
  95. {
  96. return create_pipeline_bf16s(opt);
  97. }
  98. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  99. {
  100. support_packing = false;
  101. return create_pipeline_int8_arm(opt);
  102. }
  103. if (opt.use_packing_layout == false && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  104. {
  105. convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
  106. // set param
  107. ncnn::ParamDict pd;
  108. pd.set(0, num_output);// num_output
  109. pd.set(1, kernel_w);
  110. pd.set(11, kernel_h);
  111. pd.set(2, 1);
  112. pd.set(12, 1);
  113. pd.set(3, 1);// stride_w
  114. pd.set(13, 1);// stride_h
  115. pd.set(4, 0);// pad_w
  116. pd.set(14, 0);// pad_h
  117. pd.set(5, bias_term);
  118. pd.set(6, weight_data_size);
  119. convolution_dilation1->load_param(pd);
  120. // set weights
  121. if (bias_term)
  122. {
  123. ncnn::Mat weights[2];
  124. weights[0] = weight_data;
  125. weights[1] = bias_data;
  126. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  127. }
  128. else
  129. {
  130. ncnn::Mat weights[1];
  131. weights[0] = weight_data;
  132. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  133. }
  134. convolution_dilation1->create_pipeline(opt);
  135. return 0;
  136. }
  137. const int maxk = kernel_w * kernel_h;
  138. const int num_input = weight_data_size / maxk / num_output;
  139. int elempack = (opt.use_packing_layout && num_input % 4 == 0) ? 4 : 1;
  140. int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  141. #if __ARM_NEON
  142. // pack4
  143. if (elempack == 4 && out_elempack == 4)
  144. {
  145. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  146. {
  147. conv1x1s1_sgemm_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  148. }
  149. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  150. {
  151. conv1x1s1_sgemm_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  152. }
  153. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  154. {
  155. conv3x3s1_winograd64_transform_kernel_pack4_neon(weight_data, weight_data_pack4, num_input, num_output);
  156. }
  157. else
  158. {
  159. // src = kw-kh-inch-outch
  160. // dst = 4b-4a-kw-kh-inch/4a-outch/4b
  161. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  162. weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);
  163. for (int q=0; q+3<num_output; q+=4)
  164. {
  165. const Mat k0 = weight_data_r2.channel(q);
  166. const Mat k1 = weight_data_r2.channel(q+1);
  167. const Mat k2 = weight_data_r2.channel(q+2);
  168. const Mat k3 = weight_data_r2.channel(q+3);
  169. Mat g0 = weight_data_pack4.channel(q/4);
  170. for (int p=0; p+3<num_input; p+=4)
  171. {
  172. const float* k00 = k0.row(p);
  173. const float* k01 = k0.row(p+1);
  174. const float* k02 = k0.row(p+2);
  175. const float* k03 = k0.row(p+3);
  176. const float* k10 = k1.row(p);
  177. const float* k11 = k1.row(p+1);
  178. const float* k12 = k1.row(p+2);
  179. const float* k13 = k1.row(p+3);
  180. const float* k20 = k2.row(p);
  181. const float* k21 = k2.row(p+1);
  182. const float* k22 = k2.row(p+2);
  183. const float* k23 = k2.row(p+3);
  184. const float* k30 = k3.row(p);
  185. const float* k31 = k3.row(p+1);
  186. const float* k32 = k3.row(p+2);
  187. const float* k33 = k3.row(p+3);
  188. float* g00 = g0.row(p/4);
  189. for (int k=0; k<maxk; k++)
  190. {
  191. g00[0] = k00[k];
  192. g00[1] = k10[k];
  193. g00[2] = k20[k];
  194. g00[3] = k30[k];
  195. g00[4] = k01[k];
  196. g00[5] = k11[k];
  197. g00[6] = k21[k];
  198. g00[7] = k31[k];
  199. g00[8] = k02[k];
  200. g00[9] = k12[k];
  201. g00[10] = k22[k];
  202. g00[11] = k32[k];
  203. g00[12] = k03[k];
  204. g00[13] = k13[k];
  205. g00[14] = k23[k];
  206. g00[15] = k33[k];
  207. g00 += 16;
  208. }
  209. }
  210. }
  211. }
  212. }
  213. // pack1to4
  214. if (elempack == 1 && out_elempack == 4)
  215. {
  216. // src = kw-kh-inch-outch
  217. // dst = 4b-kw-kh-inch-outch/4b
  218. {
  219. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  220. weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);
  221. for (int q=0; q+3<num_output; q+=4)
  222. {
  223. const Mat k0 = weight_data_r2.channel(q);
  224. const Mat k1 = weight_data_r2.channel(q+1);
  225. const Mat k2 = weight_data_r2.channel(q+2);
  226. const Mat k3 = weight_data_r2.channel(q+3);
  227. Mat g0 = weight_data_pack1to4.channel(q/4);
  228. for (int p=0; p<num_input; p++)
  229. {
  230. const float* k00 = k0.row(p);
  231. const float* k10 = k1.row(p);
  232. const float* k20 = k2.row(p);
  233. const float* k30 = k3.row(p);
  234. float* g00 = g0.row(p);
  235. for (int k=0; k<maxk; k++)
  236. {
  237. g00[0] = k00[k];
  238. g00[1] = k10[k];
  239. g00[2] = k20[k];
  240. g00[3] = k30[k];
  241. g00 += 4;
  242. }
  243. }
  244. }
  245. }
  246. }
  247. // pack4to1
  248. if (elempack == 4 && out_elempack == 1)
  249. {
  250. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  251. {
  252. conv1x1s1_sgemm_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  253. }
  254. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  255. {
  256. conv1x1s1_sgemm_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  257. }
  258. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  259. {
  260. conv3x3s1_winograd64_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1, num_input, num_output);
  261. }
  262. else
  263. {
  264. // src = kw-kh-inch-outch
  265. // dst = 4a-kw-kh-inch/4a-outch
  266. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  267. weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);
  268. for (int q=0; q<num_output; q++)
  269. {
  270. const Mat k0 = weight_data_r2.channel(q);
  271. Mat g0 = weight_data_pack4to1.channel(q);
  272. for (int p=0; p+3<num_input; p+=4)
  273. {
  274. const float* k00 = k0.row(p);
  275. const float* k01 = k0.row(p+1);
  276. const float* k02 = k0.row(p+2);
  277. const float* k03 = k0.row(p+3);
  278. float* g00 = g0.row(p/4);
  279. for (int k=0; k<maxk; k++)
  280. {
  281. g00[0] = k00[k];
  282. g00[1] = k01[k];
  283. g00[2] = k02[k];
  284. g00[3] = k03[k];
  285. g00 += 4;
  286. }
  287. }
  288. }
  289. }
  290. }
  291. #endif // __ARM_NEON
  292. // pack1
  293. if (elempack == 1 && out_elempack == 1)
  294. {
  295. use_winograd3x3 = false;
  296. use_sgemm1x1 = false;
  297. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  298. {
  299. // winograd is slow on small channel count
  300. if (num_input >= 16 && num_output >= 16)
  301. use_winograd3x3 = true;
  302. if (use_winograd3x3)
  303. {
  304. // conv3x3s1_winograd64_transform_kernel_neon(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  305. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  306. }
  307. }
  308. // TODO assume more proper condition
  309. if (opt.use_sgemm_convolution && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  310. {
  311. if (num_input >= 64 && num_output >= 64)
  312. use_sgemm1x1 = true;
  313. if (use_sgemm1x1)
  314. {
  315. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  316. }
  317. }
  318. if (impl_type > 0 && impl_type < 6 && impl_type != 4)
  319. {
  320. switch (impl_type)
  321. {
  322. case 1:
  323. // winograd
  324. conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
  325. break;
  326. case 2:
  327. // pointwise
  328. conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
  329. break;
  330. case 3:
  331. // im2col
  332. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  333. break;
  334. // case 4:
  335. // // direct
  336. // break;
  337. case 5:
  338. // conv3x3s2
  339. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  340. break;
  341. }
  342. }
  343. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  344. {
  345. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  346. }
  347. if (opt.use_sgemm_convolution && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  348. {
  349. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  350. }
  351. if (opt.use_sgemm_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  352. {
  353. conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, maxk);
  354. }
  355. }
  356. return 0;
  357. }
  358. int Convolution_arm::destroy_pipeline(const Option& opt)
  359. {
  360. if (activation)
  361. {
  362. activation->destroy_pipeline(opt);
  363. delete activation;
  364. activation = 0;
  365. }
  366. if (convolution_dilation1)
  367. {
  368. convolution_dilation1->destroy_pipeline(opt);
  369. delete convolution_dilation1;
  370. convolution_dilation1 = 0;
  371. }
  372. return 0;
  373. }
  374. int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  375. {
  376. if (bottom_blob.dims != 3)
  377. {
  378. return Convolution::forward(bottom_blob, top_blob, opt);
  379. }
  380. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  381. {
  382. return forward_int8_arm(bottom_blob, top_blob, opt);
  383. }
  384. if (opt.use_bf16_storage)
  385. return forward_bf16s(bottom_blob, top_blob, opt);
  386. int w = bottom_blob.w;
  387. int h = bottom_blob.h;
  388. int channels = bottom_blob.c;
  389. size_t elemsize = bottom_blob.elemsize;
  390. int elempack = bottom_blob.elempack;
  391. // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  392. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  393. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  394. Mat bottom_blob_bordered;
  395. make_padding(bottom_blob, bottom_blob_bordered, opt);
  396. if (bottom_blob_bordered.empty())
  397. return -100;
  398. w = bottom_blob_bordered.w;
  399. h = bottom_blob_bordered.h;
  400. int outw = (w - kernel_extent_w) / stride_w + 1;
  401. int outh = (h - kernel_extent_h) / stride_h + 1;
  402. int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  403. size_t out_elemsize = elemsize / elempack * out_elempack;
  404. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  405. if (top_blob.empty())
  406. return -100;
  407. if (opt.use_packing_layout == false && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  408. {
  409. return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  410. }
  411. const int maxk = kernel_w * kernel_h;
  412. // kernel offsets
  413. std::vector<int> _space_ofs(maxk);
  414. int* space_ofs = &_space_ofs[0];
  415. {
  416. int p1 = 0;
  417. int p2 = 0;
  418. int gap = w * dilation_h - kernel_w * dilation_w;
  419. for (int i = 0; i < kernel_h; i++)
  420. {
  421. for (int j = 0; j < kernel_w; j++)
  422. {
  423. space_ofs[p1] = p2;
  424. p1++;
  425. p2 += dilation_w;
  426. }
  427. p2 += gap;
  428. }
  429. }
  430. #if __ARM_NEON
  431. if (elempack == 4 && out_elempack == 4)
  432. {
  433. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  434. {
  435. conv1x1s1_sgemm_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  436. if (activation)
  437. {
  438. activation->forward_inplace(top_blob, opt);
  439. }
  440. }
  441. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  442. {
  443. conv1x1s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  444. if (activation)
  445. {
  446. activation->forward_inplace(top_blob, opt);
  447. }
  448. }
  449. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  450. {
  451. conv3x3s1_winograd64_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  452. if (activation)
  453. {
  454. activation->forward_inplace(top_blob, opt);
  455. }
  456. }
  457. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  458. {
  459. conv3x3s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  460. if (activation)
  461. {
  462. activation->forward_inplace(top_blob, opt);
  463. }
  464. }
  465. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  466. {
  467. conv5x5s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  468. if (activation)
  469. {
  470. activation->forward_inplace(top_blob, opt);
  471. }
  472. }
  473. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  474. {
  475. conv5x5s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_pack4, bias_data, opt);
  476. if (activation)
  477. {
  478. activation->forward_inplace(top_blob, opt);
  479. }
  480. }
  481. else
  482. {
  483. // num_output
  484. #pragma omp parallel for num_threads(opt.num_threads)
  485. for (int p=0; p<num_output / out_elempack; p++)
  486. {
  487. float* outptr = top_blob.channel(p);
  488. for (int i = 0; i < outh; i++)
  489. {
  490. for (int j = 0; j < outw; j++)
  491. {
  492. float32x4_t _sum = vdupq_n_f32(0.f);
  493. if (bias_term)
  494. {
  495. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  496. }
  497. const float* kptr = (const float*)weight_data_pack4 + maxk * channels * p * 16;
  498. // channels
  499. for (int q=0; q<channels; q++)
  500. {
  501. const Mat m = bottom_blob_bordered.channel(q);
  502. const float* sptr = m.row(i*stride_h) + j*stride_w * 4;
  503. for (int k = 0; k < maxk; k++) // 29.23
  504. {
  505. float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
  506. float32x4_t _w0 = vld1q_f32( kptr );
  507. float32x4_t _w1 = vld1q_f32( kptr + 4 );
  508. float32x4_t _w2 = vld1q_f32( kptr + 8 );
  509. float32x4_t _w3 = vld1q_f32( kptr + 12 );
  510. #if __aarch64__
  511. _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
  512. _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
  513. _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
  514. _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
  515. #else
  516. _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
  517. _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
  518. _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
  519. _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
  520. #endif
  521. kptr += 16;
  522. }
  523. }
  524. _sum = activation_ps(_sum, activation_type, activation_params);
  525. vst1q_f32(outptr + j * 4, _sum);
  526. }
  527. outptr += outw * 4;
  528. }
  529. }
  530. }
  531. }
  532. if (elempack == 1 && out_elempack == 4)
  533. {
  534. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  535. {
  536. conv3x3s1_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  537. if (activation)
  538. {
  539. activation->forward_inplace(top_blob, opt);
  540. }
  541. }
  542. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  543. {
  544. conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  545. if (activation)
  546. {
  547. activation->forward_inplace(top_blob, opt);
  548. }
  549. }
  550. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  551. {
  552. conv7x7s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);
  553. if (activation)
  554. {
  555. activation->forward_inplace(top_blob, opt);
  556. }
  557. }
  558. else
  559. {
  560. // num_output
  561. #pragma omp parallel for num_threads(opt.num_threads)
  562. for (int p=0; p<num_output / out_elempack; p++)
  563. {
  564. float* outptr = top_blob.channel(p);
  565. for (int i = 0; i < outh; i++)
  566. {
  567. for (int j = 0; j < outw; j++)
  568. {
  569. float32x4_t _sum = vdupq_n_f32(0.f);
  570. if (bias_term)
  571. {
  572. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  573. }
  574. const float* kptr = (const float*)weight_data_pack1to4 + maxk * channels * p * 4;
  575. // channels
  576. for (int q=0; q<channels; q++)
  577. {
  578. const Mat m = bottom_blob_bordered.channel(q);
  579. const float* sptr = m.row(i*stride_h) + j*stride_w;
  580. for (int k = 0; k < maxk; k++) // 29.23
  581. {
  582. float32x4_t _val = vdupq_n_f32( sptr[ space_ofs[k] ] );
  583. float32x4_t _w = vld1q_f32( kptr );
  584. _sum = vmlaq_f32(_sum, _val, _w);
  585. kptr += 4;
  586. }
  587. }
  588. _sum = activation_ps(_sum, activation_type, activation_params);
  589. vst1q_f32(outptr + j * 4, _sum);
  590. }
  591. outptr += outw * 4;
  592. }
  593. }
  594. }
  595. }
  596. if (elempack == 4 && out_elempack == 1)
  597. {
  598. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  599. {
  600. conv1x1s1_sgemm_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  601. if (activation)
  602. {
  603. activation->forward_inplace(top_blob, opt);
  604. }
  605. }
  606. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  607. {
  608. conv1x1s2_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  609. if (activation)
  610. {
  611. activation->forward_inplace(top_blob, opt);
  612. }
  613. }
  614. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  615. {
  616. // TODO more proper condition
  617. conv3x3s1_winograd64_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  618. // conv3x3s1_pack4to1_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1, bias_data, opt);
  619. if (activation)
  620. {
  621. activation->forward_inplace(top_blob, opt);
  622. }
  623. }
  624. else
  625. {
  626. // num_output
  627. #pragma omp parallel for num_threads(opt.num_threads)
  628. for (int p=0; p<num_output; p++)
  629. {
  630. float* outptr = top_blob.channel(p);
  631. for (int i = 0; i < outh; i++)
  632. {
  633. for (int j = 0; j < outw; j++)
  634. {
  635. float sum = 0.f;
  636. if (bias_term)
  637. {
  638. sum = bias_data[p];
  639. }
  640. const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
  641. // channels
  642. for (int q=0; q<channels; q++)
  643. {
  644. const Mat m = bottom_blob_bordered.channel(q);
  645. const float* sptr = m.row(i*stride_h) + j*stride_w * 4;
  646. for (int k = 0; k < maxk; k++) // 29.23
  647. {
  648. float32x4_t _val = vld1q_f32( sptr + space_ofs[k] * 4 );
  649. float32x4_t _w = vld1q_f32( kptr );
  650. float32x4_t _s4 = vmulq_f32(_val, _w);
  651. #if __aarch64__
  652. sum += vaddvq_f32(_s4); // dot
  653. #else
  654. float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
  655. _ss = vpadd_f32(_ss, _ss);
  656. sum += vget_lane_f32(_ss, 0);
  657. #endif
  658. kptr += 4;
  659. }
  660. }
  661. sum = activation_ss(sum, activation_type, activation_params);
  662. outptr[j] = sum;
  663. }
  664. outptr += outw;
  665. }
  666. }
  667. }
  668. }
  669. #endif // __ARM_NEON
  670. if (elempack == 1 && out_elempack == 1)
  671. {
  672. if (impl_type > 0 && impl_type < 6 && impl_type != 4)
  673. {
  674. // engineering is magic.
  675. switch (impl_type)
  676. {
  677. case 1:
  678. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  679. break;
  680. case 2:
  681. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  682. break;
  683. case 3:
  684. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  685. break;
  686. // case 4: FIXME fallback to auto path
  687. // conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  688. // break;
  689. case 5:
  690. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  691. break;
  692. }
  693. if (activation)
  694. {
  695. activation->forward_inplace(top_blob, opt);
  696. }
  697. }
  698. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  699. {
  700. if (use_sgemm1x1)
  701. {
  702. conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
  703. }
  704. else
  705. {
  706. conv1x1s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  707. }
  708. if (activation)
  709. {
  710. activation->forward_inplace(top_blob, opt);
  711. }
  712. }
  713. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  714. {
  715. if (opt.use_sgemm_convolution)
  716. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  717. else
  718. conv1x1s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  719. if (activation)
  720. {
  721. activation->forward_inplace(top_blob, opt);
  722. }
  723. }
  724. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  725. {
  726. if (use_winograd3x3 && w <= 120 && h <= 120)
  727. {
  728. // conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  729. conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
  730. }
  731. else
  732. {
  733. conv3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  734. }
  735. if (activation)
  736. {
  737. activation->forward_inplace(top_blob, opt);
  738. }
  739. }
  740. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  741. {
  742. if (opt.use_sgemm_convolution && !(outw >=8 && outh >=8))
  743. conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
  744. else
  745. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  746. if (activation)
  747. {
  748. activation->forward_inplace(top_blob, opt);
  749. }
  750. }
  751. else if (kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
  752. {
  753. conv4x4s4_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  754. if (activation)
  755. {
  756. activation->forward_inplace(top_blob, opt);
  757. }
  758. }
  759. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  760. {
  761. conv5x5s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  762. if (activation)
  763. {
  764. activation->forward_inplace(top_blob, opt);
  765. }
  766. }
  767. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  768. {
  769. conv5x5s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  770. if (activation)
  771. {
  772. activation->forward_inplace(top_blob, opt);
  773. }
  774. }
  775. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  776. {
  777. conv7x7s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  778. if (activation)
  779. {
  780. activation->forward_inplace(top_blob, opt);
  781. }
  782. }
  783. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  784. {
  785. conv7x7s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  786. if (activation)
  787. {
  788. activation->forward_inplace(top_blob, opt);
  789. }
  790. }
  791. else
  792. {
  793. // num_output
  794. #pragma omp parallel for num_threads(opt.num_threads)
  795. for (int p=0; p<num_output; p++)
  796. {
  797. float* outptr = top_blob.channel(p);
  798. for (int i = 0; i < outh; i++)
  799. {
  800. for (int j = 0; j < outw; j++)
  801. {
  802. float sum = 0.f;
  803. if (bias_term)
  804. {
  805. sum = bias_data[p];
  806. }
  807. const float* kptr = (const float*)weight_data + maxk * channels * p;
  808. // channels
  809. for (int q=0; q<channels; q++)
  810. {
  811. const Mat m = bottom_blob_bordered.channel(q);
  812. const float* sptr = m.row(i*stride_h) + j*stride_w;
  813. for (int k = 0; k < maxk; k++)
  814. {
  815. float val = sptr[ space_ofs[k] ];
  816. float w = kptr[ k ];
  817. sum += val * w;
  818. }
  819. kptr += maxk;
  820. }
  821. if (activation_type == 1)
  822. {
  823. sum = std::max(sum, 0.f);
  824. }
  825. else if (activation_type == 2)
  826. {
  827. float slope = activation_params[0];
  828. sum = sum > 0.f ? sum : sum * slope;
  829. }
  830. else if (activation_type == 3)
  831. {
  832. float min = activation_params[0];
  833. float max = activation_params[1];
  834. if (sum < min)
  835. sum = min;
  836. if (sum > max)
  837. sum = max;
  838. }
  839. else if (activation_type == 4)
  840. {
  841. sum = static_cast<float>(1.f / (1.f + exp(-sum)));
  842. }
  843. outptr[j] = sum;
  844. }
  845. outptr += outw;
  846. }
  847. }
  848. }
  849. }
  850. return 0;
  851. }
  852. int Convolution_arm::create_pipeline_bf16s(const Option& opt)
  853. {
  854. const int maxk = kernel_w * kernel_h;
  855. const int num_input = weight_data_size / maxk / num_output;
  856. int elempack = (opt.use_packing_layout && num_input % 4 == 0) ? 4 : 1;
  857. int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  858. #if __ARM_NEON
  859. // pack4
  860. if (elempack == 4 && out_elempack == 4)
  861. {
  862. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  863. {
  864. conv1x1s1_sgemm_transform_kernel_pack4_bf16s_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  865. }
  866. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  867. {
  868. conv1x1s1_sgemm_transform_kernel_pack4_bf16s_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  869. }
  870. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  871. {
  872. conv3x3s1_winograd64_transform_kernel_pack4_neon(weight_data, weight_data_pack4_bf16, num_input, num_output);
  873. }
  874. else
  875. {
  876. // src = kw-kh-inch-outch
  877. // dst = 4b-4a-kw-kh-inch/4a-outch/4b
  878. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  879. weight_data_pack4_bf16.create(maxk, num_input/4, num_output/4, (size_t)2*16, 16);
  880. for (int q=0; q+3<num_output; q+=4)
  881. {
  882. const Mat k0 = weight_data_r2.channel(q);
  883. const Mat k1 = weight_data_r2.channel(q+1);
  884. const Mat k2 = weight_data_r2.channel(q+2);
  885. const Mat k3 = weight_data_r2.channel(q+3);
  886. Mat g0 = weight_data_pack4_bf16.channel(q/4);
  887. for (int p=0; p+3<num_input; p+=4)
  888. {
  889. const float* k00 = k0.row(p);
  890. const float* k01 = k0.row(p+1);
  891. const float* k02 = k0.row(p+2);
  892. const float* k03 = k0.row(p+3);
  893. const float* k10 = k1.row(p);
  894. const float* k11 = k1.row(p+1);
  895. const float* k12 = k1.row(p+2);
  896. const float* k13 = k1.row(p+3);
  897. const float* k20 = k2.row(p);
  898. const float* k21 = k2.row(p+1);
  899. const float* k22 = k2.row(p+2);
  900. const float* k23 = k2.row(p+3);
  901. const float* k30 = k3.row(p);
  902. const float* k31 = k3.row(p+1);
  903. const float* k32 = k3.row(p+2);
  904. const float* k33 = k3.row(p+3);
  905. unsigned short* g00 = g0.row<unsigned short>(p/4);
  906. for (int k=0; k<maxk; k++)
  907. {
  908. g00[0] = float32_to_bfloat16(k00[k]);
  909. g00[1] = float32_to_bfloat16(k10[k]);
  910. g00[2] = float32_to_bfloat16(k20[k]);
  911. g00[3] = float32_to_bfloat16(k30[k]);
  912. g00[4] = float32_to_bfloat16(k01[k]);
  913. g00[5] = float32_to_bfloat16(k11[k]);
  914. g00[6] = float32_to_bfloat16(k21[k]);
  915. g00[7] = float32_to_bfloat16(k31[k]);
  916. g00[8] = float32_to_bfloat16(k02[k]);
  917. g00[9] = float32_to_bfloat16(k12[k]);
  918. g00[10] = float32_to_bfloat16(k22[k]);
  919. g00[11] = float32_to_bfloat16(k32[k]);
  920. g00[12] = float32_to_bfloat16(k03[k]);
  921. g00[13] = float32_to_bfloat16(k13[k]);
  922. g00[14] = float32_to_bfloat16(k23[k]);
  923. g00[15] = float32_to_bfloat16(k33[k]);
  924. g00 += 16;
  925. }
  926. }
  927. }
  928. }
  929. }
  930. // pack1to4
  931. if (elempack == 1 && out_elempack == 4)
  932. {
  933. // src = kw-kh-inch-outch
  934. // dst = 4b-kw-kh-inch-outch/4b
  935. {
  936. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  937. weight_data_pack1to4_bf16.create(maxk, num_input, num_output/4, (size_t)2*4, 4);
  938. for (int q=0; q+3<num_output; q+=4)
  939. {
  940. const Mat k0 = weight_data_r2.channel(q);
  941. const Mat k1 = weight_data_r2.channel(q+1);
  942. const Mat k2 = weight_data_r2.channel(q+2);
  943. const Mat k3 = weight_data_r2.channel(q+3);
  944. Mat g0 = weight_data_pack1to4_bf16.channel(q/4);
  945. for (int p=0; p<num_input; p++)
  946. {
  947. const float* k00 = k0.row(p);
  948. const float* k10 = k1.row(p);
  949. const float* k20 = k2.row(p);
  950. const float* k30 = k3.row(p);
  951. unsigned short* g00 = g0.row<unsigned short>(p);
  952. for (int k=0; k<maxk; k++)
  953. {
  954. g00[0] = float32_to_bfloat16(k00[k]);
  955. g00[1] = float32_to_bfloat16(k10[k]);
  956. g00[2] = float32_to_bfloat16(k20[k]);
  957. g00[3] = float32_to_bfloat16(k30[k]);
  958. g00 += 4;
  959. }
  960. }
  961. }
  962. }
  963. }
  964. // pack4to1
  965. if (elempack == 4 && out_elempack == 1)
  966. {
  967. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  968. {
  969. conv1x1s1_sgemm_transform_kernel_pack4to1_bf16s_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  970. }
  971. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  972. {
  973. conv1x1s1_sgemm_transform_kernel_pack4to1_bf16s_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  974. }
  975. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  976. {
  977. conv3x3s1_winograd64_transform_kernel_pack4to1_neon(weight_data, weight_data_pack4to1_bf16, num_input, num_output);
  978. }
  979. else
  980. {
  981. // src = kw-kh-inch-outch
  982. // dst = 4a-kw-kh-inch/4a-outch
  983. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  984. weight_data_pack4to1_bf16.create(maxk, num_input/4, num_output, (size_t)2*4, 4);
  985. for (int q=0; q<num_output; q++)
  986. {
  987. const Mat k0 = weight_data_r2.channel(q);
  988. Mat g0 = weight_data_pack4to1_bf16.channel(q);
  989. for (int p=0; p+3<num_input; p+=4)
  990. {
  991. const float* k00 = k0.row(p);
  992. const float* k01 = k0.row(p+1);
  993. const float* k02 = k0.row(p+2);
  994. const float* k03 = k0.row(p+3);
  995. unsigned short* g00 = g0.row<unsigned short>(p/4);
  996. for (int k=0; k<maxk; k++)
  997. {
  998. g00[0] = float32_to_bfloat16(k00[k]);
  999. g00[1] = float32_to_bfloat16(k01[k]);
  1000. g00[2] = float32_to_bfloat16(k02[k]);
  1001. g00[3] = float32_to_bfloat16(k03[k]);
  1002. g00 += 4;
  1003. }
  1004. }
  1005. }
  1006. }
  1007. }
  1008. #endif // __ARM_NEON
  1009. // pack1
  1010. if (elempack == 1 && out_elempack == 1)
  1011. {
  1012. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1013. {
  1014. conv1x1s1_sgemm_transform_kernel_bf16s_neon(weight_data, weight_data_bf16, num_input, num_output);
  1015. }
  1016. else
  1017. {
  1018. ncnn::cast_float32_to_bfloat16(weight_data, weight_data_bf16, opt);
  1019. }
  1020. }
  1021. return 0;
  1022. }
  1023. int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1024. {
  1025. int w = bottom_blob.w;
  1026. int h = bottom_blob.h;
  1027. int channels = bottom_blob.c;
  1028. size_t elemsize = bottom_blob.elemsize;
  1029. int elempack = bottom_blob.elempack;
  1030. // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  1031. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1032. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1033. Mat bottom_blob_bordered;
  1034. make_padding(bottom_blob, bottom_blob_bordered, opt);
  1035. if (bottom_blob_bordered.empty())
  1036. return -100;
  1037. w = bottom_blob_bordered.w;
  1038. h = bottom_blob_bordered.h;
  1039. int outw = (w - kernel_extent_w) / stride_w + 1;
  1040. int outh = (h - kernel_extent_h) / stride_h + 1;
  1041. int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
  1042. size_t out_elemsize = elemsize / elempack * out_elempack;
  1043. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  1044. if (top_blob.empty())
  1045. return -100;
  1046. // FIXME
  1047. // if (opt.use_packing_layout == false && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  1048. // {
  1049. // return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  1050. // }
  1051. const int maxk = kernel_w * kernel_h;
  1052. // kernel offsets
  1053. std::vector<int> _space_ofs(maxk);
  1054. int* space_ofs = &_space_ofs[0];
  1055. {
  1056. int p1 = 0;
  1057. int p2 = 0;
  1058. int gap = w * dilation_h - kernel_w * dilation_w;
  1059. for (int i = 0; i < kernel_h; i++)
  1060. {
  1061. for (int j = 0; j < kernel_w; j++)
  1062. {
  1063. space_ofs[p1] = p2;
  1064. p1++;
  1065. p2 += dilation_w;
  1066. }
  1067. p2 += gap;
  1068. }
  1069. }
  1070. #if __ARM_NEON
  1071. if (elempack == 4 && out_elempack == 4)
  1072. {
  1073. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1074. {
  1075. conv1x1s1_sgemm_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1076. if (activation)
  1077. {
  1078. activation->forward_inplace(top_blob, opt);
  1079. }
  1080. }
  1081. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1082. {
  1083. conv1x1s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1084. if (activation)
  1085. {
  1086. activation->forward_inplace(top_blob, opt);
  1087. }
  1088. }
  1089. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1090. {
  1091. conv3x3s1_winograd64_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1092. if (activation)
  1093. {
  1094. activation->forward_inplace(top_blob, opt);
  1095. }
  1096. }
  1097. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1098. {
  1099. conv3x3s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1100. if (activation)
  1101. {
  1102. activation->forward_inplace(top_blob, opt);
  1103. }
  1104. }
  1105. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1106. {
  1107. conv5x5s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1108. if (activation)
  1109. {
  1110. activation->forward_inplace(top_blob, opt);
  1111. }
  1112. }
  1113. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1114. {
  1115. conv5x5s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4_bf16, bias_data, opt);
  1116. if (activation)
  1117. {
  1118. activation->forward_inplace(top_blob, opt);
  1119. }
  1120. }
  1121. else
  1122. {
  1123. // num_output
  1124. #pragma omp parallel for num_threads(opt.num_threads)
  1125. for (int p=0; p<num_output / out_elempack; p++)
  1126. {
  1127. unsigned short* outptr = top_blob.channel(p);
  1128. for (int i = 0; i < outh; i++)
  1129. {
  1130. for (int j = 0; j < outw; j++)
  1131. {
  1132. float32x4_t _sum = vdupq_n_f32(0.f);
  1133. if (bias_term)
  1134. {
  1135. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  1136. }
  1137. const unsigned short* kptr = weight_data_pack4_bf16.channel(p);
  1138. // channels
  1139. for (int q=0; q<channels; q++)
  1140. {
  1141. const Mat m = bottom_blob_bordered.channel(q);
  1142. const unsigned short* sptr = m.row<const unsigned short>(i*stride_h) + j*stride_w * 4;
  1143. for (int k = 0; k < maxk; k++)
  1144. {
  1145. float32x4_t _val = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( sptr + space_ofs[k] * 4 ), 16));
  1146. float32x4_t _w0 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr ), 16));
  1147. float32x4_t _w1 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr + 4 ), 16));
  1148. float32x4_t _w2 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr + 8 ), 16));
  1149. float32x4_t _w3 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr + 12 ), 16));
  1150. #if __aarch64__
  1151. _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
  1152. _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
  1153. _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
  1154. _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
  1155. #else
  1156. _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
  1157. _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
  1158. _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
  1159. _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
  1160. #endif
  1161. kptr += 16;
  1162. }
  1163. }
  1164. _sum = activation_ps(_sum, activation_type, activation_params);
  1165. vst1_u16(outptr + j * 4, vshrn_n_u32(vreinterpretq_u32_f32(_sum), 16));
  1166. }
  1167. outptr += outw * 4;
  1168. }
  1169. }
  1170. }
  1171. }
  1172. if (elempack == 1 && out_elempack == 4)
  1173. {
  1174. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1175. {
  1176. conv3x3s1_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1177. if (activation)
  1178. {
  1179. activation->forward_inplace(top_blob, opt);
  1180. }
  1181. }
  1182. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1183. {
  1184. conv3x3s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1185. if (activation)
  1186. {
  1187. activation->forward_inplace(top_blob, opt);
  1188. }
  1189. }
  1190. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1191. {
  1192. conv7x7s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4_bf16, bias_data, opt);
  1193. if (activation)
  1194. {
  1195. activation->forward_inplace(top_blob, opt);
  1196. }
  1197. }
  1198. else
  1199. {
  1200. // num_output
  1201. #pragma omp parallel for num_threads(opt.num_threads)
  1202. for (int p=0; p<num_output / out_elempack; p++)
  1203. {
  1204. unsigned short* outptr = top_blob.channel(p);
  1205. for (int i = 0; i < outh; i++)
  1206. {
  1207. for (int j = 0; j < outw; j++)
  1208. {
  1209. float32x4_t _sum = vdupq_n_f32(0.f);
  1210. if (bias_term)
  1211. {
  1212. _sum = vld1q_f32(((const float*)bias_data) + p * 4);
  1213. }
  1214. const unsigned short* kptr = weight_data_pack1to4_bf16.channel(p);
  1215. // channels
  1216. for (int q=0; q<channels; q++)
  1217. {
  1218. const Mat m = bottom_blob_bordered.channel(q);
  1219. const unsigned short* sptr = m.row<const unsigned short>(i*stride_h) + j*stride_w;
  1220. for (int k = 0; k < maxk; k++)
  1221. {
  1222. float32x4_t _val = vdupq_n_f32(bfloat16_to_float32( sptr[ space_ofs[k] ] ));
  1223. float32x4_t _w = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr ), 16));
  1224. _sum = vmlaq_f32(_sum, _val, _w);
  1225. kptr += 4;
  1226. }
  1227. }
  1228. _sum = activation_ps(_sum, activation_type, activation_params);
  1229. vst1_u16(outptr + j * 4, vshrn_n_u32(vreinterpretq_u32_f32(_sum), 16));
  1230. }
  1231. outptr += outw * 4;
  1232. }
  1233. }
  1234. }
  1235. }
  1236. if (elempack == 4 && out_elempack == 1)
  1237. {
  1238. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1239. {
  1240. conv1x1s1_sgemm_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1241. if (activation)
  1242. {
  1243. activation->forward_inplace(top_blob, opt);
  1244. }
  1245. }
  1246. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1247. {
  1248. conv1x1s2_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1249. if (activation)
  1250. {
  1251. activation->forward_inplace(top_blob, opt);
  1252. }
  1253. }
  1254. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1255. {
  1256. // TODO more proper condition
  1257. conv3x3s1_winograd64_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1258. // conv3x3s1_pack4to1_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_pack4to1_bf16, bias_data, opt);
  1259. if (activation)
  1260. {
  1261. activation->forward_inplace(top_blob, opt);
  1262. }
  1263. }
  1264. else
  1265. {
  1266. // num_output
  1267. #pragma omp parallel for num_threads(opt.num_threads)
  1268. for (int p=0; p<num_output; p++)
  1269. {
  1270. unsigned short* outptr = top_blob.channel(p);
  1271. for (int i = 0; i < outh; i++)
  1272. {
  1273. for (int j = 0; j < outw; j++)
  1274. {
  1275. float sum = 0.f;
  1276. if (bias_term)
  1277. {
  1278. sum = bias_data[p];
  1279. }
  1280. const unsigned short* kptr = weight_data_pack4to1_bf16.channel(p);
  1281. // channels
  1282. for (int q=0; q<channels; q++)
  1283. {
  1284. const Mat m = bottom_blob_bordered.channel(q);
  1285. const unsigned short* sptr = m.row<const unsigned short>(i*stride_h) + j*stride_w * 4;
  1286. for (int k = 0; k < maxk; k++)
  1287. {
  1288. float32x4_t _val = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( sptr + space_ofs[k] * 4 ), 16));
  1289. float32x4_t _w = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16( kptr ), 16));
  1290. float32x4_t _s4 = vmulq_f32(_val, _w);
  1291. #if __aarch64__
  1292. sum += vaddvq_f32(_s4); // dot
  1293. #else
  1294. float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
  1295. _ss = vpadd_f32(_ss, _ss);
  1296. sum += vget_lane_f32(_ss, 0);
  1297. #endif
  1298. kptr += 4;
  1299. }
  1300. }
  1301. sum = activation_ss(sum, activation_type, activation_params);
  1302. outptr[j] = float32_to_bfloat16(sum);
  1303. }
  1304. outptr += outw;
  1305. }
  1306. }
  1307. }
  1308. }
  1309. #endif // __ARM_NEON
  1310. if (elempack == 1 && out_elempack == 1)
  1311. {
  1312. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1313. {
  1314. conv1x1s1_sgemm_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_bf16, bias_data, opt);
  1315. if (activation)
  1316. {
  1317. activation->forward_inplace(top_blob, opt);
  1318. }
  1319. }
  1320. else
  1321. {
  1322. // num_output
  1323. #pragma omp parallel for num_threads(opt.num_threads)
  1324. for (int p=0; p<num_output; p++)
  1325. {
  1326. unsigned short* outptr = top_blob.channel(p);
  1327. for (int i = 0; i < outh; i++)
  1328. {
  1329. for (int j = 0; j < outw; j++)
  1330. {
  1331. float sum = 0.f;
  1332. if (bias_term)
  1333. {
  1334. sum = bias_data[p];
  1335. }
  1336. const unsigned short* kptr = (const unsigned short*)weight_data_bf16 + maxk * channels * p;
  1337. // channels
  1338. for (int q=0; q<channels; q++)
  1339. {
  1340. const Mat m = bottom_blob_bordered.channel(q);
  1341. const unsigned short* sptr = m.row<unsigned short>(i*stride_h) + j*stride_w;
  1342. for (int k = 0; k < maxk; k++)
  1343. {
  1344. float val = bfloat16_to_float32(sptr[ space_ofs[k] ]);
  1345. float w = bfloat16_to_float32(kptr[ k ]);
  1346. sum += val * w;
  1347. }
  1348. kptr += maxk;
  1349. }
  1350. if (activation_type == 1)
  1351. {
  1352. sum = std::max(sum, 0.f);
  1353. }
  1354. else if (activation_type == 2)
  1355. {
  1356. float slope = activation_params[0];
  1357. sum = sum > 0.f ? sum : sum * slope;
  1358. }
  1359. else if (activation_type == 3)
  1360. {
  1361. float min = activation_params[0];
  1362. float max = activation_params[1];
  1363. if (sum < min)
  1364. sum = min;
  1365. if (sum > max)
  1366. sum = max;
  1367. }
  1368. else if (activation_type == 4)
  1369. {
  1370. sum = static_cast<float>(1.f / (1.f + exp(-sum)));
  1371. }
  1372. outptr[j] = float32_to_bfloat16(sum);
  1373. }
  1374. outptr += outw;
  1375. }
  1376. }
  1377. }
  1378. }
  1379. return 0;
  1380. }
  1381. int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
  1382. {
  1383. const int maxk = kernel_w * kernel_h;
  1384. const int num_input = weight_data_size / maxk / num_output;
  1385. use_winograd3x3_int8 = false;
  1386. use_sgemm1x1_int8 = false;
  1387. if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1388. {
  1389. use_winograd3x3_int8 = true;
  1390. // conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
  1391. conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
  1392. }
  1393. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1394. {
  1395. conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
  1396. }
  1397. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1398. {
  1399. use_sgemm1x1_int8 = true;
  1400. conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_data_int8, num_input, num_output);
  1401. }
  1402. else
  1403. {
  1404. conv_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data_int8, num_input, num_output, maxk);
  1405. }
  1406. return 0;
  1407. }
  1408. int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1409. {
  1410. if (dilation_w > 1 || dilation_h > 1)
  1411. {
  1412. return Convolution::forward(bottom_blob, top_blob, opt);
  1413. }
  1414. int w = bottom_blob.w;
  1415. int h = bottom_blob.h;
  1416. // int channels = bottom_blob.c;
  1417. size_t elemsize = bottom_blob.elemsize;
  1418. // fprintf(stderr, "Convolution_arm input %d x %d ksize=%d %d stride=%d %d\n", w, h, kernel_w, kernel_h, stride_w, stride_h);
  1419. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1420. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1421. Mat bottom_blob_unbordered = bottom_blob;
  1422. if (elemsize != 1)
  1423. {
  1424. Option opt_g = opt;
  1425. opt_g.blob_allocator = opt.workspace_allocator;
  1426. quantize_float32_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scale, opt_g);
  1427. }
  1428. Mat bottom_blob_bordered;
  1429. make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
  1430. if (bottom_blob_bordered.empty())
  1431. return -100;
  1432. w = bottom_blob_bordered.w;
  1433. h = bottom_blob_bordered.h;
  1434. int outw = (w - kernel_extent_w) / stride_w + 1;
  1435. int outh = (h - kernel_extent_h) / stride_h + 1;
  1436. // int8
  1437. size_t out_elemsize = use_int8_requantize ? 1u : 4u;
  1438. top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
  1439. if (top_blob.empty())
  1440. return -100;
  1441. // int8
  1442. if (use_int8_requantize == true)
  1443. {
  1444. Mat top_blob_tm;
  1445. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  1446. if (top_blob_tm.empty())
  1447. return -100;
  1448. if (use_sgemm1x1_int8)
  1449. {
  1450. std::vector<float> requantize_scales;
  1451. for (int p=0; p<num_output; p++)
  1452. {
  1453. float scale_in;
  1454. if (weight_data_int8_scales[p] == 0)
  1455. scale_in = 0;
  1456. else
  1457. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1458. float scale_out = top_blob_int8_scale;
  1459. requantize_scales.push_back(scale_in);
  1460. requantize_scales.push_back(scale_out);
  1461. }
  1462. conv1x1s1_sgemm_int8_requant_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_data_int8, bias_data, requantize_scales, opt);
  1463. if (activation)
  1464. {
  1465. activation->forward_inplace(top_blob, opt);
  1466. }
  1467. return 0;
  1468. }
  1469. else if (use_winograd3x3_int8)
  1470. {
  1471. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
  1472. conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
  1473. }
  1474. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1475. {
  1476. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_data_int8, opt);
  1477. }
  1478. else
  1479. {
  1480. conv_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_sgemm_data_int8, kernel_w, kernel_h, stride_w, stride_h, opt);
  1481. }
  1482. // requantize, reverse scale inplace
  1483. #pragma omp parallel for num_threads(opt.num_threads)
  1484. for (int p=0; p<num_output; p++)
  1485. {
  1486. Option opt_g = opt;
  1487. opt_g.num_threads = 1;
  1488. opt_g.blob_allocator = top_blob.allocator;
  1489. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  1490. Mat top_blob_g = top_blob.channel_range(p, 1);
  1491. // requantize and relu
  1492. float scale_in;
  1493. if (weight_data_int8_scales[p] == 0)
  1494. scale_in = 0;
  1495. else
  1496. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1497. float scale_out = top_blob_int8_scale;//FIXME load param
  1498. requantize_int8_to_int8(top_blob_tm_g, top_blob_g, scale_in, scale_out, bias_term ? (const float*)bias_data + p : 0, bias_term ? 1 : 0, 0, opt_g);
  1499. }
  1500. }
  1501. else
  1502. {
  1503. if (use_sgemm1x1_int8)
  1504. {
  1505. conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_data_int8, opt);
  1506. }
  1507. else if (use_winograd3x3_int8)
  1508. {
  1509. // conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
  1510. conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
  1511. }
  1512. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1513. {
  1514. conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data_int8, opt);
  1515. }
  1516. else
  1517. {
  1518. conv_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_sgemm_data_int8, kernel_w, kernel_h, stride_w, stride_h, opt);
  1519. }
  1520. // dequantize, reverse scale inplace
  1521. #pragma omp parallel for num_threads(opt.num_threads)
  1522. for (int p=0; p<num_output; p++)
  1523. {
  1524. Option opt_g = opt;
  1525. opt_g.num_threads = 1;
  1526. opt_g.blob_allocator = top_blob.allocator;
  1527. Mat top_blob_g = top_blob.channel_range(p, 1);
  1528. // dequantize
  1529. float scale_in;
  1530. if (weight_data_int8_scales[p] == 0)
  1531. scale_in = 0;
  1532. else
  1533. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
  1534. dequantize_int32_to_float32(top_blob_g, scale_in, bias_term ? (const float*)bias_data + p : 0, bias_term ? 1 : 0, opt_g);
  1535. }
  1536. }
  1537. if (activation)
  1538. {
  1539. activation->forward_inplace(top_blob, opt);
  1540. }
  1541. return 0;
  1542. }
  1543. int Convolution_arm::forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1544. {
  1545. int w = bottom_blob.w;
  1546. int h = bottom_blob.h;
  1547. size_t elemsize = bottom_blob.elemsize;
  1548. const int kernel_size = kernel_w;
  1549. const int stride = stride_w;
  1550. const int dilation = dilation_w;
  1551. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  1552. int outw = (w - kernel_extent) / stride + 1;
  1553. int outh = (h - kernel_extent) / stride + 1;
  1554. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  1555. if (top_blob.empty())
  1556. return -100;
  1557. // Make (dilation * dilation) batches
  1558. Mat inner_bottom_blob;
  1559. Mat inner_top_blob;
  1560. for (int x = 0; x < dilation; x ++)
  1561. {
  1562. for (int y = 0; y < dilation; y ++)
  1563. {
  1564. int inner_w = (w - y + dilation - 1) / dilation;
  1565. int inner_h = (h - x + dilation - 1) / dilation;
  1566. int inner_outw = (inner_w - kernel_size) / stride + 1;
  1567. int inner_outh = (inner_h - kernel_size) / stride + 1;
  1568. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  1569. if (inner_bottom_blob.empty())
  1570. return -100;
  1571. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  1572. if (inner_top_blob.empty())
  1573. return -100;
  1574. #pragma omp parallel for num_threads(opt.num_threads)
  1575. for (int c = 0; c < bottom_blob.c; c ++)
  1576. {
  1577. float *outptr = inner_bottom_blob.channel(c);
  1578. for (int i = 0; i < inner_h; i ++)
  1579. {
  1580. const float *ptr = (const float *) bottom_blob.channel(c) + dilation * i * w + x * w + y;
  1581. for (int j = 0; j < inner_w; j ++)
  1582. {
  1583. outptr[j] = ptr[j*dilation];
  1584. }
  1585. outptr += inner_w;
  1586. }
  1587. }
  1588. Option opt_g = opt;
  1589. opt_g.blob_allocator = inner_top_blob.allocator;
  1590. convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
  1591. #pragma omp parallel for num_threads(opt.num_threads)
  1592. for (int c = 0; c < num_output; c ++)
  1593. {
  1594. float *outptr = (float *) top_blob.channel(c) + x * outw + y;
  1595. for (int i = 0; i < inner_outh; i ++)
  1596. {
  1597. const float *ptr = (const float *) inner_top_blob.channel(c) + i * inner_outw;
  1598. for (int j = 0; j < inner_outw; j ++)
  1599. {
  1600. outptr[j*dilation] = ptr[j];
  1601. }
  1602. outptr += dilation * outw;
  1603. }
  1604. }
  1605. }
  1606. }
  1607. if (activation)
  1608. {
  1609. activation->forward_inplace(top_blob, opt);
  1610. }
  1611. return 0;
  1612. }
  1613. } // namespace ncnn