You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_arm.cpp 54 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_arm.h"
  15. #include "benchmark.h"
  16. #include "cpu.h"
  17. #include "layer_type.h"
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #endif // __ARM_NEON
  21. #include "arm_activation.h"
  22. #include "arm_usability.h"
  23. namespace ncnn {
  24. #include "convolution_1x1.h"
  25. #include "convolution_2x2.h"
  26. #include "convolution_3x3.h"
  27. #include "convolution_4x4.h"
  28. #include "convolution_5x5.h"
  29. #include "convolution_7x7.h"
  30. #include "convolution_packed.h"
  31. #include "convolution_3x3_winograd.h"
  32. #include "convolution_im2col_gemm.h"
  33. #if NCNN_BF16
  34. #include "convolution_packed_bf16s.h"
  35. #include "convolution_3x3_winograd_bf16s.h"
  36. #include "convolution_im2col_gemm_bf16s_fp16s.h"
  37. #include "convolution_im2col_gemm_bf16s.h"
  38. #endif // NCNN_BF16
  39. #if NCNN_INT8
  40. #include "convolution_packed_int8.h"
  41. #include "convolution_im2col_gemm_int8.h"
  42. #include "convolution_3x3_winograd_int8.h"
  43. // #include "convolution_3x3_int8.h"
  44. #endif // NCNN_INT8
  45. #if __ARM_NEON
  46. #include "convolution_3x3_pack1to4.h"
  47. #include "convolution_3x3_pack4.h"
  48. #include "convolution_3x3_pack4to1.h"
  49. #include "convolution_5x5_pack4.h"
  50. #include "convolution_7x7_pack1to4.h"
  51. #if NCNN_BF16
  52. #include "convolution_3x3_pack1to4_bf16s.h"
  53. #include "convolution_3x3_pack4_bf16s.h"
  54. #include "convolution_5x5_pack4_bf16s.h"
  55. #include "convolution_7x7_pack1to4_bf16s.h"
  56. #endif // NCNN_BF16
  57. #endif // __ARM_NEON
  58. Convolution_arm::Convolution_arm()
  59. {
  60. #if __ARM_NEON
  61. support_packing = true;
  62. #if NCNN_ARM82
  63. support_fp16_storage = cpu_support_arm_asimdhp();
  64. #endif
  65. #endif // __ARM_NEON
  66. #if NCNN_BF16
  67. support_bf16_storage = true;
  68. #endif
  69. activation = 0;
  70. nT = 0;
  71. convolution_dilation1 = 0;
  72. }
  73. static void convolution_transform_kernel_packed_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
  74. {
  75. const int maxk = kernel_w * kernel_h;
  76. // src = kw-kh-inch-outch
  77. // dst = pb-pa-kw-kh-inch/pa-outch/pb
  78. {
  79. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  80. weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
  81. for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
  82. {
  83. float* g00 = weight_data_tm.channel(q / out_elempack);
  84. for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
  85. {
  86. for (int k = 0; k < maxk; k++)
  87. {
  88. for (int i = 0; i < elempack; i++)
  89. {
  90. for (int j = 0; j < out_elempack; j++)
  91. {
  92. const float* k00 = weight_data_r2.channel(q + j).row(p + i);
  93. g00[0] = k00[k];
  94. g00++;
  95. }
  96. }
  97. }
  98. }
  99. }
  100. }
  101. }
  102. int Convolution_arm::create_pipeline(const Option& opt)
  103. {
  104. if (dynamic_weight)
  105. return 0;
  106. activation = create_activation_layer(activation_type, activation_params, opt);
  107. nT = opt.num_threads;
  108. #if NCNN_INT8
  109. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  110. {
  111. return create_pipeline_int8_arm(opt);
  112. }
  113. #endif
  114. #if NCNN_ARM82
  115. if (support_fp16_storage && opt.use_fp16_storage)
  116. {
  117. return create_pipeline_fp16s(opt);
  118. }
  119. #endif
  120. #if NCNN_BF16
  121. if (opt.use_bf16_storage)
  122. {
  123. return create_pipeline_bf16s(opt);
  124. }
  125. #endif
  126. if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  127. {
  128. convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
  129. // set param
  130. ncnn::ParamDict pd;
  131. pd.set(0, num_output); // num_output
  132. pd.set(1, kernel_w);
  133. pd.set(11, kernel_h);
  134. pd.set(2, 1);
  135. pd.set(12, 1);
  136. pd.set(3, 1); // stride_w
  137. pd.set(13, 1); // stride_h
  138. pd.set(4, 0); // pad_w
  139. pd.set(14, 0); // pad_h
  140. pd.set(5, bias_term);
  141. pd.set(6, weight_data_size);
  142. convolution_dilation1->load_param(pd);
  143. // set weights
  144. if (bias_term)
  145. {
  146. ncnn::Mat weights[2];
  147. weights[0] = weight_data;
  148. weights[1] = bias_data;
  149. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  150. }
  151. else
  152. {
  153. ncnn::Mat weights[1];
  154. weights[0] = weight_data;
  155. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  156. }
  157. convolution_dilation1->create_pipeline(opt);
  158. return 0;
  159. }
  160. const int maxk = kernel_w * kernel_h;
  161. const int num_input = weight_data_size / maxk / num_output;
  162. int elempack = 1;
  163. int out_elempack = 1;
  164. #if __ARM_NEON
  165. if (opt.use_packing_layout)
  166. {
  167. elempack = num_input % 4 == 0 ? 4 : 1;
  168. out_elempack = num_output % 4 == 0 ? 4 : 1;
  169. }
  170. #endif
  171. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);
  172. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  173. {
  174. // dynamic shape
  175. if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
  176. conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
  177. else if (opt.use_winograd43_convolution && (num_input >= 8 && num_output >= 8))
  178. conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
  179. else
  180. conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
  181. if (opt.lightmode)
  182. {
  183. weight_data.release();
  184. }
  185. return 0;
  186. }
  187. int l2_cache_size_fp32 = get_cpu_level2_cache_size() / sizeof(float);
  188. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp32 || (num_input > 16 || num_output > 16);
  189. if (elempack == 4 && out_elempack == 4)
  190. {
  191. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
  192. {
  193. prefer_sgemm = false;
  194. }
  195. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  196. {
  197. prefer_sgemm = false;
  198. }
  199. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
  200. {
  201. prefer_sgemm = false;
  202. }
  203. }
  204. if (elempack == 1 && out_elempack == 4)
  205. {
  206. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  207. {
  208. prefer_sgemm = false;
  209. }
  210. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  211. {
  212. prefer_sgemm = false;
  213. }
  214. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  215. {
  216. prefer_sgemm = false;
  217. }
  218. }
  219. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  220. {
  221. convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
  222. if (opt.lightmode)
  223. {
  224. weight_data.release();
  225. }
  226. return 0;
  227. }
  228. if ((elempack == 4 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  229. || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  230. || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  231. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  232. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  233. || (elempack == 1 && out_elempack == 4 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
  234. {
  235. convolution_transform_kernel_packed_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
  236. }
  237. else if (elempack == 1 && out_elempack == 1 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  238. {
  239. conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
  240. }
  241. else if ((elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  242. || (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  243. || (elempack == 1 && out_elempack == 1 && kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
  244. || (elempack == 1 && out_elempack == 1 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  245. || (elempack == 1 && out_elempack == 1 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  246. || (elempack == 1 && out_elempack == 1 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  247. || (elempack == 1 && out_elempack == 1 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
  248. {
  249. weight_data_tm = weight_data;
  250. }
  251. else
  252. {
  253. convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
  254. }
  255. if (opt.lightmode)
  256. {
  257. weight_data.release();
  258. }
  259. return 0;
  260. }
  261. int Convolution_arm::destroy_pipeline(const Option& opt)
  262. {
  263. if (activation)
  264. {
  265. activation->destroy_pipeline(opt);
  266. delete activation;
  267. activation = 0;
  268. }
  269. if (convolution_dilation1)
  270. {
  271. convolution_dilation1->destroy_pipeline(opt);
  272. delete convolution_dilation1;
  273. convolution_dilation1 = 0;
  274. }
  275. return 0;
  276. }
  277. int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  278. {
  279. #if NCNN_INT8
  280. if (opt.use_int8_inference && int8_scale_term)
  281. {
  282. return forward_int8_arm(bottom_blob, top_blob, opt);
  283. }
  284. #endif
  285. // flattened blob, implement as InnerProduct
  286. if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
  287. {
  288. Mat bottom_blob_3d;
  289. if (bottom_blob.elemsize % 16 == 0)
  290. {
  291. bottom_blob_3d = bottom_blob;
  292. bottom_blob_3d.dims = 3;
  293. bottom_blob_3d.w = 1;
  294. bottom_blob_3d.h = 1;
  295. bottom_blob_3d.c = bottom_blob.w;
  296. bottom_blob_3d.cstep = 1;
  297. }
  298. else
  299. {
  300. bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
  301. }
  302. Mat top_blob_3d;
  303. int ret = forward(bottom_blob_3d, top_blob_3d, opt);
  304. if (ret != 0)
  305. return ret;
  306. if (top_blob_3d.elemsize % 16 == 0)
  307. {
  308. top_blob = top_blob_3d;
  309. top_blob.dims = 1;
  310. top_blob.w = top_blob_3d.c;
  311. top_blob.h = 1;
  312. top_blob.c = 1;
  313. bottom_blob_3d.cstep = top_blob_3d.c;
  314. }
  315. else
  316. {
  317. top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
  318. }
  319. return 0;
  320. }
  321. int elembits = bottom_blob.elembits();
  322. #if NCNN_ARM82
  323. if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
  324. {
  325. if (opt.use_fp16_arithmetic)
  326. return forward_fp16sa(bottom_blob, top_blob, opt);
  327. else
  328. return forward_fp16s(bottom_blob, top_blob, opt);
  329. }
  330. #endif
  331. #if NCNN_BF16
  332. if (opt.use_bf16_storage && elembits == 16)
  333. return forward_bf16s(bottom_blob, top_blob, opt);
  334. #endif
  335. int w = bottom_blob.w;
  336. int h = bottom_blob.h;
  337. int channels = bottom_blob.c;
  338. size_t elemsize = bottom_blob.elemsize;
  339. int elempack = bottom_blob.elempack;
  340. // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  341. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  342. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  343. Mat bottom_blob_bordered;
  344. make_padding(bottom_blob, bottom_blob_bordered, opt);
  345. if (bottom_blob_bordered.empty())
  346. return -100;
  347. w = bottom_blob_bordered.w;
  348. h = bottom_blob_bordered.h;
  349. int outw = (w - kernel_extent_w) / stride_w + 1;
  350. int outh = (h - kernel_extent_h) / stride_h + 1;
  351. int out_elempack = 1;
  352. #if __ARM_NEON
  353. if (opt.use_packing_layout)
  354. {
  355. out_elempack = num_output % 4 == 0 ? 4 : 1;
  356. }
  357. #endif
  358. size_t out_elemsize = elemsize / elempack * out_elempack;
  359. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  360. if (top_blob.empty())
  361. return -100;
  362. if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  363. {
  364. if (outw >= dilation_w && outh >= dilation_h)
  365. {
  366. return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  367. }
  368. }
  369. const int num_input = channels * elempack;
  370. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);
  371. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  372. {
  373. bool prefer_winograd63 = false;
  374. bool prefer_winograd23 = false;
  375. bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;
  376. if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
  377. {
  378. // f23 fallback to f43
  379. prefer_winograd23 = false;
  380. prefer_winograd43 = true;
  381. }
  382. if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
  383. {
  384. // f63 fallback to f43
  385. prefer_winograd63 = false;
  386. prefer_winograd43 = true;
  387. }
  388. if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
  389. {
  390. // f43 fallback to f63 or f23
  391. prefer_winograd43 = false;
  392. if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
  393. {
  394. prefer_winograd63 = true;
  395. }
  396. else
  397. {
  398. prefer_winograd23 = true;
  399. }
  400. }
  401. // NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);
  402. int _nT = nT ? nT : opt.num_threads;
  403. if (nT != 0 && opt.num_threads != nT)
  404. {
  405. // force num_threads the same as in create_pipeline
  406. // so we could use pre-packed A/B from the same tile config
  407. NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
  408. }
  409. if (prefer_winograd23)
  410. {
  411. conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
  412. }
  413. else if (prefer_winograd43)
  414. {
  415. conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
  416. }
  417. else if (prefer_winograd63)
  418. {
  419. conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
  420. }
  421. else
  422. {
  423. // should never reach here
  424. }
  425. if (activation)
  426. {
  427. activation->forward_inplace(top_blob, opt);
  428. }
  429. return 0;
  430. }
  431. int l2_cache_size_fp32 = get_cpu_level2_cache_size() / sizeof(float);
  432. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp32 || (num_input > 16 || num_output > 16);
  433. if (elempack == 4 && out_elempack == 4)
  434. {
  435. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
  436. {
  437. prefer_sgemm = false;
  438. }
  439. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  440. {
  441. prefer_sgemm = false;
  442. }
  443. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
  444. {
  445. prefer_sgemm = false;
  446. }
  447. }
  448. if (elempack == 1 && out_elempack == 4)
  449. {
  450. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  451. {
  452. prefer_sgemm = false;
  453. }
  454. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  455. {
  456. prefer_sgemm = false;
  457. }
  458. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  459. {
  460. prefer_sgemm = false;
  461. }
  462. }
  463. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  464. {
  465. int _nT = nT ? nT : opt.num_threads;
  466. if (nT != 0 && opt.num_threads != nT)
  467. {
  468. // force num_threads the same as in create_pipeline
  469. // so we could use pre-packed A/B from the same tile config
  470. NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
  471. }
  472. convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
  473. if (activation)
  474. {
  475. activation->forward_inplace(top_blob, opt);
  476. }
  477. return 0;
  478. }
  479. #if __ARM_NEON
  480. if (elempack == 4 && out_elempack == 4)
  481. {
  482. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  483. {
  484. conv3x3s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  485. if (activation)
  486. {
  487. activation->forward_inplace(top_blob, opt);
  488. }
  489. }
  490. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  491. {
  492. conv5x5s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  493. if (activation)
  494. {
  495. activation->forward_inplace(top_blob, opt);
  496. }
  497. }
  498. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  499. {
  500. conv5x5s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  501. if (activation)
  502. {
  503. activation->forward_inplace(top_blob, opt);
  504. }
  505. }
  506. else
  507. {
  508. convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  509. }
  510. }
  511. if (elempack == 1 && out_elempack == 4)
  512. {
  513. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  514. {
  515. conv3x3s1_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  516. if (activation)
  517. {
  518. activation->forward_inplace(top_blob, opt);
  519. }
  520. }
  521. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  522. {
  523. conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  524. if (activation)
  525. {
  526. activation->forward_inplace(top_blob, opt);
  527. }
  528. }
  529. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  530. {
  531. conv7x7s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  532. if (activation)
  533. {
  534. activation->forward_inplace(top_blob, opt);
  535. }
  536. }
  537. else
  538. {
  539. convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  540. }
  541. }
  542. if (elempack == 4 && out_elempack == 1)
  543. {
  544. {
  545. convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  546. }
  547. }
  548. #endif // __ARM_NEON
  549. if (elempack == 1 && out_elempack == 1)
  550. {
  551. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  552. {
  553. conv1x1s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  554. if (activation)
  555. {
  556. activation->forward_inplace(top_blob, opt);
  557. }
  558. }
  559. else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  560. {
  561. conv1x1s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  562. if (activation)
  563. {
  564. activation->forward_inplace(top_blob, opt);
  565. }
  566. }
  567. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  568. {
  569. conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
  570. if (activation)
  571. {
  572. activation->forward_inplace(top_blob, opt);
  573. }
  574. }
  575. else if (kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
  576. {
  577. conv4x4s4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  578. if (activation)
  579. {
  580. activation->forward_inplace(top_blob, opt);
  581. }
  582. }
  583. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  584. {
  585. conv5x5s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  586. if (activation)
  587. {
  588. activation->forward_inplace(top_blob, opt);
  589. }
  590. }
  591. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  592. {
  593. conv5x5s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  594. if (activation)
  595. {
  596. activation->forward_inplace(top_blob, opt);
  597. }
  598. }
  599. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  600. {
  601. conv7x7s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  602. if (activation)
  603. {
  604. activation->forward_inplace(top_blob, opt);
  605. }
  606. }
  607. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  608. {
  609. conv7x7s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  610. if (activation)
  611. {
  612. activation->forward_inplace(top_blob, opt);
  613. }
  614. }
  615. else
  616. {
  617. convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  618. }
  619. }
  620. return 0;
  621. }
  622. int Convolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
  623. {
  624. const Mat& bottom_blob = bottom_blobs[0];
  625. const Mat& _weight_data = bottom_blobs[1];
  626. Mat& top_blob = top_blobs[0];
  627. const int _kernel_w = _weight_data.w;
  628. const int _kernel_h = _weight_data.h;
  629. const int _num_output = _weight_data.c * _weight_data.elempack;
  630. Mat weight_data_flattened;
  631. flatten(_weight_data, weight_data_flattened, opt);
  632. if (weight_data_flattened.empty())
  633. return -100;
  634. #if NCNN_ARM82
  635. if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
  636. {
  637. Mat weight_data_flattened_fp32;
  638. cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
  639. weight_data_flattened = weight_data_flattened_fp32;
  640. }
  641. #endif // NCNN_ARM82
  642. #if NCNN_BF16
  643. if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
  644. {
  645. Mat weight_data_flattened_fp32;
  646. cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
  647. weight_data_flattened = weight_data_flattened_fp32;
  648. }
  649. #endif // NCNN_BF16
  650. // weight_data_flattened as pack1
  651. weight_data_flattened.w *= weight_data_flattened.elempack;
  652. weight_data_flattened.elemsize /= weight_data_flattened.elempack;
  653. weight_data_flattened.elempack = 1;
  654. Mat bias_data_flattened;
  655. if (bias_term)
  656. {
  657. const Mat& _bias_data = bottom_blobs[2];
  658. flatten(_bias_data, bias_data_flattened, opt);
  659. if (bias_data_flattened.empty())
  660. return -100;
  661. #if NCNN_ARM82
  662. if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
  663. {
  664. Mat bias_data_flattened_fp32;
  665. cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
  666. bias_data_flattened = bias_data_flattened_fp32;
  667. }
  668. #endif // NCNN_ARM82
  669. #if NCNN_BF16
  670. if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
  671. {
  672. Mat bias_data_flattened_fp32;
  673. cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
  674. bias_data_flattened = bias_data_flattened_fp32;
  675. }
  676. #endif // NCNN_BF16
  677. // bias_data_flattened as pack1
  678. bias_data_flattened.w *= bias_data_flattened.elempack;
  679. bias_data_flattened.elemsize /= bias_data_flattened.elempack;
  680. bias_data_flattened.elempack = 1;
  681. }
  682. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  683. ncnn::ParamDict pd;
  684. pd.set(0, _num_output);
  685. pd.set(1, _kernel_w);
  686. pd.set(11, _kernel_h);
  687. pd.set(2, dilation_w);
  688. pd.set(12, dilation_h);
  689. pd.set(3, stride_w);
  690. pd.set(13, stride_h);
  691. pd.set(4, pad_left);
  692. pd.set(15, pad_right);
  693. pd.set(14, pad_top);
  694. pd.set(16, pad_bottom);
  695. pd.set(18, pad_value);
  696. pd.set(5, bias_term);
  697. pd.set(6, weight_data_flattened.w);
  698. pd.set(8, int8_scale_term);
  699. pd.set(9, activation_type);
  700. pd.set(10, activation_params);
  701. op->load_param(pd);
  702. ncnn::Mat weights[2];
  703. weights[0] = weight_data_flattened;
  704. weights[1] = bias_data_flattened;
  705. op->load_model(ncnn::ModelBinFromMatArray(weights));
  706. op->create_pipeline(opt);
  707. op->forward(bottom_blob, top_blob, opt);
  708. op->destroy_pipeline(opt);
  709. delete op;
  710. return 0;
  711. }
  712. #if NCNN_BF16
  713. static void convolution_transform_kernel_packed_bf16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
  714. {
  715. const int maxk = kernel_w * kernel_h;
  716. // src = kw-kh-inch-outch
  717. // dst = pb-pa-kw-kh-inch/pa-outch/pb
  718. {
  719. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  720. weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);
  721. for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
  722. {
  723. unsigned short* g00 = weight_data_tm.channel(q / out_elempack);
  724. for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
  725. {
  726. for (int k = 0; k < maxk; k++)
  727. {
  728. for (int i = 0; i < elempack; i++)
  729. {
  730. for (int j = 0; j < out_elempack; j++)
  731. {
  732. const float* k00 = weight_data_r2.channel(q + j).row(p + i);
  733. g00[0] = float32_to_bfloat16(k00[k]);
  734. g00++;
  735. }
  736. }
  737. }
  738. }
  739. }
  740. }
  741. }
  742. int Convolution_arm::create_pipeline_bf16s(const Option& opt)
  743. {
  744. const int maxk = kernel_w * kernel_h;
  745. const int num_input = weight_data_size / maxk / num_output;
  746. int elempack = 1;
  747. int out_elempack = 1;
  748. #if __ARM_NEON
  749. if (opt.use_packing_layout)
  750. {
  751. elempack = num_input % 4 == 0 ? 4 : 1;
  752. out_elempack = num_output % 4 == 0 ? 4 : 1;
  753. }
  754. #endif
  755. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);
  756. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  757. {
  758. // dynamic shape
  759. if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
  760. conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
  761. else if (opt.use_winograd43_convolution && (num_input >= 8 && num_output >= 8))
  762. conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
  763. else
  764. conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
  765. if (opt.lightmode)
  766. {
  767. weight_data.release();
  768. }
  769. return 0;
  770. }
  771. int l2_cache_size_bf16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
  772. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_bf16 || (num_input > 16 || num_output > 16);
  773. if (elempack == 4 && out_elempack == 4)
  774. {
  775. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
  776. {
  777. prefer_sgemm = false;
  778. }
  779. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  780. {
  781. prefer_sgemm = false;
  782. }
  783. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
  784. {
  785. prefer_sgemm = false;
  786. }
  787. }
  788. if (elempack == 1 && out_elempack == 4)
  789. {
  790. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  791. {
  792. prefer_sgemm = false;
  793. }
  794. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  795. {
  796. prefer_sgemm = false;
  797. }
  798. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  799. {
  800. prefer_sgemm = false;
  801. }
  802. }
  803. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  804. {
  805. convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
  806. if (opt.lightmode)
  807. {
  808. weight_data.release();
  809. }
  810. return 0;
  811. }
  812. if ((elempack == 4 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  813. || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  814. || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  815. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  816. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  817. || (elempack == 1 && out_elempack == 4 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
  818. {
  819. convolution_transform_kernel_packed_bf16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
  820. }
  821. else
  822. {
  823. convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
  824. }
  825. if (opt.lightmode)
  826. {
  827. weight_data.release();
  828. }
  829. return 0;
  830. }
  831. int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  832. {
  833. int w = bottom_blob.w;
  834. int h = bottom_blob.h;
  835. int channels = bottom_blob.c;
  836. size_t elemsize = bottom_blob.elemsize;
  837. int elempack = bottom_blob.elempack;
  838. // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  839. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  840. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  841. Mat bottom_blob_bordered;
  842. make_padding(bottom_blob, bottom_blob_bordered, opt);
  843. if (bottom_blob_bordered.empty())
  844. return -100;
  845. w = bottom_blob_bordered.w;
  846. h = bottom_blob_bordered.h;
  847. int outw = (w - kernel_extent_w) / stride_w + 1;
  848. int outh = (h - kernel_extent_h) / stride_h + 1;
  849. int out_elempack = 1;
  850. #if __ARM_NEON
  851. if (opt.use_packing_layout)
  852. {
  853. out_elempack = num_output % 4 == 0 ? 4 : 1;
  854. }
  855. #endif
  856. size_t out_elemsize = elemsize / elempack * out_elempack;
  857. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  858. if (top_blob.empty())
  859. return -100;
  860. // TODO dilated conv for bf16s
  861. // if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  862. // {
  863. // return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
  864. // }
  865. const int num_input = channels * elempack;
  866. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);
  867. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  868. {
  869. bool prefer_winograd63 = false;
  870. bool prefer_winograd23 = false;
  871. bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;
  872. if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
  873. {
  874. // f23 fallback to f43
  875. prefer_winograd23 = false;
  876. prefer_winograd43 = true;
  877. }
  878. if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
  879. {
  880. // f63 fallback to f43
  881. prefer_winograd63 = false;
  882. prefer_winograd43 = true;
  883. }
  884. if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
  885. {
  886. // f43 fallback to f63 or f23
  887. prefer_winograd43 = false;
  888. if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
  889. {
  890. prefer_winograd63 = true;
  891. }
  892. else
  893. {
  894. prefer_winograd23 = true;
  895. }
  896. }
  897. // NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);
  898. int _nT = nT ? nT : opt.num_threads;
  899. if (nT != 0 && opt.num_threads != nT)
  900. {
  901. // force num_threads the same as in create_pipeline
  902. // so we could use pre-packed A/B from the same tile config
  903. NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
  904. }
  905. if (prefer_winograd23)
  906. {
  907. conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
  908. }
  909. else if (prefer_winograd43)
  910. {
  911. conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
  912. }
  913. else if (prefer_winograd63)
  914. {
  915. conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
  916. }
  917. else
  918. {
  919. // should never reach here
  920. }
  921. if (activation)
  922. {
  923. activation->forward_inplace(top_blob, opt);
  924. }
  925. return 0;
  926. }
  927. int l2_cache_size_bf16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
  928. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_bf16 || (num_input > 16 || num_output > 16);
  929. if (elempack == 4 && out_elempack == 4)
  930. {
  931. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
  932. {
  933. prefer_sgemm = false;
  934. }
  935. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  936. {
  937. prefer_sgemm = false;
  938. }
  939. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
  940. {
  941. prefer_sgemm = false;
  942. }
  943. }
  944. if (elempack == 1 && out_elempack == 4)
  945. {
  946. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  947. {
  948. prefer_sgemm = false;
  949. }
  950. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  951. {
  952. prefer_sgemm = false;
  953. }
  954. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  955. {
  956. prefer_sgemm = false;
  957. }
  958. }
  959. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  960. {
  961. int _nT = nT ? nT : opt.num_threads;
  962. if (nT != 0 && opt.num_threads != nT)
  963. {
  964. // force num_threads the same as in create_pipeline
  965. // so we could use pre-packed A/B from the same tile config
  966. NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
  967. }
  968. convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
  969. if (activation)
  970. {
  971. activation->forward_inplace(top_blob, opt);
  972. }
  973. return 0;
  974. }
  975. #if __ARM_NEON
  976. if (elempack == 4 && out_elempack == 4)
  977. {
  978. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  979. {
  980. conv3x3s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  981. if (activation)
  982. {
  983. activation->forward_inplace(top_blob, opt);
  984. }
  985. }
  986. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  987. {
  988. conv5x5s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  989. if (activation)
  990. {
  991. activation->forward_inplace(top_blob, opt);
  992. }
  993. }
  994. else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  995. {
  996. conv5x5s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  997. if (activation)
  998. {
  999. activation->forward_inplace(top_blob, opt);
  1000. }
  1001. }
  1002. else
  1003. {
  1004. convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  1005. }
  1006. }
  1007. if (elempack == 1 && out_elempack == 4)
  1008. {
  1009. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1010. {
  1011. conv3x3s1_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  1012. if (activation)
  1013. {
  1014. activation->forward_inplace(top_blob, opt);
  1015. }
  1016. }
  1017. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1018. {
  1019. conv3x3s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  1020. if (activation)
  1021. {
  1022. activation->forward_inplace(top_blob, opt);
  1023. }
  1024. }
  1025. else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  1026. {
  1027. conv7x7s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  1028. if (activation)
  1029. {
  1030. activation->forward_inplace(top_blob, opt);
  1031. }
  1032. }
  1033. else
  1034. {
  1035. convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  1036. }
  1037. }
  1038. if (elempack == 4 && out_elempack == 1)
  1039. {
  1040. {
  1041. convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  1042. }
  1043. }
  1044. #endif // __ARM_NEON
  1045. if (elempack == 1 && out_elempack == 1)
  1046. {
  1047. {
  1048. convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  1049. }
  1050. }
  1051. return 0;
  1052. }
  1053. #endif // NCNN_BF16
  1054. #if NCNN_INT8
  1055. int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
  1056. {
  1057. const int maxk = kernel_w * kernel_h;
  1058. const int num_input = weight_data_size / maxk / num_output;
  1059. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
  1060. #if NCNN_ARM82DOT
  1061. if (ncnn::cpu_support_arm_asimddp())
  1062. {
  1063. prefer_winograd = false;
  1064. }
  1065. #endif
  1066. if (opt.use_winograd_convolution && prefer_winograd)
  1067. {
  1068. if (opt.use_winograd43_convolution)
  1069. conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
  1070. else
  1071. conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
  1072. }
  1073. else if (opt.use_sgemm_convolution)
  1074. {
  1075. convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
  1076. }
  1077. else
  1078. {
  1079. convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
  1080. }
  1081. scale_in_data.create(num_output);
  1082. for (int p = 0; p < num_output; p++)
  1083. {
  1084. // requantize and relu
  1085. float scale_in;
  1086. if (weight_data_int8_scales[p] == 0)
  1087. scale_in = 0;
  1088. else
  1089. scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
  1090. scale_in_data[p] = scale_in;
  1091. }
  1092. if (opt.lightmode)
  1093. {
  1094. weight_data.release();
  1095. }
  1096. return 0;
  1097. }
  1098. int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1099. {
  1100. int elembits = bottom_blob.elembits();
  1101. Mat bottom_blob_int8 = bottom_blob;
  1102. if (elembits != 8)
  1103. {
  1104. Option opt_q = opt;
  1105. opt_q.blob_allocator = opt.workspace_allocator;
  1106. quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
  1107. }
  1108. // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
  1109. Mat bottom_blob_bordered;
  1110. make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
  1111. if (bottom_blob_bordered.empty())
  1112. return -100;
  1113. int w = bottom_blob_bordered.w;
  1114. int h = bottom_blob_bordered.h;
  1115. int elempack = bottom_blob_bordered.elempack;
  1116. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1117. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1118. int outw = (w - kernel_extent_w) / stride_w + 1;
  1119. int outh = (h - kernel_extent_h) / stride_h + 1;
  1120. bool use_int8_requantize = int8_scale_term > 100;
  1121. int out_elempack = 1;
  1122. #if __ARM_NEON
  1123. if (opt.use_packing_layout)
  1124. {
  1125. if (use_int8_requantize)
  1126. out_elempack = num_output % 8 == 0 ? 8 : 1;
  1127. else
  1128. out_elempack = num_output % 4 == 0 ? 4 : 1;
  1129. }
  1130. #endif // __ARM_NEON
  1131. size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
  1132. #if NCNN_ARM82
  1133. if (support_fp16_storage && opt.use_fp16_storage)
  1134. {
  1135. out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
  1136. }
  1137. #endif
  1138. if (opt.use_bf16_storage)
  1139. out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
  1140. // NCNN_LOGE("forward_int8_arm %d %d %d %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
  1141. int channels = bottom_blob_bordered.c;
  1142. const int num_input = channels * elempack;
  1143. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
  1144. #if NCNN_ARM82DOT
  1145. if (ncnn::cpu_support_arm_asimddp())
  1146. {
  1147. prefer_winograd = false;
  1148. }
  1149. #endif
  1150. int out_elempack_int32 = 1;
  1151. #if __ARM_NEON
  1152. if (opt.use_packing_layout)
  1153. {
  1154. out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  1155. }
  1156. #endif // __ARM_NEON
  1157. Mat top_blob_int32;
  1158. top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
  1159. if (top_blob_int32.empty())
  1160. return -100;
  1161. int _nT = nT ? nT : opt.num_threads;
  1162. if (nT != 0 && opt.num_threads != nT)
  1163. {
  1164. // force num_threads the same as in create_pipeline
  1165. // so we could use pre-packed A/B from the same tile config
  1166. NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
  1167. }
  1168. if (opt.use_winograd_convolution && prefer_winograd)
  1169. {
  1170. if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
  1171. conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
  1172. else
  1173. conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
  1174. }
  1175. else if (opt.use_sgemm_convolution)
  1176. {
  1177. convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
  1178. }
  1179. else
  1180. {
  1181. convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
  1182. }
  1183. bottom_blob_bordered.release();
  1184. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  1185. if (top_blob.empty())
  1186. return -100;
  1187. if (use_int8_requantize)
  1188. {
  1189. requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
  1190. }
  1191. else
  1192. {
  1193. dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
  1194. if (activation)
  1195. {
  1196. activation->forward_inplace(top_blob, opt);
  1197. }
  1198. }
  1199. return 0;
  1200. }
  1201. #endif // NCNN_INT8
  1202. int Convolution_arm::forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1203. {
  1204. int w = bottom_blob.w;
  1205. int h = bottom_blob.h;
  1206. size_t elemsize = bottom_blob.elemsize;
  1207. const int kernel_size = kernel_w;
  1208. const int stride = stride_w;
  1209. const int dilation = dilation_w;
  1210. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  1211. int outw = (w - kernel_extent) / stride + 1;
  1212. int outh = (h - kernel_extent) / stride + 1;
  1213. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  1214. if (top_blob.empty())
  1215. return -100;
  1216. // Make (dilation * dilation) batches
  1217. Mat inner_bottom_blob;
  1218. Mat inner_top_blob;
  1219. for (int x = 0; x < dilation; x++)
  1220. {
  1221. for (int y = 0; y < dilation; y++)
  1222. {
  1223. int inner_w = (w - y + dilation - 1) / dilation;
  1224. int inner_h = (h - x + dilation - 1) / dilation;
  1225. int inner_outw = (inner_w - kernel_size) / stride + 1;
  1226. int inner_outh = (inner_h - kernel_size) / stride + 1;
  1227. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  1228. if (inner_bottom_blob.empty())
  1229. return -100;
  1230. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  1231. if (inner_top_blob.empty())
  1232. return -100;
  1233. #pragma omp parallel for num_threads(opt.num_threads)
  1234. for (int c = 0; c < bottom_blob.c; c++)
  1235. {
  1236. float* outptr = inner_bottom_blob.channel(c);
  1237. for (int i = 0; i < inner_h; i++)
  1238. {
  1239. const float* ptr = (const float*)bottom_blob.channel(c) + dilation * i * w + x * w + y;
  1240. for (int j = 0; j < inner_w; j++)
  1241. {
  1242. outptr[j] = ptr[j * dilation];
  1243. }
  1244. outptr += inner_w;
  1245. }
  1246. }
  1247. Option opt_g = opt;
  1248. opt_g.blob_allocator = inner_top_blob.allocator;
  1249. convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
  1250. #pragma omp parallel for num_threads(opt.num_threads)
  1251. for (int c = 0; c < num_output; c++)
  1252. {
  1253. float* outptr = (float*)top_blob.channel(c) + x * outw + y;
  1254. for (int i = 0; i < inner_outh; i++)
  1255. {
  1256. const float* ptr = (const float*)inner_top_blob.channel(c) + i * inner_outw;
  1257. for (int j = 0; j < inner_outw; j++)
  1258. {
  1259. outptr[j * dilation] = ptr[j];
  1260. }
  1261. outptr += dilation * outw;
  1262. }
  1263. }
  1264. }
  1265. }
  1266. if (activation)
  1267. {
  1268. activation->forward_inplace(top_blob, opt);
  1269. }
  1270. return 0;
  1271. }
  1272. } // namespace ncnn