You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

deconvolutiondepthwise_riscv_zfh.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "deconvolutiondepthwise_riscv.h"
  15. #if __riscv_vector
  16. #include <riscv_vector.h>
  17. #endif // __riscv_vector
  18. #include "riscv_activation.h"
  19. #include "riscv_usability.h"
  20. namespace ncnn {
  21. #if NCNN_ZFH
  22. int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
  23. {
  24. #if __riscv_zvfh
  25. const int packn = csrr_vlenb() / 2;
  26. #endif // __riscv_zvfh
  27. const int maxk = kernel_w * kernel_h;
  28. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  29. // depth-wise
  30. if (channels == group && group == num_output)
  31. {
  32. int elempack = 1;
  33. #if __riscv_zvfh
  34. if (opt.use_packing_layout)
  35. {
  36. elempack = channels % packn == 0 ? packn : 1;
  37. }
  38. #endif // __riscv_zvfh
  39. Mat weight_data_transposed(weight_data.w);
  40. {
  41. float* pt = weight_data_transposed;
  42. const float* p = weight_data;
  43. for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
  44. {
  45. for (int k = 0; k < maxk; k++)
  46. {
  47. pt[maxk - 1 - k] = p[k];
  48. }
  49. p += maxk;
  50. pt += maxk;
  51. }
  52. }
  53. #if __riscv_zvfh
  54. // packn
  55. if (elempack == packn)
  56. {
  57. Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
  58. Mat weight_data_r2_packed;
  59. convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt);
  60. ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
  61. }
  62. #endif // __riscv_zvfh
  63. if (elempack == 1)
  64. {
  65. ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt);
  66. }
  67. ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
  68. if (opt.lightmode)
  69. weight_data.release();
  70. return 0;
  71. }
  72. // group convolution
  73. create_group_ops(opt);
  74. if (opt.lightmode)
  75. weight_data.release();
  76. return 0;
  77. }
  78. int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  79. {
  80. #if __riscv_zvfh
  81. const int packn = csrr_vlenb() / 2;
  82. const size_t vl = __riscv_vsetvl_e16m1(packn);
  83. #endif // __riscv_zvfh
  84. int w = bottom_blob.w;
  85. int h = bottom_blob.h;
  86. int channels = bottom_blob.c;
  87. size_t elemsize = bottom_blob.elemsize;
  88. int elempack = bottom_blob.elempack;
  89. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  90. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  91. int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
  92. int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
  93. int out_elempack = 1;
  94. #if __riscv_zvfh
  95. if (opt.use_packing_layout)
  96. {
  97. out_elempack = num_output % packn == 0 ? packn : 1;
  98. }
  99. #endif // __riscv_zvfh
  100. size_t out_elemsize = elemsize / elempack * out_elempack;
  101. Mat top_blob_bordered;
  102. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
  103. {
  104. top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
  105. }
  106. else
  107. {
  108. top_blob_bordered = top_blob;
  109. top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  110. }
  111. if (top_blob_bordered.empty())
  112. return -100;
  113. const int maxk = kernel_w * kernel_h;
  114. // depth-wise
  115. if (channels * elempack == group && group == num_output)
  116. {
  117. #if __riscv_zvfh
  118. if (elempack == packn)
  119. {
  120. {
  121. #pragma omp parallel for num_threads(opt.num_threads)
  122. for (int g = 0; g < channels; g++)
  123. {
  124. __fp16* outptr = top_blob_bordered.channel(g);
  125. const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
  126. const Mat m = bottom_blob.channel(g);
  127. for (int i = 0; i < outh; i++)
  128. {
  129. for (int j = 0; j < outw; j++)
  130. {
  131. vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
  132. if (bias_term)
  133. {
  134. _sum = __riscv_vle32_v_f32m2((const float*)bias_data + g * packn, vl);
  135. }
  136. for (int y = 0; y < kernel_h; y++)
  137. {
  138. int sys = (i + y * dilation_h - (kernel_extent_h - 1));
  139. if (sys < 0 || sys % stride_h != 0)
  140. continue;
  141. int sy = sys / stride_h;
  142. if (sy >= h)
  143. continue;
  144. for (int x = 0; x < kernel_w; x++)
  145. {
  146. int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
  147. if (sxs < 0 || sxs % stride_w != 0)
  148. continue;
  149. int sx = sxs / stride_w;
  150. if (sx >= w)
  151. continue;
  152. const __fp16* sptr = m.row<const __fp16>(sy) + sx * packn;
  153. int k = y * kernel_w + x;
  154. vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl);
  155. vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl);
  156. _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl);
  157. }
  158. }
  159. _sum = activation_ps(_sum, activation_type, activation_params, vl);
  160. __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
  161. }
  162. outptr += outw * packn;
  163. }
  164. }
  165. }
  166. }
  167. #endif // __riscv_zvfh
  168. if (elempack == 1)
  169. {
  170. {
  171. #pragma omp parallel for num_threads(opt.num_threads)
  172. for (int g = 0; g < channels; g++)
  173. {
  174. __fp16* outptr = top_blob_bordered.channel(g);
  175. const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
  176. const Mat m = bottom_blob.channel(g);
  177. for (int i = 0; i < outh; i++)
  178. {
  179. for (int j = 0; j < outw; j++)
  180. {
  181. float sum = 0.f;
  182. if (bias_term)
  183. {
  184. sum = bias_data[g];
  185. }
  186. for (int y = 0; y < kernel_h; y++)
  187. {
  188. int sys = (i + y * dilation_h - (kernel_extent_h - 1));
  189. if (sys < 0 || sys % stride_h != 0)
  190. continue;
  191. int sy = sys / stride_h;
  192. if (sy >= h)
  193. continue;
  194. const __fp16* sptr = m.row<const __fp16>(sy);
  195. for (int x = 0; x < kernel_w; x++)
  196. {
  197. int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
  198. if (sxs < 0 || sxs % stride_w != 0)
  199. continue;
  200. int sx = sxs / stride_w;
  201. if (sx >= w)
  202. continue;
  203. float val = (float)sptr[sx];
  204. int k = y * kernel_w + x;
  205. float w = (float)kptr[k];
  206. sum += val * w;
  207. }
  208. }
  209. sum = activation_ss(sum, activation_type, activation_params);
  210. outptr[j] = (__fp16)sum;
  211. }
  212. outptr += outw;
  213. }
  214. }
  215. }
  216. }
  217. }
  218. else
  219. {
  220. // group deconvolution
  221. const int channels_g = channels * elempack / group;
  222. const int num_output_g = num_output / group;
  223. int g_elempack = 1;
  224. int out_g_elempack = 1;
  225. #if __riscv_zvfh
  226. if (opt.use_packing_layout)
  227. {
  228. g_elempack = channels_g % packn == 0 ? packn : 1;
  229. out_g_elempack = num_output_g % packn == 0 ? packn : 1;
  230. }
  231. #endif // __riscv_zvfh
  232. // unpacking
  233. Mat bottom_blob_unpacked = bottom_blob;
  234. if (elempack > g_elempack)
  235. {
  236. Option opt_p = opt;
  237. opt_p.blob_allocator = opt.workspace_allocator;
  238. convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
  239. }
  240. Mat top_blob_bordered_unpacked = top_blob_bordered;
  241. if (out_g_elempack < out_elempack)
  242. {
  243. top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
  244. if (top_blob_bordered_unpacked.empty())
  245. return -100;
  246. }
  247. for (int g = 0; g < group; g++)
  248. {
  249. const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  250. Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  251. const ncnn::Layer* op = group_ops[g];
  252. Option opt_g = opt;
  253. opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
  254. // forward
  255. op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
  256. }
  257. // packing
  258. if (out_g_elempack < out_elempack)
  259. {
  260. convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
  261. }
  262. else
  263. {
  264. top_blob_bordered = top_blob_bordered_unpacked;
  265. }
  266. }
  267. cut_padding(top_blob_bordered, top_blob, opt);
  268. if (top_blob.empty())
  269. return -100;
  270. return 0;
  271. }
  272. int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  273. {
  274. #if __riscv_zvfh
  275. const int packn = csrr_vlenb() / 2;
  276. const size_t vl = __riscv_vsetvl_e16m1(packn);
  277. #endif // __riscv_zvfh
  278. int w = bottom_blob.w;
  279. int h = bottom_blob.h;
  280. int channels = bottom_blob.c;
  281. size_t elemsize = bottom_blob.elemsize;
  282. int elempack = bottom_blob.elempack;
  283. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  284. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  285. int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
  286. int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
  287. int out_elempack = 1;
  288. #if __riscv_zvfh
  289. if (opt.use_packing_layout)
  290. {
  291. out_elempack = num_output % packn == 0 ? packn : 1;
  292. }
  293. #endif // __riscv_zvfh
  294. size_t out_elemsize = elemsize / elempack * out_elempack;
  295. Mat top_blob_bordered;
  296. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
  297. {
  298. top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
  299. }
  300. else
  301. {
  302. top_blob_bordered = top_blob;
  303. top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  304. }
  305. if (top_blob_bordered.empty())
  306. return -100;
  307. const int maxk = kernel_w * kernel_h;
  308. // depth-wise
  309. if (channels * elempack == group && group == num_output)
  310. {
  311. #if __riscv_zvfh
  312. if (elempack == packn)
  313. {
  314. {
  315. #pragma omp parallel for num_threads(opt.num_threads)
  316. for (int g = 0; g < channels; g++)
  317. {
  318. __fp16* outptr = top_blob_bordered.channel(g);
  319. const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn;
  320. const Mat m = bottom_blob.channel(g);
  321. for (int i = 0; i < outh; i++)
  322. {
  323. for (int j = 0; j < outw; j++)
  324. {
  325. vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
  326. if (bias_term)
  327. {
  328. _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl);
  329. }
  330. for (int y = 0; y < kernel_h; y++)
  331. {
  332. int sys = (i + y * dilation_h - (kernel_extent_h - 1));
  333. if (sys < 0 || sys % stride_h != 0)
  334. continue;
  335. int sy = sys / stride_h;
  336. if (sy >= h)
  337. continue;
  338. for (int x = 0; x < kernel_w; x++)
  339. {
  340. int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
  341. if (sxs < 0 || sxs % stride_w != 0)
  342. continue;
  343. int sx = sxs / stride_w;
  344. if (sx >= w)
  345. continue;
  346. const __fp16* sptr = m.row<const __fp16>(sy) + sx * packn;
  347. int k = y * kernel_w + x;
  348. vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl);
  349. vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl);
  350. _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl);
  351. }
  352. }
  353. _sum = activation_ps(_sum, activation_type, activation_params, vl);
  354. __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl);
  355. }
  356. outptr += outw * packn;
  357. }
  358. }
  359. }
  360. }
  361. #endif // __riscv_zvfh
  362. if (elempack == 1)
  363. {
  364. {
  365. #pragma omp parallel for num_threads(opt.num_threads)
  366. for (int g = 0; g < channels; g++)
  367. {
  368. __fp16* outptr = top_blob_bordered.channel(g);
  369. const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
  370. const Mat m = bottom_blob.channel(g);
  371. for (int i = 0; i < outh; i++)
  372. {
  373. for (int j = 0; j < outw; j++)
  374. {
  375. float sum = 0.f;
  376. if (bias_term)
  377. {
  378. sum = bias_data[g];
  379. }
  380. for (int y = 0; y < kernel_h; y++)
  381. {
  382. int sys = (i + y * dilation_h - (kernel_extent_h - 1));
  383. if (sys < 0 || sys % stride_h != 0)
  384. continue;
  385. int sy = sys / stride_h;
  386. if (sy >= h)
  387. continue;
  388. const __fp16* sptr = m.row<const __fp16>(sy);
  389. for (int x = 0; x < kernel_w; x++)
  390. {
  391. int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
  392. if (sxs < 0 || sxs % stride_w != 0)
  393. continue;
  394. int sx = sxs / stride_w;
  395. if (sx >= w)
  396. continue;
  397. __fp16 val = sptr[sx];
  398. int k = y * kernel_w + x;
  399. __fp16 w = kptr[k];
  400. sum += val * w;
  401. }
  402. }
  403. sum = activation_ss(sum, activation_type, activation_params);
  404. outptr[j] = (__fp16)sum;
  405. }
  406. outptr += outw;
  407. }
  408. }
  409. }
  410. }
  411. }
  412. else
  413. {
  414. // group deconvolution
  415. const int channels_g = channels * elempack / group;
  416. const int num_output_g = num_output / group;
  417. int g_elempack = 1;
  418. int out_g_elempack = 1;
  419. #if __riscv_zvfh
  420. if (opt.use_packing_layout)
  421. {
  422. g_elempack = channels_g % packn == 0 ? packn : 1;
  423. out_g_elempack = num_output_g % packn == 0 ? packn : 1;
  424. }
  425. #endif // __riscv_zvfh
  426. // unpacking
  427. Mat bottom_blob_unpacked = bottom_blob;
  428. if (elempack > g_elempack)
  429. {
  430. Option opt_p = opt;
  431. opt_p.blob_allocator = opt.workspace_allocator;
  432. convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
  433. }
  434. Mat top_blob_bordered_unpacked = top_blob_bordered;
  435. if (out_g_elempack < out_elempack)
  436. {
  437. top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
  438. if (top_blob_bordered_unpacked.empty())
  439. return -100;
  440. }
  441. for (int g = 0; g < group; g++)
  442. {
  443. const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  444. Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  445. const ncnn::Layer* op = group_ops[g];
  446. Option opt_g = opt;
  447. opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
  448. // forward
  449. op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
  450. }
  451. // packing
  452. if (out_g_elempack < out_elempack)
  453. {
  454. convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
  455. }
  456. else
  457. {
  458. top_blob_bordered = top_blob_bordered_unpacked;
  459. }
  460. }
  461. cut_padding(top_blob_bordered, top_blob, opt);
  462. if (top_blob.empty())
  463. return -100;
  464. return 0;
  465. }
  466. #endif // NCNN_ZFH
  467. } // namespace ncnn