You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel_resize.cpp 39 kB

7 years ago
7 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <math.h>
  17. #include <algorithm>
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #endif // __ARM_NEON
  21. #include "platform.h"
  22. namespace ncnn {
  23. #if NCNN_PIXEL
  24. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  25. {
  26. const int INTER_RESIZE_COEF_BITS=11;
  27. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  28. // const int ONE=INTER_RESIZE_COEF_SCALE;
  29. double scale_x = (double)srcw / w;
  30. double scale_y = (double)srch / h;
  31. int* buf = new int[w + h + w + h];
  32. int* xofs = buf;//new int[w];
  33. int* yofs = buf + w;//new int[h];
  34. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  35. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  36. float fx;
  37. float fy;
  38. int sx;
  39. int sy;
  40. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  41. for (int dx = 0; dx < w; dx++)
  42. {
  43. fx = (float)((dx + 0.5) * scale_x - 0.5);
  44. sx = floor(fx);
  45. fx -= sx;
  46. if (sx < 0)
  47. {
  48. sx = 0;
  49. fx = 0.f;
  50. }
  51. if (sx >= srcw - 1)
  52. {
  53. sx = srcw - 2;
  54. fx = 1.f;
  55. }
  56. xofs[dx] = sx;
  57. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  58. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  59. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  60. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  61. }
  62. for (int dy = 0; dy < h; dy++)
  63. {
  64. fy = (float)((dy + 0.5) * scale_y - 0.5);
  65. sy = floor(fy);
  66. fy -= sy;
  67. if (sy < 0)
  68. {
  69. sy = 0;
  70. fy = 0.f;
  71. }
  72. if (sy >= srch - 1)
  73. {
  74. sy = srch - 2;
  75. fy = 1.f;
  76. }
  77. yofs[dy] = sy;
  78. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  79. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  80. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  81. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  82. }
  83. #undef SATURATE_CAST_SHORT
  84. // loop body
  85. Mat rowsbuf0(w, (size_t)2u);
  86. Mat rowsbuf1(w, (size_t)2u);
  87. short* rows0 = (short*)rowsbuf0.data;
  88. short* rows1 = (short*)rowsbuf1.data;
  89. int prev_sy1 = -2;
  90. for (int dy = 0; dy < h; dy++ )
  91. {
  92. int sy = yofs[dy];
  93. if (sy == prev_sy1)
  94. {
  95. // reuse all rows
  96. }
  97. else if (sy == prev_sy1 + 1)
  98. {
  99. // hresize one row
  100. short* rows0_old = rows0;
  101. rows0 = rows1;
  102. rows1 = rows0_old;
  103. const unsigned char *S1 = src + srcw * (sy+1);
  104. const short* ialphap = ialpha;
  105. short* rows1p = rows1;
  106. for ( int dx = 0; dx < w; dx++ )
  107. {
  108. int sx = xofs[dx];
  109. short a0 = ialphap[0];
  110. short a1 = ialphap[1];
  111. const unsigned char* S1p = S1 + sx;
  112. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  113. ialphap += 2;
  114. }
  115. }
  116. else
  117. {
  118. // hresize two rows
  119. const unsigned char *S0 = src + srcw * (sy);
  120. const unsigned char *S1 = src + srcw * (sy+1);
  121. const short* ialphap = ialpha;
  122. short* rows0p = rows0;
  123. short* rows1p = rows1;
  124. for ( int dx = 0; dx < w; dx++ )
  125. {
  126. int sx = xofs[dx];
  127. short a0 = ialphap[0];
  128. short a1 = ialphap[1];
  129. const unsigned char* S0p = S0 + sx;
  130. const unsigned char* S1p = S1 + sx;
  131. rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
  132. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  133. ialphap += 2;
  134. }
  135. }
  136. prev_sy1 = sy;
  137. // vresize
  138. short b0 = ibeta[0];
  139. short b1 = ibeta[1];
  140. short* rows0p = rows0;
  141. short* rows1p = rows1;
  142. unsigned char* Dp = dst + w * (dy);
  143. #if __ARM_NEON
  144. int nn = w >> 3;
  145. #else
  146. int nn = 0;
  147. #endif
  148. int remain = w - (nn << 3);
  149. #if __ARM_NEON
  150. #if __aarch64__
  151. int16x4_t _b0 = vdup_n_s16(b0);
  152. int16x4_t _b1 = vdup_n_s16(b1);
  153. int32x4_t _v2 = vdupq_n_s32(2);
  154. for (; nn>0; nn--)
  155. {
  156. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  157. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  158. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  159. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  160. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  161. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  162. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  163. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  164. int32x4_t _acc = _v2;
  165. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  166. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  167. int32x4_t _acc_1 = _v2;
  168. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  169. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  170. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  171. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  172. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  173. vst1_u8(Dp, _D);
  174. Dp += 8;
  175. rows0p += 8;
  176. rows1p += 8;
  177. }
  178. #else
  179. if (nn > 0)
  180. {
  181. asm volatile(
  182. "vdup.s16 d16, %8 \n"
  183. "mov r4, #2 \n"
  184. "vdup.s16 d17, %9 \n"
  185. "vdup.s32 q12, r4 \n"
  186. "pld [%0, #128] \n"
  187. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  188. "pld [%1, #128] \n"
  189. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  190. "0: \n"
  191. "vmull.s16 q0, d2, d16 \n"
  192. "vmull.s16 q1, d3, d16 \n"
  193. "vorr.s32 q10, q12, q12 \n"
  194. "vorr.s32 q11, q12, q12 \n"
  195. "vmull.s16 q2, d6, d17 \n"
  196. "vmull.s16 q3, d7, d17 \n"
  197. "vsra.s32 q10, q0, #16 \n"
  198. "vsra.s32 q11, q1, #16 \n"
  199. "pld [%0, #128] \n"
  200. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  201. "vsra.s32 q10, q2, #16 \n"
  202. "vsra.s32 q11, q3, #16 \n"
  203. "pld [%1, #128] \n"
  204. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  205. "vshrn.s32 d20, q10, #2 \n"
  206. "vshrn.s32 d21, q11, #2 \n"
  207. "vqmovun.s16 d20, q10 \n"
  208. "vst1.8 {d20}, [%2]! \n"
  209. "subs %3, #1 \n"
  210. "bne 0b \n"
  211. "sub %0, #16 \n"
  212. "sub %1, #16 \n"
  213. : "=r"(rows0p), // %0
  214. "=r"(rows1p), // %1
  215. "=r"(Dp), // %2
  216. "=r"(nn) // %3
  217. : "0"(rows0p),
  218. "1"(rows1p),
  219. "2"(Dp),
  220. "3"(nn),
  221. "r"(b0), // %8
  222. "r"(b1) // %9
  223. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  224. );
  225. }
  226. #endif // __aarch64__
  227. #endif // __ARM_NEON
  228. for ( ; remain; --remain )
  229. {
  230. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  231. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  232. }
  233. ibeta += 2;
  234. }
  235. delete[] buf;
  236. }
  237. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  238. {
  239. const int INTER_RESIZE_COEF_BITS=11;
  240. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  241. // const int ONE=INTER_RESIZE_COEF_SCALE;
  242. double scale_x = (double)srcw / w;
  243. double scale_y = (double)srch / h;
  244. int* buf = new int[w + h + w + h];
  245. int* xofs = buf;//new int[w];
  246. int* yofs = buf + w;//new int[h];
  247. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  248. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  249. float fx;
  250. float fy;
  251. int sx;
  252. int sy;
  253. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  254. for (int dx = 0; dx < w; dx++)
  255. {
  256. fx = (float)((dx + 0.5) * scale_x - 0.5);
  257. sx = floor(fx);
  258. fx -= sx;
  259. if (sx < 0)
  260. {
  261. sx = 0;
  262. fx = 0.f;
  263. }
  264. if (sx >= srcw - 1)
  265. {
  266. sx = srcw - 2;
  267. fx = 1.f;
  268. }
  269. xofs[dx] = sx*2;
  270. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  271. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  272. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  273. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  274. }
  275. for (int dy = 0; dy < h; dy++)
  276. {
  277. fy = (float)((dy + 0.5) * scale_y - 0.5);
  278. sy = floor(fy);
  279. fy -= sy;
  280. if (sy < 0)
  281. {
  282. sy = 0;
  283. fy = 0.f;
  284. }
  285. if (sy >= srch - 1)
  286. {
  287. sy = srch - 2;
  288. fy = 1.f;
  289. }
  290. yofs[dy] = sy*2;
  291. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  292. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  293. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  294. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  295. }
  296. #undef SATURATE_CAST_SHORT
  297. // loop body
  298. Mat rowsbuf0(w*2+2, (size_t)2u);
  299. Mat rowsbuf1(w*2+2, (size_t)2u);
  300. short* rows0 = (short*)rowsbuf0.data;
  301. short* rows1 = (short*)rowsbuf1.data;
  302. int prev_sy1 = -4;
  303. for (int dy = 0; dy < h; dy++ )
  304. {
  305. int sy = yofs[dy];
  306. if (sy == prev_sy1)
  307. {
  308. // reuse all rows
  309. }
  310. else if (sy == prev_sy1 + 2)
  311. {
  312. // hresize one row
  313. short* rows0_old = rows0;
  314. rows0 = rows1;
  315. rows1 = rows0_old;
  316. const unsigned char *S1 = src + srcw * (sy+2);
  317. const short* ialphap = ialpha;
  318. short* rows1p = rows1;
  319. for ( int dx = 0; dx < w; dx++ )
  320. {
  321. int sx = xofs[dx];
  322. const unsigned char* S1p = S1 + sx;
  323. #if __ARM_NEON
  324. int16x4_t _a0a1XX = vld1_s16(ialphap);
  325. int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
  326. uint8x8_t _S1 = uint8x8_t();
  327. _S1 = vld1_lane_u8(S1p, _S1, 0);
  328. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  329. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  330. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  331. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  332. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  333. int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
  334. int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
  335. int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
  336. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  337. vst1_s16(rows1p, _rows1_sr4);
  338. #else
  339. short a0 = ialphap[0];
  340. short a1 = ialphap[1];
  341. rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
  342. rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
  343. #endif // __ARM_NEON
  344. ialphap += 2;
  345. rows1p += 2;
  346. }
  347. }
  348. else
  349. {
  350. // hresize two rows
  351. const unsigned char *S0 = src + srcw * (sy);
  352. const unsigned char *S1 = src + srcw * (sy+2);
  353. const short* ialphap = ialpha;
  354. short* rows0p = rows0;
  355. short* rows1p = rows1;
  356. for ( int dx = 0; dx < w; dx++ )
  357. {
  358. int sx = xofs[dx];
  359. short a0 = ialphap[0];
  360. short a1 = ialphap[1];
  361. const unsigned char* S0p = S0 + sx;
  362. const unsigned char* S1p = S1 + sx;
  363. #if __ARM_NEON
  364. int16x4_t _a0 = vdup_n_s16(a0);
  365. int16x4_t _a1 = vdup_n_s16(a1);
  366. uint8x8_t _S0 = uint8x8_t();
  367. uint8x8_t _S1 = uint8x8_t();
  368. _S0 = vld1_lane_u8(S0p, _S0, 0);
  369. _S0 = vld1_lane_u8(S0p+1, _S0, 1);
  370. _S0 = vld1_lane_u8(S0p+2, _S0, 2);
  371. _S0 = vld1_lane_u8(S0p+3, _S0, 3);
  372. _S1 = vld1_lane_u8(S1p, _S1, 0);
  373. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  374. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  375. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  376. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  377. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  378. int16x4_t _S0lowhigh = vget_low_s16(_S016);
  379. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  380. int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
  381. int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
  382. _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
  383. int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
  384. int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
  385. vst1_s16(rows0p, _rows01_sr4);
  386. vst1_s16(rows1p, _rows1_sr4);
  387. #else
  388. rows0p[0] = (S0p[0]*a0 + S0p[2]*a1) >> 4;
  389. rows0p[1] = (S0p[1]*a0 + S0p[3]*a1) >> 4;
  390. rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
  391. rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
  392. #endif // __ARM_NEON
  393. ialphap += 2;
  394. rows0p += 2;
  395. rows1p += 2;
  396. }
  397. }
  398. prev_sy1 = sy;
  399. // vresize
  400. short b0 = ibeta[0];
  401. short b1 = ibeta[1];
  402. short* rows0p = rows0;
  403. short* rows1p = rows1;
  404. unsigned char* Dp = dst + w * 2 * (dy);
  405. #if __ARM_NEON
  406. int nn = (w * 2) >> 3;
  407. #else
  408. int nn = 0;
  409. #endif
  410. int remain = (w * 2) - (nn << 3);
  411. #if __ARM_NEON
  412. #if __aarch64__
  413. int16x4_t _b0 = vdup_n_s16(b0);
  414. int16x4_t _b1 = vdup_n_s16(b1);
  415. int32x4_t _v2 = vdupq_n_s32(2);
  416. for (; nn>0; nn--)
  417. {
  418. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  419. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  420. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  421. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  422. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  423. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  424. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  425. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  426. int32x4_t _acc = _v2;
  427. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  428. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  429. int32x4_t _acc_1 = _v2;
  430. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  431. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  432. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  433. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  434. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  435. vst1_u8(Dp, _D);
  436. Dp += 8;
  437. rows0p += 8;
  438. rows1p += 8;
  439. }
  440. #else
  441. if (nn > 0)
  442. {
  443. asm volatile(
  444. "vdup.s16 d16, %8 \n"
  445. "mov r4, #2 \n"
  446. "vdup.s16 d17, %9 \n"
  447. "vdup.s32 q12, r4 \n"
  448. "pld [%0, #128] \n"
  449. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  450. "pld [%1, #128] \n"
  451. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  452. "0: \n"
  453. "vmull.s16 q0, d2, d16 \n"
  454. "vmull.s16 q1, d3, d16 \n"
  455. "vorr.s32 q10, q12, q12 \n"
  456. "vorr.s32 q11, q12, q12 \n"
  457. "vmull.s16 q2, d6, d17 \n"
  458. "vmull.s16 q3, d7, d17 \n"
  459. "vsra.s32 q10, q0, #16 \n"
  460. "vsra.s32 q11, q1, #16 \n"
  461. "pld [%0, #128] \n"
  462. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  463. "vsra.s32 q10, q2, #16 \n"
  464. "vsra.s32 q11, q3, #16 \n"
  465. "pld [%1, #128] \n"
  466. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  467. "vshrn.s32 d20, q10, #2 \n"
  468. "vshrn.s32 d21, q11, #2 \n"
  469. "vqmovun.s16 d20, q10 \n"
  470. "vst1.8 {d20}, [%2]! \n"
  471. "subs %3, #1 \n"
  472. "bne 0b \n"
  473. "sub %0, #16 \n"
  474. "sub %1, #16 \n"
  475. : "=r"(rows0p), // %0
  476. "=r"(rows1p), // %1
  477. "=r"(Dp), // %2
  478. "=r"(nn) // %3
  479. : "0"(rows0p),
  480. "1"(rows1p),
  481. "2"(Dp),
  482. "3"(nn),
  483. "r"(b0), // %8
  484. "r"(b1) // %9
  485. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  486. );
  487. }
  488. #endif // __aarch64__
  489. #endif // __ARM_NEON
  490. for ( ; remain; --remain )
  491. {
  492. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  493. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  494. }
  495. ibeta += 2;
  496. }
  497. delete[] buf;
  498. }
  499. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  500. {
  501. const int INTER_RESIZE_COEF_BITS=11;
  502. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  503. // const int ONE=INTER_RESIZE_COEF_SCALE;
  504. double scale_x = (double)srcw / w;
  505. double scale_y = (double)srch / h;
  506. int* buf = new int[w + h + w + h];
  507. int* xofs = buf;//new int[w];
  508. int* yofs = buf + w;//new int[h];
  509. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  510. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  511. float fx;
  512. float fy;
  513. int sx;
  514. int sy;
  515. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  516. for (int dx = 0; dx < w; dx++)
  517. {
  518. fx = (float)((dx + 0.5) * scale_x - 0.5);
  519. sx = floor(fx);
  520. fx -= sx;
  521. if (sx < 0)
  522. {
  523. sx = 0;
  524. fx = 0.f;
  525. }
  526. if (sx >= srcw - 1)
  527. {
  528. sx = srcw - 2;
  529. fx = 1.f;
  530. }
  531. xofs[dx] = sx*3;
  532. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  533. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  534. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  535. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  536. }
  537. for (int dy = 0; dy < h; dy++)
  538. {
  539. fy = (float)((dy + 0.5) * scale_y - 0.5);
  540. sy = floor(fy);
  541. fy -= sy;
  542. if (sy < 0)
  543. {
  544. sy = 0;
  545. fy = 0.f;
  546. }
  547. if (sy >= srch - 1)
  548. {
  549. sy = srch - 2;
  550. fy = 1.f;
  551. }
  552. yofs[dy] = sy*3;
  553. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  554. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  555. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  556. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  557. }
  558. #undef SATURATE_CAST_SHORT
  559. // loop body
  560. Mat rowsbuf0(w*3+1, (size_t)2u);
  561. Mat rowsbuf1(w*3+1, (size_t)2u);
  562. short* rows0 = (short*)rowsbuf0.data;
  563. short* rows1 = (short*)rowsbuf1.data;
  564. int prev_sy1 = -6;
  565. for (int dy = 0; dy < h; dy++ )
  566. {
  567. int sy = yofs[dy];
  568. if (sy == prev_sy1)
  569. {
  570. // reuse all rows
  571. }
  572. else if (sy == prev_sy1 + 3)
  573. {
  574. // hresize one row
  575. short* rows0_old = rows0;
  576. rows0 = rows1;
  577. rows1 = rows0_old;
  578. const unsigned char *S1 = src + srcw * (sy+3);
  579. const short* ialphap = ialpha;
  580. short* rows1p = rows1;
  581. for ( int dx = 0; dx < w; dx++ )
  582. {
  583. int sx = xofs[dx];
  584. short a0 = ialphap[0];
  585. short a1 = ialphap[1];
  586. const unsigned char* S1p = S1 + sx;
  587. #if __ARM_NEON
  588. int16x4_t _a0 = vdup_n_s16(a0);
  589. int16x4_t _a1 = vdup_n_s16(a1);
  590. uint8x8_t _S1 = uint8x8_t();
  591. _S1 = vld1_lane_u8(S1p, _S1, 0);
  592. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  593. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  594. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  595. _S1 = vld1_lane_u8(S1p+4, _S1, 4);
  596. _S1 = vld1_lane_u8(S1p+5, _S1, 5);
  597. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  598. int16x4_t _S1low = vget_low_s16(_S116);
  599. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  600. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  601. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  602. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  603. vst1_s16(rows1p, _rows1_sr4);
  604. #else
  605. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  606. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  607. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  608. #endif // __ARM_NEON
  609. ialphap += 2;
  610. rows1p += 3;
  611. }
  612. }
  613. else
  614. {
  615. // hresize two rows
  616. const unsigned char *S0 = src + srcw * (sy);
  617. const unsigned char *S1 = src + srcw * (sy+3);
  618. const short* ialphap = ialpha;
  619. short* rows0p = rows0;
  620. short* rows1p = rows1;
  621. for ( int dx = 0; dx < w; dx++ )
  622. {
  623. int sx = xofs[dx];
  624. short a0 = ialphap[0];
  625. short a1 = ialphap[1];
  626. const unsigned char* S0p = S0 + sx;
  627. const unsigned char* S1p = S1 + sx;
  628. #if __ARM_NEON
  629. int16x4_t _a0 = vdup_n_s16(a0);
  630. int16x4_t _a1 = vdup_n_s16(a1);
  631. uint8x8_t _S0 = uint8x8_t();
  632. uint8x8_t _S1 = uint8x8_t();
  633. _S0 = vld1_lane_u8(S0p, _S0, 0);
  634. _S0 = vld1_lane_u8(S0p+1, _S0, 1);
  635. _S0 = vld1_lane_u8(S0p+2, _S0, 2);
  636. _S0 = vld1_lane_u8(S0p+3, _S0, 3);
  637. _S0 = vld1_lane_u8(S0p+4, _S0, 4);
  638. _S0 = vld1_lane_u8(S0p+5, _S0, 5);
  639. _S1 = vld1_lane_u8(S1p, _S1, 0);
  640. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  641. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  642. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  643. _S1 = vld1_lane_u8(S1p+4, _S1, 4);
  644. _S1 = vld1_lane_u8(S1p+5, _S1, 5);
  645. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  646. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  647. int16x4_t _S0low = vget_low_s16(_S016);
  648. int16x4_t _S1low = vget_low_s16(_S116);
  649. int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
  650. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  651. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  652. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  653. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  654. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  655. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  656. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  657. vst1_s16(rows0p, _rows0_sr4);
  658. vst1_s16(rows1p, _rows1_sr4);
  659. #else
  660. rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
  661. rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
  662. rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
  663. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  664. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  665. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  666. #endif // __ARM_NEON
  667. ialphap += 2;
  668. rows0p += 3;
  669. rows1p += 3;
  670. }
  671. }
  672. prev_sy1 = sy;
  673. // vresize
  674. short b0 = ibeta[0];
  675. short b1 = ibeta[1];
  676. short* rows0p = rows0;
  677. short* rows1p = rows1;
  678. unsigned char* Dp = dst + w * 3 * (dy);
  679. #if __ARM_NEON
  680. int nn = (w * 3) >> 3;
  681. #else
  682. int nn = 0;
  683. #endif
  684. int remain = (w * 3) - (nn << 3);
  685. #if __ARM_NEON
  686. #if __aarch64__
  687. int16x4_t _b0 = vdup_n_s16(b0);
  688. int16x4_t _b1 = vdup_n_s16(b1);
  689. int32x4_t _v2 = vdupq_n_s32(2);
  690. for (; nn>0; nn--)
  691. {
  692. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  693. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  694. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  695. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  696. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  697. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  698. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  699. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  700. int32x4_t _acc = _v2;
  701. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  702. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  703. int32x4_t _acc_1 = _v2;
  704. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  705. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  706. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  707. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  708. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  709. vst1_u8(Dp, _D);
  710. Dp += 8;
  711. rows0p += 8;
  712. rows1p += 8;
  713. }
  714. #else
  715. if (nn > 0)
  716. {
  717. asm volatile(
  718. "vdup.s16 d16, %8 \n"
  719. "mov r4, #2 \n"
  720. "vdup.s16 d17, %9 \n"
  721. "vdup.s32 q12, r4 \n"
  722. "pld [%0, #128] \n"
  723. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  724. "pld [%1, #128] \n"
  725. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  726. "0: \n"
  727. "vmull.s16 q0, d2, d16 \n"
  728. "vmull.s16 q1, d3, d16 \n"
  729. "vorr.s32 q10, q12, q12 \n"
  730. "vorr.s32 q11, q12, q12 \n"
  731. "vmull.s16 q2, d6, d17 \n"
  732. "vmull.s16 q3, d7, d17 \n"
  733. "vsra.s32 q10, q0, #16 \n"
  734. "vsra.s32 q11, q1, #16 \n"
  735. "pld [%0, #128] \n"
  736. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  737. "vsra.s32 q10, q2, #16 \n"
  738. "vsra.s32 q11, q3, #16 \n"
  739. "pld [%1, #128] \n"
  740. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  741. "vshrn.s32 d20, q10, #2 \n"
  742. "vshrn.s32 d21, q11, #2 \n"
  743. "vqmovun.s16 d20, q10 \n"
  744. "vst1.8 {d20}, [%2]! \n"
  745. "subs %3, #1 \n"
  746. "bne 0b \n"
  747. "sub %0, #16 \n"
  748. "sub %1, #16 \n"
  749. : "=r"(rows0p), // %0
  750. "=r"(rows1p), // %1
  751. "=r"(Dp), // %2
  752. "=r"(nn) // %3
  753. : "0"(rows0p),
  754. "1"(rows1p),
  755. "2"(Dp),
  756. "3"(nn),
  757. "r"(b0), // %8
  758. "r"(b1) // %9
  759. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  760. );
  761. }
  762. #endif // __aarch64__
  763. #endif // __ARM_NEON
  764. for ( ; remain; --remain )
  765. {
  766. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  767. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  768. }
  769. ibeta += 2;
  770. }
  771. delete[] buf;
  772. }
  773. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  774. {
  775. const int INTER_RESIZE_COEF_BITS=11;
  776. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  777. // const int ONE=INTER_RESIZE_COEF_SCALE;
  778. double scale_x = (double)srcw / w;
  779. double scale_y = (double)srch / h;
  780. int* buf = new int[w + h + w + h];
  781. int* xofs = buf;//new int[w];
  782. int* yofs = buf + w;//new int[h];
  783. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  784. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  785. float fx;
  786. float fy;
  787. int sx;
  788. int sy;
  789. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  790. for (int dx = 0; dx < w; dx++)
  791. {
  792. fx = (float)((dx + 0.5) * scale_x - 0.5);
  793. sx = floor(fx);
  794. fx -= sx;
  795. if (sx < 0)
  796. {
  797. sx = 0;
  798. fx = 0.f;
  799. }
  800. if (sx >= srcw - 1)
  801. {
  802. sx = srcw - 2;
  803. fx = 1.f;
  804. }
  805. xofs[dx] = sx*4;
  806. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  807. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  808. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  809. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  810. }
  811. for (int dy = 0; dy < h; dy++)
  812. {
  813. fy = (float)((dy + 0.5) * scale_y - 0.5);
  814. sy = floor(fy);
  815. fy -= sy;
  816. if (sy < 0)
  817. {
  818. sy = 0;
  819. fy = 0.f;
  820. }
  821. if (sy >= srch - 1)
  822. {
  823. sy = srch - 2;
  824. fy = 1.f;
  825. }
  826. yofs[dy] = sy*4;
  827. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  828. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  829. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  830. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  831. }
  832. #undef SATURATE_CAST_SHORT
  833. // loop body
  834. Mat rowsbuf0(w*4, (size_t)2u);
  835. Mat rowsbuf1(w*4, (size_t)2u);
  836. short* rows0 = (short*)rowsbuf0.data;
  837. short* rows1 = (short*)rowsbuf1.data;
  838. int prev_sy1 = -8;
  839. for (int dy = 0; dy < h; dy++ )
  840. {
  841. int sy = yofs[dy];
  842. if (sy == prev_sy1)
  843. {
  844. // reuse all rows
  845. }
  846. else if (sy == prev_sy1 + 4)
  847. {
  848. // hresize one row
  849. short* rows0_old = rows0;
  850. rows0 = rows1;
  851. rows1 = rows0_old;
  852. const unsigned char *S1 = src + srcw * (sy+4);
  853. const short* ialphap = ialpha;
  854. short* rows1p = rows1;
  855. for ( int dx = 0; dx < w; dx++ )
  856. {
  857. int sx = xofs[dx];
  858. short a0 = ialphap[0];
  859. short a1 = ialphap[1];
  860. const unsigned char* S1p = S1 + sx;
  861. #if __ARM_NEON
  862. int16x4_t _a0 = vdup_n_s16(a0);
  863. int16x4_t _a1 = vdup_n_s16(a1);
  864. uint8x8_t _S1 = vld1_u8(S1p);
  865. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  866. int16x4_t _S1low = vget_low_s16(_S116);
  867. int16x4_t _S1high = vget_high_s16(_S116);
  868. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  869. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  870. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  871. vst1_s16(rows1p, _rows1_sr4);
  872. #else
  873. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  874. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  875. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  876. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  877. #endif // __ARM_NEON
  878. ialphap += 2;
  879. rows1p += 4;
  880. }
  881. }
  882. else
  883. {
  884. // hresize two rows
  885. const unsigned char *S0 = src + srcw * (sy);
  886. const unsigned char *S1 = src + srcw * (sy+4);
  887. const short* ialphap = ialpha;
  888. short* rows0p = rows0;
  889. short* rows1p = rows1;
  890. for ( int dx = 0; dx < w; dx++ )
  891. {
  892. int sx = xofs[dx];
  893. short a0 = ialphap[0];
  894. short a1 = ialphap[1];
  895. const unsigned char* S0p = S0 + sx;
  896. const unsigned char* S1p = S1 + sx;
  897. #if __ARM_NEON
  898. int16x4_t _a0 = vdup_n_s16(a0);
  899. int16x4_t _a1 = vdup_n_s16(a1);
  900. uint8x8_t _S0 = vld1_u8(S0p);
  901. uint8x8_t _S1 = vld1_u8(S1p);
  902. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  903. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  904. int16x4_t _S0low = vget_low_s16(_S016);
  905. int16x4_t _S1low = vget_low_s16(_S116);
  906. int16x4_t _S0high = vget_high_s16(_S016);
  907. int16x4_t _S1high = vget_high_s16(_S116);
  908. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  909. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  910. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  911. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  912. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  913. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  914. vst1_s16(rows0p, _rows0_sr4);
  915. vst1_s16(rows1p, _rows1_sr4);
  916. #else
  917. rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
  918. rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
  919. rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
  920. rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
  921. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  922. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  923. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  924. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  925. #endif // __ARM_NEON
  926. ialphap += 2;
  927. rows0p += 4;
  928. rows1p += 4;
  929. }
  930. }
  931. prev_sy1 = sy;
  932. // vresize
  933. short b0 = ibeta[0];
  934. short b1 = ibeta[1];
  935. short* rows0p = rows0;
  936. short* rows1p = rows1;
  937. unsigned char* Dp = dst + w * 4 * (dy);
  938. #if __ARM_NEON
  939. int nn = (w * 4) >> 3;
  940. #else
  941. int nn = 0;
  942. #endif
  943. int remain = (w * 4) - (nn << 3);
  944. #if __ARM_NEON
  945. #if __aarch64__
  946. int16x4_t _b0 = vdup_n_s16(b0);
  947. int16x4_t _b1 = vdup_n_s16(b1);
  948. int32x4_t _v2 = vdupq_n_s32(2);
  949. for (; nn>0; nn--)
  950. {
  951. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  952. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  953. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  954. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  955. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  956. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  957. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  958. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  959. int32x4_t _acc = _v2;
  960. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  961. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  962. int32x4_t _acc_1 = _v2;
  963. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  964. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  965. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  966. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  967. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  968. vst1_u8(Dp, _D);
  969. Dp += 8;
  970. rows0p += 8;
  971. rows1p += 8;
  972. }
  973. #else
  974. if (nn > 0)
  975. {
  976. asm volatile(
  977. "vdup.s16 d16, %8 \n"
  978. "mov r4, #2 \n"
  979. "vdup.s16 d17, %9 \n"
  980. "vdup.s32 q12, r4 \n"
  981. "pld [%0, #128] \n"
  982. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  983. "pld [%1, #128] \n"
  984. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  985. "0: \n"
  986. "vmull.s16 q0, d2, d16 \n"
  987. "vmull.s16 q1, d3, d16 \n"
  988. "vorr.s32 q10, q12, q12 \n"
  989. "vorr.s32 q11, q12, q12 \n"
  990. "vmull.s16 q2, d6, d17 \n"
  991. "vmull.s16 q3, d7, d17 \n"
  992. "vsra.s32 q10, q0, #16 \n"
  993. "vsra.s32 q11, q1, #16 \n"
  994. "pld [%0, #128] \n"
  995. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  996. "vsra.s32 q10, q2, #16 \n"
  997. "vsra.s32 q11, q3, #16 \n"
  998. "pld [%1, #128] \n"
  999. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1000. "vshrn.s32 d20, q10, #2 \n"
  1001. "vshrn.s32 d21, q11, #2 \n"
  1002. "vqmovun.s16 d20, q10 \n"
  1003. "vst1.8 {d20}, [%2]! \n"
  1004. "subs %3, #1 \n"
  1005. "bne 0b \n"
  1006. "sub %0, #16 \n"
  1007. "sub %1, #16 \n"
  1008. : "=r"(rows0p), // %0
  1009. "=r"(rows1p), // %1
  1010. "=r"(Dp), // %2
  1011. "=r"(nn) // %3
  1012. : "0"(rows0p),
  1013. "1"(rows1p),
  1014. "2"(Dp),
  1015. "3"(nn),
  1016. "r"(b0), // %8
  1017. "r"(b1) // %9
  1018. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  1019. );
  1020. }
  1021. #endif // __aarch64__
  1022. #endif // __ARM_NEON
  1023. for ( ; remain; --remain )
  1024. {
  1025. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1026. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  1027. }
  1028. ibeta += 2;
  1029. }
  1030. delete[] buf;
  1031. }
  1032. void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  1033. {
  1034. // assert srcw % 2 == 0
  1035. // assert srch % 2 == 0
  1036. // assert w % 2 == 0
  1037. // assert h % 2 == 0
  1038. const unsigned char* srcY = src;
  1039. unsigned char* dstY = dst;
  1040. resize_bilinear_c1(srcY, srcw, srch, dstY, w, h);
  1041. const unsigned char* srcUV = src + srcw * srch;
  1042. unsigned char* dstUV = dst + w * h;
  1043. resize_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2);
  1044. }
  1045. #endif // NCNN_PIXEL
  1046. } // namespace ncnn