You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel_resize.cpp 40 kB

7 years ago
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <math.h>
  17. #include <algorithm>
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #endif // __ARM_NEON
  21. #include "platform.h"
  22. namespace ncnn {
  23. #if NCNN_PIXEL
  24. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  25. {
  26. return resize_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w);
  27. }
  28. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  29. {
  30. return resize_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2);
  31. }
  32. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  33. {
  34. return resize_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3);
  35. }
  36. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  37. {
  38. return resize_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4);
  39. }
  40. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  41. {
  42. const int INTER_RESIZE_COEF_BITS=11;
  43. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  44. // const int ONE=INTER_RESIZE_COEF_SCALE;
  45. double scale_x = (double)srcw / w;
  46. double scale_y = (double)srch / h;
  47. int* buf = new int[w + h + w + h];
  48. int* xofs = buf;//new int[w];
  49. int* yofs = buf + w;//new int[h];
  50. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  51. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  52. float fx;
  53. float fy;
  54. int sx;
  55. int sy;
  56. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  57. for (int dx = 0; dx < w; dx++)
  58. {
  59. fx = (float)((dx + 0.5) * scale_x - 0.5);
  60. sx = static_cast<int>(floor(fx));
  61. fx -= sx;
  62. if (sx < 0)
  63. {
  64. sx = 0;
  65. fx = 0.f;
  66. }
  67. if (sx >= srcw - 1)
  68. {
  69. sx = srcw - 2;
  70. fx = 1.f;
  71. }
  72. xofs[dx] = sx;
  73. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  74. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  75. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  76. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  77. }
  78. for (int dy = 0; dy < h; dy++)
  79. {
  80. fy = (float)((dy + 0.5) * scale_y - 0.5);
  81. sy = static_cast<int>(floor(fy));
  82. fy -= sy;
  83. if (sy < 0)
  84. {
  85. sy = 0;
  86. fy = 0.f;
  87. }
  88. if (sy >= srch - 1)
  89. {
  90. sy = srch - 2;
  91. fy = 1.f;
  92. }
  93. yofs[dy] = sy;
  94. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  95. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  96. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  97. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  98. }
  99. #undef SATURATE_CAST_SHORT
  100. // loop body
  101. Mat rowsbuf0(w, (size_t)2u);
  102. Mat rowsbuf1(w, (size_t)2u);
  103. short* rows0 = (short*)rowsbuf0.data;
  104. short* rows1 = (short*)rowsbuf1.data;
  105. int prev_sy1 = -2;
  106. for (int dy = 0; dy < h; dy++ )
  107. {
  108. int sy = yofs[dy];
  109. if (sy == prev_sy1)
  110. {
  111. // reuse all rows
  112. }
  113. else if (sy == prev_sy1 + 1)
  114. {
  115. // hresize one row
  116. short* rows0_old = rows0;
  117. rows0 = rows1;
  118. rows1 = rows0_old;
  119. const unsigned char *S1 = src + srcstride * (sy+1);
  120. const short* ialphap = ialpha;
  121. short* rows1p = rows1;
  122. for ( int dx = 0; dx < w; dx++ )
  123. {
  124. int sx = xofs[dx];
  125. short a0 = ialphap[0];
  126. short a1 = ialphap[1];
  127. const unsigned char* S1p = S1 + sx;
  128. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  129. ialphap += 2;
  130. }
  131. }
  132. else
  133. {
  134. // hresize two rows
  135. const unsigned char *S0 = src + srcstride * (sy);
  136. const unsigned char *S1 = src + srcstride * (sy+1);
  137. const short* ialphap = ialpha;
  138. short* rows0p = rows0;
  139. short* rows1p = rows1;
  140. for ( int dx = 0; dx < w; dx++ )
  141. {
  142. int sx = xofs[dx];
  143. short a0 = ialphap[0];
  144. short a1 = ialphap[1];
  145. const unsigned char* S0p = S0 + sx;
  146. const unsigned char* S1p = S1 + sx;
  147. rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
  148. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  149. ialphap += 2;
  150. }
  151. }
  152. prev_sy1 = sy;
  153. // vresize
  154. short b0 = ibeta[0];
  155. short b1 = ibeta[1];
  156. short* rows0p = rows0;
  157. short* rows1p = rows1;
  158. unsigned char* Dp = dst + stride * (dy);
  159. #if __ARM_NEON
  160. int nn = w >> 3;
  161. #else
  162. int nn = 0;
  163. #endif
  164. int remain = w - (nn << 3);
  165. #if __ARM_NEON
  166. #if __aarch64__
  167. int16x4_t _b0 = vdup_n_s16(b0);
  168. int16x4_t _b1 = vdup_n_s16(b1);
  169. int32x4_t _v2 = vdupq_n_s32(2);
  170. for (; nn>0; nn--)
  171. {
  172. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  173. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  174. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  175. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  176. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  177. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  178. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  179. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  180. int32x4_t _acc = _v2;
  181. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  182. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  183. int32x4_t _acc_1 = _v2;
  184. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  185. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  186. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  187. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  188. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  189. vst1_u8(Dp, _D);
  190. Dp += 8;
  191. rows0p += 8;
  192. rows1p += 8;
  193. }
  194. #else
  195. if (nn > 0)
  196. {
  197. asm volatile(
  198. "vdup.s16 d16, %8 \n"
  199. "mov r4, #2 \n"
  200. "vdup.s16 d17, %9 \n"
  201. "vdup.s32 q12, r4 \n"
  202. "pld [%0, #128] \n"
  203. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  204. "pld [%1, #128] \n"
  205. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  206. "0: \n"
  207. "vmull.s16 q0, d2, d16 \n"
  208. "vmull.s16 q1, d3, d16 \n"
  209. "vorr.s32 q10, q12, q12 \n"
  210. "vorr.s32 q11, q12, q12 \n"
  211. "vmull.s16 q2, d6, d17 \n"
  212. "vmull.s16 q3, d7, d17 \n"
  213. "vsra.s32 q10, q0, #16 \n"
  214. "vsra.s32 q11, q1, #16 \n"
  215. "pld [%0, #128] \n"
  216. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  217. "vsra.s32 q10, q2, #16 \n"
  218. "vsra.s32 q11, q3, #16 \n"
  219. "pld [%1, #128] \n"
  220. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  221. "vshrn.s32 d20, q10, #2 \n"
  222. "vshrn.s32 d21, q11, #2 \n"
  223. "vqmovun.s16 d20, q10 \n"
  224. "vst1.8 {d20}, [%2]! \n"
  225. "subs %3, #1 \n"
  226. "bne 0b \n"
  227. "sub %0, #16 \n"
  228. "sub %1, #16 \n"
  229. : "=r"(rows0p), // %0
  230. "=r"(rows1p), // %1
  231. "=r"(Dp), // %2
  232. "=r"(nn) // %3
  233. : "0"(rows0p),
  234. "1"(rows1p),
  235. "2"(Dp),
  236. "3"(nn),
  237. "r"(b0), // %8
  238. "r"(b1) // %9
  239. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  240. );
  241. }
  242. #endif // __aarch64__
  243. #endif // __ARM_NEON
  244. for ( ; remain; --remain )
  245. {
  246. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  247. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  248. }
  249. ibeta += 2;
  250. }
  251. delete[] buf;
  252. }
  253. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  254. {
  255. const int INTER_RESIZE_COEF_BITS=11;
  256. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  257. // const int ONE=INTER_RESIZE_COEF_SCALE;
  258. double scale_x = (double)srcw / w;
  259. double scale_y = (double)srch / h;
  260. int* buf = new int[w + h + w + h];
  261. int* xofs = buf;//new int[w];
  262. int* yofs = buf + w;//new int[h];
  263. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  264. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  265. float fx;
  266. float fy;
  267. int sx;
  268. int sy;
  269. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  270. for (int dx = 0; dx < w; dx++)
  271. {
  272. fx = (float)((dx + 0.5) * scale_x - 0.5);
  273. sx = static_cast<int>(floor(fx));
  274. fx -= sx;
  275. if (sx < 0)
  276. {
  277. sx = 0;
  278. fx = 0.f;
  279. }
  280. if (sx >= srcw - 1)
  281. {
  282. sx = srcw - 2;
  283. fx = 1.f;
  284. }
  285. xofs[dx] = sx*2;
  286. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  287. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  288. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  289. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  290. }
  291. for (int dy = 0; dy < h; dy++)
  292. {
  293. fy = (float)((dy + 0.5) * scale_y - 0.5);
  294. sy = static_cast<int>(floor(fy));
  295. fy -= sy;
  296. if (sy < 0)
  297. {
  298. sy = 0;
  299. fy = 0.f;
  300. }
  301. if (sy >= srch - 1)
  302. {
  303. sy = srch - 2;
  304. fy = 1.f;
  305. }
  306. yofs[dy] = sy;
  307. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  308. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  309. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  310. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  311. }
  312. #undef SATURATE_CAST_SHORT
  313. // loop body
  314. Mat rowsbuf0(w*2+2, (size_t)2u);
  315. Mat rowsbuf1(w*2+2, (size_t)2u);
  316. short* rows0 = (short*)rowsbuf0.data;
  317. short* rows1 = (short*)rowsbuf1.data;
  318. int prev_sy1 = -2;
  319. for (int dy = 0; dy < h; dy++ )
  320. {
  321. int sy = yofs[dy];
  322. if (sy == prev_sy1)
  323. {
  324. // reuse all rows
  325. }
  326. else if (sy == prev_sy1 + 1)
  327. {
  328. // hresize one row
  329. short* rows0_old = rows0;
  330. rows0 = rows1;
  331. rows1 = rows0_old;
  332. const unsigned char *S1 = src + srcstride * (sy+1);
  333. const short* ialphap = ialpha;
  334. short* rows1p = rows1;
  335. for ( int dx = 0; dx < w; dx++ )
  336. {
  337. int sx = xofs[dx];
  338. const unsigned char* S1p = S1 + sx;
  339. #if __ARM_NEON
  340. int16x4_t _a0a1XX = vld1_s16(ialphap);
  341. int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
  342. uint8x8_t _S1 = uint8x8_t();
  343. _S1 = vld1_lane_u8(S1p, _S1, 0);
  344. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  345. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  346. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  347. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  348. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  349. int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
  350. int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
  351. int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
  352. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  353. vst1_s16(rows1p, _rows1_sr4);
  354. #else
  355. short a0 = ialphap[0];
  356. short a1 = ialphap[1];
  357. rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
  358. rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
  359. #endif // __ARM_NEON
  360. ialphap += 2;
  361. rows1p += 2;
  362. }
  363. }
  364. else
  365. {
  366. // hresize two rows
  367. const unsigned char *S0 = src + srcstride * (sy);
  368. const unsigned char *S1 = src + srcstride * (sy+1);
  369. const short* ialphap = ialpha;
  370. short* rows0p = rows0;
  371. short* rows1p = rows1;
  372. for ( int dx = 0; dx < w; dx++ )
  373. {
  374. int sx = xofs[dx];
  375. short a0 = ialphap[0];
  376. short a1 = ialphap[1];
  377. const unsigned char* S0p = S0 + sx;
  378. const unsigned char* S1p = S1 + sx;
  379. #if __ARM_NEON
  380. int16x4_t _a0 = vdup_n_s16(a0);
  381. int16x4_t _a1 = vdup_n_s16(a1);
  382. uint8x8_t _S0 = uint8x8_t();
  383. uint8x8_t _S1 = uint8x8_t();
  384. _S0 = vld1_lane_u8(S0p, _S0, 0);
  385. _S0 = vld1_lane_u8(S0p+1, _S0, 1);
  386. _S0 = vld1_lane_u8(S0p+2, _S0, 2);
  387. _S0 = vld1_lane_u8(S0p+3, _S0, 3);
  388. _S1 = vld1_lane_u8(S1p, _S1, 0);
  389. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  390. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  391. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  392. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  393. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  394. int16x4_t _S0lowhigh = vget_low_s16(_S016);
  395. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  396. int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
  397. int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
  398. _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
  399. int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
  400. int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
  401. vst1_s16(rows0p, _rows01_sr4);
  402. vst1_s16(rows1p, _rows1_sr4);
  403. #else
  404. rows0p[0] = (S0p[0]*a0 + S0p[2]*a1) >> 4;
  405. rows0p[1] = (S0p[1]*a0 + S0p[3]*a1) >> 4;
  406. rows1p[0] = (S1p[0]*a0 + S1p[2]*a1) >> 4;
  407. rows1p[1] = (S1p[1]*a0 + S1p[3]*a1) >> 4;
  408. #endif // __ARM_NEON
  409. ialphap += 2;
  410. rows0p += 2;
  411. rows1p += 2;
  412. }
  413. }
  414. prev_sy1 = sy;
  415. // vresize
  416. short b0 = ibeta[0];
  417. short b1 = ibeta[1];
  418. short* rows0p = rows0;
  419. short* rows1p = rows1;
  420. unsigned char* Dp = dst + stride * (dy);
  421. #if __ARM_NEON
  422. int nn = (w * 2) >> 3;
  423. #else
  424. int nn = 0;
  425. #endif
  426. int remain = (w * 2) - (nn << 3);
  427. #if __ARM_NEON
  428. #if __aarch64__
  429. int16x4_t _b0 = vdup_n_s16(b0);
  430. int16x4_t _b1 = vdup_n_s16(b1);
  431. int32x4_t _v2 = vdupq_n_s32(2);
  432. for (; nn>0; nn--)
  433. {
  434. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  435. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  436. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  437. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  438. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  439. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  440. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  441. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  442. int32x4_t _acc = _v2;
  443. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  444. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  445. int32x4_t _acc_1 = _v2;
  446. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  447. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  448. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  449. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  450. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  451. vst1_u8(Dp, _D);
  452. Dp += 8;
  453. rows0p += 8;
  454. rows1p += 8;
  455. }
  456. #else
  457. if (nn > 0)
  458. {
  459. asm volatile(
  460. "vdup.s16 d16, %8 \n"
  461. "mov r4, #2 \n"
  462. "vdup.s16 d17, %9 \n"
  463. "vdup.s32 q12, r4 \n"
  464. "pld [%0, #128] \n"
  465. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  466. "pld [%1, #128] \n"
  467. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  468. "0: \n"
  469. "vmull.s16 q0, d2, d16 \n"
  470. "vmull.s16 q1, d3, d16 \n"
  471. "vorr.s32 q10, q12, q12 \n"
  472. "vorr.s32 q11, q12, q12 \n"
  473. "vmull.s16 q2, d6, d17 \n"
  474. "vmull.s16 q3, d7, d17 \n"
  475. "vsra.s32 q10, q0, #16 \n"
  476. "vsra.s32 q11, q1, #16 \n"
  477. "pld [%0, #128] \n"
  478. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  479. "vsra.s32 q10, q2, #16 \n"
  480. "vsra.s32 q11, q3, #16 \n"
  481. "pld [%1, #128] \n"
  482. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  483. "vshrn.s32 d20, q10, #2 \n"
  484. "vshrn.s32 d21, q11, #2 \n"
  485. "vqmovun.s16 d20, q10 \n"
  486. "vst1.8 {d20}, [%2]! \n"
  487. "subs %3, #1 \n"
  488. "bne 0b \n"
  489. "sub %0, #16 \n"
  490. "sub %1, #16 \n"
  491. : "=r"(rows0p), // %0
  492. "=r"(rows1p), // %1
  493. "=r"(Dp), // %2
  494. "=r"(nn) // %3
  495. : "0"(rows0p),
  496. "1"(rows1p),
  497. "2"(Dp),
  498. "3"(nn),
  499. "r"(b0), // %8
  500. "r"(b1) // %9
  501. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  502. );
  503. }
  504. #endif // __aarch64__
  505. #endif // __ARM_NEON
  506. for ( ; remain; --remain )
  507. {
  508. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  509. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  510. }
  511. ibeta += 2;
  512. }
  513. delete[] buf;
  514. }
  515. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  516. {
  517. const int INTER_RESIZE_COEF_BITS=11;
  518. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  519. // const int ONE=INTER_RESIZE_COEF_SCALE;
  520. double scale_x = (double)srcw / w;
  521. double scale_y = (double)srch / h;
  522. int* buf = new int[w + h + w + h];
  523. int* xofs = buf;//new int[w];
  524. int* yofs = buf + w;//new int[h];
  525. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  526. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  527. float fx;
  528. float fy;
  529. int sx;
  530. int sy;
  531. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  532. for (int dx = 0; dx < w; dx++)
  533. {
  534. fx = (float)((dx + 0.5) * scale_x - 0.5);
  535. sx = static_cast<int>(floor(fx));
  536. fx -= sx;
  537. if (sx < 0)
  538. {
  539. sx = 0;
  540. fx = 0.f;
  541. }
  542. if (sx >= srcw - 1)
  543. {
  544. sx = srcw - 2;
  545. fx = 1.f;
  546. }
  547. xofs[dx] = sx*3;
  548. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  549. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  550. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  551. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  552. }
  553. for (int dy = 0; dy < h; dy++)
  554. {
  555. fy = (float)((dy + 0.5) * scale_y - 0.5);
  556. sy = static_cast<int>(floor(fy));
  557. fy -= sy;
  558. if (sy < 0)
  559. {
  560. sy = 0;
  561. fy = 0.f;
  562. }
  563. if (sy >= srch - 1)
  564. {
  565. sy = srch - 2;
  566. fy = 1.f;
  567. }
  568. yofs[dy] = sy;
  569. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  570. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  571. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  572. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  573. }
  574. #undef SATURATE_CAST_SHORT
  575. // loop body
  576. Mat rowsbuf0(w*3+1, (size_t)2u);
  577. Mat rowsbuf1(w*3+1, (size_t)2u);
  578. short* rows0 = (short*)rowsbuf0.data;
  579. short* rows1 = (short*)rowsbuf1.data;
  580. int prev_sy1 = -2;
  581. for (int dy = 0; dy < h; dy++ )
  582. {
  583. int sy = yofs[dy];
  584. if (sy == prev_sy1)
  585. {
  586. // reuse all rows
  587. }
  588. else if (sy == prev_sy1 + 1)
  589. {
  590. // hresize one row
  591. short* rows0_old = rows0;
  592. rows0 = rows1;
  593. rows1 = rows0_old;
  594. const unsigned char *S1 = src + srcstride * (sy+1);
  595. const short* ialphap = ialpha;
  596. short* rows1p = rows1;
  597. for ( int dx = 0; dx < w; dx++ )
  598. {
  599. int sx = xofs[dx];
  600. short a0 = ialphap[0];
  601. short a1 = ialphap[1];
  602. const unsigned char* S1p = S1 + sx;
  603. #if __ARM_NEON
  604. int16x4_t _a0 = vdup_n_s16(a0);
  605. int16x4_t _a1 = vdup_n_s16(a1);
  606. uint8x8_t _S1 = uint8x8_t();
  607. _S1 = vld1_lane_u8(S1p, _S1, 0);
  608. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  609. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  610. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  611. _S1 = vld1_lane_u8(S1p+4, _S1, 4);
  612. _S1 = vld1_lane_u8(S1p+5, _S1, 5);
  613. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  614. int16x4_t _S1low = vget_low_s16(_S116);
  615. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  616. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  617. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  618. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  619. vst1_s16(rows1p, _rows1_sr4);
  620. #else
  621. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  622. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  623. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  624. #endif // __ARM_NEON
  625. ialphap += 2;
  626. rows1p += 3;
  627. }
  628. }
  629. else
  630. {
  631. // hresize two rows
  632. const unsigned char *S0 = src + srcstride * (sy);
  633. const unsigned char *S1 = src + srcstride * (sy+1);
  634. const short* ialphap = ialpha;
  635. short* rows0p = rows0;
  636. short* rows1p = rows1;
  637. for ( int dx = 0; dx < w; dx++ )
  638. {
  639. int sx = xofs[dx];
  640. short a0 = ialphap[0];
  641. short a1 = ialphap[1];
  642. const unsigned char* S0p = S0 + sx;
  643. const unsigned char* S1p = S1 + sx;
  644. #if __ARM_NEON
  645. int16x4_t _a0 = vdup_n_s16(a0);
  646. int16x4_t _a1 = vdup_n_s16(a1);
  647. uint8x8_t _S0 = uint8x8_t();
  648. uint8x8_t _S1 = uint8x8_t();
  649. _S0 = vld1_lane_u8(S0p, _S0, 0);
  650. _S0 = vld1_lane_u8(S0p+1, _S0, 1);
  651. _S0 = vld1_lane_u8(S0p+2, _S0, 2);
  652. _S0 = vld1_lane_u8(S0p+3, _S0, 3);
  653. _S0 = vld1_lane_u8(S0p+4, _S0, 4);
  654. _S0 = vld1_lane_u8(S0p+5, _S0, 5);
  655. _S1 = vld1_lane_u8(S1p, _S1, 0);
  656. _S1 = vld1_lane_u8(S1p+1, _S1, 1);
  657. _S1 = vld1_lane_u8(S1p+2, _S1, 2);
  658. _S1 = vld1_lane_u8(S1p+3, _S1, 3);
  659. _S1 = vld1_lane_u8(S1p+4, _S1, 4);
  660. _S1 = vld1_lane_u8(S1p+5, _S1, 5);
  661. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  662. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  663. int16x4_t _S0low = vget_low_s16(_S016);
  664. int16x4_t _S1low = vget_low_s16(_S116);
  665. int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
  666. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  667. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  668. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  669. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  670. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  671. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  672. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  673. vst1_s16(rows0p, _rows0_sr4);
  674. vst1_s16(rows1p, _rows1_sr4);
  675. #else
  676. rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
  677. rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
  678. rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
  679. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  680. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  681. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  682. #endif // __ARM_NEON
  683. ialphap += 2;
  684. rows0p += 3;
  685. rows1p += 3;
  686. }
  687. }
  688. prev_sy1 = sy;
  689. // vresize
  690. short b0 = ibeta[0];
  691. short b1 = ibeta[1];
  692. short* rows0p = rows0;
  693. short* rows1p = rows1;
  694. unsigned char* Dp = dst + stride * (dy);
  695. #if __ARM_NEON
  696. int nn = (w * 3) >> 3;
  697. #else
  698. int nn = 0;
  699. #endif
  700. int remain = (w * 3) - (nn << 3);
  701. #if __ARM_NEON
  702. #if __aarch64__
  703. int16x4_t _b0 = vdup_n_s16(b0);
  704. int16x4_t _b1 = vdup_n_s16(b1);
  705. int32x4_t _v2 = vdupq_n_s32(2);
  706. for (; nn>0; nn--)
  707. {
  708. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  709. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  710. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  711. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  712. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  713. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  714. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  715. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  716. int32x4_t _acc = _v2;
  717. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  718. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  719. int32x4_t _acc_1 = _v2;
  720. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  721. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  722. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  723. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  724. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  725. vst1_u8(Dp, _D);
  726. Dp += 8;
  727. rows0p += 8;
  728. rows1p += 8;
  729. }
  730. #else
  731. if (nn > 0)
  732. {
  733. asm volatile(
  734. "vdup.s16 d16, %8 \n"
  735. "mov r4, #2 \n"
  736. "vdup.s16 d17, %9 \n"
  737. "vdup.s32 q12, r4 \n"
  738. "pld [%0, #128] \n"
  739. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  740. "pld [%1, #128] \n"
  741. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  742. "0: \n"
  743. "vmull.s16 q0, d2, d16 \n"
  744. "vmull.s16 q1, d3, d16 \n"
  745. "vorr.s32 q10, q12, q12 \n"
  746. "vorr.s32 q11, q12, q12 \n"
  747. "vmull.s16 q2, d6, d17 \n"
  748. "vmull.s16 q3, d7, d17 \n"
  749. "vsra.s32 q10, q0, #16 \n"
  750. "vsra.s32 q11, q1, #16 \n"
  751. "pld [%0, #128] \n"
  752. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  753. "vsra.s32 q10, q2, #16 \n"
  754. "vsra.s32 q11, q3, #16 \n"
  755. "pld [%1, #128] \n"
  756. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  757. "vshrn.s32 d20, q10, #2 \n"
  758. "vshrn.s32 d21, q11, #2 \n"
  759. "vqmovun.s16 d20, q10 \n"
  760. "vst1.8 {d20}, [%2]! \n"
  761. "subs %3, #1 \n"
  762. "bne 0b \n"
  763. "sub %0, #16 \n"
  764. "sub %1, #16 \n"
  765. : "=r"(rows0p), // %0
  766. "=r"(rows1p), // %1
  767. "=r"(Dp), // %2
  768. "=r"(nn) // %3
  769. : "0"(rows0p),
  770. "1"(rows1p),
  771. "2"(Dp),
  772. "3"(nn),
  773. "r"(b0), // %8
  774. "r"(b1) // %9
  775. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  776. );
  777. }
  778. #endif // __aarch64__
  779. #endif // __ARM_NEON
  780. for ( ; remain; --remain )
  781. {
  782. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  783. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  784. }
  785. ibeta += 2;
  786. }
  787. delete[] buf;
  788. }
  789. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  790. {
  791. const int INTER_RESIZE_COEF_BITS=11;
  792. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  793. // const int ONE=INTER_RESIZE_COEF_SCALE;
  794. double scale_x = (double)srcw / w;
  795. double scale_y = (double)srch / h;
  796. int* buf = new int[w + h + w + h];
  797. int* xofs = buf;//new int[w];
  798. int* yofs = buf + w;//new int[h];
  799. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  800. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  801. float fx;
  802. float fy;
  803. int sx;
  804. int sy;
  805. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  806. for (int dx = 0; dx < w; dx++)
  807. {
  808. fx = (float)((dx + 0.5) * scale_x - 0.5);
  809. sx = static_cast<int>(floor(fx));
  810. fx -= sx;
  811. if (sx < 0)
  812. {
  813. sx = 0;
  814. fx = 0.f;
  815. }
  816. if (sx >= srcw - 1)
  817. {
  818. sx = srcw - 2;
  819. fx = 1.f;
  820. }
  821. xofs[dx] = sx*4;
  822. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  823. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  824. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  825. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  826. }
  827. for (int dy = 0; dy < h; dy++)
  828. {
  829. fy = (float)((dy + 0.5) * scale_y - 0.5);
  830. sy = static_cast<int>(floor(fy));
  831. fy -= sy;
  832. if (sy < 0)
  833. {
  834. sy = 0;
  835. fy = 0.f;
  836. }
  837. if (sy >= srch - 1)
  838. {
  839. sy = srch - 2;
  840. fy = 1.f;
  841. }
  842. yofs[dy] = sy;
  843. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  844. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  845. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  846. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  847. }
  848. #undef SATURATE_CAST_SHORT
  849. // loop body
  850. Mat rowsbuf0(w*4, (size_t)2u);
  851. Mat rowsbuf1(w*4, (size_t)2u);
  852. short* rows0 = (short*)rowsbuf0.data;
  853. short* rows1 = (short*)rowsbuf1.data;
  854. int prev_sy1 = -2;
  855. for (int dy = 0; dy < h; dy++ )
  856. {
  857. int sy = yofs[dy];
  858. if (sy == prev_sy1)
  859. {
  860. // reuse all rows
  861. }
  862. else if (sy == prev_sy1 + 4)
  863. {
  864. // hresize one row
  865. short* rows0_old = rows0;
  866. rows0 = rows1;
  867. rows1 = rows0_old;
  868. const unsigned char *S1 = src + srcstride * (sy+1);
  869. const short* ialphap = ialpha;
  870. short* rows1p = rows1;
  871. for ( int dx = 0; dx < w; dx++ )
  872. {
  873. int sx = xofs[dx];
  874. short a0 = ialphap[0];
  875. short a1 = ialphap[1];
  876. const unsigned char* S1p = S1 + sx;
  877. #if __ARM_NEON
  878. int16x4_t _a0 = vdup_n_s16(a0);
  879. int16x4_t _a1 = vdup_n_s16(a1);
  880. uint8x8_t _S1 = vld1_u8(S1p);
  881. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  882. int16x4_t _S1low = vget_low_s16(_S116);
  883. int16x4_t _S1high = vget_high_s16(_S116);
  884. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  885. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  886. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  887. vst1_s16(rows1p, _rows1_sr4);
  888. #else
  889. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  890. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  891. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  892. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  893. #endif // __ARM_NEON
  894. ialphap += 2;
  895. rows1p += 4;
  896. }
  897. }
  898. else
  899. {
  900. // hresize two rows
  901. const unsigned char *S0 = src + srcstride * (sy);
  902. const unsigned char *S1 = src + srcstride * (sy+1);
  903. const short* ialphap = ialpha;
  904. short* rows0p = rows0;
  905. short* rows1p = rows1;
  906. for ( int dx = 0; dx < w; dx++ )
  907. {
  908. int sx = xofs[dx];
  909. short a0 = ialphap[0];
  910. short a1 = ialphap[1];
  911. const unsigned char* S0p = S0 + sx;
  912. const unsigned char* S1p = S1 + sx;
  913. #if __ARM_NEON
  914. int16x4_t _a0 = vdup_n_s16(a0);
  915. int16x4_t _a1 = vdup_n_s16(a1);
  916. uint8x8_t _S0 = vld1_u8(S0p);
  917. uint8x8_t _S1 = vld1_u8(S1p);
  918. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  919. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  920. int16x4_t _S0low = vget_low_s16(_S016);
  921. int16x4_t _S1low = vget_low_s16(_S116);
  922. int16x4_t _S0high = vget_high_s16(_S016);
  923. int16x4_t _S1high = vget_high_s16(_S116);
  924. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  925. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  926. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  927. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  928. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  929. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  930. vst1_s16(rows0p, _rows0_sr4);
  931. vst1_s16(rows1p, _rows1_sr4);
  932. #else
  933. rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
  934. rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
  935. rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
  936. rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
  937. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  938. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  939. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  940. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  941. #endif // __ARM_NEON
  942. ialphap += 2;
  943. rows0p += 4;
  944. rows1p += 4;
  945. }
  946. }
  947. prev_sy1 = sy;
  948. // vresize
  949. short b0 = ibeta[0];
  950. short b1 = ibeta[1];
  951. short* rows0p = rows0;
  952. short* rows1p = rows1;
  953. unsigned char* Dp = dst + stride * (dy);
  954. #if __ARM_NEON
  955. int nn = (w * 4) >> 3;
  956. #else
  957. int nn = 0;
  958. #endif
  959. int remain = (w * 4) - (nn << 3);
  960. #if __ARM_NEON
  961. #if __aarch64__
  962. int16x4_t _b0 = vdup_n_s16(b0);
  963. int16x4_t _b1 = vdup_n_s16(b1);
  964. int32x4_t _v2 = vdupq_n_s32(2);
  965. for (; nn>0; nn--)
  966. {
  967. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  968. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  969. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  970. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  971. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  972. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  973. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  974. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  975. int32x4_t _acc = _v2;
  976. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  977. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  978. int32x4_t _acc_1 = _v2;
  979. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  980. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  981. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  982. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  983. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  984. vst1_u8(Dp, _D);
  985. Dp += 8;
  986. rows0p += 8;
  987. rows1p += 8;
  988. }
  989. #else
  990. if (nn > 0)
  991. {
  992. asm volatile(
  993. "vdup.s16 d16, %8 \n"
  994. "mov r4, #2 \n"
  995. "vdup.s16 d17, %9 \n"
  996. "vdup.s32 q12, r4 \n"
  997. "pld [%0, #128] \n"
  998. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  999. "pld [%1, #128] \n"
  1000. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1001. "0: \n"
  1002. "vmull.s16 q0, d2, d16 \n"
  1003. "vmull.s16 q1, d3, d16 \n"
  1004. "vorr.s32 q10, q12, q12 \n"
  1005. "vorr.s32 q11, q12, q12 \n"
  1006. "vmull.s16 q2, d6, d17 \n"
  1007. "vmull.s16 q3, d7, d17 \n"
  1008. "vsra.s32 q10, q0, #16 \n"
  1009. "vsra.s32 q11, q1, #16 \n"
  1010. "pld [%0, #128] \n"
  1011. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1012. "vsra.s32 q10, q2, #16 \n"
  1013. "vsra.s32 q11, q3, #16 \n"
  1014. "pld [%1, #128] \n"
  1015. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1016. "vshrn.s32 d20, q10, #2 \n"
  1017. "vshrn.s32 d21, q11, #2 \n"
  1018. "vqmovun.s16 d20, q10 \n"
  1019. "vst1.8 {d20}, [%2]! \n"
  1020. "subs %3, #1 \n"
  1021. "bne 0b \n"
  1022. "sub %0, #16 \n"
  1023. "sub %1, #16 \n"
  1024. : "=r"(rows0p), // %0
  1025. "=r"(rows1p), // %1
  1026. "=r"(Dp), // %2
  1027. "=r"(nn) // %3
  1028. : "0"(rows0p),
  1029. "1"(rows1p),
  1030. "2"(Dp),
  1031. "3"(nn),
  1032. "r"(b0), // %8
  1033. "r"(b1) // %9
  1034. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  1035. );
  1036. }
  1037. #endif // __aarch64__
  1038. #endif // __ARM_NEON
  1039. for ( ; remain; --remain )
  1040. {
  1041. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1042. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  1043. }
  1044. ibeta += 2;
  1045. }
  1046. delete[] buf;
  1047. }
  1048. void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  1049. {
  1050. // assert srcw % 2 == 0
  1051. // assert srch % 2 == 0
  1052. // assert w % 2 == 0
  1053. // assert h % 2 == 0
  1054. const unsigned char* srcY = src;
  1055. unsigned char* dstY = dst;
  1056. resize_bilinear_c1(srcY, srcw, srch, dstY, w, h);
  1057. const unsigned char* srcUV = src + srcw * srch;
  1058. unsigned char* dstUV = dst + w * h;
  1059. resize_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2);
  1060. }
  1061. #endif // NCNN_PIXEL
  1062. } // namespace ncnn