You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel_resize.cpp 35 kB

7 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #if __ARM_NEON
  17. #include <arm_neon.h>
  18. #endif // __ARM_NEON
  19. #include "platform.h"
  20. namespace ncnn {
  21. #if NCNN_PIXEL
  22. static void vresize_two(const short* rows0p, const short* rows1p, int wsize, unsigned char* Dp0, unsigned char* Dp1, short b0, short b1, short b2, short b3)
  23. {
  24. int dx = 0;
  25. #if __ARM_NEON
  26. int16x8_t _b0 = vdupq_n_s16(b0);
  27. int16x8_t _b1 = vdupq_n_s16(b1);
  28. int16x8_t _b2 = vdupq_n_s16(b2);
  29. int16x8_t _b3 = vdupq_n_s16(b3);
  30. for (; dx + 15 < wsize; dx += 16)
  31. {
  32. int16x8_t _r00 = vld1q_s16(rows0p);
  33. int16x8_t _r01 = vld1q_s16(rows0p + 8);
  34. int16x8_t _r10 = vld1q_s16(rows1p);
  35. int16x8_t _r11 = vld1q_s16(rows1p + 8);
  36. int16x8_t _acc00 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
  37. int16x8_t _acc01 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
  38. int16x8_t _acc10 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b2), 1), vqdmulhq_s16(_r10, _b3), 1);
  39. int16x8_t _acc11 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b2), 1), vqdmulhq_s16(_r11, _b3), 1);
  40. uint8x16_t _Dp0 = vcombine_u8(vqrshrun_n_s16(_acc00, 2), vqrshrun_n_s16(_acc01, 2));
  41. uint8x16_t _Dp1 = vcombine_u8(vqrshrun_n_s16(_acc10, 2), vqrshrun_n_s16(_acc11, 2));
  42. vst1q_u8(Dp0, _Dp0);
  43. vst1q_u8(Dp1, _Dp1);
  44. Dp0 += 16;
  45. Dp1 += 16;
  46. rows0p += 16;
  47. rows1p += 16;
  48. }
  49. for (; dx + 7 < wsize; dx += 8)
  50. {
  51. int16x8_t _r0 = vld1q_s16(rows0p);
  52. int16x8_t _r1 = vld1q_s16(rows1p);
  53. int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
  54. int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b2), 1), vqdmulhq_s16(_r1, _b3), 1);
  55. uint8x8_t _Dp0 = vqrshrun_n_s16(_acc0, 2);
  56. uint8x8_t _Dp1 = vqrshrun_n_s16(_acc1, 2);
  57. vst1_u8(Dp0, _Dp0);
  58. vst1_u8(Dp1, _Dp1);
  59. Dp0 += 8;
  60. Dp1 += 8;
  61. rows0p += 8;
  62. rows1p += 8;
  63. }
  64. #endif // __ARM_NEON
  65. #if __SSE2__
  66. __m128i _b0 = _mm_set1_epi16(b0);
  67. __m128i _b1 = _mm_set1_epi16(b1);
  68. __m128i _b2 = _mm_set1_epi16(b2);
  69. __m128i _b3 = _mm_set1_epi16(b3);
  70. __m128i _v2 = _mm_set1_epi16(2);
  71. for (; dx + 15 < wsize; dx += 16)
  72. {
  73. __m128i _r00 = _mm_loadu_si128((const __m128i*)rows0p);
  74. __m128i _r01 = _mm_loadu_si128((const __m128i*)(rows0p + 8));
  75. __m128i _r10 = _mm_loadu_si128((const __m128i*)rows1p);
  76. __m128i _r11 = _mm_loadu_si128((const __m128i*)(rows1p + 8));
  77. __m128i _acc00 = _mm_add_epi16(_mm_mulhi_epi16(_r00, _b0), _mm_mulhi_epi16(_r10, _b1));
  78. __m128i _acc01 = _mm_add_epi16(_mm_mulhi_epi16(_r01, _b0), _mm_mulhi_epi16(_r11, _b1));
  79. __m128i _acc10 = _mm_add_epi16(_mm_mulhi_epi16(_r00, _b2), _mm_mulhi_epi16(_r10, _b3));
  80. __m128i _acc11 = _mm_add_epi16(_mm_mulhi_epi16(_r01, _b2), _mm_mulhi_epi16(_r11, _b3));
  81. _acc00 = _mm_srai_epi16(_mm_add_epi16(_acc00, _v2), 2);
  82. _acc01 = _mm_srai_epi16(_mm_add_epi16(_acc01, _v2), 2);
  83. _acc10 = _mm_srai_epi16(_mm_add_epi16(_acc10, _v2), 2);
  84. _acc11 = _mm_srai_epi16(_mm_add_epi16(_acc11, _v2), 2);
  85. __m128i _Dp0 = _mm_packus_epi16(_acc00, _acc01);
  86. __m128i _Dp1 = _mm_packus_epi16(_acc10, _acc11);
  87. _mm_storeu_si128((__m128i*)Dp0, _Dp0);
  88. _mm_storeu_si128((__m128i*)Dp1, _Dp1);
  89. Dp0 += 16;
  90. Dp1 += 16;
  91. rows0p += 16;
  92. rows1p += 16;
  93. }
  94. for (; dx + 7 < wsize; dx += 8)
  95. {
  96. __m128i _r0 = _mm_loadu_si128((const __m128i*)rows0p);
  97. __m128i _r1 = _mm_loadu_si128((const __m128i*)rows1p);
  98. __m128i _acc0 = _mm_add_epi16(_mm_mulhi_epi16(_r0, _b0), _mm_mulhi_epi16(_r1, _b1));
  99. __m128i _acc1 = _mm_add_epi16(_mm_mulhi_epi16(_r0, _b2), _mm_mulhi_epi16(_r1, _b3));
  100. _acc0 = _mm_srai_epi16(_mm_add_epi16(_acc0, _v2), 2);
  101. _acc1 = _mm_srai_epi16(_mm_add_epi16(_acc1, _v2), 2);
  102. __m128i _Dp0 = _mm_packus_epi16(_acc0, _acc0);
  103. __m128i _Dp1 = _mm_packus_epi16(_acc1, _acc1);
  104. _mm_storel_epi64((__m128i*)Dp0, _Dp0);
  105. _mm_storel_epi64((__m128i*)Dp1, _Dp1);
  106. Dp0 += 8;
  107. Dp1 += 8;
  108. rows0p += 8;
  109. rows1p += 8;
  110. }
  111. #endif // __SSE2__
  112. for (; dx < wsize; dx++)
  113. {
  114. short s0 = *rows0p++;
  115. short s1 = *rows1p++;
  116. *Dp0++ = (unsigned char)(((short)((b0 * s0) >> 16) + (short)((b1 * s1) >> 16) + 2) >> 2);
  117. *Dp1++ = (unsigned char)(((short)((b2 * s0) >> 16) + (short)((b3 * s1) >> 16) + 2) >> 2);
  118. }
  119. }
  120. static void vresize_one(const short* rows0p, const short* rows1p, int wsize, unsigned char* Dp, short b0, short b1)
  121. {
  122. int dx = 0;
  123. #if __ARM_NEON
  124. int16x8_t _b0 = vdupq_n_s16(b0);
  125. int16x8_t _b1 = vdupq_n_s16(b1);
  126. for (; dx + 15 < wsize; dx += 16)
  127. {
  128. int16x8_t _r00 = vld1q_s16(rows0p);
  129. int16x8_t _r01 = vld1q_s16(rows0p + 8);
  130. int16x8_t _r10 = vld1q_s16(rows1p);
  131. int16x8_t _r11 = vld1q_s16(rows1p + 8);
  132. int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
  133. int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
  134. uint8x16_t _Dp = vcombine_u8(vqrshrun_n_s16(_acc0, 2), vqrshrun_n_s16(_acc1, 2));
  135. vst1q_u8(Dp, _Dp);
  136. Dp += 16;
  137. rows0p += 16;
  138. rows1p += 16;
  139. }
  140. for (; dx + 7 < wsize; dx += 8)
  141. {
  142. int16x8_t _r0 = vld1q_s16(rows0p);
  143. int16x8_t _r1 = vld1q_s16(rows1p);
  144. int16x8_t _acc = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
  145. uint8x8_t _Dp = vqrshrun_n_s16(_acc, 2);
  146. vst1_u8(Dp, _Dp);
  147. Dp += 8;
  148. rows0p += 8;
  149. rows1p += 8;
  150. }
  151. #endif // __ARM_NEON
  152. #if __SSE2__
  153. __m128i _b0 = _mm_set1_epi16(b0);
  154. __m128i _b1 = _mm_set1_epi16(b1);
  155. __m128i _v2 = _mm_set1_epi16(2);
  156. for (; dx + 15 < wsize; dx += 16)
  157. {
  158. __m128i _r00 = _mm_loadu_si128((const __m128i*)rows0p);
  159. __m128i _r01 = _mm_loadu_si128((const __m128i*)(rows0p + 8));
  160. __m128i _r10 = _mm_loadu_si128((const __m128i*)rows1p);
  161. __m128i _r11 = _mm_loadu_si128((const __m128i*)(rows1p + 8));
  162. __m128i _acc0 = _mm_add_epi16(_mm_mulhi_epi16(_r00, _b0), _mm_mulhi_epi16(_r10, _b1));
  163. __m128i _acc1 = _mm_add_epi16(_mm_mulhi_epi16(_r01, _b0), _mm_mulhi_epi16(_r11, _b1));
  164. _acc0 = _mm_srai_epi16(_mm_add_epi16(_acc0, _v2), 2);
  165. _acc1 = _mm_srai_epi16(_mm_add_epi16(_acc1, _v2), 2);
  166. __m128i _Dp = _mm_packus_epi16(_acc0, _acc1);
  167. _mm_storeu_si128((__m128i*)Dp, _Dp);
  168. Dp += 16;
  169. rows0p += 16;
  170. rows1p += 16;
  171. }
  172. for (; dx + 7 < wsize; dx += 8)
  173. {
  174. __m128i _r0 = _mm_loadu_si128((const __m128i*)rows0p);
  175. __m128i _r1 = _mm_loadu_si128((const __m128i*)rows1p);
  176. __m128i _acc = _mm_add_epi16(_mm_mulhi_epi16(_r0, _b0), _mm_mulhi_epi16(_r1, _b1));
  177. _acc = _mm_srai_epi16(_mm_add_epi16(_acc, _v2), 2);
  178. __m128i _Dp = _mm_packus_epi16(_acc, _acc);
  179. _mm_storel_epi64((__m128i*)Dp, _Dp);
  180. Dp += 8;
  181. rows0p += 8;
  182. rows1p += 8;
  183. }
  184. #endif // __SSE2__
  185. for (; dx < wsize; dx++)
  186. {
  187. short s0 = *rows0p++;
  188. short s1 = *rows1p++;
  189. *Dp++ = (unsigned char)(((short)((b0 * s0) >> 16) + (short)((b1 * s1) >> 16) + 2) >> 2);
  190. }
  191. }
  192. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  193. {
  194. return resize_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w);
  195. }
  196. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  197. {
  198. return resize_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2);
  199. }
  200. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  201. {
  202. return resize_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3);
  203. }
  204. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  205. {
  206. return resize_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4);
  207. }
  208. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  209. {
  210. const int INTER_RESIZE_COEF_BITS = 11;
  211. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  212. double scale_x = (double)srcw / w;
  213. double scale_y = (double)srch / h;
  214. int* buf = new int[w + h + w + h];
  215. int* xofs = buf; //new int[w];
  216. int* yofs = buf + w; //new int[h];
  217. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  218. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  219. float fx;
  220. float fy;
  221. int sx;
  222. int sy;
  223. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  224. for (int dx = 0; dx < w; dx++)
  225. {
  226. fx = (float)((dx + 0.5) * scale_x - 0.5);
  227. sx = static_cast<int>(floor(fx));
  228. fx -= sx;
  229. if (sx < 0)
  230. {
  231. sx = 0;
  232. fx = 0.f;
  233. }
  234. if (sx >= srcw - 1)
  235. {
  236. sx = srcw - 2;
  237. fx = 1.f;
  238. }
  239. xofs[dx] = sx;
  240. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  241. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  242. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  243. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  244. }
  245. for (int dy = 0; dy < h; dy++)
  246. {
  247. fy = (float)((dy + 0.5) * scale_y - 0.5);
  248. sy = static_cast<int>(floor(fy));
  249. fy -= sy;
  250. if (sy < 0)
  251. {
  252. sy = 0;
  253. fy = 0.f;
  254. }
  255. if (sy >= srch - 1)
  256. {
  257. sy = srch - 2;
  258. fy = 1.f;
  259. }
  260. yofs[dy] = sy;
  261. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  262. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  263. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  264. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  265. }
  266. #undef SATURATE_CAST_SHORT
  267. // loop body
  268. Mat rowsbuf0(w, (size_t)2u);
  269. Mat rowsbuf1(w, (size_t)2u);
  270. short* rows0 = (short*)rowsbuf0.data;
  271. short* rows1 = (short*)rowsbuf1.data;
  272. int prev_sy1 = -2;
  273. for (int dy = 0; dy < h; dy++)
  274. {
  275. sy = yofs[dy];
  276. if (sy == prev_sy1)
  277. {
  278. // reuse all rows
  279. }
  280. else if (sy == prev_sy1 + 1)
  281. {
  282. // hresize one row
  283. short* rows0_old = rows0;
  284. rows0 = rows1;
  285. rows1 = rows0_old;
  286. const unsigned char* S1 = src + srcstride * (sy + 1);
  287. const short* ialphap = ialpha;
  288. short* rows1p = rows1;
  289. for (int dx = 0; dx < w; dx++)
  290. {
  291. sx = xofs[dx];
  292. short a0 = ialphap[0];
  293. short a1 = ialphap[1];
  294. const unsigned char* S1p = S1 + sx;
  295. rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
  296. ialphap += 2;
  297. }
  298. }
  299. else
  300. {
  301. // hresize two rows
  302. const unsigned char* S0 = src + srcstride * (sy);
  303. const unsigned char* S1 = src + srcstride * (sy + 1);
  304. const short* ialphap = ialpha;
  305. short* rows0p = rows0;
  306. short* rows1p = rows1;
  307. for (int dx = 0; dx < w; dx++)
  308. {
  309. sx = xofs[dx];
  310. short a0 = ialphap[0];
  311. short a1 = ialphap[1];
  312. const unsigned char* S0p = S0 + sx;
  313. const unsigned char* S1p = S1 + sx;
  314. rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
  315. rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
  316. ialphap += 2;
  317. }
  318. }
  319. prev_sy1 = sy;
  320. if (dy + 1 < h && yofs[dy + 1] == sy)
  321. {
  322. // vresize for two rows
  323. unsigned char* Dp0 = dst + stride * dy;
  324. unsigned char* Dp1 = dst + stride * (dy + 1);
  325. vresize_two(rows0, rows1, w, Dp0, Dp1, ibeta[0], ibeta[1], ibeta[2], ibeta[3]);
  326. ibeta += 4;
  327. dy += 1;
  328. }
  329. else
  330. {
  331. // vresize
  332. unsigned char* Dp = dst + stride * dy;
  333. vresize_one(rows0, rows1, w, Dp, ibeta[0], ibeta[1]);
  334. ibeta += 2;
  335. }
  336. }
  337. delete[] buf;
  338. }
  339. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  340. {
  341. const int INTER_RESIZE_COEF_BITS = 11;
  342. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  343. double scale_x = (double)srcw / w;
  344. double scale_y = (double)srch / h;
  345. int* buf = new int[w + h + w + h];
  346. int* xofs = buf; //new int[w];
  347. int* yofs = buf + w; //new int[h];
  348. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  349. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  350. float fx;
  351. float fy;
  352. int sx;
  353. int sy;
  354. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  355. for (int dx = 0; dx < w; dx++)
  356. {
  357. fx = (float)((dx + 0.5) * scale_x - 0.5);
  358. sx = static_cast<int>(floor(fx));
  359. fx -= sx;
  360. if (sx < 0)
  361. {
  362. sx = 0;
  363. fx = 0.f;
  364. }
  365. if (sx >= srcw - 1)
  366. {
  367. sx = srcw - 2;
  368. fx = 1.f;
  369. }
  370. xofs[dx] = sx * 2;
  371. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  372. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  373. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  374. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  375. }
  376. for (int dy = 0; dy < h; dy++)
  377. {
  378. fy = (float)((dy + 0.5) * scale_y - 0.5);
  379. sy = static_cast<int>(floor(fy));
  380. fy -= sy;
  381. if (sy < 0)
  382. {
  383. sy = 0;
  384. fy = 0.f;
  385. }
  386. if (sy >= srch - 1)
  387. {
  388. sy = srch - 2;
  389. fy = 1.f;
  390. }
  391. yofs[dy] = sy;
  392. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  393. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  394. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  395. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  396. }
  397. #undef SATURATE_CAST_SHORT
  398. // loop body
  399. Mat rowsbuf0(w * 2 + 2, (size_t)2u);
  400. Mat rowsbuf1(w * 2 + 2, (size_t)2u);
  401. short* rows0 = (short*)rowsbuf0.data;
  402. short* rows1 = (short*)rowsbuf1.data;
  403. int prev_sy1 = -2;
  404. for (int dy = 0; dy < h; dy++)
  405. {
  406. sy = yofs[dy];
  407. if (sy == prev_sy1)
  408. {
  409. // reuse all rows
  410. }
  411. else if (sy == prev_sy1 + 1)
  412. {
  413. // hresize one row
  414. short* rows0_old = rows0;
  415. rows0 = rows1;
  416. rows1 = rows0_old;
  417. const unsigned char* S1 = src + srcstride * (sy + 1);
  418. const short* ialphap = ialpha;
  419. short* rows1p = rows1;
  420. for (int dx = 0; dx < w; dx++)
  421. {
  422. sx = xofs[dx];
  423. const unsigned char* S1p = S1 + sx;
  424. #if __ARM_NEON
  425. int16x4_t _a0a1XX = vld1_s16(ialphap);
  426. int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
  427. uint8x8_t _S1 = uint8x8_t();
  428. _S1 = vld1_lane_u8(S1p, _S1, 0);
  429. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  430. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  431. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  432. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  433. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  434. int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
  435. int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
  436. int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
  437. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  438. vst1_s16(rows1p, _rows1_sr4);
  439. #else
  440. short a0 = ialphap[0];
  441. short a1 = ialphap[1];
  442. rows1p[0] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
  443. rows1p[1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
  444. #endif // __ARM_NEON
  445. ialphap += 2;
  446. rows1p += 2;
  447. }
  448. }
  449. else
  450. {
  451. // hresize two rows
  452. const unsigned char* S0 = src + srcstride * (sy);
  453. const unsigned char* S1 = src + srcstride * (sy + 1);
  454. const short* ialphap = ialpha;
  455. short* rows0p = rows0;
  456. short* rows1p = rows1;
  457. for (int dx = 0; dx < w; dx++)
  458. {
  459. sx = xofs[dx];
  460. short a0 = ialphap[0];
  461. short a1 = ialphap[1];
  462. const unsigned char* S0p = S0 + sx;
  463. const unsigned char* S1p = S1 + sx;
  464. #if __ARM_NEON
  465. int16x4_t _a0 = vdup_n_s16(a0);
  466. int16x4_t _a1 = vdup_n_s16(a1);
  467. uint8x8_t _S0 = uint8x8_t();
  468. uint8x8_t _S1 = uint8x8_t();
  469. _S0 = vld1_lane_u8(S0p, _S0, 0);
  470. _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
  471. _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
  472. _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
  473. _S1 = vld1_lane_u8(S1p, _S1, 0);
  474. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  475. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  476. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  477. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  478. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  479. int16x4_t _S0lowhigh = vget_low_s16(_S016);
  480. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  481. int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
  482. int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
  483. _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
  484. int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
  485. int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
  486. vst1_s16(rows0p, _rows01_sr4);
  487. vst1_s16(rows1p, _rows1_sr4);
  488. #else
  489. rows0p[0] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
  490. rows0p[1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
  491. rows1p[0] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
  492. rows1p[1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
  493. #endif // __ARM_NEON
  494. ialphap += 2;
  495. rows0p += 2;
  496. rows1p += 2;
  497. }
  498. }
  499. prev_sy1 = sy;
  500. if (dy + 1 < h && yofs[dy + 1] == sy)
  501. {
  502. // vresize for two rows
  503. unsigned char* Dp0 = dst + stride * dy;
  504. unsigned char* Dp1 = dst + stride * (dy + 1);
  505. vresize_two(rows0, rows1, w * 2, Dp0, Dp1, ibeta[0], ibeta[1], ibeta[2], ibeta[3]);
  506. ibeta += 4;
  507. dy += 1;
  508. }
  509. else
  510. {
  511. // vresize
  512. unsigned char* Dp = dst + stride * dy;
  513. vresize_one(rows0, rows1, w * 2, Dp, ibeta[0], ibeta[1]);
  514. ibeta += 2;
  515. }
  516. }
  517. delete[] buf;
  518. }
  519. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  520. {
  521. const int INTER_RESIZE_COEF_BITS = 11;
  522. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  523. double scale_x = (double)srcw / w;
  524. double scale_y = (double)srch / h;
  525. int* buf = new int[w + h + w + h];
  526. int* xofs = buf; //new int[w];
  527. int* yofs = buf + w; //new int[h];
  528. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  529. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  530. float fx;
  531. float fy;
  532. int sx;
  533. int sy;
  534. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  535. for (int dx = 0; dx < w; dx++)
  536. {
  537. fx = (float)((dx + 0.5) * scale_x - 0.5);
  538. sx = static_cast<int>(floor(fx));
  539. fx -= sx;
  540. if (sx < 0)
  541. {
  542. sx = 0;
  543. fx = 0.f;
  544. }
  545. if (sx >= srcw - 1)
  546. {
  547. sx = srcw - 2;
  548. fx = 1.f;
  549. }
  550. xofs[dx] = sx * 3;
  551. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  552. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  553. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  554. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  555. }
  556. for (int dy = 0; dy < h; dy++)
  557. {
  558. fy = (float)((dy + 0.5) * scale_y - 0.5);
  559. sy = static_cast<int>(floor(fy));
  560. fy -= sy;
  561. if (sy < 0)
  562. {
  563. sy = 0;
  564. fy = 0.f;
  565. }
  566. if (sy >= srch - 1)
  567. {
  568. sy = srch - 2;
  569. fy = 1.f;
  570. }
  571. yofs[dy] = sy;
  572. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  573. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  574. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  575. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  576. }
  577. #undef SATURATE_CAST_SHORT
  578. // loop body
  579. Mat rowsbuf0(w * 3 + 1, (size_t)2u);
  580. Mat rowsbuf1(w * 3 + 1, (size_t)2u);
  581. short* rows0 = (short*)rowsbuf0.data;
  582. short* rows1 = (short*)rowsbuf1.data;
  583. int prev_sy1 = -2;
  584. for (int dy = 0; dy < h; dy++)
  585. {
  586. sy = yofs[dy];
  587. if (sy == prev_sy1)
  588. {
  589. // reuse all rows
  590. }
  591. else if (sy == prev_sy1 + 1)
  592. {
  593. // hresize one row
  594. short* rows0_old = rows0;
  595. rows0 = rows1;
  596. rows1 = rows0_old;
  597. const unsigned char* S1 = src + srcstride * (sy + 1);
  598. const short* ialphap = ialpha;
  599. short* rows1p = rows1;
  600. for (int dx = 0; dx < w; dx++)
  601. {
  602. sx = xofs[dx];
  603. short a0 = ialphap[0];
  604. short a1 = ialphap[1];
  605. const unsigned char* S1p = S1 + sx;
  606. #if __ARM_NEON
  607. int16x4_t _a0 = vdup_n_s16(a0);
  608. int16x4_t _a1 = vdup_n_s16(a1);
  609. uint8x8_t _S1 = uint8x8_t();
  610. _S1 = vld1_lane_u8(S1p, _S1, 0);
  611. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  612. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  613. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  614. _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
  615. _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
  616. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  617. int16x4_t _S1low = vget_low_s16(_S116);
  618. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  619. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  620. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  621. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  622. vst1_s16(rows1p, _rows1_sr4);
  623. #else
  624. rows1p[0] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
  625. rows1p[1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
  626. rows1p[2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
  627. #endif // __ARM_NEON
  628. ialphap += 2;
  629. rows1p += 3;
  630. }
  631. }
  632. else
  633. {
  634. // hresize two rows
  635. const unsigned char* S0 = src + srcstride * (sy);
  636. const unsigned char* S1 = src + srcstride * (sy + 1);
  637. const short* ialphap = ialpha;
  638. short* rows0p = rows0;
  639. short* rows1p = rows1;
  640. for (int dx = 0; dx < w; dx++)
  641. {
  642. sx = xofs[dx];
  643. short a0 = ialphap[0];
  644. short a1 = ialphap[1];
  645. const unsigned char* S0p = S0 + sx;
  646. const unsigned char* S1p = S1 + sx;
  647. #if __ARM_NEON
  648. int16x4_t _a0 = vdup_n_s16(a0);
  649. int16x4_t _a1 = vdup_n_s16(a1);
  650. uint8x8_t _S0 = uint8x8_t();
  651. uint8x8_t _S1 = uint8x8_t();
  652. _S0 = vld1_lane_u8(S0p, _S0, 0);
  653. _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
  654. _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
  655. _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
  656. _S0 = vld1_lane_u8(S0p + 4, _S0, 4);
  657. _S0 = vld1_lane_u8(S0p + 5, _S0, 5);
  658. _S1 = vld1_lane_u8(S1p, _S1, 0);
  659. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  660. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  661. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  662. _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
  663. _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
  664. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  665. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  666. int16x4_t _S0low = vget_low_s16(_S016);
  667. int16x4_t _S1low = vget_low_s16(_S116);
  668. int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
  669. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  670. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  671. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  672. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  673. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  674. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  675. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  676. vst1_s16(rows0p, _rows0_sr4);
  677. vst1_s16(rows1p, _rows1_sr4);
  678. #else
  679. rows0p[0] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
  680. rows0p[1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
  681. rows0p[2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
  682. rows1p[0] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
  683. rows1p[1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
  684. rows1p[2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
  685. #endif // __ARM_NEON
  686. ialphap += 2;
  687. rows0p += 3;
  688. rows1p += 3;
  689. }
  690. }
  691. prev_sy1 = sy;
  692. if (dy + 1 < h && yofs[dy + 1] == sy)
  693. {
  694. // vresize for two rows
  695. unsigned char* Dp0 = dst + stride * dy;
  696. unsigned char* Dp1 = dst + stride * (dy + 1);
  697. vresize_two(rows0, rows1, w * 3, Dp0, Dp1, ibeta[0], ibeta[1], ibeta[2], ibeta[3]);
  698. ibeta += 4;
  699. dy += 1;
  700. }
  701. else
  702. {
  703. // vresize
  704. unsigned char* Dp = dst + stride * dy;
  705. vresize_one(rows0, rows1, w * 3, Dp, ibeta[0], ibeta[1]);
  706. ibeta += 2;
  707. }
  708. }
  709. delete[] buf;
  710. }
  711. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  712. {
  713. const int INTER_RESIZE_COEF_BITS = 11;
  714. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  715. double scale_x = (double)srcw / w;
  716. double scale_y = (double)srch / h;
  717. int* buf = new int[w + h + w + h];
  718. int* xofs = buf; //new int[w];
  719. int* yofs = buf + w; //new int[h];
  720. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  721. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  722. float fx;
  723. float fy;
  724. int sx;
  725. int sy;
  726. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  727. for (int dx = 0; dx < w; dx++)
  728. {
  729. fx = (float)((dx + 0.5) * scale_x - 0.5);
  730. sx = static_cast<int>(floor(fx));
  731. fx -= sx;
  732. if (sx < 0)
  733. {
  734. sx = 0;
  735. fx = 0.f;
  736. }
  737. if (sx >= srcw - 1)
  738. {
  739. sx = srcw - 2;
  740. fx = 1.f;
  741. }
  742. xofs[dx] = sx * 4;
  743. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  744. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  745. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  746. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  747. }
  748. for (int dy = 0; dy < h; dy++)
  749. {
  750. fy = (float)((dy + 0.5) * scale_y - 0.5);
  751. sy = static_cast<int>(floor(fy));
  752. fy -= sy;
  753. if (sy < 0)
  754. {
  755. sy = 0;
  756. fy = 0.f;
  757. }
  758. if (sy >= srch - 1)
  759. {
  760. sy = srch - 2;
  761. fy = 1.f;
  762. }
  763. yofs[dy] = sy;
  764. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  765. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  766. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  767. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  768. }
  769. #undef SATURATE_CAST_SHORT
  770. // loop body
  771. Mat rowsbuf0(w * 4, (size_t)2u);
  772. Mat rowsbuf1(w * 4, (size_t)2u);
  773. short* rows0 = (short*)rowsbuf0.data;
  774. short* rows1 = (short*)rowsbuf1.data;
  775. int prev_sy1 = -2;
  776. for (int dy = 0; dy < h; dy++)
  777. {
  778. sy = yofs[dy];
  779. if (sy == prev_sy1)
  780. {
  781. // reuse all rows
  782. }
  783. else if (sy == prev_sy1 + 1)
  784. {
  785. // hresize one row
  786. short* rows0_old = rows0;
  787. rows0 = rows1;
  788. rows1 = rows0_old;
  789. const unsigned char* S1 = src + srcstride * (sy + 1);
  790. const short* ialphap = ialpha;
  791. short* rows1p = rows1;
  792. for (int dx = 0; dx < w; dx++)
  793. {
  794. sx = xofs[dx];
  795. short a0 = ialphap[0];
  796. short a1 = ialphap[1];
  797. const unsigned char* S1p = S1 + sx;
  798. #if __ARM_NEON
  799. int16x4_t _a0 = vdup_n_s16(a0);
  800. int16x4_t _a1 = vdup_n_s16(a1);
  801. uint8x8_t _S1 = vld1_u8(S1p);
  802. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  803. int16x4_t _S1low = vget_low_s16(_S116);
  804. int16x4_t _S1high = vget_high_s16(_S116);
  805. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  806. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  807. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  808. vst1_s16(rows1p, _rows1_sr4);
  809. #else
  810. rows1p[0] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
  811. rows1p[1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
  812. rows1p[2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
  813. rows1p[3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
  814. #endif // __ARM_NEON
  815. ialphap += 2;
  816. rows1p += 4;
  817. }
  818. }
  819. else
  820. {
  821. // hresize two rows
  822. const unsigned char* S0 = src + srcstride * (sy);
  823. const unsigned char* S1 = src + srcstride * (sy + 1);
  824. const short* ialphap = ialpha;
  825. short* rows0p = rows0;
  826. short* rows1p = rows1;
  827. for (int dx = 0; dx < w; dx++)
  828. {
  829. sx = xofs[dx];
  830. short a0 = ialphap[0];
  831. short a1 = ialphap[1];
  832. const unsigned char* S0p = S0 + sx;
  833. const unsigned char* S1p = S1 + sx;
  834. #if __ARM_NEON
  835. int16x4_t _a0 = vdup_n_s16(a0);
  836. int16x4_t _a1 = vdup_n_s16(a1);
  837. uint8x8_t _S0 = vld1_u8(S0p);
  838. uint8x8_t _S1 = vld1_u8(S1p);
  839. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  840. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  841. int16x4_t _S0low = vget_low_s16(_S016);
  842. int16x4_t _S1low = vget_low_s16(_S116);
  843. int16x4_t _S0high = vget_high_s16(_S016);
  844. int16x4_t _S1high = vget_high_s16(_S116);
  845. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  846. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  847. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  848. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  849. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  850. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  851. vst1_s16(rows0p, _rows0_sr4);
  852. vst1_s16(rows1p, _rows1_sr4);
  853. #else
  854. rows0p[0] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
  855. rows0p[1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
  856. rows0p[2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
  857. rows0p[3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
  858. rows1p[0] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
  859. rows1p[1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
  860. rows1p[2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
  861. rows1p[3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
  862. #endif // __ARM_NEON
  863. ialphap += 2;
  864. rows0p += 4;
  865. rows1p += 4;
  866. }
  867. }
  868. prev_sy1 = sy;
  869. if (dy + 1 < h && yofs[dy + 1] == sy)
  870. {
  871. // vresize for two rows
  872. unsigned char* Dp0 = dst + stride * dy;
  873. unsigned char* Dp1 = dst + stride * (dy + 1);
  874. vresize_two(rows0, rows1, w * 4, Dp0, Dp1, ibeta[0], ibeta[1], ibeta[2], ibeta[3]);
  875. ibeta += 4;
  876. dy += 1;
  877. }
  878. else
  879. {
  880. // vresize
  881. unsigned char* Dp = dst + stride * dy;
  882. vresize_one(rows0, rows1, w * 4, Dp, ibeta[0], ibeta[1]);
  883. ibeta += 2;
  884. }
  885. }
  886. delete[] buf;
  887. }
  888. void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  889. {
  890. // assert srcw % 2 == 0
  891. // assert srch % 2 == 0
  892. // assert w % 2 == 0
  893. // assert h % 2 == 0
  894. const unsigned char* srcY = src;
  895. unsigned char* dstY = dst;
  896. resize_bilinear_c1(srcY, srcw, srch, dstY, w, h);
  897. const unsigned char* srcUV = src + srcw * srch;
  898. unsigned char* dstUV = dst + w * h;
  899. resize_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2);
  900. }
  901. #endif // NCNN_PIXEL
  902. } // namespace ncnn