You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel_resize.cpp 41 kB

7 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <math.h>
  17. #if __ARM_NEON
  18. #include <arm_neon.h>
  19. #endif // __ARM_NEON
  20. #include "platform.h"
  21. namespace ncnn {
  22. #if NCNN_PIXEL
  23. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  24. {
  25. return resize_bilinear_c1(src, srcw, srch, srcw, dst, w, h, w);
  26. }
  27. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  28. {
  29. return resize_bilinear_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2);
  30. }
  31. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  32. {
  33. return resize_bilinear_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3);
  34. }
  35. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  36. {
  37. return resize_bilinear_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4);
  38. }
  39. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  40. {
  41. const int INTER_RESIZE_COEF_BITS = 11;
  42. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  43. // const int ONE=INTER_RESIZE_COEF_SCALE;
  44. double scale_x = (double)srcw / w;
  45. double scale_y = (double)srch / h;
  46. int* buf = new int[w + h + w + h];
  47. int* xofs = buf; //new int[w];
  48. int* yofs = buf + w; //new int[h];
  49. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  50. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  51. float fx;
  52. float fy;
  53. int sx;
  54. int sy;
  55. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  56. for (int dx = 0; dx < w; dx++)
  57. {
  58. fx = (float)((dx + 0.5) * scale_x - 0.5);
  59. sx = static_cast<int>(floor(fx));
  60. fx -= sx;
  61. if (sx < 0)
  62. {
  63. sx = 0;
  64. fx = 0.f;
  65. }
  66. if (sx >= srcw - 1)
  67. {
  68. sx = srcw - 2;
  69. fx = 1.f;
  70. }
  71. xofs[dx] = sx;
  72. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  73. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  74. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  75. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  76. }
  77. for (int dy = 0; dy < h; dy++)
  78. {
  79. fy = (float)((dy + 0.5) * scale_y - 0.5);
  80. sy = static_cast<int>(floor(fy));
  81. fy -= sy;
  82. if (sy < 0)
  83. {
  84. sy = 0;
  85. fy = 0.f;
  86. }
  87. if (sy >= srch - 1)
  88. {
  89. sy = srch - 2;
  90. fy = 1.f;
  91. }
  92. yofs[dy] = sy;
  93. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  94. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  95. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  96. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  97. }
  98. #undef SATURATE_CAST_SHORT
  99. // loop body
  100. Mat rowsbuf0(w, (size_t)2u);
  101. Mat rowsbuf1(w, (size_t)2u);
  102. short* rows0 = (short*)rowsbuf0.data;
  103. short* rows1 = (short*)rowsbuf1.data;
  104. int prev_sy1 = -2;
  105. for (int dy = 0; dy < h; dy++)
  106. {
  107. sy = yofs[dy];
  108. if (sy == prev_sy1)
  109. {
  110. // reuse all rows
  111. }
  112. else if (sy == prev_sy1 + 1)
  113. {
  114. // hresize one row
  115. short* rows0_old = rows0;
  116. rows0 = rows1;
  117. rows1 = rows0_old;
  118. const unsigned char* S1 = src + srcstride * (sy + 1);
  119. const short* ialphap = ialpha;
  120. short* rows1p = rows1;
  121. for (int dx = 0; dx < w; dx++)
  122. {
  123. sx = xofs[dx];
  124. short a0 = ialphap[0];
  125. short a1 = ialphap[1];
  126. const unsigned char* S1p = S1 + sx;
  127. rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
  128. ialphap += 2;
  129. }
  130. }
  131. else
  132. {
  133. // hresize two rows
  134. const unsigned char* S0 = src + srcstride * (sy);
  135. const unsigned char* S1 = src + srcstride * (sy + 1);
  136. const short* ialphap = ialpha;
  137. short* rows0p = rows0;
  138. short* rows1p = rows1;
  139. for (int dx = 0; dx < w; dx++)
  140. {
  141. sx = xofs[dx];
  142. short a0 = ialphap[0];
  143. short a1 = ialphap[1];
  144. const unsigned char* S0p = S0 + sx;
  145. const unsigned char* S1p = S1 + sx;
  146. rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
  147. rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
  148. ialphap += 2;
  149. }
  150. }
  151. prev_sy1 = sy;
  152. // vresize
  153. short b0 = ibeta[0];
  154. short b1 = ibeta[1];
  155. short* rows0p = rows0;
  156. short* rows1p = rows1;
  157. unsigned char* Dp = dst + stride * (dy);
  158. #if __ARM_NEON
  159. int nn = w >> 3;
  160. #else
  161. int nn = 0;
  162. #endif
  163. int remain = w - (nn << 3);
  164. #if __ARM_NEON
  165. #if __aarch64__
  166. int16x4_t _b0 = vdup_n_s16(b0);
  167. int16x4_t _b1 = vdup_n_s16(b1);
  168. int32x4_t _v2 = vdupq_n_s32(2);
  169. for (; nn > 0; nn--)
  170. {
  171. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  172. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  173. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
  174. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
  175. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  176. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  177. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  178. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  179. int32x4_t _acc = _v2;
  180. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  181. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  182. int32x4_t _acc_1 = _v2;
  183. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  184. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  185. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  186. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  187. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  188. vst1_u8(Dp, _D);
  189. Dp += 8;
  190. rows0p += 8;
  191. rows1p += 8;
  192. }
  193. #else
  194. if (nn > 0)
  195. {
  196. asm volatile(
  197. "vdup.s16 d16, %8 \n"
  198. "mov r4, #2 \n"
  199. "vdup.s16 d17, %9 \n"
  200. "vdup.s32 q12, r4 \n"
  201. "pld [%0, #128] \n"
  202. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  203. "pld [%1, #128] \n"
  204. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  205. "0: \n"
  206. "vmull.s16 q0, d2, d16 \n"
  207. "vmull.s16 q1, d3, d16 \n"
  208. "vorr.s32 q10, q12, q12 \n"
  209. "vorr.s32 q11, q12, q12 \n"
  210. "vmull.s16 q2, d6, d17 \n"
  211. "vmull.s16 q3, d7, d17 \n"
  212. "vsra.s32 q10, q0, #16 \n"
  213. "vsra.s32 q11, q1, #16 \n"
  214. "pld [%0, #128] \n"
  215. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  216. "vsra.s32 q10, q2, #16 \n"
  217. "vsra.s32 q11, q3, #16 \n"
  218. "pld [%1, #128] \n"
  219. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  220. "vshrn.s32 d20, q10, #2 \n"
  221. "vshrn.s32 d21, q11, #2 \n"
  222. "vqmovun.s16 d20, q10 \n"
  223. "vst1.8 {d20}, [%2]! \n"
  224. "subs %3, #1 \n"
  225. "bne 0b \n"
  226. "sub %0, #16 \n"
  227. "sub %1, #16 \n"
  228. : "=r"(rows0p), // %0
  229. "=r"(rows1p), // %1
  230. "=r"(Dp), // %2
  231. "=r"(nn) // %3
  232. : "0"(rows0p),
  233. "1"(rows1p),
  234. "2"(Dp),
  235. "3"(nn),
  236. "r"(b0), // %8
  237. "r"(b1) // %9
  238. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
  239. }
  240. #endif // __aarch64__
  241. #endif // __ARM_NEON
  242. for (; remain; --remain)
  243. {
  244. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  245. *Dp++ = (unsigned char)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
  246. }
  247. ibeta += 2;
  248. }
  249. delete[] buf;
  250. }
  251. void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  252. {
  253. const int INTER_RESIZE_COEF_BITS = 11;
  254. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  255. // const int ONE=INTER_RESIZE_COEF_SCALE;
  256. double scale_x = (double)srcw / w;
  257. double scale_y = (double)srch / h;
  258. int* buf = new int[w + h + w + h];
  259. int* xofs = buf; //new int[w];
  260. int* yofs = buf + w; //new int[h];
  261. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  262. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  263. float fx;
  264. float fy;
  265. int sx;
  266. int sy;
  267. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  268. for (int dx = 0; dx < w; dx++)
  269. {
  270. fx = (float)((dx + 0.5) * scale_x - 0.5);
  271. sx = static_cast<int>(floor(fx));
  272. fx -= sx;
  273. if (sx < 0)
  274. {
  275. sx = 0;
  276. fx = 0.f;
  277. }
  278. if (sx >= srcw - 1)
  279. {
  280. sx = srcw - 2;
  281. fx = 1.f;
  282. }
  283. xofs[dx] = sx * 2;
  284. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  285. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  286. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  287. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  288. }
  289. for (int dy = 0; dy < h; dy++)
  290. {
  291. fy = (float)((dy + 0.5) * scale_y - 0.5);
  292. sy = static_cast<int>(floor(fy));
  293. fy -= sy;
  294. if (sy < 0)
  295. {
  296. sy = 0;
  297. fy = 0.f;
  298. }
  299. if (sy >= srch - 1)
  300. {
  301. sy = srch - 2;
  302. fy = 1.f;
  303. }
  304. yofs[dy] = sy;
  305. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  306. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  307. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  308. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  309. }
  310. #undef SATURATE_CAST_SHORT
  311. // loop body
  312. Mat rowsbuf0(w * 2 + 2, (size_t)2u);
  313. Mat rowsbuf1(w * 2 + 2, (size_t)2u);
  314. short* rows0 = (short*)rowsbuf0.data;
  315. short* rows1 = (short*)rowsbuf1.data;
  316. int prev_sy1 = -2;
  317. for (int dy = 0; dy < h; dy++)
  318. {
  319. sy = yofs[dy];
  320. if (sy == prev_sy1)
  321. {
  322. // reuse all rows
  323. }
  324. else if (sy == prev_sy1 + 1)
  325. {
  326. // hresize one row
  327. short* rows0_old = rows0;
  328. rows0 = rows1;
  329. rows1 = rows0_old;
  330. const unsigned char* S1 = src + srcstride * (sy + 1);
  331. const short* ialphap = ialpha;
  332. short* rows1p = rows1;
  333. for (int dx = 0; dx < w; dx++)
  334. {
  335. sx = xofs[dx];
  336. const unsigned char* S1p = S1 + sx;
  337. #if __ARM_NEON
  338. int16x4_t _a0a1XX = vld1_s16(ialphap);
  339. int16x4_t _a0a0a1a1 = vzip_s16(_a0a1XX, _a0a1XX).val[0];
  340. uint8x8_t _S1 = uint8x8_t();
  341. _S1 = vld1_lane_u8(S1p, _S1, 0);
  342. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  343. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  344. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  345. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  346. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  347. int32x4_t _S1ma0a1 = vmull_s16(_S1lowhigh, _a0a0a1a1);
  348. int32x2_t _rows1low = vadd_s32(vget_low_s32(_S1ma0a1), vget_high_s32(_S1ma0a1));
  349. int32x4_t _rows1 = vcombine_s32(_rows1low, vget_high_s32(_S1ma0a1));
  350. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  351. vst1_s16(rows1p, _rows1_sr4);
  352. #else
  353. short a0 = ialphap[0];
  354. short a1 = ialphap[1];
  355. rows1p[0] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
  356. rows1p[1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
  357. #endif // __ARM_NEON
  358. ialphap += 2;
  359. rows1p += 2;
  360. }
  361. }
  362. else
  363. {
  364. // hresize two rows
  365. const unsigned char* S0 = src + srcstride * (sy);
  366. const unsigned char* S1 = src + srcstride * (sy + 1);
  367. const short* ialphap = ialpha;
  368. short* rows0p = rows0;
  369. short* rows1p = rows1;
  370. for (int dx = 0; dx < w; dx++)
  371. {
  372. sx = xofs[dx];
  373. short a0 = ialphap[0];
  374. short a1 = ialphap[1];
  375. const unsigned char* S0p = S0 + sx;
  376. const unsigned char* S1p = S1 + sx;
  377. #if __ARM_NEON
  378. int16x4_t _a0 = vdup_n_s16(a0);
  379. int16x4_t _a1 = vdup_n_s16(a1);
  380. uint8x8_t _S0 = uint8x8_t();
  381. uint8x8_t _S1 = uint8x8_t();
  382. _S0 = vld1_lane_u8(S0p, _S0, 0);
  383. _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
  384. _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
  385. _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
  386. _S1 = vld1_lane_u8(S1p, _S1, 0);
  387. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  388. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  389. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  390. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  391. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  392. int16x4_t _S0lowhigh = vget_low_s16(_S016);
  393. int16x4_t _S1lowhigh = vget_low_s16(_S116);
  394. int32x2x2_t _S0S1low_S0S1high = vtrn_s32(vreinterpret_s32_s16(_S0lowhigh), vreinterpret_s32_s16(_S1lowhigh));
  395. int32x4_t _rows01 = vmull_s16(vreinterpret_s16_s32(_S0S1low_S0S1high.val[0]), _a0);
  396. _rows01 = vmlal_s16(_rows01, vreinterpret_s16_s32(_S0S1low_S0S1high.val[1]), _a1);
  397. int16x4_t _rows01_sr4 = vshrn_n_s32(_rows01, 4);
  398. int16x4_t _rows1_sr4 = vext_s16(_rows01_sr4, _rows01_sr4, 2);
  399. vst1_s16(rows0p, _rows01_sr4);
  400. vst1_s16(rows1p, _rows1_sr4);
  401. #else
  402. rows0p[0] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
  403. rows0p[1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
  404. rows1p[0] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
  405. rows1p[1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
  406. #endif // __ARM_NEON
  407. ialphap += 2;
  408. rows0p += 2;
  409. rows1p += 2;
  410. }
  411. }
  412. prev_sy1 = sy;
  413. // vresize
  414. short b0 = ibeta[0];
  415. short b1 = ibeta[1];
  416. short* rows0p = rows0;
  417. short* rows1p = rows1;
  418. unsigned char* Dp = dst + stride * (dy);
  419. #if __ARM_NEON
  420. int nn = (w * 2) >> 3;
  421. #else
  422. int nn = 0;
  423. #endif
  424. int remain = (w * 2) - (nn << 3);
  425. #if __ARM_NEON
  426. #if __aarch64__
  427. int16x4_t _b0 = vdup_n_s16(b0);
  428. int16x4_t _b1 = vdup_n_s16(b1);
  429. int32x4_t _v2 = vdupq_n_s32(2);
  430. for (; nn > 0; nn--)
  431. {
  432. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  433. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  434. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
  435. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
  436. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  437. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  438. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  439. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  440. int32x4_t _acc = _v2;
  441. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  442. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  443. int32x4_t _acc_1 = _v2;
  444. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  445. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  446. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  447. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  448. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  449. vst1_u8(Dp, _D);
  450. Dp += 8;
  451. rows0p += 8;
  452. rows1p += 8;
  453. }
  454. #else
  455. if (nn > 0)
  456. {
  457. asm volatile(
  458. "vdup.s16 d16, %8 \n"
  459. "mov r4, #2 \n"
  460. "vdup.s16 d17, %9 \n"
  461. "vdup.s32 q12, r4 \n"
  462. "pld [%0, #128] \n"
  463. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  464. "pld [%1, #128] \n"
  465. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  466. "0: \n"
  467. "vmull.s16 q0, d2, d16 \n"
  468. "vmull.s16 q1, d3, d16 \n"
  469. "vorr.s32 q10, q12, q12 \n"
  470. "vorr.s32 q11, q12, q12 \n"
  471. "vmull.s16 q2, d6, d17 \n"
  472. "vmull.s16 q3, d7, d17 \n"
  473. "vsra.s32 q10, q0, #16 \n"
  474. "vsra.s32 q11, q1, #16 \n"
  475. "pld [%0, #128] \n"
  476. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  477. "vsra.s32 q10, q2, #16 \n"
  478. "vsra.s32 q11, q3, #16 \n"
  479. "pld [%1, #128] \n"
  480. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  481. "vshrn.s32 d20, q10, #2 \n"
  482. "vshrn.s32 d21, q11, #2 \n"
  483. "vqmovun.s16 d20, q10 \n"
  484. "vst1.8 {d20}, [%2]! \n"
  485. "subs %3, #1 \n"
  486. "bne 0b \n"
  487. "sub %0, #16 \n"
  488. "sub %1, #16 \n"
  489. : "=r"(rows0p), // %0
  490. "=r"(rows1p), // %1
  491. "=r"(Dp), // %2
  492. "=r"(nn) // %3
  493. : "0"(rows0p),
  494. "1"(rows1p),
  495. "2"(Dp),
  496. "3"(nn),
  497. "r"(b0), // %8
  498. "r"(b1) // %9
  499. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
  500. }
  501. #endif // __aarch64__
  502. #endif // __ARM_NEON
  503. for (; remain; --remain)
  504. {
  505. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  506. *Dp++ = (unsigned char)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
  507. }
  508. ibeta += 2;
  509. }
  510. delete[] buf;
  511. }
  512. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  513. {
  514. const int INTER_RESIZE_COEF_BITS = 11;
  515. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  516. // const int ONE=INTER_RESIZE_COEF_SCALE;
  517. double scale_x = (double)srcw / w;
  518. double scale_y = (double)srch / h;
  519. int* buf = new int[w + h + w + h];
  520. int* xofs = buf; //new int[w];
  521. int* yofs = buf + w; //new int[h];
  522. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  523. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  524. float fx;
  525. float fy;
  526. int sx;
  527. int sy;
  528. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  529. for (int dx = 0; dx < w; dx++)
  530. {
  531. fx = (float)((dx + 0.5) * scale_x - 0.5);
  532. sx = static_cast<int>(floor(fx));
  533. fx -= sx;
  534. if (sx < 0)
  535. {
  536. sx = 0;
  537. fx = 0.f;
  538. }
  539. if (sx >= srcw - 1)
  540. {
  541. sx = srcw - 2;
  542. fx = 1.f;
  543. }
  544. xofs[dx] = sx * 3;
  545. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  546. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  547. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  548. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  549. }
  550. for (int dy = 0; dy < h; dy++)
  551. {
  552. fy = (float)((dy + 0.5) * scale_y - 0.5);
  553. sy = static_cast<int>(floor(fy));
  554. fy -= sy;
  555. if (sy < 0)
  556. {
  557. sy = 0;
  558. fy = 0.f;
  559. }
  560. if (sy >= srch - 1)
  561. {
  562. sy = srch - 2;
  563. fy = 1.f;
  564. }
  565. yofs[dy] = sy;
  566. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  567. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  568. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  569. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  570. }
  571. #undef SATURATE_CAST_SHORT
  572. // loop body
  573. Mat rowsbuf0(w * 3 + 1, (size_t)2u);
  574. Mat rowsbuf1(w * 3 + 1, (size_t)2u);
  575. short* rows0 = (short*)rowsbuf0.data;
  576. short* rows1 = (short*)rowsbuf1.data;
  577. int prev_sy1 = -2;
  578. for (int dy = 0; dy < h; dy++)
  579. {
  580. sy = yofs[dy];
  581. if (sy == prev_sy1)
  582. {
  583. // reuse all rows
  584. }
  585. else if (sy == prev_sy1 + 1)
  586. {
  587. // hresize one row
  588. short* rows0_old = rows0;
  589. rows0 = rows1;
  590. rows1 = rows0_old;
  591. const unsigned char* S1 = src + srcstride * (sy + 1);
  592. const short* ialphap = ialpha;
  593. short* rows1p = rows1;
  594. for (int dx = 0; dx < w; dx++)
  595. {
  596. sx = xofs[dx];
  597. short a0 = ialphap[0];
  598. short a1 = ialphap[1];
  599. const unsigned char* S1p = S1 + sx;
  600. #if __ARM_NEON
  601. int16x4_t _a0 = vdup_n_s16(a0);
  602. int16x4_t _a1 = vdup_n_s16(a1);
  603. uint8x8_t _S1 = uint8x8_t();
  604. _S1 = vld1_lane_u8(S1p, _S1, 0);
  605. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  606. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  607. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  608. _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
  609. _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
  610. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  611. int16x4_t _S1low = vget_low_s16(_S116);
  612. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  613. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  614. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  615. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  616. vst1_s16(rows1p, _rows1_sr4);
  617. #else
  618. rows1p[0] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
  619. rows1p[1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
  620. rows1p[2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
  621. #endif // __ARM_NEON
  622. ialphap += 2;
  623. rows1p += 3;
  624. }
  625. }
  626. else
  627. {
  628. // hresize two rows
  629. const unsigned char* S0 = src + srcstride * (sy);
  630. const unsigned char* S1 = src + srcstride * (sy + 1);
  631. const short* ialphap = ialpha;
  632. short* rows0p = rows0;
  633. short* rows1p = rows1;
  634. for (int dx = 0; dx < w; dx++)
  635. {
  636. sx = xofs[dx];
  637. short a0 = ialphap[0];
  638. short a1 = ialphap[1];
  639. const unsigned char* S0p = S0 + sx;
  640. const unsigned char* S1p = S1 + sx;
  641. #if __ARM_NEON
  642. int16x4_t _a0 = vdup_n_s16(a0);
  643. int16x4_t _a1 = vdup_n_s16(a1);
  644. uint8x8_t _S0 = uint8x8_t();
  645. uint8x8_t _S1 = uint8x8_t();
  646. _S0 = vld1_lane_u8(S0p, _S0, 0);
  647. _S0 = vld1_lane_u8(S0p + 1, _S0, 1);
  648. _S0 = vld1_lane_u8(S0p + 2, _S0, 2);
  649. _S0 = vld1_lane_u8(S0p + 3, _S0, 3);
  650. _S0 = vld1_lane_u8(S0p + 4, _S0, 4);
  651. _S0 = vld1_lane_u8(S0p + 5, _S0, 5);
  652. _S1 = vld1_lane_u8(S1p, _S1, 0);
  653. _S1 = vld1_lane_u8(S1p + 1, _S1, 1);
  654. _S1 = vld1_lane_u8(S1p + 2, _S1, 2);
  655. _S1 = vld1_lane_u8(S1p + 3, _S1, 3);
  656. _S1 = vld1_lane_u8(S1p + 4, _S1, 4);
  657. _S1 = vld1_lane_u8(S1p + 5, _S1, 5);
  658. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  659. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  660. int16x4_t _S0low = vget_low_s16(_S016);
  661. int16x4_t _S1low = vget_low_s16(_S116);
  662. int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
  663. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  664. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  665. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  666. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  667. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  668. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  669. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  670. vst1_s16(rows0p, _rows0_sr4);
  671. vst1_s16(rows1p, _rows1_sr4);
  672. #else
  673. rows0p[0] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
  674. rows0p[1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
  675. rows0p[2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
  676. rows1p[0] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
  677. rows1p[1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
  678. rows1p[2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
  679. #endif // __ARM_NEON
  680. ialphap += 2;
  681. rows0p += 3;
  682. rows1p += 3;
  683. }
  684. }
  685. prev_sy1 = sy;
  686. // vresize
  687. short b0 = ibeta[0];
  688. short b1 = ibeta[1];
  689. short* rows0p = rows0;
  690. short* rows1p = rows1;
  691. unsigned char* Dp = dst + stride * (dy);
  692. #if __ARM_NEON
  693. int nn = (w * 3) >> 3;
  694. #else
  695. int nn = 0;
  696. #endif
  697. int remain = (w * 3) - (nn << 3);
  698. #if __ARM_NEON
  699. #if __aarch64__
  700. int16x4_t _b0 = vdup_n_s16(b0);
  701. int16x4_t _b1 = vdup_n_s16(b1);
  702. int32x4_t _v2 = vdupq_n_s32(2);
  703. for (; nn > 0; nn--)
  704. {
  705. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  706. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  707. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
  708. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
  709. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  710. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  711. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  712. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  713. int32x4_t _acc = _v2;
  714. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  715. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  716. int32x4_t _acc_1 = _v2;
  717. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  718. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  719. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  720. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  721. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  722. vst1_u8(Dp, _D);
  723. Dp += 8;
  724. rows0p += 8;
  725. rows1p += 8;
  726. }
  727. #else
  728. if (nn > 0)
  729. {
  730. asm volatile(
  731. "vdup.s16 d16, %8 \n"
  732. "mov r4, #2 \n"
  733. "vdup.s16 d17, %9 \n"
  734. "vdup.s32 q12, r4 \n"
  735. "pld [%0, #128] \n"
  736. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  737. "pld [%1, #128] \n"
  738. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  739. "0: \n"
  740. "vmull.s16 q0, d2, d16 \n"
  741. "vmull.s16 q1, d3, d16 \n"
  742. "vorr.s32 q10, q12, q12 \n"
  743. "vorr.s32 q11, q12, q12 \n"
  744. "vmull.s16 q2, d6, d17 \n"
  745. "vmull.s16 q3, d7, d17 \n"
  746. "vsra.s32 q10, q0, #16 \n"
  747. "vsra.s32 q11, q1, #16 \n"
  748. "pld [%0, #128] \n"
  749. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  750. "vsra.s32 q10, q2, #16 \n"
  751. "vsra.s32 q11, q3, #16 \n"
  752. "pld [%1, #128] \n"
  753. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  754. "vshrn.s32 d20, q10, #2 \n"
  755. "vshrn.s32 d21, q11, #2 \n"
  756. "vqmovun.s16 d20, q10 \n"
  757. "vst1.8 {d20}, [%2]! \n"
  758. "subs %3, #1 \n"
  759. "bne 0b \n"
  760. "sub %0, #16 \n"
  761. "sub %1, #16 \n"
  762. : "=r"(rows0p), // %0
  763. "=r"(rows1p), // %1
  764. "=r"(Dp), // %2
  765. "=r"(nn) // %3
  766. : "0"(rows0p),
  767. "1"(rows1p),
  768. "2"(Dp),
  769. "3"(nn),
  770. "r"(b0), // %8
  771. "r"(b1) // %9
  772. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
  773. }
  774. #endif // __aarch64__
  775. #endif // __ARM_NEON
  776. for (; remain; --remain)
  777. {
  778. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  779. *Dp++ = (unsigned char)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
  780. }
  781. ibeta += 2;
  782. }
  783. delete[] buf;
  784. }
  785. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  786. {
  787. const int INTER_RESIZE_COEF_BITS = 11;
  788. const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
  789. // const int ONE=INTER_RESIZE_COEF_SCALE;
  790. double scale_x = (double)srcw / w;
  791. double scale_y = (double)srch / h;
  792. int* buf = new int[w + h + w + h];
  793. int* xofs = buf; //new int[w];
  794. int* yofs = buf + w; //new int[h];
  795. short* ialpha = (short*)(buf + w + h); //new short[w * 2];
  796. short* ibeta = (short*)(buf + w + h + w); //new short[h * 2];
  797. float fx;
  798. float fy;
  799. int sx;
  800. int sy;
  801. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  802. for (int dx = 0; dx < w; dx++)
  803. {
  804. fx = (float)((dx + 0.5) * scale_x - 0.5);
  805. sx = static_cast<int>(floor(fx));
  806. fx -= sx;
  807. if (sx < 0)
  808. {
  809. sx = 0;
  810. fx = 0.f;
  811. }
  812. if (sx >= srcw - 1)
  813. {
  814. sx = srcw - 2;
  815. fx = 1.f;
  816. }
  817. xofs[dx] = sx * 4;
  818. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  819. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  820. ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
  821. ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  822. }
  823. for (int dy = 0; dy < h; dy++)
  824. {
  825. fy = (float)((dy + 0.5) * scale_y - 0.5);
  826. sy = static_cast<int>(floor(fy));
  827. fy -= sy;
  828. if (sy < 0)
  829. {
  830. sy = 0;
  831. fy = 0.f;
  832. }
  833. if (sy >= srch - 1)
  834. {
  835. sy = srch - 2;
  836. fy = 1.f;
  837. }
  838. yofs[dy] = sy;
  839. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  840. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  841. ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
  842. ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  843. }
  844. #undef SATURATE_CAST_SHORT
  845. // loop body
  846. Mat rowsbuf0(w * 4, (size_t)2u);
  847. Mat rowsbuf1(w * 4, (size_t)2u);
  848. short* rows0 = (short*)rowsbuf0.data;
  849. short* rows1 = (short*)rowsbuf1.data;
  850. int prev_sy1 = -2;
  851. for (int dy = 0; dy < h; dy++)
  852. {
  853. sy = yofs[dy];
  854. if (sy == prev_sy1)
  855. {
  856. // reuse all rows
  857. }
  858. else if (sy == prev_sy1 + 1)
  859. {
  860. // hresize one row
  861. short* rows0_old = rows0;
  862. rows0 = rows1;
  863. rows1 = rows0_old;
  864. const unsigned char* S1 = src + srcstride * (sy + 1);
  865. const short* ialphap = ialpha;
  866. short* rows1p = rows1;
  867. for (int dx = 0; dx < w; dx++)
  868. {
  869. sx = xofs[dx];
  870. short a0 = ialphap[0];
  871. short a1 = ialphap[1];
  872. const unsigned char* S1p = S1 + sx;
  873. #if __ARM_NEON
  874. int16x4_t _a0 = vdup_n_s16(a0);
  875. int16x4_t _a1 = vdup_n_s16(a1);
  876. uint8x8_t _S1 = vld1_u8(S1p);
  877. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  878. int16x4_t _S1low = vget_low_s16(_S116);
  879. int16x4_t _S1high = vget_high_s16(_S116);
  880. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  881. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  882. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  883. vst1_s16(rows1p, _rows1_sr4);
  884. #else
  885. rows1p[0] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
  886. rows1p[1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
  887. rows1p[2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
  888. rows1p[3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
  889. #endif // __ARM_NEON
  890. ialphap += 2;
  891. rows1p += 4;
  892. }
  893. }
  894. else
  895. {
  896. // hresize two rows
  897. const unsigned char* S0 = src + srcstride * (sy);
  898. const unsigned char* S1 = src + srcstride * (sy + 1);
  899. const short* ialphap = ialpha;
  900. short* rows0p = rows0;
  901. short* rows1p = rows1;
  902. for (int dx = 0; dx < w; dx++)
  903. {
  904. sx = xofs[dx];
  905. short a0 = ialphap[0];
  906. short a1 = ialphap[1];
  907. const unsigned char* S0p = S0 + sx;
  908. const unsigned char* S1p = S1 + sx;
  909. #if __ARM_NEON
  910. int16x4_t _a0 = vdup_n_s16(a0);
  911. int16x4_t _a1 = vdup_n_s16(a1);
  912. uint8x8_t _S0 = vld1_u8(S0p);
  913. uint8x8_t _S1 = vld1_u8(S1p);
  914. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  915. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  916. int16x4_t _S0low = vget_low_s16(_S016);
  917. int16x4_t _S1low = vget_low_s16(_S116);
  918. int16x4_t _S0high = vget_high_s16(_S016);
  919. int16x4_t _S1high = vget_high_s16(_S116);
  920. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  921. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  922. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  923. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  924. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  925. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  926. vst1_s16(rows0p, _rows0_sr4);
  927. vst1_s16(rows1p, _rows1_sr4);
  928. #else
  929. rows0p[0] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
  930. rows0p[1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
  931. rows0p[2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
  932. rows0p[3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
  933. rows1p[0] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
  934. rows1p[1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
  935. rows1p[2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
  936. rows1p[3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
  937. #endif // __ARM_NEON
  938. ialphap += 2;
  939. rows0p += 4;
  940. rows1p += 4;
  941. }
  942. }
  943. prev_sy1 = sy;
  944. // vresize
  945. short b0 = ibeta[0];
  946. short b1 = ibeta[1];
  947. short* rows0p = rows0;
  948. short* rows1p = rows1;
  949. unsigned char* Dp = dst + stride * (dy);
  950. #if __ARM_NEON
  951. int nn = (w * 4) >> 3;
  952. #else
  953. int nn = 0;
  954. #endif
  955. int remain = (w * 4) - (nn << 3);
  956. #if __ARM_NEON
  957. #if __aarch64__
  958. int16x4_t _b0 = vdup_n_s16(b0);
  959. int16x4_t _b1 = vdup_n_s16(b1);
  960. int32x4_t _v2 = vdupq_n_s32(2);
  961. for (; nn > 0; nn--)
  962. {
  963. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  964. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  965. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
  966. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
  967. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  968. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  969. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  970. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  971. int32x4_t _acc = _v2;
  972. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  973. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  974. int32x4_t _acc_1 = _v2;
  975. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  976. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  977. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  978. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  979. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  980. vst1_u8(Dp, _D);
  981. Dp += 8;
  982. rows0p += 8;
  983. rows1p += 8;
  984. }
  985. #else
  986. if (nn > 0)
  987. {
  988. asm volatile(
  989. "vdup.s16 d16, %8 \n"
  990. "mov r4, #2 \n"
  991. "vdup.s16 d17, %9 \n"
  992. "vdup.s32 q12, r4 \n"
  993. "pld [%0, #128] \n"
  994. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  995. "pld [%1, #128] \n"
  996. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  997. "0: \n"
  998. "vmull.s16 q0, d2, d16 \n"
  999. "vmull.s16 q1, d3, d16 \n"
  1000. "vorr.s32 q10, q12, q12 \n"
  1001. "vorr.s32 q11, q12, q12 \n"
  1002. "vmull.s16 q2, d6, d17 \n"
  1003. "vmull.s16 q3, d7, d17 \n"
  1004. "vsra.s32 q10, q0, #16 \n"
  1005. "vsra.s32 q11, q1, #16 \n"
  1006. "pld [%0, #128] \n"
  1007. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1008. "vsra.s32 q10, q2, #16 \n"
  1009. "vsra.s32 q11, q3, #16 \n"
  1010. "pld [%1, #128] \n"
  1011. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1012. "vshrn.s32 d20, q10, #2 \n"
  1013. "vshrn.s32 d21, q11, #2 \n"
  1014. "vqmovun.s16 d20, q10 \n"
  1015. "vst1.8 {d20}, [%2]! \n"
  1016. "subs %3, #1 \n"
  1017. "bne 0b \n"
  1018. "sub %0, #16 \n"
  1019. "sub %1, #16 \n"
  1020. : "=r"(rows0p), // %0
  1021. "=r"(rows1p), // %1
  1022. "=r"(Dp), // %2
  1023. "=r"(nn) // %3
  1024. : "0"(rows0p),
  1025. "1"(rows1p),
  1026. "2"(Dp),
  1027. "3"(nn),
  1028. "r"(b0), // %8
  1029. "r"(b1) // %9
  1030. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
  1031. }
  1032. #endif // __aarch64__
  1033. #endif // __ARM_NEON
  1034. for (; remain; --remain)
  1035. {
  1036. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1037. *Dp++ = (unsigned char)(((short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2) >> 2);
  1038. }
  1039. ibeta += 2;
  1040. }
  1041. delete[] buf;
  1042. }
  1043. void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  1044. {
  1045. // assert srcw % 2 == 0
  1046. // assert srch % 2 == 0
  1047. // assert w % 2 == 0
  1048. // assert h % 2 == 0
  1049. const unsigned char* srcY = src;
  1050. unsigned char* dstY = dst;
  1051. resize_bilinear_c1(srcY, srcw, srch, dstY, w, h);
  1052. const unsigned char* srcUV = src + srcw * srch;
  1053. unsigned char* dstUV = dst + w * h;
  1054. resize_bilinear_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2);
  1055. }
  1056. #endif // NCNN_PIXEL
  1057. } // namespace ncnn