You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel.cpp 62 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <algorithm>
  17. #if __ARM_NEON
  18. #include <arm_neon.h>
  19. #endif // __ARM_NEON
  20. namespace ncnn {
  21. static Mat from_rgb(const unsigned char* rgb, int w, int h)
  22. {
  23. Mat m(w, h, 3);
  24. if (m.empty())
  25. return m;
  26. float* ptr0 = m.channel(0);
  27. float* ptr1 = m.channel(1);
  28. float* ptr2 = m.channel(2);
  29. int size = w * h;
  30. #if __ARM_NEON
  31. int nn = size >> 3;
  32. int remain = size - (nn << 3);
  33. #else
  34. int remain = size;
  35. #endif // __ARM_NEON
  36. #if __ARM_NEON
  37. #if __aarch64__
  38. for (; nn>0; nn--)
  39. {
  40. uint8x8x3_t _rgb = vld3_u8(rgb);
  41. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  42. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  43. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  44. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  45. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  46. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  47. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  48. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  49. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  50. vst1q_f32(ptr0, _rlow);
  51. vst1q_f32(ptr0+4, _rhigh);
  52. vst1q_f32(ptr1, _glow);
  53. vst1q_f32(ptr1+4, _ghigh);
  54. vst1q_f32(ptr2, _blow);
  55. vst1q_f32(ptr2+4, _bhigh);
  56. rgb += 3*8;
  57. ptr0 += 8;
  58. ptr1 += 8;
  59. ptr2 += 8;
  60. }
  61. #else
  62. if (nn > 0)
  63. {
  64. asm volatile(
  65. "0: \n"
  66. "pld [%1, #256] \n"
  67. "vld3.u8 {d0-d2}, [%1]! \n"
  68. "vmovl.u8 q8, d0 \n"
  69. "vmovl.u8 q9, d1 \n"
  70. "vmovl.u8 q10, d2 \n"
  71. "vmovl.u16 q0, d16 \n"
  72. "vmovl.u16 q1, d17 \n"
  73. "vmovl.u16 q2, d18 \n"
  74. "vmovl.u16 q3, d19 \n"
  75. "vmovl.u16 q8, d20 \n"
  76. "vmovl.u16 q9, d21 \n"
  77. "vcvt.f32.u32 q0, q0 \n"
  78. "vcvt.f32.u32 q1, q1 \n"
  79. "vcvt.f32.u32 q2, q2 \n"
  80. "vcvt.f32.u32 q3, q3 \n"
  81. "vcvt.f32.u32 q8, q8 \n"
  82. "subs %0, #1 \n"
  83. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  84. "vcvt.f32.u32 q9, q9 \n"
  85. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  86. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  87. "bne 0b \n"
  88. : "=r"(nn), // %0
  89. "=r"(rgb), // %1
  90. "=r"(ptr0), // %2
  91. "=r"(ptr1), // %3
  92. "=r"(ptr2) // %4
  93. : "0"(nn),
  94. "1"(rgb),
  95. "2"(ptr0),
  96. "3"(ptr1),
  97. "4"(ptr2)
  98. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  99. );
  100. }
  101. #endif // __aarch64__
  102. #endif // __ARM_NEON
  103. for (; remain>0; remain--)
  104. {
  105. *ptr0 = rgb[0];
  106. *ptr1 = rgb[1];
  107. *ptr2 = rgb[2];
  108. rgb += 3;
  109. ptr0++;
  110. ptr1++;
  111. ptr2++;
  112. }
  113. return m;
  114. }
  115. static void to_rgb(const Mat& m, unsigned char* rgb)
  116. {
  117. const float* ptr0 = m.channel(0);
  118. const float* ptr1 = m.channel(1);
  119. const float* ptr2 = m.channel(2);
  120. int size = m.w * m.h;
  121. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  122. int remain = size;
  123. for (; remain>0; remain--)
  124. {
  125. rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
  126. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  127. rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
  128. rgb += 3;
  129. ptr0++;
  130. ptr1++;
  131. ptr2++;
  132. }
  133. #undef SATURATE_CAST_UCHAR
  134. }
  135. static Mat from_gray(const unsigned char* gray, int w, int h)
  136. {
  137. Mat m(w, h, 1);
  138. if (m.empty())
  139. return m;
  140. float* ptr = m;
  141. int size = w * h;
  142. #if __ARM_NEON
  143. int nn = size >> 4;
  144. int remain = size - (nn << 4);
  145. #else
  146. int remain = size;
  147. #endif // __ARM_NEON
  148. #if __ARM_NEON
  149. #if __aarch64__
  150. for (; nn>0; nn--)
  151. {
  152. uint8x16_t _gray = vld1q_u8(gray);
  153. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  154. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  155. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  156. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  157. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  158. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  159. vst1q_f32(ptr, _graylow_0);
  160. vst1q_f32(ptr+4, _grayhigh_0);
  161. vst1q_f32(ptr+8, _graylow_1);
  162. vst1q_f32(ptr+12, _grayhigh_1);
  163. gray += 16;
  164. ptr += 16;
  165. }
  166. #else
  167. if (nn > 0)
  168. {
  169. asm volatile(
  170. "0: \n"
  171. "pld [%1, #128] \n"
  172. "vld1.u8 {d0,d1}, [%1]! \n"
  173. "vmovl.u8 q8, d0 \n"
  174. "vmovl.u8 q9, d1 \n"
  175. "vmovl.u16 q0, d16 \n"
  176. "vmovl.u16 q1, d17 \n"
  177. "vmovl.u16 q2, d18 \n"
  178. "vmovl.u16 q3, d19 \n"
  179. "vcvt.f32.u32 q0, q0 \n"
  180. "vcvt.f32.u32 q1, q1 \n"
  181. "vcvt.f32.u32 q2, q2 \n"
  182. "vcvt.f32.u32 q3, q3 \n"
  183. "subs %0, #1 \n"
  184. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  185. "vst1.f32 {d4-d7}, [%2 :128]! \n"
  186. "bne 0b \n"
  187. : "=r"(nn), // %0
  188. "=r"(gray), // %1
  189. "=r"(ptr) // %2
  190. : "0"(nn),
  191. "1"(gray),
  192. "2"(ptr)
  193. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  194. );
  195. }
  196. #endif // __aarch64__
  197. #endif // __ARM_NEON
  198. for (; remain>0; remain--)
  199. {
  200. *ptr = *gray;
  201. gray++;
  202. ptr++;
  203. }
  204. return m;
  205. }
  206. static void to_gray(const Mat& m, unsigned char* gray)
  207. {
  208. const float* ptr = m;
  209. int size = m.w * m.h;
  210. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  211. int remain = size;
  212. for (; remain>0; remain--)
  213. {
  214. *gray = SATURATE_CAST_UCHAR(*ptr);
  215. gray++;
  216. ptr++;
  217. }
  218. #undef SATURATE_CAST_UCHAR
  219. }
  220. static Mat from_rgba(const unsigned char* rgba, int w, int h)
  221. {
  222. Mat m(w, h, 4);
  223. if (m.empty())
  224. return m;
  225. float* ptr0 = m.channel(0);
  226. float* ptr1 = m.channel(1);
  227. float* ptr2 = m.channel(2);
  228. float* ptr3 = m.channel(3);
  229. int size = w * h;
  230. #if __ARM_NEON
  231. int nn = size >> 3;
  232. int remain = size - (nn << 3);
  233. #else
  234. int remain = size;
  235. #endif // __ARM_NEON
  236. #if __ARM_NEON
  237. #if __aarch64__
  238. for (; nn>0; nn--)
  239. {
  240. uint8x8x4_t _rgba = vld4_u8(rgba);
  241. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  242. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  243. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  244. int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
  245. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  246. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  247. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  248. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  249. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  250. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  251. float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
  252. float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
  253. vst1q_f32(ptr0, _rlow);
  254. vst1q_f32(ptr0+4, _rhigh);
  255. vst1q_f32(ptr1, _glow);
  256. vst1q_f32(ptr1+4, _ghigh);
  257. vst1q_f32(ptr2, _blow);
  258. vst1q_f32(ptr2+4, _bhigh);
  259. vst1q_f32(ptr3, _alow);
  260. vst1q_f32(ptr3+4, _ahigh);
  261. rgba += 4*8;
  262. ptr0 += 8;
  263. ptr1 += 8;
  264. ptr2 += 8;
  265. ptr3 += 8;
  266. }
  267. #else
  268. if (nn > 0)
  269. {
  270. asm volatile(
  271. "0: \n"
  272. "pld [%1, #256] \n"
  273. "vld4.u8 {d0-d3}, [%1]! \n"
  274. "vmovl.u8 q8, d0 \n"
  275. "vmovl.u8 q9, d1 \n"
  276. "vmovl.u8 q10, d2 \n"
  277. "vmovl.u8 q11, d3 \n"
  278. "vmovl.u16 q0, d16 \n"
  279. "vmovl.u16 q1, d17 \n"
  280. "vmovl.u16 q2, d18 \n"
  281. "vmovl.u16 q3, d19 \n"
  282. "vmovl.u16 q8, d20 \n"
  283. "vmovl.u16 q9, d21 \n"
  284. "vmovl.u16 q10, d22 \n"
  285. "vmovl.u16 q11, d23 \n"
  286. "vcvt.f32.u32 q0, q0 \n"
  287. "vcvt.f32.u32 q1, q1 \n"
  288. "vcvt.f32.u32 q2, q2 \n"
  289. "vcvt.f32.u32 q3, q3 \n"
  290. "vcvt.f32.u32 q8, q8 \n"
  291. "vcvt.f32.u32 q9, q9 \n"
  292. "subs %0, #1 \n"
  293. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  294. "vcvt.f32.u32 q10, q10 \n"
  295. "vcvt.f32.u32 q11, q11 \n"
  296. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  297. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  298. "vst1.f32 {d20-d23}, [%5 :128]!\n"
  299. "bne 0b \n"
  300. : "=r"(nn), // %0
  301. "=r"(rgba), // %1
  302. "=r"(ptr0), // %2
  303. "=r"(ptr1), // %3
  304. "=r"(ptr2), // %4
  305. "=r"(ptr3) // %5
  306. : "0"(nn),
  307. "1"(rgba),
  308. "2"(ptr0),
  309. "3"(ptr1),
  310. "4"(ptr2),
  311. "5"(ptr3)
  312. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  313. );
  314. }
  315. #endif // __aarch64__
  316. #endif // __ARM_NEON
  317. for (; remain>0; remain--)
  318. {
  319. *ptr0 = rgba[0];
  320. *ptr1 = rgba[1];
  321. *ptr2 = rgba[2];
  322. *ptr3 = rgba[3];
  323. rgba += 4;
  324. ptr0++;
  325. ptr1++;
  326. ptr2++;
  327. ptr3++;
  328. }
  329. return m;
  330. }
  331. static void to_rgba(const Mat& m, unsigned char* rgba)
  332. {
  333. const float* ptr0 = m.channel(0);
  334. const float* ptr1 = m.channel(1);
  335. const float* ptr2 = m.channel(2);
  336. const float* ptr3 = m.channel(3);
  337. int size = m.w * m.h;
  338. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  339. int remain = size;
  340. for (; remain>0; remain--)
  341. {
  342. rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
  343. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  344. rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
  345. rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
  346. rgba += 4;
  347. ptr0++;
  348. ptr1++;
  349. ptr2++;
  350. ptr3++;
  351. }
  352. #undef SATURATE_CAST_UCHAR
  353. }
  354. static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h)
  355. {
  356. Mat m(w, h, 3);
  357. if (m.empty())
  358. return m;
  359. float* ptr0 = m.channel(0);
  360. float* ptr1 = m.channel(1);
  361. float* ptr2 = m.channel(2);
  362. int size = w * h;
  363. #if __ARM_NEON
  364. int nn = size >> 3;
  365. int remain = size - (nn << 3);
  366. #else
  367. int remain = size;
  368. #endif // __ARM_NEON
  369. #if __ARM_NEON
  370. #if __aarch64__
  371. for (; nn>0; nn--)
  372. {
  373. uint8x8x3_t _rgb = vld3_u8(rgb);
  374. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  375. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  376. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  377. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  378. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  379. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  380. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  381. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  382. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  383. vst1q_f32(ptr2, _rlow);
  384. vst1q_f32(ptr2+4, _rhigh);
  385. vst1q_f32(ptr1, _glow);
  386. vst1q_f32(ptr1+4, _ghigh);
  387. vst1q_f32(ptr0, _blow);
  388. vst1q_f32(ptr0+4, _bhigh);
  389. rgb += 3*8;
  390. ptr0 += 8;
  391. ptr1 += 8;
  392. ptr2 += 8;
  393. }
  394. #else
  395. if (nn > 0)
  396. {
  397. asm volatile(
  398. "0: \n"
  399. "pld [%1, #256] \n"
  400. "vld3.u8 {d0-d2}, [%1]! \n"
  401. "vmovl.u8 q8, d0 \n"
  402. "vmovl.u8 q9, d1 \n"
  403. "vmovl.u8 q10, d2 \n"
  404. "vmovl.u16 q0, d16 \n"
  405. "vmovl.u16 q1, d17 \n"
  406. "vmovl.u16 q2, d18 \n"
  407. "vmovl.u16 q3, d19 \n"
  408. "vmovl.u16 q8, d20 \n"
  409. "vmovl.u16 q9, d21 \n"
  410. "vcvt.f32.u32 q0, q0 \n"
  411. "vcvt.f32.u32 q1, q1 \n"
  412. "vcvt.f32.u32 q2, q2 \n"
  413. "vcvt.f32.u32 q3, q3 \n"
  414. "vcvt.f32.u32 q8, q8 \n"
  415. "subs %0, #1 \n"
  416. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  417. "vcvt.f32.u32 q9, q9 \n"
  418. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  419. "vst1.f32 {d16-d19}, [%2 :128]!\n"
  420. "bne 0b \n"
  421. : "=r"(nn), // %0
  422. "=r"(rgb), // %1
  423. "=r"(ptr0), // %2
  424. "=r"(ptr1), // %3
  425. "=r"(ptr2) // %4
  426. : "0"(nn),
  427. "1"(rgb),
  428. "2"(ptr0),
  429. "3"(ptr1),
  430. "4"(ptr2)
  431. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  432. );
  433. }
  434. #endif // __aarch64__
  435. #endif // __ARM_NEON
  436. for (; remain>0; remain--)
  437. {
  438. *ptr0 = rgb[2];
  439. *ptr1 = rgb[1];
  440. *ptr2 = rgb[0];
  441. rgb += 3;
  442. ptr0++;
  443. ptr1++;
  444. ptr2++;
  445. }
  446. return m;
  447. }
  448. static void to_bgr2rgb(const Mat& m, unsigned char* rgb)
  449. {
  450. const float* ptr0 = m.channel(0);
  451. const float* ptr1 = m.channel(1);
  452. const float* ptr2 = m.channel(2);
  453. int size = m.w * m.h;
  454. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  455. int remain = size;
  456. for (; remain>0; remain--)
  457. {
  458. rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
  459. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  460. rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
  461. rgb += 3;
  462. ptr0++;
  463. ptr1++;
  464. ptr2++;
  465. }
  466. #undef SATURATE_CAST_UCHAR
  467. }
  468. static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
  469. {
  470. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  471. const unsigned char Y_shift = 8;//14
  472. const unsigned char R2Y = 77;
  473. const unsigned char G2Y = 150;
  474. const unsigned char B2Y = 29;
  475. Mat m(w, h, 1);
  476. if (m.empty())
  477. return m;
  478. float* ptr = m;
  479. int size = w * h;
  480. #if __ARM_NEON
  481. int nn = size >> 3;
  482. int remain = size - (nn << 3);
  483. #else
  484. int remain = size;
  485. #endif // __ARM_NEON
  486. #if __ARM_NEON
  487. #if __aarch64__
  488. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  489. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  490. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  491. for (; nn>0; nn--)
  492. {
  493. uint8x8x3_t _rgb = vld3_u8(rgb);
  494. uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
  495. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  496. _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
  497. _y16 = vshrq_n_u16(_y16, Y_shift);
  498. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  499. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  500. vst1q_f32(ptr, _ylow);
  501. vst1q_f32(ptr+4, _yhigh);
  502. rgb += 3*8;
  503. ptr += 8;
  504. }
  505. #else
  506. if (nn > 0)
  507. {
  508. asm volatile(
  509. "vdup.u8 d16, %6 \n"
  510. "vdup.u8 d17, %7 \n"
  511. "vdup.u8 d18, %8 \n"
  512. "0: \n"
  513. "pld [%1, #256] \n"
  514. "vld3.u8 {d0-d2}, [%1]! \n"
  515. "vmull.u8 q2, d0, d16 \n"
  516. "vmlal.u8 q2, d1, d17 \n"
  517. "vmlal.u8 q2, d2, d18 \n"
  518. "vshr.u16 q2, q2, #8 \n" // Y_shift
  519. "vmovl.u16 q0, d4 \n"
  520. "vmovl.u16 q1, d5 \n"
  521. "vcvt.f32.u32 q0, q0 \n"
  522. "vcvt.f32.u32 q1, q1 \n"
  523. "subs %0, #1 \n"
  524. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  525. "bne 0b \n"
  526. : "=r"(nn), // %0
  527. "=r"(rgb), // %1
  528. "=r"(ptr) // %2
  529. : "0"(nn),
  530. "1"(rgb),
  531. "2"(ptr),
  532. "r"(R2Y), // %6
  533. "r"(G2Y), // %7
  534. "r"(B2Y) // %8
  535. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  536. );
  537. }
  538. #endif // __aarch64__
  539. #endif // __ARM_NEON
  540. for (; remain>0; remain--)
  541. {
  542. *ptr = (rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift;
  543. rgb += 3;
  544. ptr++;
  545. }
  546. return m;
  547. }
  548. static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
  549. {
  550. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  551. const unsigned char Y_shift = 8;//14
  552. const unsigned char R2Y = 77;
  553. const unsigned char G2Y = 150;
  554. const unsigned char B2Y = 29;
  555. Mat m(w, h, 1);
  556. if (m.empty())
  557. return m;
  558. float* ptr = m;
  559. int size = w * h;
  560. #if __ARM_NEON
  561. int nn = size >> 3;
  562. int remain = size - (nn << 3);
  563. #else
  564. int remain = size;
  565. #endif // __ARM_NEON
  566. #if __ARM_NEON
  567. #if __aarch64__
  568. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  569. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  570. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  571. for (; nn>0; nn--)
  572. {
  573. uint8x8x3_t _rgb = vld3_u8(bgr);
  574. uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
  575. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  576. _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
  577. _y16 = vshrq_n_u16(_y16, Y_shift);
  578. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  579. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  580. vst1q_f32(ptr, _ylow);
  581. vst1q_f32(ptr+4, _yhigh);
  582. bgr += 3*8;
  583. ptr += 8;
  584. }
  585. #else
  586. if (nn > 0)
  587. {
  588. asm volatile(
  589. "vdup.u8 d16, %6 \n"
  590. "vdup.u8 d17, %7 \n"
  591. "vdup.u8 d18, %8 \n"
  592. "0: \n"
  593. "pld [%1, #256] \n"
  594. "vld3.u8 {d0-d2}, [%1]! \n"
  595. "vmull.u8 q2, d2, d16 \n"
  596. "vmlal.u8 q2, d1, d17 \n"
  597. "vmlal.u8 q2, d0, d18 \n"
  598. "vshr.u16 q2, q2, #8 \n" // Y_shift
  599. "vmovl.u16 q0, d4 \n"
  600. "vmovl.u16 q1, d5 \n"
  601. "vcvt.f32.u32 q0, q0 \n"
  602. "vcvt.f32.u32 q1, q1 \n"
  603. "subs %0, #1 \n"
  604. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  605. "bne 0b \n"
  606. : "=r"(nn), // %0
  607. "=r"(bgr), // %1
  608. "=r"(ptr) // %2
  609. : "0"(nn),
  610. "1"(bgr),
  611. "2"(ptr),
  612. "r"(R2Y), // %6
  613. "r"(G2Y), // %7
  614. "r"(B2Y) // %8
  615. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  616. );
  617. }
  618. #endif // __aarch64__
  619. #endif // __ARM_NEON
  620. for (; remain>0; remain--)
  621. {
  622. *ptr = (bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift;
  623. bgr += 3;
  624. ptr++;
  625. }
  626. return m;
  627. }
  628. static Mat from_gray2rgb(const unsigned char* gray, int w, int h)
  629. {
  630. Mat m(w, h, 3);
  631. if (m.empty())
  632. return m;
  633. float* ptr0 = m.channel(0);
  634. float* ptr1 = m.channel(1);
  635. float* ptr2 = m.channel(2);
  636. int size = w * h;
  637. #if __ARM_NEON
  638. int nn = size >> 4;
  639. int remain = size - (nn << 4);
  640. #else
  641. int remain = size;
  642. #endif // __ARM_NEON
  643. #if __ARM_NEON
  644. #if __aarch64__
  645. for (; nn>0; nn--)
  646. {
  647. uint8x16_t _gray = vld1q_u8(gray);
  648. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  649. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  650. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  651. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  652. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  653. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  654. vst1q_f32(ptr0, _graylow_0);
  655. vst1q_f32(ptr0+4, _grayhigh_0);
  656. vst1q_f32(ptr0+8, _graylow_1);
  657. vst1q_f32(ptr0+12, _grayhigh_1);
  658. vst1q_f32(ptr1, _graylow_0);
  659. vst1q_f32(ptr1+4, _grayhigh_0);
  660. vst1q_f32(ptr1+8, _graylow_1);
  661. vst1q_f32(ptr1+12, _grayhigh_1);
  662. vst1q_f32(ptr2, _graylow_0);
  663. vst1q_f32(ptr2+4, _grayhigh_0);
  664. vst1q_f32(ptr2+8, _graylow_1);
  665. vst1q_f32(ptr2+12, _grayhigh_1);
  666. gray += 16;
  667. ptr0 += 16;
  668. ptr1 += 16;
  669. ptr2 += 16;
  670. }
  671. #else
  672. if (nn > 0)
  673. {
  674. asm volatile(
  675. "0: \n"
  676. "pld [%1, #128] \n"
  677. "vld1.u8 {d0,d1}, [%1]! \n"
  678. "vmovl.u8 q8, d0 \n"
  679. "vmovl.u8 q9, d1 \n"
  680. "vmovl.u16 q0, d16 \n"
  681. "vmovl.u16 q1, d17 \n"
  682. "vmovl.u16 q2, d18 \n"
  683. "vmovl.u16 q3, d19 \n"
  684. "vcvt.f32.u32 q0, q0 \n"
  685. "vcvt.f32.u32 q1, q1 \n"
  686. "vcvt.f32.u32 q2, q2 \n"
  687. "vcvt.f32.u32 q3, q3 \n"
  688. "subs %0, #1 \n"
  689. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  690. "vst1.f32 {d4-d7}, [%2 :128]! \n"
  691. "vst1.f32 {d0-d3}, [%3 :128]! \n"
  692. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  693. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  694. "vst1.f32 {d4-d7}, [%4 :128]! \n"
  695. "bne 0b \n"
  696. : "=r"(nn), // %0
  697. "=r"(gray), // %1
  698. "=r"(ptr0), // %2
  699. "=r"(ptr1), // %3
  700. "=r"(ptr2) // %4
  701. : "0"(nn),
  702. "1"(gray),
  703. "2"(ptr0),
  704. "3"(ptr1),
  705. "4"(ptr2)
  706. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  707. );
  708. }
  709. #endif // __aarch64__
  710. #endif // __ARM_NEON
  711. for (; remain>0; remain--)
  712. {
  713. *ptr0 = *gray;
  714. *ptr1 = *gray;
  715. *ptr2 = *gray;
  716. gray++;
  717. ptr0++;
  718. ptr1++;
  719. ptr2++;
  720. }
  721. return m;
  722. }
  723. static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h)
  724. {
  725. Mat m(w, h, 3);
  726. if (m.empty())
  727. return m;
  728. float* ptr0 = m.channel(0);
  729. float* ptr1 = m.channel(1);
  730. float* ptr2 = m.channel(2);
  731. int size = w * h;
  732. #if __ARM_NEON
  733. int nn = size >> 3;
  734. int remain = size - (nn << 3);
  735. #else
  736. int remain = size;
  737. #endif // __ARM_NEON
  738. #if __ARM_NEON
  739. #if __aarch64__
  740. for (; nn>0; nn--)
  741. {
  742. uint8x8x4_t _rgba = vld4_u8(rgba);
  743. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  744. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  745. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  746. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  747. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  748. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  749. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  750. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  751. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  752. vst1q_f32(ptr0, _rlow);
  753. vst1q_f32(ptr0+4, _rhigh);
  754. vst1q_f32(ptr1, _glow);
  755. vst1q_f32(ptr1+4, _ghigh);
  756. vst1q_f32(ptr2, _blow);
  757. vst1q_f32(ptr2+4, _bhigh);
  758. rgba += 4*8;
  759. ptr0 += 8;
  760. ptr1 += 8;
  761. ptr2 += 8;
  762. }
  763. #else
  764. if (nn > 0)
  765. {
  766. asm volatile(
  767. "0: \n"
  768. "pld [%1, #256] \n"
  769. "vld4.u8 {d0-d3}, [%1]! \n"
  770. "vmovl.u8 q8, d0 \n"
  771. "vmovl.u8 q9, d1 \n"
  772. "vmovl.u8 q10, d2 \n"
  773. "vmovl.u16 q0, d16 \n"
  774. "vmovl.u16 q1, d17 \n"
  775. "vmovl.u16 q2, d18 \n"
  776. "vmovl.u16 q3, d19 \n"
  777. "vmovl.u16 q8, d20 \n"
  778. "vmovl.u16 q9, d21 \n"
  779. "vcvt.f32.u32 q0, q0 \n"
  780. "vcvt.f32.u32 q1, q1 \n"
  781. "vcvt.f32.u32 q2, q2 \n"
  782. "vcvt.f32.u32 q3, q3 \n"
  783. "vcvt.f32.u32 q8, q8 \n"
  784. "subs %0, #1 \n"
  785. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  786. "vcvt.f32.u32 q9, q9 \n"
  787. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  788. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  789. "bne 0b \n"
  790. : "=r"(nn), // %0
  791. "=r"(rgba), // %1
  792. "=r"(ptr0), // %2
  793. "=r"(ptr1), // %3
  794. "=r"(ptr2) // %4
  795. : "0"(nn),
  796. "1"(rgba),
  797. "2"(ptr0),
  798. "3"(ptr1),
  799. "4"(ptr2)
  800. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  801. );
  802. }
  803. #endif // __aarch64__
  804. #endif // __ARM_NEON
  805. for (; remain>0; remain--)
  806. {
  807. *ptr0 = rgba[0];
  808. *ptr1 = rgba[1];
  809. *ptr2 = rgba[2];
  810. rgba += 4;
  811. ptr0++;
  812. ptr1++;
  813. ptr2++;
  814. }
  815. return m;
  816. }
  817. static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h)
  818. {
  819. Mat m(w, h, 3);
  820. if (m.empty())
  821. return m;
  822. float* ptr0 = m.channel(0);
  823. float* ptr1 = m.channel(1);
  824. float* ptr2 = m.channel(2);
  825. int size = w * h;
  826. #if __ARM_NEON
  827. int nn = size >> 3;
  828. int remain = size - (nn << 3);
  829. #else
  830. int remain = size;
  831. #endif // __ARM_NEON
  832. #if __ARM_NEON
  833. #if __aarch64__
  834. for (; nn>0; nn--)
  835. {
  836. uint8x8x4_t _rgba = vld4_u8(rgba);
  837. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  838. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  839. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  840. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  841. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  842. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  843. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  844. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  845. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  846. vst1q_f32(ptr2, _rlow);
  847. vst1q_f32(ptr2+4, _rhigh);
  848. vst1q_f32(ptr1, _glow);
  849. vst1q_f32(ptr1+4, _ghigh);
  850. vst1q_f32(ptr0, _blow);
  851. vst1q_f32(ptr0+4, _bhigh);
  852. rgba += 4*8;
  853. ptr0 += 8;
  854. ptr1 += 8;
  855. ptr2 += 8;
  856. }
  857. #else
  858. if (nn > 0)
  859. {
  860. asm volatile(
  861. "0: \n"
  862. "pld [%1, #256] \n"
  863. "vld4.u8 {d0-d3}, [%1]! \n"
  864. "vmovl.u8 q8, d0 \n"
  865. "vmovl.u8 q9, d1 \n"
  866. "vmovl.u8 q10, d2 \n"
  867. "vmovl.u16 q0, d16 \n"
  868. "vmovl.u16 q1, d17 \n"
  869. "vmovl.u16 q2, d18 \n"
  870. "vmovl.u16 q3, d19 \n"
  871. "vmovl.u16 q8, d20 \n"
  872. "vmovl.u16 q9, d21 \n"
  873. "vcvt.f32.u32 q0, q0 \n"
  874. "vcvt.f32.u32 q1, q1 \n"
  875. "vcvt.f32.u32 q2, q2 \n"
  876. "vcvt.f32.u32 q3, q3 \n"
  877. "vcvt.f32.u32 q8, q8 \n"
  878. "subs %0, #1 \n"
  879. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  880. "vcvt.f32.u32 q9, q9 \n"
  881. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  882. "vst1.f32 {d16-d19}, [%2 :128]!\n"
  883. "bne 0b \n"
  884. : "=r"(nn), // %0
  885. "=r"(rgba), // %1
  886. "=r"(ptr0), // %2
  887. "=r"(ptr1), // %3
  888. "=r"(ptr2) // %4
  889. : "0"(nn),
  890. "1"(rgba),
  891. "2"(ptr0),
  892. "3"(ptr1),
  893. "4"(ptr2)
  894. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  895. );
  896. }
  897. #endif // __aarch64__
  898. #endif // __ARM_NEON
  899. for (; remain>0; remain--)
  900. {
  901. *ptr0 = rgba[2];
  902. *ptr1 = rgba[1];
  903. *ptr2 = rgba[0];
  904. rgba += 4;
  905. ptr0++;
  906. ptr1++;
  907. ptr2++;
  908. }
  909. return m;
  910. }
  911. static Mat from_rgba2gray(const unsigned char* rgba, int w, int h)
  912. {
  913. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  914. const unsigned char Y_shift = 8;//14
  915. const unsigned char R2Y = 77;
  916. const unsigned char G2Y = 150;
  917. const unsigned char B2Y = 29;
  918. Mat m(w, h, 1);
  919. if (m.empty())
  920. return m;
  921. float* ptr = m;
  922. int size = w * h;
  923. #if __ARM_NEON
  924. int nn = size >> 3;
  925. int remain = size - (nn << 3);
  926. #else
  927. int remain = size;
  928. #endif // __ARM_NEON
  929. #if __ARM_NEON
  930. #if __aarch64__
  931. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  932. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  933. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  934. for (; nn>0; nn--)
  935. {
  936. uint8x8x4_t _rgba = vld4_u8(rgba);
  937. uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
  938. _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
  939. _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
  940. _y16 = vshrq_n_u16(_y16, Y_shift);
  941. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  942. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  943. vst1q_f32(ptr, _ylow);
  944. vst1q_f32(ptr+4, _yhigh);
  945. rgba += 4*8;
  946. ptr += 8;
  947. }
  948. #else
  949. if (nn > 0)
  950. {
  951. asm volatile(
  952. "vdup.u8 d16, %6 \n"
  953. "vdup.u8 d17, %7 \n"
  954. "vdup.u8 d18, %8 \n"
  955. "0: \n"
  956. "pld [%1, #256] \n"
  957. "vld4.u8 {d0-d3}, [%1]! \n"
  958. "vmull.u8 q2, d0, d16 \n"
  959. "vmlal.u8 q2, d1, d17 \n"
  960. "vmlal.u8 q2, d2, d18 \n"
  961. "vshr.u16 q2, q2, #8 \n" // Y_shift
  962. "vmovl.u16 q0, d4 \n"
  963. "vmovl.u16 q1, d5 \n"
  964. "vcvt.f32.u32 q0, q0 \n"
  965. "vcvt.f32.u32 q1, q1 \n"
  966. "subs %0, #1 \n"
  967. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  968. "bne 0b \n"
  969. : "=r"(nn), // %0
  970. "=r"(rgba), // %1
  971. "=r"(ptr) // %2
  972. : "0"(nn),
  973. "1"(rgba),
  974. "2"(ptr),
  975. "r"(R2Y), // %6
  976. "r"(G2Y), // %7
  977. "r"(B2Y) // %8
  978. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  979. );
  980. }
  981. #endif // __aarch64__
  982. #endif // __ARM_NEON
  983. for (; remain>0; remain--)
  984. {
  985. *ptr = (rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift;
  986. rgba += 4;
  987. ptr++;
  988. }
  989. return m;
  990. }
  991. void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  992. {
  993. const int INTER_RESIZE_COEF_BITS=11;
  994. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  995. // const int ONE=INTER_RESIZE_COEF_SCALE;
  996. double scale_x = (double)srcw / w;
  997. double scale_y = (double)srch / h;
  998. int* buf = new int[w + h + w + h];
  999. int* xofs = buf;//new int[w];
  1000. int* yofs = buf + w;//new int[h];
  1001. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  1002. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  1003. float fx;
  1004. float fy;
  1005. int sx;
  1006. int sy;
  1007. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  1008. for (int dx = 0; dx < w; dx++)
  1009. {
  1010. fx = (float)((dx + 0.5) * scale_x - 0.5);
  1011. sx = fx;//cvFloor(fx);
  1012. fx -= sx;
  1013. if (sx >= srcw - 1)
  1014. {
  1015. sx = srcw - 2;
  1016. fx = 1.f;
  1017. }
  1018. xofs[dx] = sx*3;
  1019. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  1020. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  1021. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  1022. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  1023. }
  1024. for (int dy = 0; dy < h; dy++)
  1025. {
  1026. fy = (float)((dy + 0.5) * scale_y - 0.5);
  1027. sy = fy;//cvFloor(fy);
  1028. fy -= sy;
  1029. if (sy >= srch - 1)
  1030. {
  1031. sy = srch - 2;
  1032. fy = 1.f;
  1033. }
  1034. yofs[dy] = sy*3;
  1035. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  1036. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  1037. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  1038. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  1039. }
  1040. #undef SATURATE_CAST_SHORT
  1041. // loop body
  1042. Mat rowsbuf0((w*3 >> 1) + 3);
  1043. Mat rowsbuf1((w*3 >> 1) + 3);
  1044. short* rows0 = (short*)rowsbuf0.data;
  1045. short* rows1 = (short*)rowsbuf1.data;
  1046. int prev_sy1 = -1;
  1047. for (int dy = 0; dy < h; dy++ )
  1048. {
  1049. int sy = yofs[dy];
  1050. if (sy == prev_sy1)
  1051. {
  1052. // hresize one row
  1053. short* rows0_old = rows0;
  1054. rows0 = rows1;
  1055. rows1 = rows0_old;
  1056. const unsigned char *S1 = src + srcw * (sy+3);
  1057. const short* ialphap = ialpha;
  1058. short* rows1p = rows1;
  1059. for ( int dx = 0; dx < w; dx++ )
  1060. {
  1061. int sx = xofs[dx];
  1062. short a0 = ialphap[0];
  1063. short a1 = ialphap[1];
  1064. const unsigned char* S1p = S1 + sx;
  1065. #if __ARM_NEON
  1066. int16x4_t _a0 = vdup_n_s16(a0);
  1067. int16x4_t _a1 = vdup_n_s16(a1);
  1068. uint8x8_t _S1 = vld1_u8(S1p);
  1069. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  1070. int16x4_t _S1low = vget_low_s16(_S116);
  1071. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  1072. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  1073. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  1074. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  1075. vst1_s16(rows1p, _rows1_sr4);
  1076. #else
  1077. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  1078. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  1079. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  1080. #endif // __ARM_NEON
  1081. ialphap += 2;
  1082. rows1p += 3;
  1083. }
  1084. }
  1085. else
  1086. {
  1087. // hresize two rows
  1088. const unsigned char *S0 = src + srcw * (sy);
  1089. const unsigned char *S1 = src + srcw * (sy+3);
  1090. const short* ialphap = ialpha;
  1091. short* rows0p = rows0;
  1092. short* rows1p = rows1;
  1093. for ( int dx = 0; dx < w; dx++ )
  1094. {
  1095. int sx = xofs[dx];
  1096. short a0 = ialphap[0];
  1097. short a1 = ialphap[1];
  1098. const unsigned char* S0p = S0 + sx;
  1099. const unsigned char* S1p = S1 + sx;
  1100. #if __ARM_NEON
  1101. int16x4_t _a0 = vdup_n_s16(a0);
  1102. int16x4_t _a1 = vdup_n_s16(a1);
  1103. uint8x8_t _S0 = vld1_u8(S0p);
  1104. uint8x8_t _S1 = vld1_u8(S1p);
  1105. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  1106. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  1107. int16x4_t _S0low = vget_low_s16(_S016);
  1108. int16x4_t _S1low = vget_low_s16(_S116);
  1109. int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
  1110. int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
  1111. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  1112. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  1113. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  1114. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  1115. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  1116. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  1117. vst1_s16(rows0p, _rows0_sr4);
  1118. vst1_s16(rows1p, _rows1_sr4);
  1119. #else
  1120. rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
  1121. rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
  1122. rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
  1123. rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
  1124. rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
  1125. rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
  1126. #endif // __ARM_NEON
  1127. ialphap += 2;
  1128. rows0p += 3;
  1129. rows1p += 3;
  1130. }
  1131. }
  1132. prev_sy1 = sy + 1;
  1133. // vresize
  1134. short b0 = ibeta[0];
  1135. short b1 = ibeta[1];
  1136. short* rows0p = rows0;
  1137. short* rows1p = rows1;
  1138. unsigned char* Dp = dst + w * 3 * (dy);
  1139. #if __ARM_NEON
  1140. int nn = (w * 3) >> 3;
  1141. #else
  1142. int nn = 0;
  1143. #endif
  1144. int remain = (w * 3) - (nn << 3);
  1145. #if __ARM_NEON
  1146. #if __aarch64__
  1147. int16x4_t _b0 = vdup_n_s16(b0);
  1148. int16x4_t _b1 = vdup_n_s16(b1);
  1149. int32x4_t _v2 = vdupq_n_s32(2);
  1150. for (; nn>0; nn--)
  1151. {
  1152. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  1153. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  1154. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  1155. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  1156. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  1157. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  1158. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  1159. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  1160. int32x4_t _acc = _v2;
  1161. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  1162. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  1163. int32x4_t _acc_1 = _v2;
  1164. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  1165. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  1166. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  1167. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  1168. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  1169. vst1_u8(Dp, _D);
  1170. Dp += 8;
  1171. rows0p += 8;
  1172. rows1p += 8;
  1173. }
  1174. #else
  1175. if (nn > 0)
  1176. {
  1177. asm volatile(
  1178. "vdup.s16 d16, %8 \n"
  1179. "mov r4, #2 \n"
  1180. "vdup.s16 d17, %9 \n"
  1181. "vdup.s32 q12, r4 \n"
  1182. "pld [%0, #128] \n"
  1183. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1184. "pld [%1, #128] \n"
  1185. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1186. "0: \n"
  1187. "vmull.s16 q0, d2, d16 \n"
  1188. "vmull.s16 q1, d3, d16 \n"
  1189. "vorr.s32 q10, q12, q12 \n"
  1190. "vorr.s32 q11, q12, q12 \n"
  1191. "vmull.s16 q2, d6, d17 \n"
  1192. "vmull.s16 q3, d7, d17 \n"
  1193. "vsra.s32 q10, q0, #16 \n"
  1194. "vsra.s32 q11, q1, #16 \n"
  1195. "pld [%0, #128] \n"
  1196. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1197. "vsra.s32 q10, q2, #16 \n"
  1198. "vsra.s32 q11, q3, #16 \n"
  1199. "pld [%1, #128] \n"
  1200. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1201. "vshrn.s32 d20, q10, #2 \n"
  1202. "vshrn.s32 d21, q11, #2 \n"
  1203. "vqmovun.s16 d20, q10 \n"
  1204. "vst1.8 {d20}, [%2]! \n"
  1205. "subs %3, #1 \n"
  1206. "bne 0b \n"
  1207. "sub %0, #16 \n"
  1208. "sub %1, #16 \n"
  1209. : "=r"(rows0p), // %0
  1210. "=r"(rows1p), // %1
  1211. "=r"(Dp), // %2
  1212. "=r"(nn) // %3
  1213. : "0"(rows0p),
  1214. "1"(rows1p),
  1215. "2"(Dp),
  1216. "3"(nn),
  1217. "r"(b0), // %8
  1218. "r"(b1) // %9
  1219. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  1220. );
  1221. }
  1222. #endif // __aarch64__
  1223. #endif // __ARM_NEON
  1224. for ( ; remain; --remain )
  1225. {
  1226. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1227. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  1228. }
  1229. ibeta += 2;
  1230. }
  1231. delete[] buf;
  1232. }
  1233. void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  1234. {
  1235. const int INTER_RESIZE_COEF_BITS=11;
  1236. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  1237. // const int ONE=INTER_RESIZE_COEF_SCALE;
  1238. double scale_x = (double)srcw / w;
  1239. double scale_y = (double)srch / h;
  1240. int* buf = new int[w + h + w + h];
  1241. int* xofs = buf;//new int[w];
  1242. int* yofs = buf + w;//new int[h];
  1243. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  1244. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  1245. float fx;
  1246. float fy;
  1247. int sx;
  1248. int sy;
  1249. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  1250. for (int dx = 0; dx < w; dx++)
  1251. {
  1252. fx = (float)((dx + 0.5) * scale_x - 0.5);
  1253. sx = fx;//cvFloor(fx);
  1254. fx -= sx;
  1255. if (sx >= srcw - 1)
  1256. {
  1257. sx = srcw - 2;
  1258. fx = 1.f;
  1259. }
  1260. xofs[dx] = sx;
  1261. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  1262. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  1263. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  1264. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  1265. }
  1266. for (int dy = 0; dy < h; dy++)
  1267. {
  1268. fy = (float)((dy + 0.5) * scale_y - 0.5);
  1269. sy = fy;//cvFloor(fy);
  1270. fy -= sy;
  1271. if (sy >= srch - 1)
  1272. {
  1273. sy = srch - 2;
  1274. fy = 1.f;
  1275. }
  1276. yofs[dy] = sy;
  1277. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  1278. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  1279. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  1280. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  1281. }
  1282. #undef SATURATE_CAST_SHORT
  1283. // loop body
  1284. Mat rowsbuf0((w >> 1) + 1);
  1285. Mat rowsbuf1((w >> 1) + 1);
  1286. short* rows0 = (short*)rowsbuf0.data;
  1287. short* rows1 = (short*)rowsbuf1.data;
  1288. int prev_sy1 = -1;
  1289. for (int dy = 0; dy < h; dy++ )
  1290. {
  1291. int sy = yofs[dy];
  1292. if (sy == prev_sy1)
  1293. {
  1294. // hresize one row
  1295. short* rows0_old = rows0;
  1296. rows0 = rows1;
  1297. rows1 = rows0_old;
  1298. const unsigned char *S1 = src + srcw * (sy+1);
  1299. const short* ialphap = ialpha;
  1300. short* rows1p = rows1;
  1301. for ( int dx = 0; dx < w; dx++ )
  1302. {
  1303. int sx = xofs[dx];
  1304. short a0 = ialphap[0];
  1305. short a1 = ialphap[1];
  1306. const unsigned char* S1p = S1 + sx;
  1307. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  1308. ialphap += 2;
  1309. }
  1310. }
  1311. else
  1312. {
  1313. // hresize two rows
  1314. const unsigned char *S0 = src + srcw * (sy);
  1315. const unsigned char *S1 = src + srcw * (sy+1);
  1316. const short* ialphap = ialpha;
  1317. short* rows0p = rows0;
  1318. short* rows1p = rows1;
  1319. for ( int dx = 0; dx < w; dx++ )
  1320. {
  1321. int sx = xofs[dx];
  1322. short a0 = ialphap[0];
  1323. short a1 = ialphap[1];
  1324. const unsigned char* S0p = S0 + sx;
  1325. const unsigned char* S1p = S1 + sx;
  1326. rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
  1327. rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
  1328. ialphap += 2;
  1329. }
  1330. }
  1331. prev_sy1 = sy + 1;
  1332. // vresize
  1333. short b0 = ibeta[0];
  1334. short b1 = ibeta[1];
  1335. short* rows0p = rows0;
  1336. short* rows1p = rows1;
  1337. unsigned char* Dp = dst + w * (dy);
  1338. #if __ARM_NEON
  1339. int nn = w >> 3;
  1340. #else
  1341. int nn = 0;
  1342. #endif
  1343. int remain = w - (nn << 3);
  1344. #if __ARM_NEON
  1345. #if __aarch64__
  1346. int16x4_t _b0 = vdup_n_s16(b0);
  1347. int16x4_t _b1 = vdup_n_s16(b1);
  1348. int32x4_t _v2 = vdupq_n_s32(2);
  1349. for (; nn>0; nn--)
  1350. {
  1351. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  1352. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  1353. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  1354. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  1355. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  1356. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  1357. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  1358. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  1359. int32x4_t _acc = _v2;
  1360. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  1361. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  1362. int32x4_t _acc_1 = _v2;
  1363. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  1364. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  1365. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  1366. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  1367. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  1368. vst1_u8(Dp, _D);
  1369. Dp += 8;
  1370. rows0p += 8;
  1371. rows1p += 8;
  1372. }
  1373. #else
  1374. if (nn > 0)
  1375. {
  1376. asm volatile(
  1377. "vdup.s16 d16, %8 \n"
  1378. "mov r4, #2 \n"
  1379. "vdup.s16 d17, %9 \n"
  1380. "vdup.s32 q12, r4 \n"
  1381. "pld [%0, #128] \n"
  1382. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1383. "pld [%1, #128] \n"
  1384. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1385. "0: \n"
  1386. "vmull.s16 q0, d2, d16 \n"
  1387. "vmull.s16 q1, d3, d16 \n"
  1388. "vorr.s32 q10, q12, q12 \n"
  1389. "vorr.s32 q11, q12, q12 \n"
  1390. "vmull.s16 q2, d6, d17 \n"
  1391. "vmull.s16 q3, d7, d17 \n"
  1392. "vsra.s32 q10, q0, #16 \n"
  1393. "vsra.s32 q11, q1, #16 \n"
  1394. "pld [%0, #128] \n"
  1395. "vld1.s32 {d2-d3}, [%0 :128]!\n"
  1396. "vsra.s32 q10, q2, #16 \n"
  1397. "vsra.s32 q11, q3, #16 \n"
  1398. "pld [%1, #128] \n"
  1399. "vld1.s32 {d6-d7}, [%1 :128]!\n"
  1400. "vshrn.s32 d20, q10, #2 \n"
  1401. "vshrn.s32 d21, q11, #2 \n"
  1402. "vqmovun.s16 d20, q10 \n"
  1403. "vst1.8 {d20}, [%2]! \n"
  1404. "subs %3, #1 \n"
  1405. "bne 0b \n"
  1406. "sub %0, #16 \n"
  1407. "sub %1, #16 \n"
  1408. : "=r"(rows0p), // %0
  1409. "=r"(rows1p), // %1
  1410. "=r"(Dp), // %2
  1411. "=r"(nn) // %3
  1412. : "0"(rows0p),
  1413. "1"(rows1p),
  1414. "2"(Dp),
  1415. "3"(nn),
  1416. "r"(b0), // %8
  1417. "r"(b1) // %9
  1418. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  1419. );
  1420. }
  1421. #endif // __aarch64__
  1422. #endif // __ARM_NEON
  1423. for ( ; remain; --remain )
  1424. {
  1425. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1426. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  1427. }
  1428. ibeta += 2;
  1429. }
  1430. delete[] buf;
  1431. }
  1432. void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
  1433. {
  1434. const int INTER_RESIZE_COEF_BITS=11;
  1435. const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
  1436. // const int ONE=INTER_RESIZE_COEF_SCALE;
  1437. double scale_x = (double)srcw / w;
  1438. double scale_y = (double)srch / h;
  1439. int* buf = new int[w + h + w + h];
  1440. int* xofs = buf;//new int[w];
  1441. int* yofs = buf + w;//new int[h];
  1442. short* ialpha = (short*)(buf + w + h);//new short[w * 2];
  1443. short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
  1444. float fx;
  1445. float fy;
  1446. int sx;
  1447. int sy;
  1448. #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
  1449. for (int dx = 0; dx < w; dx++)
  1450. {
  1451. fx = (float)((dx + 0.5) * scale_x - 0.5);
  1452. sx = fx;//cvFloor(fx);
  1453. fx -= sx;
  1454. if (sx >= srcw - 1)
  1455. {
  1456. sx = srcw - 2;
  1457. fx = 1.f;
  1458. }
  1459. xofs[dx] = sx*4;
  1460. float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
  1461. float a1 = fx * INTER_RESIZE_COEF_SCALE;
  1462. ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
  1463. ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
  1464. }
  1465. for (int dy = 0; dy < h; dy++)
  1466. {
  1467. fy = (float)((dy + 0.5) * scale_y - 0.5);
  1468. sy = fy;//cvFloor(fy);
  1469. fy -= sy;
  1470. if (sy >= srch - 1)
  1471. {
  1472. sy = srch - 2;
  1473. fy = 1.f;
  1474. }
  1475. yofs[dy] = sy*4;
  1476. float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
  1477. float b1 = fy * INTER_RESIZE_COEF_SCALE;
  1478. ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
  1479. ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
  1480. }
  1481. #undef SATURATE_CAST_SHORT
  1482. // loop body
  1483. Mat rowsbuf0((w*4 >> 1) + 4);
  1484. Mat rowsbuf1((w*4 >> 1) + 4);
  1485. short* rows0 = (short*)rowsbuf0.data;
  1486. short* rows1 = (short*)rowsbuf1.data;
  1487. int prev_sy1 = -1;
  1488. for (int dy = 0; dy < h; dy++ )
  1489. {
  1490. int sy = yofs[dy];
  1491. if (sy == prev_sy1)
  1492. {
  1493. // hresize one row
  1494. short* rows0_old = rows0;
  1495. rows0 = rows1;
  1496. rows1 = rows0_old;
  1497. const unsigned char *S1 = src + srcw * (sy+4);
  1498. const short* ialphap = ialpha;
  1499. short* rows1p = rows1;
  1500. for ( int dx = 0; dx < w; dx++ )
  1501. {
  1502. int sx = xofs[dx];
  1503. short a0 = ialphap[0];
  1504. short a1 = ialphap[1];
  1505. const unsigned char* S1p = S1 + sx;
  1506. #if __ARM_NEON
  1507. int16x4_t _a0 = vdup_n_s16(a0);
  1508. int16x4_t _a1 = vdup_n_s16(a1);
  1509. uint8x8_t _S1 = vld1_u8(S1p);
  1510. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  1511. int16x4_t _S1low = vget_low_s16(_S116);
  1512. int16x4_t _S1high = vget_high_s16(_S116);
  1513. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  1514. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  1515. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  1516. vst1_s16(rows1p, _rows1_sr4);
  1517. #else
  1518. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  1519. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  1520. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  1521. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  1522. #endif // __ARM_NEON
  1523. ialphap += 2;
  1524. rows1p += 4;
  1525. }
  1526. }
  1527. else
  1528. {
  1529. // hresize two rows
  1530. const unsigned char *S0 = src + srcw * (sy);
  1531. const unsigned char *S1 = src + srcw * (sy+4);
  1532. const short* ialphap = ialpha;
  1533. short* rows0p = rows0;
  1534. short* rows1p = rows1;
  1535. for ( int dx = 0; dx < w; dx++ )
  1536. {
  1537. int sx = xofs[dx];
  1538. short a0 = ialphap[0];
  1539. short a1 = ialphap[1];
  1540. const unsigned char* S0p = S0 + sx;
  1541. const unsigned char* S1p = S1 + sx;
  1542. #if __ARM_NEON
  1543. int16x4_t _a0 = vdup_n_s16(a0);
  1544. int16x4_t _a1 = vdup_n_s16(a1);
  1545. uint8x8_t _S0 = vld1_u8(S0p);
  1546. uint8x8_t _S1 = vld1_u8(S1p);
  1547. int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
  1548. int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
  1549. int16x4_t _S0low = vget_low_s16(_S016);
  1550. int16x4_t _S1low = vget_low_s16(_S116);
  1551. int16x4_t _S0high = vget_high_s16(_S016);
  1552. int16x4_t _S1high = vget_high_s16(_S116);
  1553. int32x4_t _rows0 = vmull_s16(_S0low, _a0);
  1554. int32x4_t _rows1 = vmull_s16(_S1low, _a0);
  1555. _rows0 = vmlal_s16(_rows0, _S0high, _a1);
  1556. _rows1 = vmlal_s16(_rows1, _S1high, _a1);
  1557. int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
  1558. int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
  1559. vst1_s16(rows0p, _rows0_sr4);
  1560. vst1_s16(rows1p, _rows1_sr4);
  1561. #else
  1562. rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
  1563. rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
  1564. rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
  1565. rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
  1566. rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
  1567. rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
  1568. rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
  1569. rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
  1570. #endif // __ARM_NEON
  1571. ialphap += 2;
  1572. rows0p += 4;
  1573. rows1p += 4;
  1574. }
  1575. }
  1576. prev_sy1 = sy + 1;
  1577. // vresize
  1578. short b0 = ibeta[0];
  1579. short b1 = ibeta[1];
  1580. short* rows0p = rows0;
  1581. short* rows1p = rows1;
  1582. unsigned char* Dp = dst + w * 4 * (dy);
  1583. #if __ARM_NEON
  1584. int nn = (w * 4) >> 3;
  1585. #else
  1586. int nn = 0;
  1587. #endif
  1588. int remain = (w * 4) - (nn << 3);
  1589. #if __ARM_NEON
  1590. #if __aarch64__
  1591. int16x4_t _b0 = vdup_n_s16(b0);
  1592. int16x4_t _b1 = vdup_n_s16(b1);
  1593. int32x4_t _v2 = vdupq_n_s32(2);
  1594. for (; nn>0; nn--)
  1595. {
  1596. int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
  1597. int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
  1598. int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
  1599. int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
  1600. int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
  1601. int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
  1602. int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
  1603. int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
  1604. int32x4_t _acc = _v2;
  1605. _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
  1606. _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
  1607. int32x4_t _acc_1 = _v2;
  1608. _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
  1609. _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
  1610. int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
  1611. int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
  1612. uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
  1613. vst1_u8(Dp, _D);
  1614. Dp += 8;
  1615. rows0p += 8;
  1616. rows1p += 8;
  1617. }
  1618. #else
  1619. if (nn > 0)
  1620. {
  1621. asm volatile(
  1622. "vdup.s16 d16, %8 \n"
  1623. "mov r4, #2 \n"
  1624. "vdup.s16 d17, %9 \n"
  1625. "vdup.s32 q12, r4 \n"
  1626. "pld [%0, #128] \n"
  1627. "vld1.s16 {d2-d3}, [%0 :128]!\n"
  1628. "pld [%1, #128] \n"
  1629. "vld1.s16 {d6-d7}, [%1 :128]!\n"
  1630. "0: \n"
  1631. "vmull.s16 q0, d2, d16 \n"
  1632. "vmull.s16 q1, d3, d16 \n"
  1633. "vorr.s32 q10, q12, q12 \n"
  1634. "vorr.s32 q11, q12, q12 \n"
  1635. "vmull.s16 q2, d6, d17 \n"
  1636. "vmull.s16 q3, d7, d17 \n"
  1637. "vsra.s32 q10, q0, #16 \n"
  1638. "vsra.s32 q11, q1, #16 \n"
  1639. "pld [%0, #128] \n"
  1640. "vld1.s32 {d2-d3}, [%0 :128]!\n"
  1641. "vsra.s32 q10, q2, #16 \n"
  1642. "vsra.s32 q11, q3, #16 \n"
  1643. "pld [%1, #128] \n"
  1644. "vld1.s32 {d6-d7}, [%1 :128]!\n"
  1645. "vshrn.s32 d20, q10, #2 \n"
  1646. "vshrn.s32 d21, q11, #2 \n"
  1647. "vqmovun.s16 d20, q10 \n"
  1648. "vst1.8 {d20}, [%2]! \n"
  1649. "subs %3, #1 \n"
  1650. "bne 0b \n"
  1651. "sub %0, #16 \n"
  1652. "sub %1, #16 \n"
  1653. : "=r"(rows0p), // %0
  1654. "=r"(rows1p), // %1
  1655. "=r"(Dp), // %2
  1656. "=r"(nn) // %3
  1657. : "0"(rows0p),
  1658. "1"(rows1p),
  1659. "2"(Dp),
  1660. "3"(nn),
  1661. "r"(b0), // %8
  1662. "r"(b1) // %9
  1663. : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
  1664. );
  1665. }
  1666. #endif // __aarch64__
  1667. #endif // __ARM_NEON
  1668. for ( ; remain; --remain )
  1669. {
  1670. // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
  1671. *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
  1672. }
  1673. ibeta += 2;
  1674. }
  1675. delete[] buf;
  1676. }
  1677. Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h)
  1678. {
  1679. if (type & PIXEL_CONVERT_MASK)
  1680. {
  1681. if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
  1682. return from_rgb2bgr(pixels, w, h);
  1683. if (type == PIXEL_RGB2GRAY)
  1684. return from_rgb2gray(pixels, w, h);
  1685. if (type == PIXEL_BGR2GRAY)
  1686. return from_bgr2gray(pixels, w, h);
  1687. if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR)
  1688. return from_gray2rgb(pixels, w, h);
  1689. if (type == PIXEL_RGBA2RGB)
  1690. return from_rgba2rgb(pixels, w, h);
  1691. if (type == PIXEL_RGBA2BGR)
  1692. return from_rgba2bgr(pixels, w, h);
  1693. if (type == PIXEL_RGBA2GRAY)
  1694. return from_rgba2gray(pixels, w, h);
  1695. }
  1696. else
  1697. {
  1698. if (type == PIXEL_RGB || type == PIXEL_BGR)
  1699. return from_rgb(pixels, w, h);
  1700. if (type == PIXEL_GRAY)
  1701. return from_gray(pixels, w, h);
  1702. if (type == PIXEL_RGBA)
  1703. return from_rgba(pixels, w, h);
  1704. }
  1705. return Mat();
  1706. }
  1707. Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height)
  1708. {
  1709. if (w == target_width && h == target_height)
  1710. return Mat::from_pixels(pixels, type, w, h);
  1711. Mat m;
  1712. int type_from = type & PIXEL_FORMAT_MASK;
  1713. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  1714. {
  1715. unsigned char* dst = new unsigned char[target_width * target_height * 3];
  1716. resize_bilinear_c3(pixels, w, h, dst, target_width, target_height);
  1717. m = Mat::from_pixels(dst, type, target_width, target_height);
  1718. delete[] dst;
  1719. }
  1720. else if (type_from == PIXEL_GRAY)
  1721. {
  1722. unsigned char* dst = new unsigned char[target_width * target_height];
  1723. resize_bilinear_c1(pixels, w, h, dst, target_width, target_height);
  1724. m = Mat::from_pixels(dst, type, target_width, target_height);
  1725. delete[] dst;
  1726. }
  1727. else if (type_from == PIXEL_RGBA)
  1728. {
  1729. unsigned char* dst = new unsigned char[target_width * target_height * 4];
  1730. resize_bilinear_c4(pixels, w, h, dst, target_width, target_height);
  1731. m = Mat::from_pixels(dst, type, target_width, target_height);
  1732. delete[] dst;
  1733. }
  1734. return m;
  1735. }
  1736. void Mat::to_pixels(unsigned char* pixels, int type)
  1737. {
  1738. if (type & PIXEL_CONVERT_MASK)
  1739. {
  1740. if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
  1741. return to_bgr2rgb(*this, pixels);
  1742. }
  1743. else
  1744. {
  1745. if (type == PIXEL_RGB || type == PIXEL_BGR)
  1746. return to_rgb(*this, pixels);
  1747. if (type == PIXEL_GRAY)
  1748. return to_gray(*this, pixels);
  1749. if (type == PIXEL_RGBA)
  1750. return to_rgba(*this, pixels);
  1751. }
  1752. }
  1753. void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height)
  1754. {
  1755. if (w == target_width && h == target_height)
  1756. return to_pixels(pixels, type);
  1757. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  1758. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  1759. {
  1760. unsigned char* src = new unsigned char[w * h * 3];
  1761. to_pixels(src, type);
  1762. resize_bilinear_c3(src, w, h, pixels, target_width, target_height);
  1763. delete[] src;
  1764. }
  1765. else if (type_to == PIXEL_GRAY)
  1766. {
  1767. unsigned char* src = new unsigned char[w * h];
  1768. to_pixels(src, type);
  1769. resize_bilinear_c1(src, w, h, pixels, target_width, target_height);
  1770. delete[] src;
  1771. }
  1772. else if (type_to == PIXEL_RGBA)
  1773. {
  1774. unsigned char* src = new unsigned char[w * h * 4];
  1775. to_pixels(src, type);
  1776. resize_bilinear_c4(src, w, h, pixels, target_width, target_height);
  1777. delete[] src;
  1778. }
  1779. }
  1780. } // namespace ncnn