You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel.cpp 73 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <math.h>
  17. #include <algorithm>
  18. #if __ARM_NEON
  19. #include <arm_neon.h>
  20. #endif // __ARM_NEON
  21. #include "platform.h"
  22. namespace ncnn {
  23. #if NCNN_PIXEL
  24. static int from_rgb(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  25. {
  26. m.create(w, h, 3, 4u, allocator);
  27. if (m.empty())
  28. return -100;
  29. const int wgap = stride - w * 3;
  30. if (wgap == 0)
  31. {
  32. w = w * h;
  33. h = 1;
  34. }
  35. float* ptr0 = m.channel(0);
  36. float* ptr1 = m.channel(1);
  37. float* ptr2 = m.channel(2);
  38. for (int y=0; y<h; y++)
  39. {
  40. #if __ARM_NEON
  41. int nn = w >> 3;
  42. int remain = w - (nn << 3);
  43. #else
  44. int remain = w;
  45. #endif // __ARM_NEON
  46. #if __ARM_NEON
  47. #if __aarch64__
  48. for (; nn>0; nn--)
  49. {
  50. uint8x8x3_t _rgb = vld3_u8(rgb);
  51. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  52. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  53. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  54. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  55. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  56. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  57. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  58. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  59. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  60. vst1q_f32(ptr0, _rlow);
  61. vst1q_f32(ptr0+4, _rhigh);
  62. vst1q_f32(ptr1, _glow);
  63. vst1q_f32(ptr1+4, _ghigh);
  64. vst1q_f32(ptr2, _blow);
  65. vst1q_f32(ptr2+4, _bhigh);
  66. rgb += 3*8;
  67. ptr0 += 8;
  68. ptr1 += 8;
  69. ptr2 += 8;
  70. }
  71. #else
  72. if (nn > 0)
  73. {
  74. asm volatile(
  75. "0: \n"
  76. "pld [%1, #256] \n"
  77. "vld3.u8 {d0-d2}, [%1]! \n"
  78. "vmovl.u8 q8, d0 \n"
  79. "vmovl.u8 q9, d1 \n"
  80. "vmovl.u8 q10, d2 \n"
  81. "vmovl.u16 q0, d16 \n"
  82. "vmovl.u16 q1, d17 \n"
  83. "vmovl.u16 q2, d18 \n"
  84. "vmovl.u16 q3, d19 \n"
  85. "vmovl.u16 q8, d20 \n"
  86. "vmovl.u16 q9, d21 \n"
  87. "vcvt.f32.u32 q0, q0 \n"
  88. "vcvt.f32.u32 q1, q1 \n"
  89. "vcvt.f32.u32 q2, q2 \n"
  90. "vcvt.f32.u32 q3, q3 \n"
  91. "vcvt.f32.u32 q8, q8 \n"
  92. "subs %0, #1 \n"
  93. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  94. "vcvt.f32.u32 q9, q9 \n"
  95. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  96. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  97. "bne 0b \n"
  98. : "=r"(nn), // %0
  99. "=r"(rgb), // %1
  100. "=r"(ptr0), // %2
  101. "=r"(ptr1), // %3
  102. "=r"(ptr2) // %4
  103. : "0"(nn),
  104. "1"(rgb),
  105. "2"(ptr0),
  106. "3"(ptr1),
  107. "4"(ptr2)
  108. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  109. );
  110. }
  111. #endif // __aarch64__
  112. #endif // __ARM_NEON
  113. for (; remain>0; remain--)
  114. {
  115. *ptr0 = rgb[0];
  116. *ptr1 = rgb[1];
  117. *ptr2 = rgb[2];
  118. rgb += 3;
  119. ptr0++;
  120. ptr1++;
  121. ptr2++;
  122. }
  123. rgb += wgap;
  124. }
  125. return 0;
  126. }
  127. static void to_rgb(const Mat& m, unsigned char* rgb, int stride)
  128. {
  129. int w = m.w;
  130. int h = m.h;
  131. const int wgap = stride - w * 3;
  132. if (wgap == 0)
  133. {
  134. w = w * h;
  135. h = 1;
  136. }
  137. const float* ptr0 = m.channel(0);
  138. const float* ptr1 = m.channel(1);
  139. const float* ptr2 = m.channel(2);
  140. for (int y=0; y<h; y++)
  141. {
  142. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  143. #if __ARM_NEON
  144. int nn = w >> 3;
  145. int remain = w - (nn << 3);
  146. #else
  147. int remain = w;
  148. #endif // __ARM_NEON
  149. #if __ARM_NEON
  150. for (; nn>0; nn--)
  151. {
  152. float32x4_t _rlow = vld1q_f32(ptr0);
  153. float32x4_t _rhigh = vld1q_f32(ptr0+4);
  154. float32x4_t _glow = vld1q_f32(ptr1);
  155. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  156. float32x4_t _blow = vld1q_f32(ptr2);
  157. float32x4_t _bhigh = vld1q_f32(ptr2+4);
  158. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  159. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  160. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  161. uint8x8x3_t _rgb;
  162. _rgb.val[0] = vqmovun_s16(_r16);
  163. _rgb.val[1] = vqmovun_s16(_g16);
  164. _rgb.val[2] = vqmovun_s16(_b16);
  165. vst3_u8(rgb, _rgb);
  166. rgb += 3*8;
  167. ptr0 += 8;
  168. ptr1 += 8;
  169. ptr2 += 8;
  170. }
  171. #endif // __ARM_NEON
  172. for (; remain>0; remain--)
  173. {
  174. rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
  175. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  176. rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
  177. rgb += 3;
  178. ptr0++;
  179. ptr1++;
  180. ptr2++;
  181. }
  182. #undef SATURATE_CAST_UCHAR
  183. rgb += wgap;
  184. }
  185. }
  186. static int from_gray(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  187. {
  188. m.create(w, h, 1, 4u, allocator);
  189. if (m.empty())
  190. return -100;
  191. const int wgap = stride - w;
  192. if (wgap == 0)
  193. {
  194. w = w * h;
  195. h = 1;
  196. }
  197. float* ptr = m;
  198. for (int y=0; y<h; y++)
  199. {
  200. #if __ARM_NEON
  201. int nn = w >> 4;
  202. int remain = w - (nn << 4);
  203. #else
  204. int remain = w;
  205. #endif // __ARM_NEON
  206. #if __ARM_NEON
  207. #if __aarch64__
  208. for (; nn>0; nn--)
  209. {
  210. uint8x16_t _gray = vld1q_u8(gray);
  211. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  212. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  213. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  214. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  215. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  216. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  217. vst1q_f32(ptr, _graylow_0);
  218. vst1q_f32(ptr+4, _grayhigh_0);
  219. vst1q_f32(ptr+8, _graylow_1);
  220. vst1q_f32(ptr+12, _grayhigh_1);
  221. gray += 16;
  222. ptr += 16;
  223. }
  224. #else
  225. if (nn > 0)
  226. {
  227. asm volatile(
  228. "0: \n"
  229. "pld [%1, #128] \n"
  230. "vld1.u8 {d0,d1}, [%1]! \n"
  231. "vmovl.u8 q8, d0 \n"
  232. "vmovl.u8 q9, d1 \n"
  233. "vmovl.u16 q0, d16 \n"
  234. "vmovl.u16 q1, d17 \n"
  235. "vmovl.u16 q2, d18 \n"
  236. "vmovl.u16 q3, d19 \n"
  237. "vcvt.f32.u32 q0, q0 \n"
  238. "vcvt.f32.u32 q1, q1 \n"
  239. "vcvt.f32.u32 q2, q2 \n"
  240. "vcvt.f32.u32 q3, q3 \n"
  241. "subs %0, #1 \n"
  242. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  243. "vst1.f32 {d4-d7}, [%2 :128]! \n"
  244. "bne 0b \n"
  245. : "=r"(nn), // %0
  246. "=r"(gray), // %1
  247. "=r"(ptr) // %2
  248. : "0"(nn),
  249. "1"(gray),
  250. "2"(ptr)
  251. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  252. );
  253. }
  254. #endif // __aarch64__
  255. #endif // __ARM_NEON
  256. for (; remain>0; remain--)
  257. {
  258. *ptr = *gray;
  259. gray++;
  260. ptr++;
  261. }
  262. gray += wgap;
  263. }
  264. return 0;
  265. }
  266. static void to_gray(const Mat& m, unsigned char* gray, int stride)
  267. {
  268. int w = m.w;
  269. int h = m.h;
  270. const int wgap = stride - w;
  271. if (wgap == 0)
  272. {
  273. w = w * h;
  274. h = 1;
  275. }
  276. const float* ptr = m;
  277. for (int y=0; y<h; y++)
  278. {
  279. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  280. #if __ARM_NEON
  281. int nn = w >> 3;
  282. int remain = w - (nn << 3);
  283. #else
  284. int remain = w;
  285. #endif // __ARM_NEON
  286. #if __ARM_NEON
  287. for (; nn>0; nn--)
  288. {
  289. float32x4_t _glow = vld1q_f32(ptr);
  290. float32x4_t _ghigh = vld1q_f32(ptr+4);
  291. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  292. uint8x8_t _gray = vqmovun_s16(_g16);
  293. vst1_u8(gray, _gray);
  294. gray += 8;
  295. ptr += 8;
  296. }
  297. #endif // __ARM_NEON
  298. for (; remain>0; remain--)
  299. {
  300. *gray = SATURATE_CAST_UCHAR(*ptr);
  301. gray++;
  302. ptr++;
  303. }
  304. #undef SATURATE_CAST_UCHAR
  305. gray += wgap;
  306. }
  307. }
  308. static int from_rgba(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  309. {
  310. m.create(w, h, 4, 4u, allocator);
  311. if (m.empty())
  312. return -100;
  313. const int wgap = stride - w * 4;
  314. if (wgap == 0)
  315. {
  316. w = w * h;
  317. h = 1;
  318. }
  319. float* ptr0 = m.channel(0);
  320. float* ptr1 = m.channel(1);
  321. float* ptr2 = m.channel(2);
  322. float* ptr3 = m.channel(3);
  323. for (int y=0; y<h; y++)
  324. {
  325. #if __ARM_NEON
  326. int nn = w >> 3;
  327. int remain = w - (nn << 3);
  328. #else
  329. int remain = w;
  330. #endif // __ARM_NEON
  331. #if __ARM_NEON
  332. #if __aarch64__
  333. for (; nn>0; nn--)
  334. {
  335. uint8x8x4_t _rgba = vld4_u8(rgba);
  336. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  337. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  338. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  339. int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
  340. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  341. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  342. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  343. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  344. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  345. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  346. float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
  347. float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
  348. vst1q_f32(ptr0, _rlow);
  349. vst1q_f32(ptr0+4, _rhigh);
  350. vst1q_f32(ptr1, _glow);
  351. vst1q_f32(ptr1+4, _ghigh);
  352. vst1q_f32(ptr2, _blow);
  353. vst1q_f32(ptr2+4, _bhigh);
  354. vst1q_f32(ptr3, _alow);
  355. vst1q_f32(ptr3+4, _ahigh);
  356. rgba += 4*8;
  357. ptr0 += 8;
  358. ptr1 += 8;
  359. ptr2 += 8;
  360. ptr3 += 8;
  361. }
  362. #else
  363. if (nn > 0)
  364. {
  365. asm volatile(
  366. "0: \n"
  367. "pld [%1, #256] \n"
  368. "vld4.u8 {d0-d3}, [%1]! \n"
  369. "vmovl.u8 q8, d0 \n"
  370. "vmovl.u8 q9, d1 \n"
  371. "vmovl.u8 q10, d2 \n"
  372. "vmovl.u8 q11, d3 \n"
  373. "vmovl.u16 q0, d16 \n"
  374. "vmovl.u16 q1, d17 \n"
  375. "vmovl.u16 q2, d18 \n"
  376. "vmovl.u16 q3, d19 \n"
  377. "vmovl.u16 q8, d20 \n"
  378. "vmovl.u16 q9, d21 \n"
  379. "vmovl.u16 q10, d22 \n"
  380. "vmovl.u16 q11, d23 \n"
  381. "vcvt.f32.u32 q0, q0 \n"
  382. "vcvt.f32.u32 q1, q1 \n"
  383. "vcvt.f32.u32 q2, q2 \n"
  384. "vcvt.f32.u32 q3, q3 \n"
  385. "vcvt.f32.u32 q8, q8 \n"
  386. "vcvt.f32.u32 q9, q9 \n"
  387. "subs %0, #1 \n"
  388. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  389. "vcvt.f32.u32 q10, q10 \n"
  390. "vcvt.f32.u32 q11, q11 \n"
  391. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  392. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  393. "vst1.f32 {d20-d23}, [%5 :128]!\n"
  394. "bne 0b \n"
  395. : "=r"(nn), // %0
  396. "=r"(rgba), // %1
  397. "=r"(ptr0), // %2
  398. "=r"(ptr1), // %3
  399. "=r"(ptr2), // %4
  400. "=r"(ptr3) // %5
  401. : "0"(nn),
  402. "1"(rgba),
  403. "2"(ptr0),
  404. "3"(ptr1),
  405. "4"(ptr2),
  406. "5"(ptr3)
  407. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  408. );
  409. }
  410. #endif // __aarch64__
  411. #endif // __ARM_NEON
  412. for (; remain>0; remain--)
  413. {
  414. *ptr0 = rgba[0];
  415. *ptr1 = rgba[1];
  416. *ptr2 = rgba[2];
  417. *ptr3 = rgba[3];
  418. rgba += 4;
  419. ptr0++;
  420. ptr1++;
  421. ptr2++;
  422. ptr3++;
  423. }
  424. rgba += wgap;
  425. }
  426. return 0;
  427. }
  428. static void to_rgba(const Mat& m, unsigned char* rgba, int stride)
  429. {
  430. int w = m.w;
  431. int h = m.h;
  432. const int wgap = stride - w * 4;
  433. if (wgap == 0)
  434. {
  435. w = w * h;
  436. h = 1;
  437. }
  438. const float* ptr0 = m.channel(0);
  439. const float* ptr1 = m.channel(1);
  440. const float* ptr2 = m.channel(2);
  441. const float* ptr3 = m.channel(3);
  442. for (int y=0; y<h; y++)
  443. {
  444. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  445. #if __ARM_NEON
  446. int nn = w >> 3;
  447. int remain = w - (nn << 3);
  448. #else
  449. int remain = w;
  450. #endif // __ARM_NEON
  451. #if __ARM_NEON
  452. for (; nn>0; nn--)
  453. {
  454. float32x4_t _rlow = vld1q_f32(ptr0);
  455. float32x4_t _rhigh = vld1q_f32(ptr0+4);
  456. float32x4_t _glow = vld1q_f32(ptr1);
  457. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  458. float32x4_t _blow = vld1q_f32(ptr2);
  459. float32x4_t _bhigh = vld1q_f32(ptr2+4);
  460. float32x4_t _alow = vld1q_f32(ptr3);
  461. float32x4_t _ahigh = vld1q_f32(ptr3+4);
  462. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  463. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  464. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  465. int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
  466. uint8x8x4_t _rgba;
  467. _rgba.val[0] = vqmovun_s16(_r16);
  468. _rgba.val[1] = vqmovun_s16(_g16);
  469. _rgba.val[2] = vqmovun_s16(_b16);
  470. _rgba.val[3] = vqmovun_s16(_a16);
  471. vst4_u8(rgba, _rgba);
  472. rgba += 4*8;
  473. ptr0 += 8;
  474. ptr1 += 8;
  475. ptr2 += 8;
  476. ptr3 += 8;
  477. }
  478. #endif // __ARM_NEON
  479. for (; remain>0; remain--)
  480. {
  481. rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
  482. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  483. rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
  484. rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
  485. rgba += 4;
  486. ptr0++;
  487. ptr1++;
  488. ptr2++;
  489. ptr3++;
  490. }
  491. #undef SATURATE_CAST_UCHAR
  492. rgba += wgap;
  493. }
  494. }
  495. static int from_rgb2bgr(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  496. {
  497. m.create(w, h, 3, 4u, allocator);
  498. if (m.empty())
  499. return -100;
  500. const int wgap = stride - w * 3;
  501. if (wgap == 0)
  502. {
  503. w = w * h;
  504. h = 1;
  505. }
  506. float* ptr0 = m.channel(0);
  507. float* ptr1 = m.channel(1);
  508. float* ptr2 = m.channel(2);
  509. for (int y=0; y<h; y++)
  510. {
  511. #if __ARM_NEON
  512. int nn = w >> 3;
  513. int remain = w - (nn << 3);
  514. #else
  515. int remain = w;
  516. #endif // __ARM_NEON
  517. #if __ARM_NEON
  518. #if __aarch64__
  519. for (; nn>0; nn--)
  520. {
  521. uint8x8x3_t _rgb = vld3_u8(rgb);
  522. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  523. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  524. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  525. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  526. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  527. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  528. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  529. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  530. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  531. vst1q_f32(ptr2, _rlow);
  532. vst1q_f32(ptr2+4, _rhigh);
  533. vst1q_f32(ptr1, _glow);
  534. vst1q_f32(ptr1+4, _ghigh);
  535. vst1q_f32(ptr0, _blow);
  536. vst1q_f32(ptr0+4, _bhigh);
  537. rgb += 3*8;
  538. ptr0 += 8;
  539. ptr1 += 8;
  540. ptr2 += 8;
  541. }
  542. #else
  543. if (nn > 0)
  544. {
  545. asm volatile(
  546. "0: \n"
  547. "pld [%1, #256] \n"
  548. "vld3.u8 {d0-d2}, [%1]! \n"
  549. "vmovl.u8 q8, d0 \n"
  550. "vmovl.u8 q9, d1 \n"
  551. "vmovl.u8 q10, d2 \n"
  552. "vmovl.u16 q0, d16 \n"
  553. "vmovl.u16 q1, d17 \n"
  554. "vmovl.u16 q2, d18 \n"
  555. "vmovl.u16 q3, d19 \n"
  556. "vmovl.u16 q8, d20 \n"
  557. "vmovl.u16 q9, d21 \n"
  558. "vcvt.f32.u32 q0, q0 \n"
  559. "vcvt.f32.u32 q1, q1 \n"
  560. "vcvt.f32.u32 q2, q2 \n"
  561. "vcvt.f32.u32 q3, q3 \n"
  562. "vcvt.f32.u32 q8, q8 \n"
  563. "subs %0, #1 \n"
  564. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  565. "vcvt.f32.u32 q9, q9 \n"
  566. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  567. "vst1.f32 {d16-d19}, [%2 :128]!\n"
  568. "bne 0b \n"
  569. : "=r"(nn), // %0
  570. "=r"(rgb), // %1
  571. "=r"(ptr0), // %2
  572. "=r"(ptr1), // %3
  573. "=r"(ptr2) // %4
  574. : "0"(nn),
  575. "1"(rgb),
  576. "2"(ptr0),
  577. "3"(ptr1),
  578. "4"(ptr2)
  579. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  580. );
  581. }
  582. #endif // __aarch64__
  583. #endif // __ARM_NEON
  584. for (; remain>0; remain--)
  585. {
  586. *ptr0 = rgb[2];
  587. *ptr1 = rgb[1];
  588. *ptr2 = rgb[0];
  589. rgb += 3;
  590. ptr0++;
  591. ptr1++;
  592. ptr2++;
  593. }
  594. rgb += wgap;
  595. }
  596. return 0;
  597. }
  598. static void to_bgr2rgb(const Mat& m, unsigned char* rgb, int stride)
  599. {
  600. int w = m.w;
  601. int h = m.h;
  602. const int wgap = stride - w * 3;
  603. if (wgap == 0)
  604. {
  605. w = w * h;
  606. h = 1;
  607. }
  608. const float* ptr0 = m.channel(0);
  609. const float* ptr1 = m.channel(1);
  610. const float* ptr2 = m.channel(2);
  611. for (int y=0; y<h; y++)
  612. {
  613. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  614. #if __ARM_NEON
  615. int nn = w >> 3;
  616. int remain = w - (nn << 3);
  617. #else
  618. int remain = w;
  619. #endif // __ARM_NEON
  620. #if __ARM_NEON
  621. for (; nn>0; nn--)
  622. {
  623. float32x4_t _rlow = vld1q_f32(ptr2);
  624. float32x4_t _rhigh = vld1q_f32(ptr2+4);
  625. float32x4_t _glow = vld1q_f32(ptr1);
  626. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  627. float32x4_t _blow = vld1q_f32(ptr0);
  628. float32x4_t _bhigh = vld1q_f32(ptr0+4);
  629. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  630. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  631. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  632. uint8x8x3_t _rgb;
  633. _rgb.val[0] = vqmovun_s16(_r16);
  634. _rgb.val[1] = vqmovun_s16(_g16);
  635. _rgb.val[2] = vqmovun_s16(_b16);
  636. vst3_u8(rgb, _rgb);
  637. rgb += 3*8;
  638. ptr0 += 8;
  639. ptr1 += 8;
  640. ptr2 += 8;
  641. }
  642. #endif // __ARM_NEON
  643. for (; remain>0; remain--)
  644. {
  645. rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
  646. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  647. rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
  648. rgb += 3;
  649. ptr0++;
  650. ptr1++;
  651. ptr2++;
  652. }
  653. #undef SATURATE_CAST_UCHAR
  654. rgb += wgap;
  655. }
  656. }
  657. static int from_rgb2gray(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  658. {
  659. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  660. const unsigned char Y_shift = 8;//14
  661. const unsigned char R2Y = 77;
  662. const unsigned char G2Y = 150;
  663. const unsigned char B2Y = 29;
  664. m.create(w, h, 1, 4u, allocator);
  665. if (m.empty())
  666. return -100;
  667. const int wgap = stride - w * 3;
  668. if (wgap == 0)
  669. {
  670. w = w * h;
  671. h = 1;
  672. }
  673. float* ptr = m;
  674. for (int y=0; y<h; y++)
  675. {
  676. #if __ARM_NEON
  677. int nn = w >> 3;
  678. int remain = w - (nn << 3);
  679. #else
  680. int remain = w;
  681. #endif // __ARM_NEON
  682. #if __ARM_NEON
  683. #if __aarch64__
  684. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  685. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  686. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  687. for (; nn>0; nn--)
  688. {
  689. uint8x8x3_t _rgb = vld3_u8(rgb);
  690. uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
  691. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  692. _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
  693. _y16 = vshrq_n_u16(_y16, Y_shift);
  694. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  695. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  696. vst1q_f32(ptr, _ylow);
  697. vst1q_f32(ptr+4, _yhigh);
  698. rgb += 3*8;
  699. ptr += 8;
  700. }
  701. #else
  702. if (nn > 0)
  703. {
  704. asm volatile(
  705. "vdup.u8 d16, %6 \n"
  706. "vdup.u8 d17, %7 \n"
  707. "vdup.u8 d18, %8 \n"
  708. "0: \n"
  709. "pld [%1, #256] \n"
  710. "vld3.u8 {d0-d2}, [%1]! \n"
  711. "vmull.u8 q2, d0, d16 \n"
  712. "vmlal.u8 q2, d1, d17 \n"
  713. "vmlal.u8 q2, d2, d18 \n"
  714. "vshr.u16 q2, q2, #8 \n" // Y_shift
  715. "vmovl.u16 q0, d4 \n"
  716. "vmovl.u16 q1, d5 \n"
  717. "vcvt.f32.u32 q0, q0 \n"
  718. "vcvt.f32.u32 q1, q1 \n"
  719. "subs %0, #1 \n"
  720. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  721. "bne 0b \n"
  722. : "=r"(nn), // %0
  723. "=r"(rgb), // %1
  724. "=r"(ptr) // %2
  725. : "0"(nn),
  726. "1"(rgb),
  727. "2"(ptr),
  728. "r"(R2Y), // %6
  729. "r"(G2Y), // %7
  730. "r"(B2Y) // %8
  731. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  732. );
  733. }
  734. #endif // __aarch64__
  735. #endif // __ARM_NEON
  736. for (; remain>0; remain--)
  737. {
  738. *ptr = static_cast<float>((rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift);
  739. rgb += 3;
  740. ptr++;
  741. }
  742. rgb += wgap;
  743. }
  744. return 0;
  745. }
  746. static int from_rgb2rgba(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  747. {
  748. m.create(w, h, 4, 4u, allocator);
  749. if (m.empty())
  750. return -100;
  751. Mat rgb_channels = m.channel_range(0, 3);
  752. from_rgb(rgb, w, h, stride, rgb_channels, allocator);
  753. Mat alpha_channel = m.channel(3);
  754. alpha_channel.fill(255.f);
  755. return 0;
  756. }
  757. static void to_rgb2rgba(const Mat& m, unsigned char* rgba, int stride)
  758. {
  759. int w = m.w;
  760. int h = m.h;
  761. const int wgap = stride - w * 4;
  762. if (wgap == 0)
  763. {
  764. w = w * h;
  765. h = 1;
  766. }
  767. const float* ptr0 = m.channel(0);
  768. const float* ptr1 = m.channel(1);
  769. const float* ptr2 = m.channel(2);
  770. for (int y=0; y<h; y++)
  771. {
  772. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  773. #if __ARM_NEON
  774. int nn = w >> 3;
  775. int remain = w - (nn << 3);
  776. #else
  777. int remain = w;
  778. #endif // __ARM_NEON
  779. #if __ARM_NEON
  780. uint8x8_t _a = vdup_n_u8(255);
  781. for (; nn>0; nn--)
  782. {
  783. float32x4_t _rlow = vld1q_f32(ptr0);
  784. float32x4_t _rhigh = vld1q_f32(ptr0+4);
  785. float32x4_t _glow = vld1q_f32(ptr1);
  786. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  787. float32x4_t _blow = vld1q_f32(ptr2);
  788. float32x4_t _bhigh = vld1q_f32(ptr2+4);
  789. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  790. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  791. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  792. uint8x8x4_t _rgba;
  793. _rgba.val[0] = vqmovun_s16(_r16);
  794. _rgba.val[1] = vqmovun_s16(_g16);
  795. _rgba.val[2] = vqmovun_s16(_b16);
  796. _rgba.val[3] = _a;
  797. vst4_u8(rgba, _rgba);
  798. rgba += 4*8;
  799. ptr0 += 8;
  800. ptr1 += 8;
  801. ptr2 += 8;
  802. }
  803. #endif // __ARM_NEON
  804. for (; remain>0; remain--)
  805. {
  806. rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
  807. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  808. rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
  809. rgba[3] = 255;
  810. rgba += 4;
  811. ptr0++;
  812. ptr1++;
  813. ptr2++;
  814. }
  815. #undef SATURATE_CAST_UCHAR
  816. rgba += wgap;
  817. }
  818. }
  819. static int from_bgr2gray(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
  820. {
  821. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  822. const unsigned char Y_shift = 8;//14
  823. const unsigned char R2Y = 77;
  824. const unsigned char G2Y = 150;
  825. const unsigned char B2Y = 29;
  826. m.create(w, h, 1, 4u, allocator);
  827. if (m.empty())
  828. return -100;
  829. const int wgap = stride - w * 3;
  830. if (wgap == 0)
  831. {
  832. w = w * h;
  833. h = 1;
  834. }
  835. float* ptr = m;
  836. for (int y=0; y<h; y++)
  837. {
  838. #if __ARM_NEON
  839. int nn = w >> 3;
  840. int remain = w - (nn << 3);
  841. #else
  842. int remain = w;
  843. #endif // __ARM_NEON
  844. #if __ARM_NEON
  845. #if __aarch64__
  846. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  847. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  848. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  849. for (; nn>0; nn--)
  850. {
  851. uint8x8x3_t _rgb = vld3_u8(bgr);
  852. uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
  853. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  854. _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
  855. _y16 = vshrq_n_u16(_y16, Y_shift);
  856. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  857. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  858. vst1q_f32(ptr, _ylow);
  859. vst1q_f32(ptr+4, _yhigh);
  860. bgr += 3*8;
  861. ptr += 8;
  862. }
  863. #else
  864. if (nn > 0)
  865. {
  866. asm volatile(
  867. "vdup.u8 d16, %6 \n"
  868. "vdup.u8 d17, %7 \n"
  869. "vdup.u8 d18, %8 \n"
  870. "0: \n"
  871. "pld [%1, #256] \n"
  872. "vld3.u8 {d0-d2}, [%1]! \n"
  873. "vmull.u8 q2, d2, d16 \n"
  874. "vmlal.u8 q2, d1, d17 \n"
  875. "vmlal.u8 q2, d0, d18 \n"
  876. "vshr.u16 q2, q2, #8 \n" // Y_shift
  877. "vmovl.u16 q0, d4 \n"
  878. "vmovl.u16 q1, d5 \n"
  879. "vcvt.f32.u32 q0, q0 \n"
  880. "vcvt.f32.u32 q1, q1 \n"
  881. "subs %0, #1 \n"
  882. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  883. "bne 0b \n"
  884. : "=r"(nn), // %0
  885. "=r"(bgr), // %1
  886. "=r"(ptr) // %2
  887. : "0"(nn),
  888. "1"(bgr),
  889. "2"(ptr),
  890. "r"(R2Y), // %6
  891. "r"(G2Y), // %7
  892. "r"(B2Y) // %8
  893. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  894. );
  895. }
  896. #endif // __aarch64__
  897. #endif // __ARM_NEON
  898. for (; remain>0; remain--)
  899. {
  900. *ptr = static_cast<float>((bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift);
  901. bgr += 3;
  902. ptr++;
  903. }
  904. bgr += wgap;
  905. }
  906. return 0;
  907. }
  908. static int from_bgr2rgba(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
  909. {
  910. m.create(w, h, 4, 4u, allocator);
  911. if (m.empty())
  912. return -100;
  913. Mat rgb_channels = m.channel_range(0, 3);
  914. from_rgb2bgr(bgr, w, h, stride, rgb_channels, allocator);
  915. Mat alpha_channel = m.channel(3);
  916. alpha_channel.fill(255.f);
  917. return 0;
  918. }
  919. static void to_bgr2rgba(const Mat& m, unsigned char* rgba, int stride)
  920. {
  921. int w = m.w;
  922. int h = m.h;
  923. const int wgap = stride - w * 4;
  924. if (wgap == 0)
  925. {
  926. w = w * h;
  927. h = 1;
  928. }
  929. const float* ptr0 = m.channel(0);
  930. const float* ptr1 = m.channel(1);
  931. const float* ptr2 = m.channel(2);
  932. for (int y=0; y<h; y++)
  933. {
  934. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  935. #if __ARM_NEON
  936. int nn = w >> 3;
  937. int remain = w - (nn << 3);
  938. #else
  939. int remain = w;
  940. #endif // __ARM_NEON
  941. #if __ARM_NEON
  942. uint8x8_t _a = vdup_n_u8(255);
  943. for (; nn>0; nn--)
  944. {
  945. float32x4_t _rlow = vld1q_f32(ptr2);
  946. float32x4_t _rhigh = vld1q_f32(ptr2+4);
  947. float32x4_t _glow = vld1q_f32(ptr1);
  948. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  949. float32x4_t _blow = vld1q_f32(ptr0);
  950. float32x4_t _bhigh = vld1q_f32(ptr0+4);
  951. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  952. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  953. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  954. uint8x8x4_t _rgba;
  955. _rgba.val[0] = vqmovun_s16(_r16);
  956. _rgba.val[1] = vqmovun_s16(_g16);
  957. _rgba.val[2] = vqmovun_s16(_b16);
  958. _rgba.val[3] = _a;
  959. vst4_u8(rgba, _rgba);
  960. rgba += 4*8;
  961. ptr0 += 8;
  962. ptr1 += 8;
  963. ptr2 += 8;
  964. }
  965. #endif // __ARM_NEON
  966. for (; remain>0; remain--)
  967. {
  968. rgba[0] = SATURATE_CAST_UCHAR(*ptr2);
  969. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  970. rgba[2] = SATURATE_CAST_UCHAR(*ptr0);
  971. rgba[3] = 255;
  972. rgba += 4;
  973. ptr0++;
  974. ptr1++;
  975. ptr2++;
  976. }
  977. #undef SATURATE_CAST_UCHAR
  978. rgba += wgap;
  979. }
  980. }
  981. static int from_gray2rgb(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  982. {
  983. m.create(w, h, 3, 4u, allocator);
  984. if (m.empty())
  985. return -100;
  986. const int wgap = stride - w;
  987. if (wgap == 0)
  988. {
  989. w = w * h;
  990. h = 1;
  991. }
  992. float* ptr0 = m.channel(0);
  993. float* ptr1 = m.channel(1);
  994. float* ptr2 = m.channel(2);
  995. for (int y=0; y<h; y++)
  996. {
  997. #if __ARM_NEON
  998. int nn = w >> 4;
  999. int remain = w - (nn << 4);
  1000. #else
  1001. int remain = w;
  1002. #endif // __ARM_NEON
  1003. #if __ARM_NEON
  1004. #if __aarch64__
  1005. for (; nn>0; nn--)
  1006. {
  1007. uint8x16_t _gray = vld1q_u8(gray);
  1008. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  1009. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  1010. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  1011. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  1012. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  1013. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  1014. vst1q_f32(ptr0, _graylow_0);
  1015. vst1q_f32(ptr0+4, _grayhigh_0);
  1016. vst1q_f32(ptr0+8, _graylow_1);
  1017. vst1q_f32(ptr0+12, _grayhigh_1);
  1018. vst1q_f32(ptr1, _graylow_0);
  1019. vst1q_f32(ptr1+4, _grayhigh_0);
  1020. vst1q_f32(ptr1+8, _graylow_1);
  1021. vst1q_f32(ptr1+12, _grayhigh_1);
  1022. vst1q_f32(ptr2, _graylow_0);
  1023. vst1q_f32(ptr2+4, _grayhigh_0);
  1024. vst1q_f32(ptr2+8, _graylow_1);
  1025. vst1q_f32(ptr2+12, _grayhigh_1);
  1026. gray += 16;
  1027. ptr0 += 16;
  1028. ptr1 += 16;
  1029. ptr2 += 16;
  1030. }
  1031. #else
  1032. if (nn > 0)
  1033. {
  1034. asm volatile(
  1035. "0: \n"
  1036. "pld [%1, #128] \n"
  1037. "vld1.u8 {d0,d1}, [%1]! \n"
  1038. "vmovl.u8 q8, d0 \n"
  1039. "vmovl.u8 q9, d1 \n"
  1040. "vmovl.u16 q0, d16 \n"
  1041. "vmovl.u16 q1, d17 \n"
  1042. "vmovl.u16 q2, d18 \n"
  1043. "vmovl.u16 q3, d19 \n"
  1044. "vcvt.f32.u32 q0, q0 \n"
  1045. "vcvt.f32.u32 q1, q1 \n"
  1046. "vcvt.f32.u32 q2, q2 \n"
  1047. "vcvt.f32.u32 q3, q3 \n"
  1048. "subs %0, #1 \n"
  1049. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  1050. "vst1.f32 {d4-d7}, [%2 :128]! \n"
  1051. "vst1.f32 {d0-d3}, [%3 :128]! \n"
  1052. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  1053. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  1054. "vst1.f32 {d4-d7}, [%4 :128]! \n"
  1055. "bne 0b \n"
  1056. : "=r"(nn), // %0
  1057. "=r"(gray), // %1
  1058. "=r"(ptr0), // %2
  1059. "=r"(ptr1), // %3
  1060. "=r"(ptr2) // %4
  1061. : "0"(nn),
  1062. "1"(gray),
  1063. "2"(ptr0),
  1064. "3"(ptr1),
  1065. "4"(ptr2)
  1066. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  1067. );
  1068. }
  1069. #endif // __aarch64__
  1070. #endif // __ARM_NEON
  1071. for (; remain>0; remain--)
  1072. {
  1073. *ptr0 = *gray;
  1074. *ptr1 = *gray;
  1075. *ptr2 = *gray;
  1076. gray++;
  1077. ptr0++;
  1078. ptr1++;
  1079. ptr2++;
  1080. }
  1081. gray += wgap;
  1082. }
  1083. return 0;
  1084. }
  1085. static int from_gray2rgba(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  1086. {
  1087. m.create(w, h, 4, 4u, allocator);
  1088. if (m.empty())
  1089. return -100;
  1090. Mat rgb_channels = m.channel_range(0, 3);
  1091. from_gray2rgb(gray, w, h, stride, rgb_channels, allocator);
  1092. Mat alpha_channel = m.channel(3);
  1093. alpha_channel.fill(255.f);
  1094. return 0;
  1095. }
  1096. static void to_gray2rgba(const Mat& m, unsigned char* rgba, int stride)
  1097. {
  1098. int w = m.w;
  1099. int h = m.h;
  1100. const int wgap = stride - w * 4;
  1101. if (wgap == 0)
  1102. {
  1103. w = w * h;
  1104. h = 1;
  1105. }
  1106. const float* ptr = m;
  1107. for (int y=0; y<h; y++)
  1108. {
  1109. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1110. #if __ARM_NEON
  1111. int nn = w >> 3;
  1112. int remain = w - (nn << 3);
  1113. #else
  1114. int remain = w;
  1115. #endif // __ARM_NEON
  1116. #if __ARM_NEON
  1117. uint8x8_t _a = vdup_n_u8(255);
  1118. for (; nn>0; nn--)
  1119. {
  1120. float32x4_t _glow = vld1q_f32(ptr);
  1121. float32x4_t _ghigh = vld1q_f32(ptr+4);
  1122. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  1123. uint8x8_t _gray = vqmovun_s16(_g16);
  1124. uint8x8x4_t _rgba;
  1125. _rgba.val[0] = _gray;
  1126. _rgba.val[1] = _gray;
  1127. _rgba.val[2] = _gray;
  1128. _rgba.val[3] = _a;
  1129. vst4_u8(rgba, _rgba);
  1130. rgba += 4*8;
  1131. ptr += 8;
  1132. }
  1133. #endif // __ARM_NEON
  1134. for (; remain>0; remain--)
  1135. {
  1136. unsigned char gray = SATURATE_CAST_UCHAR(*ptr);
  1137. rgba[0] = gray;
  1138. rgba[1] = gray;
  1139. rgba[2] = gray;
  1140. rgba[3] = 255;
  1141. rgba += 4;
  1142. ptr++;
  1143. }
  1144. #undef SATURATE_CAST_UCHAR
  1145. rgba += wgap;
  1146. }
  1147. }
  1148. static int from_rgba2rgb(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1149. {
  1150. m.create(w, h, 3, 4u, allocator);
  1151. if (m.empty())
  1152. return -100;
  1153. const int wgap = stride - w * 4;
  1154. if (wgap == 0)
  1155. {
  1156. w = w * h;
  1157. h = 1;
  1158. }
  1159. float* ptr0 = m.channel(0);
  1160. float* ptr1 = m.channel(1);
  1161. float* ptr2 = m.channel(2);
  1162. for (int y=0; y<h; y++)
  1163. {
  1164. #if __ARM_NEON
  1165. int nn = w >> 3;
  1166. int remain = w - (nn << 3);
  1167. #else
  1168. int remain = w;
  1169. #endif // __ARM_NEON
  1170. #if __ARM_NEON
  1171. #if __aarch64__
  1172. for (; nn>0; nn--)
  1173. {
  1174. uint8x8x4_t _rgba = vld4_u8(rgba);
  1175. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1176. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1177. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1178. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1179. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1180. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1181. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1182. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1183. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1184. vst1q_f32(ptr0, _rlow);
  1185. vst1q_f32(ptr0+4, _rhigh);
  1186. vst1q_f32(ptr1, _glow);
  1187. vst1q_f32(ptr1+4, _ghigh);
  1188. vst1q_f32(ptr2, _blow);
  1189. vst1q_f32(ptr2+4, _bhigh);
  1190. rgba += 4*8;
  1191. ptr0 += 8;
  1192. ptr1 += 8;
  1193. ptr2 += 8;
  1194. }
  1195. #else
  1196. if (nn > 0)
  1197. {
  1198. asm volatile(
  1199. "0: \n"
  1200. "pld [%1, #256] \n"
  1201. "vld4.u8 {d0-d3}, [%1]! \n"
  1202. "vmovl.u8 q8, d0 \n"
  1203. "vmovl.u8 q9, d1 \n"
  1204. "vmovl.u8 q10, d2 \n"
  1205. "vmovl.u16 q0, d16 \n"
  1206. "vmovl.u16 q1, d17 \n"
  1207. "vmovl.u16 q2, d18 \n"
  1208. "vmovl.u16 q3, d19 \n"
  1209. "vmovl.u16 q8, d20 \n"
  1210. "vmovl.u16 q9, d21 \n"
  1211. "vcvt.f32.u32 q0, q0 \n"
  1212. "vcvt.f32.u32 q1, q1 \n"
  1213. "vcvt.f32.u32 q2, q2 \n"
  1214. "vcvt.f32.u32 q3, q3 \n"
  1215. "vcvt.f32.u32 q8, q8 \n"
  1216. "subs %0, #1 \n"
  1217. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  1218. "vcvt.f32.u32 q9, q9 \n"
  1219. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  1220. "vst1.f32 {d16-d19}, [%4 :128]!\n"
  1221. "bne 0b \n"
  1222. : "=r"(nn), // %0
  1223. "=r"(rgba), // %1
  1224. "=r"(ptr0), // %2
  1225. "=r"(ptr1), // %3
  1226. "=r"(ptr2) // %4
  1227. : "0"(nn),
  1228. "1"(rgba),
  1229. "2"(ptr0),
  1230. "3"(ptr1),
  1231. "4"(ptr2)
  1232. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
  1233. );
  1234. }
  1235. #endif // __aarch64__
  1236. #endif // __ARM_NEON
  1237. for (; remain>0; remain--)
  1238. {
  1239. *ptr0 = rgba[0];
  1240. *ptr1 = rgba[1];
  1241. *ptr2 = rgba[2];
  1242. rgba += 4;
  1243. ptr0++;
  1244. ptr1++;
  1245. ptr2++;
  1246. }
  1247. rgba += wgap;
  1248. }
  1249. return 0;
  1250. }
  1251. static int from_rgba2bgr(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1252. {
  1253. m.create(w, h, 3, 4u, allocator);
  1254. if (m.empty())
  1255. return -100;
  1256. const int wgap = stride - w * 4;
  1257. if (wgap == 0)
  1258. {
  1259. w = w * h;
  1260. h = 1;
  1261. }
  1262. float* ptr0 = m.channel(0);
  1263. float* ptr1 = m.channel(1);
  1264. float* ptr2 = m.channel(2);
  1265. for (int y=0; y<h; y++)
  1266. {
  1267. #if __ARM_NEON
  1268. int nn = w >> 3;
  1269. int remain = w - (nn << 3);
  1270. #else
  1271. int remain = w;
  1272. #endif // __ARM_NEON
  1273. #if __ARM_NEON
  1274. #if __aarch64__
  1275. for (; nn>0; nn--)
  1276. {
  1277. uint8x8x4_t _rgba = vld4_u8(rgba);
  1278. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1279. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1280. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1281. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1282. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1283. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1284. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1285. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1286. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1287. vst1q_f32(ptr2, _rlow);
  1288. vst1q_f32(ptr2+4, _rhigh);
  1289. vst1q_f32(ptr1, _glow);
  1290. vst1q_f32(ptr1+4, _ghigh);
  1291. vst1q_f32(ptr0, _blow);
  1292. vst1q_f32(ptr0+4, _bhigh);
  1293. rgba += 4*8;
  1294. ptr0 += 8;
  1295. ptr1 += 8;
  1296. ptr2 += 8;
  1297. }
  1298. #else
  1299. if (nn > 0)
  1300. {
  1301. asm volatile(
  1302. "0: \n"
  1303. "pld [%1, #256] \n"
  1304. "vld4.u8 {d0-d3}, [%1]! \n"
  1305. "vmovl.u8 q8, d0 \n"
  1306. "vmovl.u8 q9, d1 \n"
  1307. "vmovl.u8 q10, d2 \n"
  1308. "vmovl.u16 q0, d16 \n"
  1309. "vmovl.u16 q1, d17 \n"
  1310. "vmovl.u16 q2, d18 \n"
  1311. "vmovl.u16 q3, d19 \n"
  1312. "vmovl.u16 q8, d20 \n"
  1313. "vmovl.u16 q9, d21 \n"
  1314. "vcvt.f32.u32 q0, q0 \n"
  1315. "vcvt.f32.u32 q1, q1 \n"
  1316. "vcvt.f32.u32 q2, q2 \n"
  1317. "vcvt.f32.u32 q3, q3 \n"
  1318. "vcvt.f32.u32 q8, q8 \n"
  1319. "subs %0, #1 \n"
  1320. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  1321. "vcvt.f32.u32 q9, q9 \n"
  1322. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  1323. "vst1.f32 {d16-d19}, [%2 :128]!\n"
  1324. "bne 0b \n"
  1325. : "=r"(nn), // %0
  1326. "=r"(rgba), // %1
  1327. "=r"(ptr0), // %2
  1328. "=r"(ptr1), // %3
  1329. "=r"(ptr2) // %4
  1330. : "0"(nn),
  1331. "1"(rgba),
  1332. "2"(ptr0),
  1333. "3"(ptr1),
  1334. "4"(ptr2)
  1335. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  1336. );
  1337. }
  1338. #endif // __aarch64__
  1339. #endif // __ARM_NEON
  1340. for (; remain>0; remain--)
  1341. {
  1342. *ptr0 = rgba[2];
  1343. *ptr1 = rgba[1];
  1344. *ptr2 = rgba[0];
  1345. rgba += 4;
  1346. ptr0++;
  1347. ptr1++;
  1348. ptr2++;
  1349. }
  1350. rgba += wgap;
  1351. }
  1352. return 0;
  1353. }
  1354. static int from_rgba2gray(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1355. {
  1356. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  1357. const unsigned char Y_shift = 8;//14
  1358. const unsigned char R2Y = 77;
  1359. const unsigned char G2Y = 150;
  1360. const unsigned char B2Y = 29;
  1361. m.create(w, h, 1, 4u, allocator);
  1362. if (m.empty())
  1363. return -100;
  1364. const int wgap = stride - w * 4;
  1365. if (wgap == 0)
  1366. {
  1367. w = w * h;
  1368. h = 1;
  1369. }
  1370. float* ptr = m;
  1371. for (int y=0; y<h; y++)
  1372. {
  1373. #if __ARM_NEON
  1374. int nn = w >> 3;
  1375. int remain = w - (nn << 3);
  1376. #else
  1377. int remain = w;
  1378. #endif // __ARM_NEON
  1379. #if __ARM_NEON
  1380. #if __aarch64__
  1381. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  1382. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  1383. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  1384. for (; nn>0; nn--)
  1385. {
  1386. uint8x8x4_t _rgba = vld4_u8(rgba);
  1387. uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
  1388. _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
  1389. _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
  1390. _y16 = vshrq_n_u16(_y16, Y_shift);
  1391. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  1392. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  1393. vst1q_f32(ptr, _ylow);
  1394. vst1q_f32(ptr+4, _yhigh);
  1395. rgba += 4*8;
  1396. ptr += 8;
  1397. }
  1398. #else
  1399. if (nn > 0)
  1400. {
  1401. asm volatile(
  1402. "vdup.u8 d16, %6 \n"
  1403. "vdup.u8 d17, %7 \n"
  1404. "vdup.u8 d18, %8 \n"
  1405. "0: \n"
  1406. "pld [%1, #256] \n"
  1407. "vld4.u8 {d0-d3}, [%1]! \n"
  1408. "vmull.u8 q2, d0, d16 \n"
  1409. "vmlal.u8 q2, d1, d17 \n"
  1410. "vmlal.u8 q2, d2, d18 \n"
  1411. "vshr.u16 q2, q2, #8 \n" // Y_shift
  1412. "vmovl.u16 q0, d4 \n"
  1413. "vmovl.u16 q1, d5 \n"
  1414. "vcvt.f32.u32 q0, q0 \n"
  1415. "vcvt.f32.u32 q1, q1 \n"
  1416. "subs %0, #1 \n"
  1417. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  1418. "bne 0b \n"
  1419. : "=r"(nn), // %0
  1420. "=r"(rgba), // %1
  1421. "=r"(ptr) // %2
  1422. : "0"(nn),
  1423. "1"(rgba),
  1424. "2"(ptr),
  1425. "r"(R2Y), // %6
  1426. "r"(G2Y), // %7
  1427. "r"(B2Y) // %8
  1428. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  1429. );
  1430. }
  1431. #endif // __aarch64__
  1432. #endif // __ARM_NEON
  1433. for (; remain>0; remain--)
  1434. {
  1435. *ptr = static_cast<float>((rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift);
  1436. rgba += 4;
  1437. ptr++;
  1438. }
  1439. rgba += wgap;
  1440. }
  1441. return 0;
  1442. }
  1443. static int from_rgba2bgra(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1444. {
  1445. m.create(w, h, 4, 4u, allocator);
  1446. if (m.empty())
  1447. return -100;
  1448. const int wgap = stride - w * 4;
  1449. if (wgap == 0)
  1450. {
  1451. w = w * h;
  1452. h = 1;
  1453. }
  1454. float* ptr0 = m.channel(0);
  1455. float* ptr1 = m.channel(1);
  1456. float* ptr2 = m.channel(2);
  1457. float* ptr3 = m.channel(3);
  1458. for (int y=0; y<h; y++)
  1459. {
  1460. #if __ARM_NEON
  1461. int nn = w >> 3;
  1462. int remain = w - (nn << 3);
  1463. #else
  1464. int remain = w;
  1465. #endif // __ARM_NEON
  1466. #if __ARM_NEON
  1467. #if __aarch64__
  1468. for (; nn>0; nn--)
  1469. {
  1470. uint8x8x4_t _rgba = vld4_u8(rgba);
  1471. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1472. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1473. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1474. int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
  1475. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1476. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1477. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1478. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1479. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1480. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1481. float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
  1482. float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
  1483. vst1q_f32(ptr2, _rlow);
  1484. vst1q_f32(ptr2+4, _rhigh);
  1485. vst1q_f32(ptr1, _glow);
  1486. vst1q_f32(ptr1+4, _ghigh);
  1487. vst1q_f32(ptr0, _blow);
  1488. vst1q_f32(ptr0+4, _bhigh);
  1489. vst1q_f32(ptr3, _alow);
  1490. vst1q_f32(ptr3+4, _ahigh);
  1491. rgba += 4*8;
  1492. ptr0 += 8;
  1493. ptr1 += 8;
  1494. ptr2 += 8;
  1495. ptr3 += 8;
  1496. }
  1497. #else
  1498. if (nn > 0)
  1499. {
  1500. asm volatile(
  1501. "0: \n"
  1502. "pld [%1, #256] \n"
  1503. "vld4.u8 {d0-d3}, [%1]! \n"
  1504. "vmovl.u8 q8, d0 \n"
  1505. "vmovl.u8 q9, d1 \n"
  1506. "vmovl.u8 q10, d2 \n"
  1507. "vmovl.u8 q11, d3 \n"
  1508. "vmovl.u16 q0, d16 \n"
  1509. "vmovl.u16 q1, d17 \n"
  1510. "vmovl.u16 q2, d18 \n"
  1511. "vmovl.u16 q3, d19 \n"
  1512. "vmovl.u16 q8, d20 \n"
  1513. "vmovl.u16 q9, d21 \n"
  1514. "vmovl.u16 q10, d22 \n"
  1515. "vmovl.u16 q11, d23 \n"
  1516. "vcvt.f32.u32 q0, q0 \n"
  1517. "vcvt.f32.u32 q1, q1 \n"
  1518. "vcvt.f32.u32 q2, q2 \n"
  1519. "vcvt.f32.u32 q3, q3 \n"
  1520. "vcvt.f32.u32 q8, q8 \n"
  1521. "subs %0, #1 \n"
  1522. "vst1.f32 {d0-d3}, [%4 :128]! \n"
  1523. "vcvt.f32.u32 q9, q9 \n"
  1524. "vcvt.f32.u32 q10, q10 \n"
  1525. "vst1.f32 {d4-d7}, [%3 :128]! \n"
  1526. "vcvt.f32.u32 q11, q11 \n"
  1527. "vst1.f32 {d16-d19}, [%2 :128]!\n"
  1528. "vst1.f32 {d20-d23}, [%5 :128]!\n"
  1529. "bne 0b \n"
  1530. : "=r"(nn), // %0
  1531. "=r"(rgba), // %1
  1532. "=r"(ptr0), // %2
  1533. "=r"(ptr1), // %3
  1534. "=r"(ptr2), // %4
  1535. "=r"(ptr3) // %5
  1536. : "0"(nn),
  1537. "1"(rgba),
  1538. "2"(ptr0),
  1539. "3"(ptr1),
  1540. "4"(ptr2),
  1541. "5"(ptr3)
  1542. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
  1543. );
  1544. }
  1545. #endif // __aarch64__
  1546. #endif // __ARM_NEON
  1547. for (; remain>0; remain--)
  1548. {
  1549. *ptr0 = rgba[2];
  1550. *ptr1 = rgba[1];
  1551. *ptr2 = rgba[0];
  1552. *ptr3 = rgba[3];
  1553. rgba += 4;
  1554. ptr0++;
  1555. ptr1++;
  1556. ptr2++;
  1557. ptr3++;
  1558. }
  1559. rgba += wgap;
  1560. }
  1561. return 0;
  1562. }
  1563. static void to_rgba2bgra(const Mat& m, unsigned char* bgra, int stride)
  1564. {
  1565. int w = m.w;
  1566. int h = m.h;
  1567. const int wgap = stride - w * 4;
  1568. if (wgap == 0)
  1569. {
  1570. w = w * h;
  1571. h = 1;
  1572. }
  1573. const float* ptr0 = m.channel(0);
  1574. const float* ptr1 = m.channel(1);
  1575. const float* ptr2 = m.channel(2);
  1576. const float* ptr3 = m.channel(3);
  1577. for (int y=0; y<h; y++)
  1578. {
  1579. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1580. #if __ARM_NEON
  1581. int nn = w >> 3;
  1582. int remain = w - (nn << 3);
  1583. #else
  1584. int remain = w;
  1585. #endif // __ARM_NEON
  1586. #if __ARM_NEON
  1587. for (; nn>0; nn--)
  1588. {
  1589. float32x4_t _rlow = vld1q_f32(ptr0);
  1590. float32x4_t _rhigh = vld1q_f32(ptr0+4);
  1591. float32x4_t _glow = vld1q_f32(ptr1);
  1592. float32x4_t _ghigh = vld1q_f32(ptr1+4);
  1593. float32x4_t _blow = vld1q_f32(ptr2);
  1594. float32x4_t _bhigh = vld1q_f32(ptr2+4);
  1595. float32x4_t _alow = vld1q_f32(ptr3);
  1596. float32x4_t _ahigh = vld1q_f32(ptr3+4);
  1597. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  1598. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  1599. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  1600. int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
  1601. uint8x8x4_t _bgra;
  1602. _bgra.val[0] = vqmovun_s16(_b16);
  1603. _bgra.val[1] = vqmovun_s16(_g16);
  1604. _bgra.val[2] = vqmovun_s16(_r16);
  1605. _bgra.val[3] = vqmovun_s16(_a16);
  1606. vst4_u8(bgra, _bgra);
  1607. bgra += 4*8;
  1608. ptr0 += 8;
  1609. ptr1 += 8;
  1610. ptr2 += 8;
  1611. ptr3 += 8;
  1612. }
  1613. #endif // __ARM_NEON
  1614. for (; remain>0; remain--)
  1615. {
  1616. bgra[0] = SATURATE_CAST_UCHAR(*ptr2);
  1617. bgra[1] = SATURATE_CAST_UCHAR(*ptr1);
  1618. bgra[2] = SATURATE_CAST_UCHAR(*ptr0);
  1619. bgra[3] = SATURATE_CAST_UCHAR(*ptr3);
  1620. bgra += 4;
  1621. ptr0++;
  1622. ptr1++;
  1623. ptr2++;
  1624. ptr3++;
  1625. }
  1626. #undef SATURATE_CAST_UCHAR
  1627. bgra += wgap;
  1628. }
  1629. }
  1630. static int from_bgra2gray(const unsigned char* bgra, int w, int h, int stride, Mat& m, Allocator* allocator)
  1631. {
  1632. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  1633. const unsigned char Y_shift = 8;//14
  1634. const unsigned char R2Y = 77;
  1635. const unsigned char G2Y = 150;
  1636. const unsigned char B2Y = 29;
  1637. m.create(w, h, 1, 4u, allocator);
  1638. if (m.empty())
  1639. return -100;
  1640. const int wgap = stride - w * 4;
  1641. if (wgap == 0)
  1642. {
  1643. w = w * h;
  1644. h = 1;
  1645. }
  1646. float* ptr = m;
  1647. for (int y=0; y<h; y++)
  1648. {
  1649. #if __ARM_NEON
  1650. int nn = w >> 3;
  1651. int remain = w - (nn << 3);
  1652. #else
  1653. int remain = w;
  1654. #endif // __ARM_NEON
  1655. #if __ARM_NEON
  1656. #if __aarch64__
  1657. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  1658. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  1659. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  1660. for (; nn>0; nn--)
  1661. {
  1662. uint8x8x4_t _bgra = vld4_u8(bgra);
  1663. uint16x8_t _y16 = vmull_u8(_bgra.val[2], _R2Y);
  1664. _y16 = vmlal_u8(_y16, _bgra.val[1], _G2Y);
  1665. _y16 = vmlal_u8(_y16, _bgra.val[0], _B2Y);
  1666. _y16 = vshrq_n_u16(_y16, Y_shift);
  1667. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  1668. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  1669. vst1q_f32(ptr, _ylow);
  1670. vst1q_f32(ptr+4, _yhigh);
  1671. bgra += 4*8;
  1672. ptr += 8;
  1673. }
  1674. #else
  1675. if (nn > 0)
  1676. {
  1677. asm volatile(
  1678. "vdup.u8 d16, %6 \n"
  1679. "vdup.u8 d17, %7 \n"
  1680. "vdup.u8 d18, %8 \n"
  1681. "0: \n"
  1682. "pld [%1, #256] \n"
  1683. "vld4.u8 {d0-d3}, [%1]! \n"
  1684. "vmull.u8 q2, d2, d16 \n"
  1685. "vmlal.u8 q2, d1, d17 \n"
  1686. "vmlal.u8 q2, d0, d18 \n"
  1687. "vshr.u16 q2, q2, #8 \n" // Y_shift
  1688. "vmovl.u16 q0, d4 \n"
  1689. "vmovl.u16 q1, d5 \n"
  1690. "vcvt.f32.u32 q0, q0 \n"
  1691. "vcvt.f32.u32 q1, q1 \n"
  1692. "subs %0, #1 \n"
  1693. "vst1.f32 {d0-d3}, [%2 :128]! \n"
  1694. "bne 0b \n"
  1695. : "=r"(nn), // %0
  1696. "=r"(bgra), // %1
  1697. "=r"(ptr) // %2
  1698. : "0"(nn),
  1699. "1"(bgra),
  1700. "2"(ptr),
  1701. "r"(R2Y), // %6
  1702. "r"(G2Y), // %7
  1703. "r"(B2Y) // %8
  1704. : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
  1705. );
  1706. }
  1707. #endif // __aarch64__
  1708. #endif // __ARM_NEON
  1709. for (; remain>0; remain--)
  1710. {
  1711. *ptr = static_cast<float>((bgra[2] * R2Y + bgra[1] * G2Y + bgra[0] * B2Y) >> Y_shift);
  1712. bgra += 4;
  1713. ptr++;
  1714. }
  1715. bgra += wgap;
  1716. }
  1717. return 0;
  1718. }
  1719. void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
  1720. {
  1721. const unsigned char* yptr = yuv420sp;
  1722. const unsigned char* vuptr = yuv420sp + w * h;
  1723. #if __ARM_NEON
  1724. uint8x8_t _v128 = vdup_n_u8(128);
  1725. int8x8_t _v90 = vdup_n_s8(90);
  1726. int8x8_t _v46 = vdup_n_s8(46);
  1727. int8x8_t _v22 = vdup_n_s8(22);
  1728. int8x8_t _v113 = vdup_n_s8(113);
  1729. #endif // __ARM_NEON
  1730. for (int y=0; y<h; y+=2)
  1731. {
  1732. const unsigned char* yptr0 = yptr;
  1733. const unsigned char* yptr1 = yptr + w;
  1734. unsigned char* rgb0 = rgb;
  1735. unsigned char* rgb1 = rgb + w*3;
  1736. #if __ARM_NEON
  1737. int nn = w >> 3;
  1738. int remain = w - (nn << 3);
  1739. #else
  1740. int remain = w;
  1741. #endif // __ARM_NEON
  1742. #if __ARM_NEON
  1743. #if __aarch64__
  1744. for (; nn>0; nn--)
  1745. {
  1746. int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
  1747. int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
  1748. int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
  1749. int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
  1750. int8x8_t _vv = _vvvvuuuu.val[0];
  1751. int8x8_t _uu = _vvvvuuuu.val[1];
  1752. int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
  1753. int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
  1754. _g0 = vmlsl_s8(_g0, _uu, _v22);
  1755. int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
  1756. int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
  1757. int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
  1758. _g1 = vmlsl_s8(_g1, _uu, _v22);
  1759. int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
  1760. uint8x8x3_t _rgb0;
  1761. _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
  1762. _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
  1763. _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
  1764. uint8x8x3_t _rgb1;
  1765. _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
  1766. _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
  1767. _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
  1768. vst3_u8(rgb0, _rgb0);
  1769. vst3_u8(rgb1, _rgb1);
  1770. yptr0 += 8;
  1771. yptr1 += 8;
  1772. vuptr += 8;
  1773. rgb0 += 24;
  1774. rgb1 += 24;
  1775. }
  1776. #else
  1777. if (nn > 0)
  1778. {
  1779. asm volatile(
  1780. "pld [%3, #128] \n"
  1781. "vld1.u8 {d2}, [%3]! \n"
  1782. "vsub.s8 d2, d2, %12 \n"
  1783. "0: \n"
  1784. "pld [%1, #128] \n"
  1785. "vld1.u8 {d0}, [%1]! \n"
  1786. "pld [%2, #128] \n"
  1787. "vld1.u8 {d1}, [%2]! \n"
  1788. "vshll.u8 q2, d0, #6 \n"
  1789. "vorr d3, d2, d2 \n"
  1790. "vshll.u8 q3, d1, #6 \n"
  1791. "vorr q9, q2, q2 \n"
  1792. "vtrn.s8 d2, d3 \n"
  1793. "vorr q11, q3, q3 \n"
  1794. "vmlsl.s8 q9, d2, %14 \n"
  1795. "vorr q8, q2, q2 \n"
  1796. "vmlsl.s8 q11, d2, %14 \n"
  1797. "vorr q10, q3, q3 \n"
  1798. "vmlal.s8 q8, d2, %13 \n"
  1799. "vmlal.s8 q2, d3, %16 \n"
  1800. "vmlal.s8 q10, d2, %13 \n"
  1801. "vmlsl.s8 q9, d3, %15 \n"
  1802. "vmlal.s8 q3, d3, %16 \n"
  1803. "vmlsl.s8 q11, d3, %15 \n"
  1804. "vqshrun.s16 d24, q8, #6 \n"
  1805. "vqshrun.s16 d26, q2, #6 \n"
  1806. "vqshrun.s16 d4, q10, #6 \n"
  1807. "vqshrun.s16 d25, q9, #6 \n"
  1808. "vqshrun.s16 d6, q3, #6 \n"
  1809. "vqshrun.s16 d5, q11, #6 \n"
  1810. "pld [%3, #128] \n"
  1811. "vld1.u8 {d2}, [%3]! \n"
  1812. "subs %0, #1 \n"
  1813. "vst3.u8 {d24-d26}, [%4]! \n"
  1814. "vsub.s8 d2, d2, %12 \n"
  1815. "vst3.u8 {d4-d6}, [%5]! \n"
  1816. "bne 0b \n"
  1817. "sub %3, #8 \n"
  1818. : "=r"(nn), // %0
  1819. "=r"(yptr0), // %1
  1820. "=r"(yptr1), // %2
  1821. "=r"(vuptr), // %3
  1822. "=r"(rgb0), // %4
  1823. "=r"(rgb1) // %5
  1824. : "0"(nn),
  1825. "1"(yptr0),
  1826. "2"(yptr1),
  1827. "3"(vuptr),
  1828. "4"(rgb0),
  1829. "5"(rgb1),
  1830. "w"(_v128), // %12
  1831. "w"(_v90), // %13
  1832. "w"(_v46), // %14
  1833. "w"(_v22), // %15
  1834. "w"(_v113) // %16
  1835. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
  1836. );
  1837. }
  1838. #endif // __aarch64__
  1839. #endif // __ARM_NEON
  1840. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1841. for (; remain>0; remain-=2)
  1842. {
  1843. // R = 1.164 * yy + 1.596 * vv
  1844. // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
  1845. // B = 1.164 * yy + 2.018 * uu
  1846. // R = Y + (1.370705 * (V-128))
  1847. // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
  1848. // B = Y + (1.732446 * (U-128))
  1849. // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
  1850. // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
  1851. // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
  1852. // R = ((Y << 6) + 90 * (V-128)) >> 6
  1853. // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
  1854. // B = ((Y << 6) + 113 * (U-128)) >> 6
  1855. // R = (yy + 90 * vv) >> 6
  1856. // G = (yy - 46 * vv - 22 * uu) >> 6
  1857. // B = (yy + 113 * uu) >> 6
  1858. int v = vuptr[0] - 128;
  1859. int u = vuptr[1] - 128;
  1860. int ruv = 90 * v;
  1861. int guv = -46 * v + -22 * u;
  1862. int buv = 113 * u;
  1863. int y00 = yptr0[0] << 6;
  1864. rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
  1865. rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
  1866. rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
  1867. int y01 = yptr0[1] << 6;
  1868. rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
  1869. rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
  1870. rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
  1871. int y10 = yptr1[0] << 6;
  1872. rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
  1873. rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
  1874. rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
  1875. int y11 = yptr1[1] << 6;
  1876. rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
  1877. rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
  1878. rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
  1879. yptr0 += 2;
  1880. yptr1 += 2;
  1881. vuptr += 2;
  1882. rgb0 += 6;
  1883. rgb1 += 6;
  1884. }
  1885. #undef SATURATE_CAST_UCHAR
  1886. yptr += 2*w;
  1887. rgb += 2*3*w;
  1888. }
  1889. }
  1890. Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
  1891. {
  1892. int type_from = type & PIXEL_FORMAT_MASK;
  1893. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  1894. {
  1895. return Mat::from_pixels(pixels, type, w, h, w * 3, allocator);
  1896. }
  1897. else if (type_from == PIXEL_GRAY)
  1898. {
  1899. return Mat::from_pixels(pixels, type, w, h, w * 1, allocator);
  1900. }
  1901. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  1902. {
  1903. return Mat::from_pixels(pixels, type, w, h, w * 4, allocator);
  1904. }
  1905. // unknown convert type
  1906. return Mat();
  1907. }
  1908. Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator)
  1909. {
  1910. Mat m;
  1911. if (type & PIXEL_CONVERT_MASK)
  1912. {
  1913. switch (type)
  1914. {
  1915. case PIXEL_RGB2BGR:
  1916. case PIXEL_BGR2RGB:
  1917. from_rgb2bgr(pixels, w, h, stride, m, allocator);
  1918. break;
  1919. case PIXEL_RGB2GRAY:
  1920. from_rgb2gray(pixels, w, h, stride, m, allocator);
  1921. break;
  1922. case PIXEL_RGB2RGBA:
  1923. case PIXEL_BGR2BGRA:
  1924. from_rgb2rgba(pixels, w, h, stride, m, allocator);
  1925. break;
  1926. case PIXEL_BGR2GRAY:
  1927. from_bgr2gray(pixels, w, h, stride, m, allocator);
  1928. break;
  1929. case PIXEL_BGR2RGBA:
  1930. case PIXEL_RGB2BGRA:
  1931. from_bgr2rgba(pixels, w, h, stride, m, allocator);
  1932. break;
  1933. case PIXEL_GRAY2RGB:
  1934. case PIXEL_GRAY2BGR:
  1935. from_gray2rgb(pixels, w, h, stride, m, allocator);
  1936. break;
  1937. case PIXEL_GRAY2RGBA:
  1938. case PIXEL_GRAY2BGRA:
  1939. from_gray2rgba(pixels, w, h, stride, m, allocator);
  1940. break;
  1941. case PIXEL_RGBA2RGB:
  1942. case PIXEL_BGRA2BGR:
  1943. from_rgba2rgb(pixels, w, h, stride, m, allocator);
  1944. break;
  1945. case PIXEL_RGBA2BGR:
  1946. case PIXEL_BGRA2RGB:
  1947. from_rgba2bgr(pixels, w, h, stride, m, allocator);
  1948. break;
  1949. case PIXEL_RGBA2GRAY:
  1950. from_rgba2gray(pixels, w, h, stride, m, allocator);
  1951. break;
  1952. case PIXEL_RGBA2BGRA:
  1953. case PIXEL_BGRA2RGBA:
  1954. from_rgba2bgra(pixels, w, h, stride, m, allocator);
  1955. break;
  1956. case PIXEL_BGRA2GRAY:
  1957. from_bgra2gray(pixels, w, h, stride, m, allocator);
  1958. break;
  1959. default:
  1960. // unimplemented convert type
  1961. break;
  1962. }
  1963. }
  1964. else
  1965. {
  1966. if (type == PIXEL_RGB || type == PIXEL_BGR)
  1967. from_rgb(pixels, w, h, stride, m, allocator);
  1968. if (type == PIXEL_GRAY)
  1969. from_gray(pixels, w, h, stride, m, allocator);
  1970. if (type == PIXEL_RGBA || type == PIXEL_BGRA)
  1971. from_rgba(pixels, w, h, stride, m, allocator);
  1972. }
  1973. return m;
  1974. }
  1975. Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
  1976. {
  1977. int type_from = type & PIXEL_FORMAT_MASK;
  1978. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  1979. {
  1980. return Mat::from_pixels_resize(pixels, type, w, h, w * 3, target_width, target_height, allocator);
  1981. }
  1982. else if (type_from == PIXEL_GRAY)
  1983. {
  1984. return Mat::from_pixels_resize(pixels, type, w, h, w * 1, target_width, target_height, allocator);
  1985. }
  1986. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  1987. {
  1988. return Mat::from_pixels_resize(pixels, type, w, h, w * 4, target_width, target_height, allocator);
  1989. }
  1990. // unknown convert type
  1991. return Mat();
  1992. }
  1993. Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator)
  1994. {
  1995. if (w == target_width && h == target_height)
  1996. return Mat::from_pixels(pixels, type, w, h, stride, allocator);
  1997. int type_from = type & PIXEL_FORMAT_MASK;
  1998. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  1999. {
  2000. Mat dst(target_width, target_height, (size_t)3u, 3);
  2001. resize_bilinear_c3(pixels, w, h, stride, dst, target_width, target_height, target_width * 3);
  2002. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2003. }
  2004. else if (type_from == PIXEL_GRAY)
  2005. {
  2006. Mat dst(target_width, target_height, (size_t)1u, 1);
  2007. resize_bilinear_c1(pixels, w, h, stride, dst, target_width, target_height, target_width * 1);
  2008. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2009. }
  2010. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2011. {
  2012. Mat dst(target_width, target_height, (size_t)4u, 4);
  2013. resize_bilinear_c4(pixels, w, h, stride, dst, target_width, target_height, target_width * 4);
  2014. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2015. }
  2016. // unknown convert type
  2017. return Mat();
  2018. }
  2019. void Mat::to_pixels(unsigned char* pixels, int type) const
  2020. {
  2021. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2022. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2023. {
  2024. to_pixels(pixels, type, w * 3);
  2025. }
  2026. else if (type_to == PIXEL_GRAY)
  2027. {
  2028. to_pixels(pixels, type, w * 1);
  2029. }
  2030. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2031. {
  2032. to_pixels(pixels, type, w * 4);
  2033. }
  2034. }
  2035. void Mat::to_pixels(unsigned char* pixels, int type, int stride) const
  2036. {
  2037. if (type & PIXEL_CONVERT_MASK)
  2038. {
  2039. switch (type)
  2040. {
  2041. case PIXEL_RGB2BGR:
  2042. case PIXEL_BGR2RGB:
  2043. to_bgr2rgb(*this, pixels, stride);
  2044. break;
  2045. case PIXEL_RGB2RGBA:
  2046. case PIXEL_BGR2BGRA:
  2047. to_rgb2rgba(*this, pixels, stride);
  2048. break;
  2049. case PIXEL_BGR2RGBA:
  2050. case PIXEL_RGB2BGRA:
  2051. to_bgr2rgba(*this, pixels, stride);
  2052. break;
  2053. case PIXEL_GRAY2RGBA:
  2054. case PIXEL_GRAY2BGRA:
  2055. to_gray2rgba(*this, pixels, stride);
  2056. break;
  2057. case PIXEL_RGBA2BGRA:
  2058. case PIXEL_BGRA2RGBA:
  2059. to_rgba2bgra(*this, pixels, stride);
  2060. break;
  2061. default:
  2062. // unimplemented convert type
  2063. break;
  2064. }
  2065. }
  2066. else
  2067. {
  2068. if (type == PIXEL_RGB || type == PIXEL_BGR)
  2069. to_rgb(*this, pixels, stride);
  2070. if (type == PIXEL_GRAY)
  2071. to_gray(*this, pixels, stride);
  2072. if (type == PIXEL_RGBA || type == PIXEL_BGRA)
  2073. to_rgba(*this, pixels, stride);
  2074. }
  2075. }
  2076. void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
  2077. {
  2078. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2079. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2080. {
  2081. to_pixels_resize(pixels, type, target_width, target_height, target_width * 3);
  2082. }
  2083. else if (type_to == PIXEL_GRAY)
  2084. {
  2085. to_pixels_resize(pixels, type, target_width, target_height, target_width * 1);
  2086. }
  2087. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2088. {
  2089. to_pixels_resize(pixels, type, target_width, target_height, target_width * 4);
  2090. }
  2091. }
  2092. void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const
  2093. {
  2094. if (w == target_width && h == target_height)
  2095. return to_pixels(pixels, type);
  2096. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2097. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2098. {
  2099. Mat src(w, h, (size_t)3u, 3);
  2100. to_pixels(src, type);
  2101. resize_bilinear_c3(src, w, h, w * 3, pixels, target_width, target_height, target_stride);
  2102. }
  2103. else if (type_to == PIXEL_GRAY)
  2104. {
  2105. Mat src(w, h, (size_t)1u, 1);
  2106. to_pixels(src, type);
  2107. resize_bilinear_c1(src, w, h, w * 1, pixels, target_width, target_height, target_stride);
  2108. }
  2109. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2110. {
  2111. Mat src(w, h, (size_t)4u, 4);
  2112. to_pixels(src, type);
  2113. resize_bilinear_c4(src, w, h, w * 4, pixels, target_width, target_height, target_stride);
  2114. }
  2115. }
  2116. #endif // NCNN_PIXEL
  2117. } // namespace ncnn