You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel.cpp 90 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #include <limits.h>
  16. #include <math.h>
  17. #if __ARM_NEON
  18. #include <arm_neon.h>
  19. #endif // __ARM_NEON
  20. #include "platform.h"
  21. namespace ncnn {
  22. #if NCNN_PIXEL
  23. static int from_rgb(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  24. {
  25. m.create(w, h, 3, 4u, allocator);
  26. if (m.empty())
  27. return -100;
  28. const int wgap = stride - w * 3;
  29. if (wgap == 0)
  30. {
  31. w = w * h;
  32. h = 1;
  33. }
  34. float* ptr0 = m.channel(0);
  35. float* ptr1 = m.channel(1);
  36. float* ptr2 = m.channel(2);
  37. for (int y = 0; y < h; y++)
  38. {
  39. #if __ARM_NEON
  40. int nn = w >> 3;
  41. int remain = w - (nn << 3);
  42. #else
  43. int remain = w;
  44. #endif // __ARM_NEON
  45. #if __ARM_NEON
  46. #if __aarch64__
  47. for (; nn > 0; nn--)
  48. {
  49. uint8x8x3_t _rgb = vld3_u8(rgb);
  50. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  51. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  52. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  53. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  54. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  55. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  56. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  57. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  58. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  59. vst1q_f32(ptr0, _rlow);
  60. vst1q_f32(ptr0 + 4, _rhigh);
  61. vst1q_f32(ptr1, _glow);
  62. vst1q_f32(ptr1 + 4, _ghigh);
  63. vst1q_f32(ptr2, _blow);
  64. vst1q_f32(ptr2 + 4, _bhigh);
  65. rgb += 3 * 8;
  66. ptr0 += 8;
  67. ptr1 += 8;
  68. ptr2 += 8;
  69. }
  70. #else
  71. if (nn > 0)
  72. {
  73. asm volatile(
  74. "0: \n"
  75. "pld [%1, #256] \n"
  76. "vld3.u8 {d0-d2}, [%1]! \n"
  77. "vmovl.u8 q8, d0 \n"
  78. "vmovl.u8 q9, d1 \n"
  79. "vmovl.u8 q10, d2 \n"
  80. "vmovl.u16 q0, d16 \n"
  81. "vmovl.u16 q1, d17 \n"
  82. "vmovl.u16 q2, d18 \n"
  83. "vmovl.u16 q3, d19 \n"
  84. "vmovl.u16 q8, d20 \n"
  85. "vmovl.u16 q9, d21 \n"
  86. "vcvt.f32.u32 q0, q0 \n"
  87. "vcvt.f32.u32 q1, q1 \n"
  88. "vcvt.f32.u32 q2, q2 \n"
  89. "vcvt.f32.u32 q3, q3 \n"
  90. "vcvt.f32.u32 q8, q8 \n"
  91. "subs %0, #1 \n"
  92. "vst1.f32 {d0-d3}, [%2]! \n"
  93. "vcvt.f32.u32 q9, q9 \n"
  94. "vst1.f32 {d4-d7}, [%3]! \n"
  95. "vst1.f32 {d16-d19}, [%4]! \n"
  96. "bne 0b \n"
  97. : "=r"(nn), // %0
  98. "=r"(rgb), // %1
  99. "=r"(ptr0), // %2
  100. "=r"(ptr1), // %3
  101. "=r"(ptr2) // %4
  102. : "0"(nn),
  103. "1"(rgb),
  104. "2"(ptr0),
  105. "3"(ptr1),
  106. "4"(ptr2)
  107. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
  108. }
  109. #endif // __aarch64__
  110. #endif // __ARM_NEON
  111. for (; remain > 0; remain--)
  112. {
  113. *ptr0 = rgb[0];
  114. *ptr1 = rgb[1];
  115. *ptr2 = rgb[2];
  116. rgb += 3;
  117. ptr0++;
  118. ptr1++;
  119. ptr2++;
  120. }
  121. rgb += wgap;
  122. }
  123. return 0;
  124. }
  125. static void to_rgb(const Mat& m, unsigned char* rgb, int stride)
  126. {
  127. int w = m.w;
  128. int h = m.h;
  129. const int wgap = stride - w * 3;
  130. if (wgap == 0)
  131. {
  132. w = w * h;
  133. h = 1;
  134. }
  135. const float* ptr0 = m.channel(0);
  136. const float* ptr1 = m.channel(1);
  137. const float* ptr2 = m.channel(2);
  138. for (int y = 0; y < h; y++)
  139. {
  140. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  141. #if __ARM_NEON
  142. int nn = w >> 3;
  143. int remain = w - (nn << 3);
  144. #else
  145. int remain = w;
  146. #endif // __ARM_NEON
  147. #if __ARM_NEON
  148. for (; nn > 0; nn--)
  149. {
  150. float32x4_t _rlow = vld1q_f32(ptr0);
  151. float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
  152. float32x4_t _glow = vld1q_f32(ptr1);
  153. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  154. float32x4_t _blow = vld1q_f32(ptr2);
  155. float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
  156. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  157. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  158. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  159. uint8x8x3_t _rgb;
  160. _rgb.val[0] = vqmovun_s16(_r16);
  161. _rgb.val[1] = vqmovun_s16(_g16);
  162. _rgb.val[2] = vqmovun_s16(_b16);
  163. vst3_u8(rgb, _rgb);
  164. rgb += 3 * 8;
  165. ptr0 += 8;
  166. ptr1 += 8;
  167. ptr2 += 8;
  168. }
  169. #endif // __ARM_NEON
  170. for (; remain > 0; remain--)
  171. {
  172. rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
  173. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  174. rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
  175. rgb += 3;
  176. ptr0++;
  177. ptr1++;
  178. ptr2++;
  179. }
  180. #undef SATURATE_CAST_UCHAR
  181. rgb += wgap;
  182. }
  183. }
  184. static int from_gray(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  185. {
  186. m.create(w, h, 1, 4u, allocator);
  187. if (m.empty())
  188. return -100;
  189. const int wgap = stride - w;
  190. if (wgap == 0)
  191. {
  192. w = w * h;
  193. h = 1;
  194. }
  195. float* ptr = m;
  196. for (int y = 0; y < h; y++)
  197. {
  198. #if __ARM_NEON
  199. int nn = w >> 4;
  200. int remain = w - (nn << 4);
  201. #else
  202. int remain = w;
  203. #endif // __ARM_NEON
  204. #if __ARM_NEON
  205. #if __aarch64__
  206. for (; nn > 0; nn--)
  207. {
  208. uint8x16_t _gray = vld1q_u8(gray);
  209. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  210. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  211. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  212. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  213. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  214. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  215. vst1q_f32(ptr, _graylow_0);
  216. vst1q_f32(ptr + 4, _grayhigh_0);
  217. vst1q_f32(ptr + 8, _graylow_1);
  218. vst1q_f32(ptr + 12, _grayhigh_1);
  219. gray += 16;
  220. ptr += 16;
  221. }
  222. #else
  223. if (nn > 0)
  224. {
  225. asm volatile(
  226. "0: \n"
  227. "pld [%1, #128] \n"
  228. "vld1.u8 {d0,d1}, [%1]! \n"
  229. "vmovl.u8 q8, d0 \n"
  230. "vmovl.u8 q9, d1 \n"
  231. "vmovl.u16 q0, d16 \n"
  232. "vmovl.u16 q1, d17 \n"
  233. "vmovl.u16 q2, d18 \n"
  234. "vmovl.u16 q3, d19 \n"
  235. "vcvt.f32.u32 q0, q0 \n"
  236. "vcvt.f32.u32 q1, q1 \n"
  237. "vcvt.f32.u32 q2, q2 \n"
  238. "vcvt.f32.u32 q3, q3 \n"
  239. "subs %0, #1 \n"
  240. "vst1.f32 {d0-d3}, [%2]! \n"
  241. "vst1.f32 {d4-d7}, [%2]! \n"
  242. "bne 0b \n"
  243. : "=r"(nn), // %0
  244. "=r"(gray), // %1
  245. "=r"(ptr) // %2
  246. : "0"(nn),
  247. "1"(gray),
  248. "2"(ptr)
  249. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
  250. }
  251. #endif // __aarch64__
  252. #endif // __ARM_NEON
  253. for (; remain > 0; remain--)
  254. {
  255. *ptr = *gray;
  256. gray++;
  257. ptr++;
  258. }
  259. gray += wgap;
  260. }
  261. return 0;
  262. }
  263. static void to_gray(const Mat& m, unsigned char* gray, int stride)
  264. {
  265. int w = m.w;
  266. int h = m.h;
  267. const int wgap = stride - w;
  268. if (wgap == 0)
  269. {
  270. w = w * h;
  271. h = 1;
  272. }
  273. const float* ptr = m;
  274. for (int y = 0; y < h; y++)
  275. {
  276. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  277. #if __ARM_NEON
  278. int nn = w >> 3;
  279. int remain = w - (nn << 3);
  280. #else
  281. int remain = w;
  282. #endif // __ARM_NEON
  283. #if __ARM_NEON
  284. for (; nn > 0; nn--)
  285. {
  286. float32x4_t _glow = vld1q_f32(ptr);
  287. float32x4_t _ghigh = vld1q_f32(ptr + 4);
  288. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  289. uint8x8_t _gray = vqmovun_s16(_g16);
  290. vst1_u8(gray, _gray);
  291. gray += 8;
  292. ptr += 8;
  293. }
  294. #endif // __ARM_NEON
  295. for (; remain > 0; remain--)
  296. {
  297. *gray = SATURATE_CAST_UCHAR(*ptr);
  298. gray++;
  299. ptr++;
  300. }
  301. #undef SATURATE_CAST_UCHAR
  302. gray += wgap;
  303. }
  304. }
  305. static int from_rgba(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  306. {
  307. m.create(w, h, 4, 4u, allocator);
  308. if (m.empty())
  309. return -100;
  310. const int wgap = stride - w * 4;
  311. if (wgap == 0)
  312. {
  313. w = w * h;
  314. h = 1;
  315. }
  316. float* ptr0 = m.channel(0);
  317. float* ptr1 = m.channel(1);
  318. float* ptr2 = m.channel(2);
  319. float* ptr3 = m.channel(3);
  320. for (int y = 0; y < h; y++)
  321. {
  322. #if __ARM_NEON
  323. int nn = w >> 3;
  324. int remain = w - (nn << 3);
  325. #else
  326. int remain = w;
  327. #endif // __ARM_NEON
  328. #if __ARM_NEON
  329. #if __aarch64__
  330. for (; nn > 0; nn--)
  331. {
  332. uint8x8x4_t _rgba = vld4_u8(rgba);
  333. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  334. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  335. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  336. int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
  337. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  338. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  339. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  340. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  341. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  342. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  343. float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
  344. float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
  345. vst1q_f32(ptr0, _rlow);
  346. vst1q_f32(ptr0 + 4, _rhigh);
  347. vst1q_f32(ptr1, _glow);
  348. vst1q_f32(ptr1 + 4, _ghigh);
  349. vst1q_f32(ptr2, _blow);
  350. vst1q_f32(ptr2 + 4, _bhigh);
  351. vst1q_f32(ptr3, _alow);
  352. vst1q_f32(ptr3 + 4, _ahigh);
  353. rgba += 4 * 8;
  354. ptr0 += 8;
  355. ptr1 += 8;
  356. ptr2 += 8;
  357. ptr3 += 8;
  358. }
  359. #else
  360. if (nn > 0)
  361. {
  362. asm volatile(
  363. "0: \n"
  364. "pld [%1, #256] \n"
  365. "vld4.u8 {d0-d3}, [%1]! \n"
  366. "vmovl.u8 q8, d0 \n"
  367. "vmovl.u8 q9, d1 \n"
  368. "vmovl.u8 q10, d2 \n"
  369. "vmovl.u8 q11, d3 \n"
  370. "vmovl.u16 q0, d16 \n"
  371. "vmovl.u16 q1, d17 \n"
  372. "vmovl.u16 q2, d18 \n"
  373. "vmovl.u16 q3, d19 \n"
  374. "vmovl.u16 q8, d20 \n"
  375. "vmovl.u16 q9, d21 \n"
  376. "vmovl.u16 q10, d22 \n"
  377. "vmovl.u16 q11, d23 \n"
  378. "vcvt.f32.u32 q0, q0 \n"
  379. "vcvt.f32.u32 q1, q1 \n"
  380. "vcvt.f32.u32 q2, q2 \n"
  381. "vcvt.f32.u32 q3, q3 \n"
  382. "vcvt.f32.u32 q8, q8 \n"
  383. "vcvt.f32.u32 q9, q9 \n"
  384. "subs %0, #1 \n"
  385. "vst1.f32 {d0-d3}, [%2]! \n"
  386. "vcvt.f32.u32 q10, q10 \n"
  387. "vcvt.f32.u32 q11, q11 \n"
  388. "vst1.f32 {d4-d7}, [%3]! \n"
  389. "vst1.f32 {d16-d19}, [%4]! \n"
  390. "vst1.f32 {d20-d23}, [%5]! \n"
  391. "bne 0b \n"
  392. : "=r"(nn), // %0
  393. "=r"(rgba), // %1
  394. "=r"(ptr0), // %2
  395. "=r"(ptr1), // %3
  396. "=r"(ptr2), // %4
  397. "=r"(ptr3) // %5
  398. : "0"(nn),
  399. "1"(rgba),
  400. "2"(ptr0),
  401. "3"(ptr1),
  402. "4"(ptr2),
  403. "5"(ptr3)
  404. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  405. }
  406. #endif // __aarch64__
  407. #endif // __ARM_NEON
  408. for (; remain > 0; remain--)
  409. {
  410. *ptr0 = rgba[0];
  411. *ptr1 = rgba[1];
  412. *ptr2 = rgba[2];
  413. *ptr3 = rgba[3];
  414. rgba += 4;
  415. ptr0++;
  416. ptr1++;
  417. ptr2++;
  418. ptr3++;
  419. }
  420. rgba += wgap;
  421. }
  422. return 0;
  423. }
  424. static void to_rgba(const Mat& m, unsigned char* rgba, int stride)
  425. {
  426. int w = m.w;
  427. int h = m.h;
  428. const int wgap = stride - w * 4;
  429. if (wgap == 0)
  430. {
  431. w = w * h;
  432. h = 1;
  433. }
  434. const float* ptr0 = m.channel(0);
  435. const float* ptr1 = m.channel(1);
  436. const float* ptr2 = m.channel(2);
  437. const float* ptr3 = m.channel(3);
  438. for (int y = 0; y < h; y++)
  439. {
  440. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  441. #if __ARM_NEON
  442. int nn = w >> 3;
  443. int remain = w - (nn << 3);
  444. #else
  445. int remain = w;
  446. #endif // __ARM_NEON
  447. #if __ARM_NEON
  448. for (; nn > 0; nn--)
  449. {
  450. float32x4_t _rlow = vld1q_f32(ptr0);
  451. float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
  452. float32x4_t _glow = vld1q_f32(ptr1);
  453. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  454. float32x4_t _blow = vld1q_f32(ptr2);
  455. float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
  456. float32x4_t _alow = vld1q_f32(ptr3);
  457. float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
  458. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  459. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  460. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  461. int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
  462. uint8x8x4_t _rgba;
  463. _rgba.val[0] = vqmovun_s16(_r16);
  464. _rgba.val[1] = vqmovun_s16(_g16);
  465. _rgba.val[2] = vqmovun_s16(_b16);
  466. _rgba.val[3] = vqmovun_s16(_a16);
  467. vst4_u8(rgba, _rgba);
  468. rgba += 4 * 8;
  469. ptr0 += 8;
  470. ptr1 += 8;
  471. ptr2 += 8;
  472. ptr3 += 8;
  473. }
  474. #endif // __ARM_NEON
  475. for (; remain > 0; remain--)
  476. {
  477. rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
  478. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  479. rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
  480. rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
  481. rgba += 4;
  482. ptr0++;
  483. ptr1++;
  484. ptr2++;
  485. ptr3++;
  486. }
  487. #undef SATURATE_CAST_UCHAR
  488. rgba += wgap;
  489. }
  490. }
  491. static int from_rgb2bgr(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  492. {
  493. m.create(w, h, 3, 4u, allocator);
  494. if (m.empty())
  495. return -100;
  496. const int wgap = stride - w * 3;
  497. if (wgap == 0)
  498. {
  499. w = w * h;
  500. h = 1;
  501. }
  502. float* ptr0 = m.channel(0);
  503. float* ptr1 = m.channel(1);
  504. float* ptr2 = m.channel(2);
  505. for (int y = 0; y < h; y++)
  506. {
  507. #if __ARM_NEON
  508. int nn = w >> 3;
  509. int remain = w - (nn << 3);
  510. #else
  511. int remain = w;
  512. #endif // __ARM_NEON
  513. #if __ARM_NEON
  514. #if __aarch64__
  515. for (; nn > 0; nn--)
  516. {
  517. uint8x8x3_t _rgb = vld3_u8(rgb);
  518. uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
  519. uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
  520. uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
  521. float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
  522. float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
  523. float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
  524. float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
  525. float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
  526. float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
  527. vst1q_f32(ptr2, _rlow);
  528. vst1q_f32(ptr2 + 4, _rhigh);
  529. vst1q_f32(ptr1, _glow);
  530. vst1q_f32(ptr1 + 4, _ghigh);
  531. vst1q_f32(ptr0, _blow);
  532. vst1q_f32(ptr0 + 4, _bhigh);
  533. rgb += 3 * 8;
  534. ptr0 += 8;
  535. ptr1 += 8;
  536. ptr2 += 8;
  537. }
  538. #else
  539. if (nn > 0)
  540. {
  541. asm volatile(
  542. "0: \n"
  543. "pld [%1, #256] \n"
  544. "vld3.u8 {d0-d2}, [%1]! \n"
  545. "vmovl.u8 q8, d0 \n"
  546. "vmovl.u8 q9, d1 \n"
  547. "vmovl.u8 q10, d2 \n"
  548. "vmovl.u16 q0, d16 \n"
  549. "vmovl.u16 q1, d17 \n"
  550. "vmovl.u16 q2, d18 \n"
  551. "vmovl.u16 q3, d19 \n"
  552. "vmovl.u16 q8, d20 \n"
  553. "vmovl.u16 q9, d21 \n"
  554. "vcvt.f32.u32 q0, q0 \n"
  555. "vcvt.f32.u32 q1, q1 \n"
  556. "vcvt.f32.u32 q2, q2 \n"
  557. "vcvt.f32.u32 q3, q3 \n"
  558. "vcvt.f32.u32 q8, q8 \n"
  559. "subs %0, #1 \n"
  560. "vst1.f32 {d0-d3}, [%4]! \n"
  561. "vcvt.f32.u32 q9, q9 \n"
  562. "vst1.f32 {d4-d7}, [%3]! \n"
  563. "vst1.f32 {d16-d19}, [%2]! \n"
  564. "bne 0b \n"
  565. : "=r"(nn), // %0
  566. "=r"(rgb), // %1
  567. "=r"(ptr0), // %2
  568. "=r"(ptr1), // %3
  569. "=r"(ptr2) // %4
  570. : "0"(nn),
  571. "1"(rgb),
  572. "2"(ptr0),
  573. "3"(ptr1),
  574. "4"(ptr2)
  575. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
  576. }
  577. #endif // __aarch64__
  578. #endif // __ARM_NEON
  579. for (; remain > 0; remain--)
  580. {
  581. *ptr0 = rgb[2];
  582. *ptr1 = rgb[1];
  583. *ptr2 = rgb[0];
  584. rgb += 3;
  585. ptr0++;
  586. ptr1++;
  587. ptr2++;
  588. }
  589. rgb += wgap;
  590. }
  591. return 0;
  592. }
  593. static void to_bgr2rgb(const Mat& m, unsigned char* rgb, int stride)
  594. {
  595. int w = m.w;
  596. int h = m.h;
  597. const int wgap = stride - w * 3;
  598. if (wgap == 0)
  599. {
  600. w = w * h;
  601. h = 1;
  602. }
  603. const float* ptr0 = m.channel(0);
  604. const float* ptr1 = m.channel(1);
  605. const float* ptr2 = m.channel(2);
  606. for (int y = 0; y < h; y++)
  607. {
  608. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  609. #if __ARM_NEON
  610. int nn = w >> 3;
  611. int remain = w - (nn << 3);
  612. #else
  613. int remain = w;
  614. #endif // __ARM_NEON
  615. #if __ARM_NEON
  616. for (; nn > 0; nn--)
  617. {
  618. float32x4_t _rlow = vld1q_f32(ptr2);
  619. float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
  620. float32x4_t _glow = vld1q_f32(ptr1);
  621. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  622. float32x4_t _blow = vld1q_f32(ptr0);
  623. float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
  624. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  625. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  626. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  627. uint8x8x3_t _rgb;
  628. _rgb.val[0] = vqmovun_s16(_r16);
  629. _rgb.val[1] = vqmovun_s16(_g16);
  630. _rgb.val[2] = vqmovun_s16(_b16);
  631. vst3_u8(rgb, _rgb);
  632. rgb += 3 * 8;
  633. ptr0 += 8;
  634. ptr1 += 8;
  635. ptr2 += 8;
  636. }
  637. #endif // __ARM_NEON
  638. for (; remain > 0; remain--)
  639. {
  640. rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
  641. rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
  642. rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
  643. rgb += 3;
  644. ptr0++;
  645. ptr1++;
  646. ptr2++;
  647. }
  648. #undef SATURATE_CAST_UCHAR
  649. rgb += wgap;
  650. }
  651. }
  652. static int from_rgb2gray(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  653. {
  654. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  655. const unsigned char Y_shift = 8; //14
  656. const unsigned char R2Y = 77;
  657. const unsigned char G2Y = 150;
  658. const unsigned char B2Y = 29;
  659. m.create(w, h, 1, 4u, allocator);
  660. if (m.empty())
  661. return -100;
  662. const int wgap = stride - w * 3;
  663. if (wgap == 0)
  664. {
  665. w = w * h;
  666. h = 1;
  667. }
  668. float* ptr = m;
  669. for (int y = 0; y < h; y++)
  670. {
  671. #if __ARM_NEON
  672. int nn = w >> 3;
  673. int remain = w - (nn << 3);
  674. #else
  675. int remain = w;
  676. #endif // __ARM_NEON
  677. #if __ARM_NEON
  678. #if __aarch64__
  679. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  680. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  681. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  682. for (; nn > 0; nn--)
  683. {
  684. uint8x8x3_t _rgb = vld3_u8(rgb);
  685. uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
  686. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  687. _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
  688. _y16 = vshrq_n_u16(_y16, Y_shift);
  689. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  690. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  691. vst1q_f32(ptr, _ylow);
  692. vst1q_f32(ptr + 4, _yhigh);
  693. rgb += 3 * 8;
  694. ptr += 8;
  695. }
  696. #else
  697. if (nn > 0)
  698. {
  699. asm volatile(
  700. "vdup.u8 d16, %6 \n"
  701. "vdup.u8 d17, %7 \n"
  702. "vdup.u8 d18, %8 \n"
  703. "0: \n"
  704. "pld [%1, #256] \n"
  705. "vld3.u8 {d0-d2}, [%1]! \n"
  706. "vmull.u8 q2, d0, d16 \n"
  707. "vmlal.u8 q2, d1, d17 \n"
  708. "vmlal.u8 q2, d2, d18 \n"
  709. "vshr.u16 q2, q2, #8 \n" // Y_shift
  710. "vmovl.u16 q0, d4 \n"
  711. "vmovl.u16 q1, d5 \n"
  712. "vcvt.f32.u32 q0, q0 \n"
  713. "vcvt.f32.u32 q1, q1 \n"
  714. "subs %0, #1 \n"
  715. "vst1.f32 {d0-d3}, [%2]! \n"
  716. "bne 0b \n"
  717. : "=r"(nn), // %0
  718. "=r"(rgb), // %1
  719. "=r"(ptr) // %2
  720. : "0"(nn),
  721. "1"(rgb),
  722. "2"(ptr),
  723. "r"(R2Y), // %6
  724. "r"(G2Y), // %7
  725. "r"(B2Y) // %8
  726. : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
  727. }
  728. #endif // __aarch64__
  729. #endif // __ARM_NEON
  730. for (; remain > 0; remain--)
  731. {
  732. *ptr = static_cast<float>((rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift);
  733. rgb += 3;
  734. ptr++;
  735. }
  736. rgb += wgap;
  737. }
  738. return 0;
  739. }
  740. static int from_rgb2rgba(const unsigned char* rgb, int w, int h, int stride, Mat& m, Allocator* allocator)
  741. {
  742. m.create(w, h, 4, 4u, allocator);
  743. if (m.empty())
  744. return -100;
  745. Mat rgb_channels = m.channel_range(0, 3);
  746. from_rgb(rgb, w, h, stride, rgb_channels, allocator);
  747. Mat alpha_channel = m.channel(3);
  748. alpha_channel.fill(255.f);
  749. return 0;
  750. }
  751. static void to_rgb2rgba(const Mat& m, unsigned char* rgba, int stride)
  752. {
  753. int w = m.w;
  754. int h = m.h;
  755. const int wgap = stride - w * 4;
  756. if (wgap == 0)
  757. {
  758. w = w * h;
  759. h = 1;
  760. }
  761. const float* ptr0 = m.channel(0);
  762. const float* ptr1 = m.channel(1);
  763. const float* ptr2 = m.channel(2);
  764. for (int y = 0; y < h; y++)
  765. {
  766. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  767. #if __ARM_NEON
  768. int nn = w >> 3;
  769. int remain = w - (nn << 3);
  770. #else
  771. int remain = w;
  772. #endif // __ARM_NEON
  773. #if __ARM_NEON
  774. uint8x8_t _a = vdup_n_u8(255);
  775. for (; nn > 0; nn--)
  776. {
  777. float32x4_t _rlow = vld1q_f32(ptr0);
  778. float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
  779. float32x4_t _glow = vld1q_f32(ptr1);
  780. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  781. float32x4_t _blow = vld1q_f32(ptr2);
  782. float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
  783. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  784. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  785. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  786. uint8x8x4_t _rgba;
  787. _rgba.val[0] = vqmovun_s16(_r16);
  788. _rgba.val[1] = vqmovun_s16(_g16);
  789. _rgba.val[2] = vqmovun_s16(_b16);
  790. _rgba.val[3] = _a;
  791. vst4_u8(rgba, _rgba);
  792. rgba += 4 * 8;
  793. ptr0 += 8;
  794. ptr1 += 8;
  795. ptr2 += 8;
  796. }
  797. #endif // __ARM_NEON
  798. for (; remain > 0; remain--)
  799. {
  800. rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
  801. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  802. rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
  803. rgba[3] = 255;
  804. rgba += 4;
  805. ptr0++;
  806. ptr1++;
  807. ptr2++;
  808. }
  809. #undef SATURATE_CAST_UCHAR
  810. rgba += wgap;
  811. }
  812. }
  813. static int from_bgr2gray(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
  814. {
  815. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  816. const unsigned char Y_shift = 8; //14
  817. const unsigned char R2Y = 77;
  818. const unsigned char G2Y = 150;
  819. const unsigned char B2Y = 29;
  820. m.create(w, h, 1, 4u, allocator);
  821. if (m.empty())
  822. return -100;
  823. const int wgap = stride - w * 3;
  824. if (wgap == 0)
  825. {
  826. w = w * h;
  827. h = 1;
  828. }
  829. float* ptr = m;
  830. for (int y = 0; y < h; y++)
  831. {
  832. #if __ARM_NEON
  833. int nn = w >> 3;
  834. int remain = w - (nn << 3);
  835. #else
  836. int remain = w;
  837. #endif // __ARM_NEON
  838. #if __ARM_NEON
  839. #if __aarch64__
  840. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  841. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  842. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  843. for (; nn > 0; nn--)
  844. {
  845. uint8x8x3_t _rgb = vld3_u8(bgr);
  846. uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
  847. _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
  848. _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
  849. _y16 = vshrq_n_u16(_y16, Y_shift);
  850. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  851. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  852. vst1q_f32(ptr, _ylow);
  853. vst1q_f32(ptr + 4, _yhigh);
  854. bgr += 3 * 8;
  855. ptr += 8;
  856. }
  857. #else
  858. if (nn > 0)
  859. {
  860. asm volatile(
  861. "vdup.u8 d16, %6 \n"
  862. "vdup.u8 d17, %7 \n"
  863. "vdup.u8 d18, %8 \n"
  864. "0: \n"
  865. "pld [%1, #256] \n"
  866. "vld3.u8 {d0-d2}, [%1]! \n"
  867. "vmull.u8 q2, d2, d16 \n"
  868. "vmlal.u8 q2, d1, d17 \n"
  869. "vmlal.u8 q2, d0, d18 \n"
  870. "vshr.u16 q2, q2, #8 \n" // Y_shift
  871. "vmovl.u16 q0, d4 \n"
  872. "vmovl.u16 q1, d5 \n"
  873. "vcvt.f32.u32 q0, q0 \n"
  874. "vcvt.f32.u32 q1, q1 \n"
  875. "subs %0, #1 \n"
  876. "vst1.f32 {d0-d3}, [%2]! \n"
  877. "bne 0b \n"
  878. : "=r"(nn), // %0
  879. "=r"(bgr), // %1
  880. "=r"(ptr) // %2
  881. : "0"(nn),
  882. "1"(bgr),
  883. "2"(ptr),
  884. "r"(R2Y), // %6
  885. "r"(G2Y), // %7
  886. "r"(B2Y) // %8
  887. : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
  888. }
  889. #endif // __aarch64__
  890. #endif // __ARM_NEON
  891. for (; remain > 0; remain--)
  892. {
  893. *ptr = static_cast<float>((bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift);
  894. bgr += 3;
  895. ptr++;
  896. }
  897. bgr += wgap;
  898. }
  899. return 0;
  900. }
  901. static int from_bgr2rgba(const unsigned char* bgr, int w, int h, int stride, Mat& m, Allocator* allocator)
  902. {
  903. m.create(w, h, 4, 4u, allocator);
  904. if (m.empty())
  905. return -100;
  906. Mat rgb_channels = m.channel_range(0, 3);
  907. from_rgb2bgr(bgr, w, h, stride, rgb_channels, allocator);
  908. Mat alpha_channel = m.channel(3);
  909. alpha_channel.fill(255.f);
  910. return 0;
  911. }
  912. static void to_bgr2rgba(const Mat& m, unsigned char* rgba, int stride)
  913. {
  914. int w = m.w;
  915. int h = m.h;
  916. const int wgap = stride - w * 4;
  917. if (wgap == 0)
  918. {
  919. w = w * h;
  920. h = 1;
  921. }
  922. const float* ptr0 = m.channel(0);
  923. const float* ptr1 = m.channel(1);
  924. const float* ptr2 = m.channel(2);
  925. for (int y = 0; y < h; y++)
  926. {
  927. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  928. #if __ARM_NEON
  929. int nn = w >> 3;
  930. int remain = w - (nn << 3);
  931. #else
  932. int remain = w;
  933. #endif // __ARM_NEON
  934. #if __ARM_NEON
  935. uint8x8_t _a = vdup_n_u8(255);
  936. for (; nn > 0; nn--)
  937. {
  938. float32x4_t _rlow = vld1q_f32(ptr2);
  939. float32x4_t _rhigh = vld1q_f32(ptr2 + 4);
  940. float32x4_t _glow = vld1q_f32(ptr1);
  941. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  942. float32x4_t _blow = vld1q_f32(ptr0);
  943. float32x4_t _bhigh = vld1q_f32(ptr0 + 4);
  944. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  945. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  946. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  947. uint8x8x4_t _rgba;
  948. _rgba.val[0] = vqmovun_s16(_r16);
  949. _rgba.val[1] = vqmovun_s16(_g16);
  950. _rgba.val[2] = vqmovun_s16(_b16);
  951. _rgba.val[3] = _a;
  952. vst4_u8(rgba, _rgba);
  953. rgba += 4 * 8;
  954. ptr0 += 8;
  955. ptr1 += 8;
  956. ptr2 += 8;
  957. }
  958. #endif // __ARM_NEON
  959. for (; remain > 0; remain--)
  960. {
  961. rgba[0] = SATURATE_CAST_UCHAR(*ptr2);
  962. rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
  963. rgba[2] = SATURATE_CAST_UCHAR(*ptr0);
  964. rgba[3] = 255;
  965. rgba += 4;
  966. ptr0++;
  967. ptr1++;
  968. ptr2++;
  969. }
  970. #undef SATURATE_CAST_UCHAR
  971. rgba += wgap;
  972. }
  973. }
  974. static int from_gray2rgb(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  975. {
  976. m.create(w, h, 3, 4u, allocator);
  977. if (m.empty())
  978. return -100;
  979. const int wgap = stride - w;
  980. if (wgap == 0)
  981. {
  982. w = w * h;
  983. h = 1;
  984. }
  985. float* ptr0 = m.channel(0);
  986. float* ptr1 = m.channel(1);
  987. float* ptr2 = m.channel(2);
  988. for (int y = 0; y < h; y++)
  989. {
  990. #if __ARM_NEON
  991. int nn = w >> 4;
  992. int remain = w - (nn << 4);
  993. #else
  994. int remain = w;
  995. #endif // __ARM_NEON
  996. #if __ARM_NEON
  997. #if __aarch64__
  998. for (; nn > 0; nn--)
  999. {
  1000. uint8x16_t _gray = vld1q_u8(gray);
  1001. uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
  1002. uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
  1003. float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
  1004. float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
  1005. float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
  1006. float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
  1007. vst1q_f32(ptr0, _graylow_0);
  1008. vst1q_f32(ptr0 + 4, _grayhigh_0);
  1009. vst1q_f32(ptr0 + 8, _graylow_1);
  1010. vst1q_f32(ptr0 + 12, _grayhigh_1);
  1011. vst1q_f32(ptr1, _graylow_0);
  1012. vst1q_f32(ptr1 + 4, _grayhigh_0);
  1013. vst1q_f32(ptr1 + 8, _graylow_1);
  1014. vst1q_f32(ptr1 + 12, _grayhigh_1);
  1015. vst1q_f32(ptr2, _graylow_0);
  1016. vst1q_f32(ptr2 + 4, _grayhigh_0);
  1017. vst1q_f32(ptr2 + 8, _graylow_1);
  1018. vst1q_f32(ptr2 + 12, _grayhigh_1);
  1019. gray += 16;
  1020. ptr0 += 16;
  1021. ptr1 += 16;
  1022. ptr2 += 16;
  1023. }
  1024. #else
  1025. if (nn > 0)
  1026. {
  1027. asm volatile(
  1028. "0: \n"
  1029. "pld [%1, #128] \n"
  1030. "vld1.u8 {d0,d1}, [%1]! \n"
  1031. "vmovl.u8 q8, d0 \n"
  1032. "vmovl.u8 q9, d1 \n"
  1033. "vmovl.u16 q0, d16 \n"
  1034. "vmovl.u16 q1, d17 \n"
  1035. "vmovl.u16 q2, d18 \n"
  1036. "vmovl.u16 q3, d19 \n"
  1037. "vcvt.f32.u32 q0, q0 \n"
  1038. "vcvt.f32.u32 q1, q1 \n"
  1039. "vcvt.f32.u32 q2, q2 \n"
  1040. "vcvt.f32.u32 q3, q3 \n"
  1041. "subs %0, #1 \n"
  1042. "vst1.f32 {d0-d3}, [%2]! \n"
  1043. "vst1.f32 {d4-d7}, [%2]! \n"
  1044. "vst1.f32 {d0-d3}, [%3]! \n"
  1045. "vst1.f32 {d4-d7}, [%3]! \n"
  1046. "vst1.f32 {d0-d3}, [%4]! \n"
  1047. "vst1.f32 {d4-d7}, [%4]! \n"
  1048. "bne 0b \n"
  1049. : "=r"(nn), // %0
  1050. "=r"(gray), // %1
  1051. "=r"(ptr0), // %2
  1052. "=r"(ptr1), // %3
  1053. "=r"(ptr2) // %4
  1054. : "0"(nn),
  1055. "1"(gray),
  1056. "2"(ptr0),
  1057. "3"(ptr1),
  1058. "4"(ptr2)
  1059. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
  1060. }
  1061. #endif // __aarch64__
  1062. #endif // __ARM_NEON
  1063. for (; remain > 0; remain--)
  1064. {
  1065. *ptr0 = *gray;
  1066. *ptr1 = *gray;
  1067. *ptr2 = *gray;
  1068. gray++;
  1069. ptr0++;
  1070. ptr1++;
  1071. ptr2++;
  1072. }
  1073. gray += wgap;
  1074. }
  1075. return 0;
  1076. }
  1077. static int from_gray2rgba(const unsigned char* gray, int w, int h, int stride, Mat& m, Allocator* allocator)
  1078. {
  1079. m.create(w, h, 4, 4u, allocator);
  1080. if (m.empty())
  1081. return -100;
  1082. Mat rgb_channels = m.channel_range(0, 3);
  1083. from_gray2rgb(gray, w, h, stride, rgb_channels, allocator);
  1084. Mat alpha_channel = m.channel(3);
  1085. alpha_channel.fill(255.f);
  1086. return 0;
  1087. }
  1088. static void to_gray2rgba(const Mat& m, unsigned char* rgba, int stride)
  1089. {
  1090. int w = m.w;
  1091. int h = m.h;
  1092. const int wgap = stride - w * 4;
  1093. if (wgap == 0)
  1094. {
  1095. w = w * h;
  1096. h = 1;
  1097. }
  1098. const float* ptr = m;
  1099. for (int y = 0; y < h; y++)
  1100. {
  1101. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1102. #if __ARM_NEON
  1103. int nn = w >> 3;
  1104. int remain = w - (nn << 3);
  1105. #else
  1106. int remain = w;
  1107. #endif // __ARM_NEON
  1108. #if __ARM_NEON
  1109. uint8x8_t _a = vdup_n_u8(255);
  1110. for (; nn > 0; nn--)
  1111. {
  1112. float32x4_t _glow = vld1q_f32(ptr);
  1113. float32x4_t _ghigh = vld1q_f32(ptr + 4);
  1114. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  1115. uint8x8_t _gray = vqmovun_s16(_g16);
  1116. uint8x8x4_t _rgba;
  1117. _rgba.val[0] = _gray;
  1118. _rgba.val[1] = _gray;
  1119. _rgba.val[2] = _gray;
  1120. _rgba.val[3] = _a;
  1121. vst4_u8(rgba, _rgba);
  1122. rgba += 4 * 8;
  1123. ptr += 8;
  1124. }
  1125. #endif // __ARM_NEON
  1126. for (; remain > 0; remain--)
  1127. {
  1128. unsigned char gray = SATURATE_CAST_UCHAR(*ptr);
  1129. rgba[0] = gray;
  1130. rgba[1] = gray;
  1131. rgba[2] = gray;
  1132. rgba[3] = 255;
  1133. rgba += 4;
  1134. ptr++;
  1135. }
  1136. #undef SATURATE_CAST_UCHAR
  1137. rgba += wgap;
  1138. }
  1139. }
  1140. static int from_rgba2rgb(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1141. {
  1142. m.create(w, h, 3, 4u, allocator);
  1143. if (m.empty())
  1144. return -100;
  1145. const int wgap = stride - w * 4;
  1146. if (wgap == 0)
  1147. {
  1148. w = w * h;
  1149. h = 1;
  1150. }
  1151. float* ptr0 = m.channel(0);
  1152. float* ptr1 = m.channel(1);
  1153. float* ptr2 = m.channel(2);
  1154. for (int y = 0; y < h; y++)
  1155. {
  1156. #if __ARM_NEON
  1157. int nn = w >> 3;
  1158. int remain = w - (nn << 3);
  1159. #else
  1160. int remain = w;
  1161. #endif // __ARM_NEON
  1162. #if __ARM_NEON
  1163. #if __aarch64__
  1164. for (; nn > 0; nn--)
  1165. {
  1166. uint8x8x4_t _rgba = vld4_u8(rgba);
  1167. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1168. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1169. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1170. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1171. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1172. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1173. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1174. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1175. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1176. vst1q_f32(ptr0, _rlow);
  1177. vst1q_f32(ptr0 + 4, _rhigh);
  1178. vst1q_f32(ptr1, _glow);
  1179. vst1q_f32(ptr1 + 4, _ghigh);
  1180. vst1q_f32(ptr2, _blow);
  1181. vst1q_f32(ptr2 + 4, _bhigh);
  1182. rgba += 4 * 8;
  1183. ptr0 += 8;
  1184. ptr1 += 8;
  1185. ptr2 += 8;
  1186. }
  1187. #else
  1188. if (nn > 0)
  1189. {
  1190. asm volatile(
  1191. "0: \n"
  1192. "pld [%1, #256] \n"
  1193. "vld4.u8 {d0-d3}, [%1]! \n"
  1194. "vmovl.u8 q8, d0 \n"
  1195. "vmovl.u8 q9, d1 \n"
  1196. "vmovl.u8 q10, d2 \n"
  1197. "vmovl.u16 q0, d16 \n"
  1198. "vmovl.u16 q1, d17 \n"
  1199. "vmovl.u16 q2, d18 \n"
  1200. "vmovl.u16 q3, d19 \n"
  1201. "vmovl.u16 q8, d20 \n"
  1202. "vmovl.u16 q9, d21 \n"
  1203. "vcvt.f32.u32 q0, q0 \n"
  1204. "vcvt.f32.u32 q1, q1 \n"
  1205. "vcvt.f32.u32 q2, q2 \n"
  1206. "vcvt.f32.u32 q3, q3 \n"
  1207. "vcvt.f32.u32 q8, q8 \n"
  1208. "subs %0, #1 \n"
  1209. "vst1.f32 {d0-d3}, [%2]! \n"
  1210. "vcvt.f32.u32 q9, q9 \n"
  1211. "vst1.f32 {d4-d7}, [%3]! \n"
  1212. "vst1.f32 {d16-d19}, [%4]! \n"
  1213. "bne 0b \n"
  1214. : "=r"(nn), // %0
  1215. "=r"(rgba), // %1
  1216. "=r"(ptr0), // %2
  1217. "=r"(ptr1), // %3
  1218. "=r"(ptr2) // %4
  1219. : "0"(nn),
  1220. "1"(rgba),
  1221. "2"(ptr0),
  1222. "3"(ptr1),
  1223. "4"(ptr2)
  1224. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
  1225. }
  1226. #endif // __aarch64__
  1227. #endif // __ARM_NEON
  1228. for (; remain > 0; remain--)
  1229. {
  1230. *ptr0 = rgba[0];
  1231. *ptr1 = rgba[1];
  1232. *ptr2 = rgba[2];
  1233. rgba += 4;
  1234. ptr0++;
  1235. ptr1++;
  1236. ptr2++;
  1237. }
  1238. rgba += wgap;
  1239. }
  1240. return 0;
  1241. }
  1242. static int from_rgba2bgr(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1243. {
  1244. m.create(w, h, 3, 4u, allocator);
  1245. if (m.empty())
  1246. return -100;
  1247. const int wgap = stride - w * 4;
  1248. if (wgap == 0)
  1249. {
  1250. w = w * h;
  1251. h = 1;
  1252. }
  1253. float* ptr0 = m.channel(0);
  1254. float* ptr1 = m.channel(1);
  1255. float* ptr2 = m.channel(2);
  1256. for (int y = 0; y < h; y++)
  1257. {
  1258. #if __ARM_NEON
  1259. int nn = w >> 3;
  1260. int remain = w - (nn << 3);
  1261. #else
  1262. int remain = w;
  1263. #endif // __ARM_NEON
  1264. #if __ARM_NEON
  1265. #if __aarch64__
  1266. for (; nn > 0; nn--)
  1267. {
  1268. uint8x8x4_t _rgba = vld4_u8(rgba);
  1269. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1270. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1271. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1272. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1273. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1274. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1275. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1276. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1277. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1278. vst1q_f32(ptr2, _rlow);
  1279. vst1q_f32(ptr2 + 4, _rhigh);
  1280. vst1q_f32(ptr1, _glow);
  1281. vst1q_f32(ptr1 + 4, _ghigh);
  1282. vst1q_f32(ptr0, _blow);
  1283. vst1q_f32(ptr0 + 4, _bhigh);
  1284. rgba += 4 * 8;
  1285. ptr0 += 8;
  1286. ptr1 += 8;
  1287. ptr2 += 8;
  1288. }
  1289. #else
  1290. if (nn > 0)
  1291. {
  1292. asm volatile(
  1293. "0: \n"
  1294. "pld [%1, #256] \n"
  1295. "vld4.u8 {d0-d3}, [%1]! \n"
  1296. "vmovl.u8 q8, d0 \n"
  1297. "vmovl.u8 q9, d1 \n"
  1298. "vmovl.u8 q10, d2 \n"
  1299. "vmovl.u16 q0, d16 \n"
  1300. "vmovl.u16 q1, d17 \n"
  1301. "vmovl.u16 q2, d18 \n"
  1302. "vmovl.u16 q3, d19 \n"
  1303. "vmovl.u16 q8, d20 \n"
  1304. "vmovl.u16 q9, d21 \n"
  1305. "vcvt.f32.u32 q0, q0 \n"
  1306. "vcvt.f32.u32 q1, q1 \n"
  1307. "vcvt.f32.u32 q2, q2 \n"
  1308. "vcvt.f32.u32 q3, q3 \n"
  1309. "vcvt.f32.u32 q8, q8 \n"
  1310. "subs %0, #1 \n"
  1311. "vst1.f32 {d0-d3}, [%4]! \n"
  1312. "vcvt.f32.u32 q9, q9 \n"
  1313. "vst1.f32 {d4-d7}, [%3]! \n"
  1314. "vst1.f32 {d16-d19}, [%2]! \n"
  1315. "bne 0b \n"
  1316. : "=r"(nn), // %0
  1317. "=r"(rgba), // %1
  1318. "=r"(ptr0), // %2
  1319. "=r"(ptr1), // %3
  1320. "=r"(ptr2) // %4
  1321. : "0"(nn),
  1322. "1"(rgba),
  1323. "2"(ptr0),
  1324. "3"(ptr1),
  1325. "4"(ptr2)
  1326. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
  1327. }
  1328. #endif // __aarch64__
  1329. #endif // __ARM_NEON
  1330. for (; remain > 0; remain--)
  1331. {
  1332. *ptr0 = rgba[2];
  1333. *ptr1 = rgba[1];
  1334. *ptr2 = rgba[0];
  1335. rgba += 4;
  1336. ptr0++;
  1337. ptr1++;
  1338. ptr2++;
  1339. }
  1340. rgba += wgap;
  1341. }
  1342. return 0;
  1343. }
  1344. static int from_rgba2gray(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1345. {
  1346. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  1347. const unsigned char Y_shift = 8; //14
  1348. const unsigned char R2Y = 77;
  1349. const unsigned char G2Y = 150;
  1350. const unsigned char B2Y = 29;
  1351. m.create(w, h, 1, 4u, allocator);
  1352. if (m.empty())
  1353. return -100;
  1354. const int wgap = stride - w * 4;
  1355. if (wgap == 0)
  1356. {
  1357. w = w * h;
  1358. h = 1;
  1359. }
  1360. float* ptr = m;
  1361. for (int y = 0; y < h; y++)
  1362. {
  1363. #if __ARM_NEON
  1364. int nn = w >> 3;
  1365. int remain = w - (nn << 3);
  1366. #else
  1367. int remain = w;
  1368. #endif // __ARM_NEON
  1369. #if __ARM_NEON
  1370. #if __aarch64__
  1371. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  1372. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  1373. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  1374. for (; nn > 0; nn--)
  1375. {
  1376. uint8x8x4_t _rgba = vld4_u8(rgba);
  1377. uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
  1378. _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
  1379. _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
  1380. _y16 = vshrq_n_u16(_y16, Y_shift);
  1381. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  1382. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  1383. vst1q_f32(ptr, _ylow);
  1384. vst1q_f32(ptr + 4, _yhigh);
  1385. rgba += 4 * 8;
  1386. ptr += 8;
  1387. }
  1388. #else
  1389. if (nn > 0)
  1390. {
  1391. asm volatile(
  1392. "vdup.u8 d16, %6 \n"
  1393. "vdup.u8 d17, %7 \n"
  1394. "vdup.u8 d18, %8 \n"
  1395. "0: \n"
  1396. "pld [%1, #256] \n"
  1397. "vld4.u8 {d0-d3}, [%1]! \n"
  1398. "vmull.u8 q2, d0, d16 \n"
  1399. "vmlal.u8 q2, d1, d17 \n"
  1400. "vmlal.u8 q2, d2, d18 \n"
  1401. "vshr.u16 q2, q2, #8 \n" // Y_shift
  1402. "vmovl.u16 q0, d4 \n"
  1403. "vmovl.u16 q1, d5 \n"
  1404. "vcvt.f32.u32 q0, q0 \n"
  1405. "vcvt.f32.u32 q1, q1 \n"
  1406. "subs %0, #1 \n"
  1407. "vst1.f32 {d0-d3}, [%2]! \n"
  1408. "bne 0b \n"
  1409. : "=r"(nn), // %0
  1410. "=r"(rgba), // %1
  1411. "=r"(ptr) // %2
  1412. : "0"(nn),
  1413. "1"(rgba),
  1414. "2"(ptr),
  1415. "r"(R2Y), // %6
  1416. "r"(G2Y), // %7
  1417. "r"(B2Y) // %8
  1418. : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
  1419. }
  1420. #endif // __aarch64__
  1421. #endif // __ARM_NEON
  1422. for (; remain > 0; remain--)
  1423. {
  1424. *ptr = static_cast<float>((rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift);
  1425. rgba += 4;
  1426. ptr++;
  1427. }
  1428. rgba += wgap;
  1429. }
  1430. return 0;
  1431. }
  1432. static int from_rgba2bgra(const unsigned char* rgba, int w, int h, int stride, Mat& m, Allocator* allocator)
  1433. {
  1434. m.create(w, h, 4, 4u, allocator);
  1435. if (m.empty())
  1436. return -100;
  1437. const int wgap = stride - w * 4;
  1438. if (wgap == 0)
  1439. {
  1440. w = w * h;
  1441. h = 1;
  1442. }
  1443. float* ptr0 = m.channel(0);
  1444. float* ptr1 = m.channel(1);
  1445. float* ptr2 = m.channel(2);
  1446. float* ptr3 = m.channel(3);
  1447. for (int y = 0; y < h; y++)
  1448. {
  1449. #if __ARM_NEON
  1450. int nn = w >> 3;
  1451. int remain = w - (nn << 3);
  1452. #else
  1453. int remain = w;
  1454. #endif // __ARM_NEON
  1455. #if __ARM_NEON
  1456. #if __aarch64__
  1457. for (; nn > 0; nn--)
  1458. {
  1459. uint8x8x4_t _rgba = vld4_u8(rgba);
  1460. int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
  1461. int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
  1462. int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
  1463. int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
  1464. float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
  1465. float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
  1466. float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
  1467. float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
  1468. float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
  1469. float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
  1470. float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
  1471. float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
  1472. vst1q_f32(ptr2, _rlow);
  1473. vst1q_f32(ptr2 + 4, _rhigh);
  1474. vst1q_f32(ptr1, _glow);
  1475. vst1q_f32(ptr1 + 4, _ghigh);
  1476. vst1q_f32(ptr0, _blow);
  1477. vst1q_f32(ptr0 + 4, _bhigh);
  1478. vst1q_f32(ptr3, _alow);
  1479. vst1q_f32(ptr3 + 4, _ahigh);
  1480. rgba += 4 * 8;
  1481. ptr0 += 8;
  1482. ptr1 += 8;
  1483. ptr2 += 8;
  1484. ptr3 += 8;
  1485. }
  1486. #else
  1487. if (nn > 0)
  1488. {
  1489. asm volatile(
  1490. "0: \n"
  1491. "pld [%1, #256] \n"
  1492. "vld4.u8 {d0-d3}, [%1]! \n"
  1493. "vmovl.u8 q8, d0 \n"
  1494. "vmovl.u8 q9, d1 \n"
  1495. "vmovl.u8 q10, d2 \n"
  1496. "vmovl.u8 q11, d3 \n"
  1497. "vmovl.u16 q0, d16 \n"
  1498. "vmovl.u16 q1, d17 \n"
  1499. "vmovl.u16 q2, d18 \n"
  1500. "vmovl.u16 q3, d19 \n"
  1501. "vmovl.u16 q8, d20 \n"
  1502. "vmovl.u16 q9, d21 \n"
  1503. "vmovl.u16 q10, d22 \n"
  1504. "vmovl.u16 q11, d23 \n"
  1505. "vcvt.f32.u32 q0, q0 \n"
  1506. "vcvt.f32.u32 q1, q1 \n"
  1507. "vcvt.f32.u32 q2, q2 \n"
  1508. "vcvt.f32.u32 q3, q3 \n"
  1509. "vcvt.f32.u32 q8, q8 \n"
  1510. "subs %0, #1 \n"
  1511. "vst1.f32 {d0-d3}, [%4]! \n"
  1512. "vcvt.f32.u32 q9, q9 \n"
  1513. "vcvt.f32.u32 q10, q10 \n"
  1514. "vst1.f32 {d4-d7}, [%3]! \n"
  1515. "vcvt.f32.u32 q11, q11 \n"
  1516. "vst1.f32 {d16-d19}, [%2]! \n"
  1517. "vst1.f32 {d20-d23}, [%5]! \n"
  1518. "bne 0b \n"
  1519. : "=r"(nn), // %0
  1520. "=r"(rgba), // %1
  1521. "=r"(ptr0), // %2
  1522. "=r"(ptr1), // %3
  1523. "=r"(ptr2), // %4
  1524. "=r"(ptr3) // %5
  1525. : "0"(nn),
  1526. "1"(rgba),
  1527. "2"(ptr0),
  1528. "3"(ptr1),
  1529. "4"(ptr2),
  1530. "5"(ptr3)
  1531. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  1532. }
  1533. #endif // __aarch64__
  1534. #endif // __ARM_NEON
  1535. for (; remain > 0; remain--)
  1536. {
  1537. *ptr0 = rgba[2];
  1538. *ptr1 = rgba[1];
  1539. *ptr2 = rgba[0];
  1540. *ptr3 = rgba[3];
  1541. rgba += 4;
  1542. ptr0++;
  1543. ptr1++;
  1544. ptr2++;
  1545. ptr3++;
  1546. }
  1547. rgba += wgap;
  1548. }
  1549. return 0;
  1550. }
  1551. static void to_rgba2bgra(const Mat& m, unsigned char* bgra, int stride)
  1552. {
  1553. int w = m.w;
  1554. int h = m.h;
  1555. const int wgap = stride - w * 4;
  1556. if (wgap == 0)
  1557. {
  1558. w = w * h;
  1559. h = 1;
  1560. }
  1561. const float* ptr0 = m.channel(0);
  1562. const float* ptr1 = m.channel(1);
  1563. const float* ptr2 = m.channel(2);
  1564. const float* ptr3 = m.channel(3);
  1565. for (int y = 0; y < h; y++)
  1566. {
  1567. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1568. #if __ARM_NEON
  1569. int nn = w >> 3;
  1570. int remain = w - (nn << 3);
  1571. #else
  1572. int remain = w;
  1573. #endif // __ARM_NEON
  1574. #if __ARM_NEON
  1575. for (; nn > 0; nn--)
  1576. {
  1577. float32x4_t _rlow = vld1q_f32(ptr0);
  1578. float32x4_t _rhigh = vld1q_f32(ptr0 + 4);
  1579. float32x4_t _glow = vld1q_f32(ptr1);
  1580. float32x4_t _ghigh = vld1q_f32(ptr1 + 4);
  1581. float32x4_t _blow = vld1q_f32(ptr2);
  1582. float32x4_t _bhigh = vld1q_f32(ptr2 + 4);
  1583. float32x4_t _alow = vld1q_f32(ptr3);
  1584. float32x4_t _ahigh = vld1q_f32(ptr3 + 4);
  1585. int16x8_t _r16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_rlow)), vmovn_s32(vcvtq_s32_f32(_rhigh)));
  1586. int16x8_t _g16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_glow)), vmovn_s32(vcvtq_s32_f32(_ghigh)));
  1587. int16x8_t _b16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_blow)), vmovn_s32(vcvtq_s32_f32(_bhigh)));
  1588. int16x8_t _a16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(_alow)), vmovn_s32(vcvtq_s32_f32(_ahigh)));
  1589. uint8x8x4_t _bgra;
  1590. _bgra.val[0] = vqmovun_s16(_b16);
  1591. _bgra.val[1] = vqmovun_s16(_g16);
  1592. _bgra.val[2] = vqmovun_s16(_r16);
  1593. _bgra.val[3] = vqmovun_s16(_a16);
  1594. vst4_u8(bgra, _bgra);
  1595. bgra += 4 * 8;
  1596. ptr0 += 8;
  1597. ptr1 += 8;
  1598. ptr2 += 8;
  1599. ptr3 += 8;
  1600. }
  1601. #endif // __ARM_NEON
  1602. for (; remain > 0; remain--)
  1603. {
  1604. bgra[0] = SATURATE_CAST_UCHAR(*ptr2);
  1605. bgra[1] = SATURATE_CAST_UCHAR(*ptr1);
  1606. bgra[2] = SATURATE_CAST_UCHAR(*ptr0);
  1607. bgra[3] = SATURATE_CAST_UCHAR(*ptr3);
  1608. bgra += 4;
  1609. ptr0++;
  1610. ptr1++;
  1611. ptr2++;
  1612. ptr3++;
  1613. }
  1614. #undef SATURATE_CAST_UCHAR
  1615. bgra += wgap;
  1616. }
  1617. }
  1618. static int from_bgra2gray(const unsigned char* bgra, int w, int h, int stride, Mat& m, Allocator* allocator)
  1619. {
  1620. // coeffs for r g b = 0.299f, 0.587f, 0.114f
  1621. const unsigned char Y_shift = 8; //14
  1622. const unsigned char R2Y = 77;
  1623. const unsigned char G2Y = 150;
  1624. const unsigned char B2Y = 29;
  1625. m.create(w, h, 1, 4u, allocator);
  1626. if (m.empty())
  1627. return -100;
  1628. const int wgap = stride - w * 4;
  1629. if (wgap == 0)
  1630. {
  1631. w = w * h;
  1632. h = 1;
  1633. }
  1634. float* ptr = m;
  1635. for (int y = 0; y < h; y++)
  1636. {
  1637. #if __ARM_NEON
  1638. int nn = w >> 3;
  1639. int remain = w - (nn << 3);
  1640. #else
  1641. int remain = w;
  1642. #endif // __ARM_NEON
  1643. #if __ARM_NEON
  1644. #if __aarch64__
  1645. uint8x8_t _R2Y = vdup_n_u8(R2Y);
  1646. uint8x8_t _G2Y = vdup_n_u8(G2Y);
  1647. uint8x8_t _B2Y = vdup_n_u8(B2Y);
  1648. for (; nn > 0; nn--)
  1649. {
  1650. uint8x8x4_t _bgra = vld4_u8(bgra);
  1651. uint16x8_t _y16 = vmull_u8(_bgra.val[2], _R2Y);
  1652. _y16 = vmlal_u8(_y16, _bgra.val[1], _G2Y);
  1653. _y16 = vmlal_u8(_y16, _bgra.val[0], _B2Y);
  1654. _y16 = vshrq_n_u16(_y16, Y_shift);
  1655. float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
  1656. float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
  1657. vst1q_f32(ptr, _ylow);
  1658. vst1q_f32(ptr + 4, _yhigh);
  1659. bgra += 4 * 8;
  1660. ptr += 8;
  1661. }
  1662. #else
  1663. if (nn > 0)
  1664. {
  1665. asm volatile(
  1666. "vdup.u8 d16, %6 \n"
  1667. "vdup.u8 d17, %7 \n"
  1668. "vdup.u8 d18, %8 \n"
  1669. "0: \n"
  1670. "pld [%1, #256] \n"
  1671. "vld4.u8 {d0-d3}, [%1]! \n"
  1672. "vmull.u8 q2, d2, d16 \n"
  1673. "vmlal.u8 q2, d1, d17 \n"
  1674. "vmlal.u8 q2, d0, d18 \n"
  1675. "vshr.u16 q2, q2, #8 \n" // Y_shift
  1676. "vmovl.u16 q0, d4 \n"
  1677. "vmovl.u16 q1, d5 \n"
  1678. "vcvt.f32.u32 q0, q0 \n"
  1679. "vcvt.f32.u32 q1, q1 \n"
  1680. "subs %0, #1 \n"
  1681. "vst1.f32 {d0-d3}, [%2]! \n"
  1682. "bne 0b \n"
  1683. : "=r"(nn), // %0
  1684. "=r"(bgra), // %1
  1685. "=r"(ptr) // %2
  1686. : "0"(nn),
  1687. "1"(bgra),
  1688. "2"(ptr),
  1689. "r"(R2Y), // %6
  1690. "r"(G2Y), // %7
  1691. "r"(B2Y) // %8
  1692. : "cc", "memory", "q0", "q1", "q2", "q8", "q9");
  1693. }
  1694. #endif // __aarch64__
  1695. #endif // __ARM_NEON
  1696. for (; remain > 0; remain--)
  1697. {
  1698. *ptr = static_cast<float>((bgra[2] * R2Y + bgra[1] * G2Y + bgra[0] * B2Y) >> Y_shift);
  1699. bgra += 4;
  1700. ptr++;
  1701. }
  1702. bgra += wgap;
  1703. }
  1704. return 0;
  1705. }
  1706. void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
  1707. {
  1708. const unsigned char* yptr = yuv420sp;
  1709. const unsigned char* vuptr = yuv420sp + w * h;
  1710. #if __ARM_NEON
  1711. uint8x8_t _v128 = vdup_n_u8(128);
  1712. int8x8_t _v90 = vdup_n_s8(90);
  1713. int8x8_t _v46 = vdup_n_s8(46);
  1714. int8x8_t _v22 = vdup_n_s8(22);
  1715. int8x8_t _v113 = vdup_n_s8(113);
  1716. #endif // __ARM_NEON
  1717. for (int y = 0; y < h; y += 2)
  1718. {
  1719. const unsigned char* yptr0 = yptr;
  1720. const unsigned char* yptr1 = yptr + w;
  1721. unsigned char* rgb0 = rgb;
  1722. unsigned char* rgb1 = rgb + w * 3;
  1723. #if __ARM_NEON
  1724. int nn = w >> 3;
  1725. int remain = w - (nn << 3);
  1726. #else
  1727. int remain = w;
  1728. #endif // __ARM_NEON
  1729. #if __ARM_NEON
  1730. #if __aarch64__
  1731. for (; nn > 0; nn--)
  1732. {
  1733. int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
  1734. int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
  1735. int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
  1736. int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
  1737. int8x8_t _vv = _vvvvuuuu.val[0];
  1738. int8x8_t _uu = _vvvvuuuu.val[1];
  1739. int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
  1740. int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
  1741. _g0 = vmlsl_s8(_g0, _uu, _v22);
  1742. int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
  1743. int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
  1744. int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
  1745. _g1 = vmlsl_s8(_g1, _uu, _v22);
  1746. int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
  1747. uint8x8x3_t _rgb0;
  1748. _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
  1749. _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
  1750. _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
  1751. uint8x8x3_t _rgb1;
  1752. _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
  1753. _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
  1754. _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
  1755. vst3_u8(rgb0, _rgb0);
  1756. vst3_u8(rgb1, _rgb1);
  1757. yptr0 += 8;
  1758. yptr1 += 8;
  1759. vuptr += 8;
  1760. rgb0 += 24;
  1761. rgb1 += 24;
  1762. }
  1763. #else
  1764. if (nn > 0)
  1765. {
  1766. asm volatile(
  1767. "pld [%3, #128] \n"
  1768. "vld1.u8 {d2}, [%3]! \n"
  1769. "vsub.s8 d2, d2, %12 \n"
  1770. "0: \n"
  1771. "pld [%1, #128] \n"
  1772. "vld1.u8 {d0}, [%1]! \n"
  1773. "pld [%2, #128] \n"
  1774. "vld1.u8 {d1}, [%2]! \n"
  1775. "vshll.u8 q2, d0, #6 \n"
  1776. "vorr d3, d2, d2 \n"
  1777. "vshll.u8 q3, d1, #6 \n"
  1778. "vorr q9, q2, q2 \n"
  1779. "vtrn.s8 d2, d3 \n"
  1780. "vorr q11, q3, q3 \n"
  1781. "vmlsl.s8 q9, d2, %14 \n"
  1782. "vorr q8, q2, q2 \n"
  1783. "vmlsl.s8 q11, d2, %14 \n"
  1784. "vorr q10, q3, q3 \n"
  1785. "vmlal.s8 q8, d2, %13 \n"
  1786. "vmlal.s8 q2, d3, %16 \n"
  1787. "vmlal.s8 q10, d2, %13 \n"
  1788. "vmlsl.s8 q9, d3, %15 \n"
  1789. "vmlal.s8 q3, d3, %16 \n"
  1790. "vmlsl.s8 q11, d3, %15 \n"
  1791. "vqshrun.s16 d24, q8, #6 \n"
  1792. "vqshrun.s16 d26, q2, #6 \n"
  1793. "vqshrun.s16 d4, q10, #6 \n"
  1794. "vqshrun.s16 d25, q9, #6 \n"
  1795. "vqshrun.s16 d6, q3, #6 \n"
  1796. "vqshrun.s16 d5, q11, #6 \n"
  1797. "pld [%3, #128] \n"
  1798. "vld1.u8 {d2}, [%3]! \n"
  1799. "subs %0, #1 \n"
  1800. "vst3.u8 {d24-d26}, [%4]! \n"
  1801. "vsub.s8 d2, d2, %12 \n"
  1802. "vst3.u8 {d4-d6}, [%5]! \n"
  1803. "bne 0b \n"
  1804. "sub %3, #8 \n"
  1805. : "=r"(nn), // %0
  1806. "=r"(yptr0), // %1
  1807. "=r"(yptr1), // %2
  1808. "=r"(vuptr), // %3
  1809. "=r"(rgb0), // %4
  1810. "=r"(rgb1) // %5
  1811. : "0"(nn),
  1812. "1"(yptr0),
  1813. "2"(yptr1),
  1814. "3"(vuptr),
  1815. "4"(rgb0),
  1816. "5"(rgb1),
  1817. "w"(_v128), // %12
  1818. "w"(_v90), // %13
  1819. "w"(_v46), // %14
  1820. "w"(_v22), // %15
  1821. "w"(_v113) // %16
  1822. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
  1823. }
  1824. #endif // __aarch64__
  1825. #endif // __ARM_NEON
  1826. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1827. for (; remain > 0; remain -= 2)
  1828. {
  1829. // R = 1.164 * yy + 1.596 * vv
  1830. // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
  1831. // B = 1.164 * yy + 2.018 * uu
  1832. // R = Y + (1.370705 * (V-128))
  1833. // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
  1834. // B = Y + (1.732446 * (U-128))
  1835. // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
  1836. // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
  1837. // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
  1838. // R = ((Y << 6) + 90 * (V-128)) >> 6
  1839. // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
  1840. // B = ((Y << 6) + 113 * (U-128)) >> 6
  1841. // R = (yy + 90 * vv) >> 6
  1842. // G = (yy - 46 * vv - 22 * uu) >> 6
  1843. // B = (yy + 113 * uu) >> 6
  1844. int v = vuptr[0] - 128;
  1845. int u = vuptr[1] - 128;
  1846. int ruv = 90 * v;
  1847. int guv = -46 * v + -22 * u;
  1848. int buv = 113 * u;
  1849. int y00 = yptr0[0] << 6;
  1850. rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
  1851. rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
  1852. rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
  1853. int y01 = yptr0[1] << 6;
  1854. rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
  1855. rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
  1856. rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
  1857. int y10 = yptr1[0] << 6;
  1858. rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
  1859. rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
  1860. rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
  1861. int y11 = yptr1[1] << 6;
  1862. rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
  1863. rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
  1864. rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
  1865. yptr0 += 2;
  1866. yptr1 += 2;
  1867. vuptr += 2;
  1868. rgb0 += 6;
  1869. rgb1 += 6;
  1870. }
  1871. #undef SATURATE_CAST_UCHAR
  1872. yptr += 2 * w;
  1873. rgb += 2 * 3 * w;
  1874. }
  1875. }
  1876. void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
  1877. {
  1878. const unsigned char* yptr = yuv420sp;
  1879. const unsigned char* uvptr = yuv420sp + w * h;
  1880. #if __ARM_NEON
  1881. uint8x8_t _v128 = vdup_n_u8(128);
  1882. int8x8_t _v90 = vdup_n_s8(90);
  1883. int8x8_t _v46 = vdup_n_s8(46);
  1884. int8x8_t _v22 = vdup_n_s8(22);
  1885. int8x8_t _v113 = vdup_n_s8(113);
  1886. #endif // __ARM_NEON
  1887. for (int y = 0; y < h; y += 2)
  1888. {
  1889. const unsigned char* yptr0 = yptr;
  1890. const unsigned char* yptr1 = yptr + w;
  1891. unsigned char* rgb0 = rgb;
  1892. unsigned char* rgb1 = rgb + w * 3;
  1893. #if __ARM_NEON
  1894. int nn = w >> 3;
  1895. int remain = w - (nn << 3);
  1896. #else
  1897. int remain = w;
  1898. #endif // __ARM_NEON
  1899. #if __ARM_NEON
  1900. #if __aarch64__
  1901. for (; nn > 0; nn--)
  1902. {
  1903. int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
  1904. int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
  1905. int8x8_t _uuvv = vreinterpret_s8_u8(vsub_u8(vld1_u8(uvptr), _v128));
  1906. int8x8x2_t _uuuuvvvv = vtrn_s8(_uuvv, _uuvv);
  1907. int8x8_t _uu = _uuuuvvvv.val[0];
  1908. int8x8_t _vv = _uuuuvvvv.val[1];
  1909. int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
  1910. int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
  1911. _g0 = vmlsl_s8(_g0, _uu, _v22);
  1912. int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
  1913. int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
  1914. int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
  1915. _g1 = vmlsl_s8(_g1, _uu, _v22);
  1916. int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
  1917. uint8x8x3_t _rgb0;
  1918. _rgb0.val[0] = vqshrun_n_s16(_r0, 6);
  1919. _rgb0.val[1] = vqshrun_n_s16(_g0, 6);
  1920. _rgb0.val[2] = vqshrun_n_s16(_b0, 6);
  1921. uint8x8x3_t _rgb1;
  1922. _rgb1.val[0] = vqshrun_n_s16(_r1, 6);
  1923. _rgb1.val[1] = vqshrun_n_s16(_g1, 6);
  1924. _rgb1.val[2] = vqshrun_n_s16(_b1, 6);
  1925. vst3_u8(rgb0, _rgb0);
  1926. vst3_u8(rgb1, _rgb1);
  1927. yptr0 += 8;
  1928. yptr1 += 8;
  1929. uvptr += 8;
  1930. rgb0 += 24;
  1931. rgb1 += 24;
  1932. }
  1933. #else
  1934. if (nn > 0)
  1935. {
  1936. asm volatile(
  1937. "pld [%3, #128] \n"
  1938. "vld1.u8 {d2}, [%3]! \n"
  1939. "vsub.s8 d2, d2, %12 \n"
  1940. "0: \n"
  1941. "pld [%1, #128] \n"
  1942. "vld1.u8 {d0}, [%1]! \n"
  1943. "pld [%2, #128] \n"
  1944. "vld1.u8 {d1}, [%2]! \n"
  1945. "vshll.u8 q2, d0, #6 \n"
  1946. "vorr d3, d2, d2 \n"
  1947. "vshll.u8 q3, d1, #6 \n"
  1948. "vorr q9, q2, q2 \n"
  1949. "vtrn.s8 d2, d3 \n"
  1950. "vorr q11, q3, q3 \n"
  1951. "vmlsl.s8 q9, d3, %14 \n"
  1952. "vorr q8, q2, q2 \n"
  1953. "vmlsl.s8 q11, d3, %14 \n"
  1954. "vorr q10, q3, q3 \n"
  1955. "vmlal.s8 q8, d3, %13 \n"
  1956. "vmlal.s8 q2, d2, %16 \n"
  1957. "vmlal.s8 q10, d3, %13 \n"
  1958. "vmlsl.s8 q9, d2, %15 \n"
  1959. "vmlal.s8 q3, d2, %16 \n"
  1960. "vmlsl.s8 q11, d2, %15 \n"
  1961. "vqshrun.s16 d24, q8, #6 \n"
  1962. "vqshrun.s16 d26, q2, #6 \n"
  1963. "vqshrun.s16 d4, q10, #6 \n"
  1964. "vqshrun.s16 d25, q9, #6 \n"
  1965. "vqshrun.s16 d6, q3, #6 \n"
  1966. "vqshrun.s16 d5, q11, #6 \n"
  1967. "pld [%3, #128] \n"
  1968. "vld1.u8 {d2}, [%3]! \n"
  1969. "subs %0, #1 \n"
  1970. "vst3.u8 {d24-d26}, [%4]! \n"
  1971. "vsub.s8 d2, d2, %12 \n"
  1972. "vst3.u8 {d4-d6}, [%5]! \n"
  1973. "bne 0b \n"
  1974. "sub %3, #8 \n"
  1975. : "=r"(nn), // %0
  1976. "=r"(yptr0), // %1
  1977. "=r"(yptr1), // %2
  1978. "=r"(uvptr), // %3
  1979. "=r"(rgb0), // %4
  1980. "=r"(rgb1) // %5
  1981. : "0"(nn),
  1982. "1"(yptr0),
  1983. "2"(yptr1),
  1984. "3"(uvptr),
  1985. "4"(rgb0),
  1986. "5"(rgb1),
  1987. "w"(_v128), // %12
  1988. "w"(_v90), // %13
  1989. "w"(_v46), // %14
  1990. "w"(_v22), // %15
  1991. "w"(_v113) // %16
  1992. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
  1993. }
  1994. #endif // __aarch64__
  1995. #endif // __ARM_NEON
  1996. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  1997. for (; remain > 0; remain -= 2)
  1998. {
  1999. // R = 1.164 * yy + 1.596 * vv
  2000. // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
  2001. // B = 1.164 * yy + 2.018 * uu
  2002. // R = Y + (1.370705 * (V-128))
  2003. // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
  2004. // B = Y + (1.732446 * (U-128))
  2005. // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
  2006. // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
  2007. // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
  2008. // R = ((Y << 6) + 90 * (V-128)) >> 6
  2009. // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
  2010. // B = ((Y << 6) + 113 * (U-128)) >> 6
  2011. // R = (yy + 90 * vv) >> 6
  2012. // G = (yy - 46 * vv - 22 * uu) >> 6
  2013. // B = (yy + 113 * uu) >> 6
  2014. int u = uvptr[0] - 128;
  2015. int v = uvptr[1] - 128;
  2016. int ruv = 90 * v;
  2017. int guv = -46 * v + -22 * u;
  2018. int buv = 113 * u;
  2019. int y00 = yptr0[0] << 6;
  2020. rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
  2021. rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
  2022. rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
  2023. int y01 = yptr0[1] << 6;
  2024. rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
  2025. rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
  2026. rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
  2027. int y10 = yptr1[0] << 6;
  2028. rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
  2029. rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
  2030. rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
  2031. int y11 = yptr1[1] << 6;
  2032. rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
  2033. rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
  2034. rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
  2035. yptr0 += 2;
  2036. yptr1 += 2;
  2037. uvptr += 2;
  2038. rgb0 += 6;
  2039. rgb1 += 6;
  2040. }
  2041. #undef SATURATE_CAST_UCHAR
  2042. yptr += 2 * w;
  2043. rgb += 2 * 3 * w;
  2044. }
  2045. }
  2046. void yuv420sp2rgb_half(const unsigned char* yuv, int w, int h, unsigned char* rgb)
  2047. {
  2048. const unsigned char* puv = yuv + w * h;
  2049. const unsigned char *py0 = yuv, *py1 = yuv + w;
  2050. const int hstep = h / 2;
  2051. #if __ARM_NEON
  2052. const int wstep = w / 16, tailstep = (w - wstep * 16) / 2;
  2053. uint8x8_t _u128 = vdup_n_u8(128);
  2054. int8x8_t _s90 = vdup_n_s8(90);
  2055. int8x8_t _sn46 = vdup_n_s8(-46);
  2056. int8x8_t _s113 = vdup_n_s8(113);
  2057. int8x8_t _sn22 = vdup_n_s8(-22);
  2058. int16x8_t _s0 = vdupq_n_s16(0);
  2059. int16x8_t _s16320 = vdupq_n_s16(16320); // 255 << 6
  2060. #else
  2061. const int tailstep = w / 2;
  2062. #endif
  2063. for (int i = 0; i < hstep; ++i)
  2064. {
  2065. #if __ARM_NEON
  2066. for (int j = 0; j < wstep; ++j)
  2067. {
  2068. uint8x16_t y0 = vld1q_u8(py0);
  2069. uint8x16_t y1 = vld1q_u8(py1);
  2070. // first 8 Y
  2071. uint16x8_t low = vaddl_u8(vget_low_u8(y0), vget_low_u8(y1));
  2072. uint16x4_t low_sum = vpadd_u16(vget_low_u16(low), vget_high_u16(low));
  2073. // last 8 Y
  2074. uint16x8_t high = vaddl_u8(vget_high_u8(y0), vget_high_u8(y1));
  2075. uint16x4_t high_sum = vpadd_u16(vget_low_u16(high), vget_high_u16(high));
  2076. uint16x8_t y8_sum = vcombine_u16(low_sum, high_sum);
  2077. // y8 = (y8_sum >> 2) << 6 = y8_sum << 4;
  2078. int16x8_t y8 = vreinterpretq_s16_u16(vshlq_n_u16(y8_sum, 4));
  2079. // prepare uv
  2080. uint8x8x2_t vu = vld2_u8(puv);
  2081. int8x8_t v = vreinterpret_s8_u8(vsub_u8(vu.val[0], _u128));
  2082. int8x8_t u = vreinterpret_s8_u8(vsub_u8(vu.val[1], _u128));
  2083. int16x8_t r_acc = vmlal_s8(y8, v, _s90);
  2084. int16x8_t g_acc = vmlal_s8(y8, v, _sn46);
  2085. g_acc = vmlal_s8(g_acc, u, _sn22);
  2086. int16x8_t b_acc = vmlal_s8(y8, u, _s113);
  2087. #define SHIFT_6_SATURATE(FROM, TO) \
  2088. FROM = vmaxq_s16(vminq_s16((FROM), _s16320), _s0); \
  2089. uint8x8_t TO = vshrn_n_u16(vreinterpretq_u16_s16((FROM)), 6);
  2090. SHIFT_6_SATURATE(b_acc, b_out)
  2091. SHIFT_6_SATURATE(g_acc, g_out)
  2092. SHIFT_6_SATURATE(r_acc, r_out)
  2093. #undef SHIFT_6_SATURATE
  2094. uint8x8x3_t _rgb;
  2095. _rgb.val[0] = r_out;
  2096. _rgb.val[1] = g_out;
  2097. _rgb.val[2] = b_out;
  2098. vst3_u8(rgb, _rgb);
  2099. rgb += 24;
  2100. py0 += 16;
  2101. py1 += 16;
  2102. puv += 16;
  2103. }
  2104. #endif
  2105. for (int idx = 0; idx < tailstep; ++idx)
  2106. {
  2107. int y = (static_cast<int>(py0[0]) + py0[1] + py1[2] + py1[1]) << 4;
  2108. int v = static_cast<int>(puv[0]) - 128;
  2109. int u = static_cast<int>(puv[1]) - 128;
  2110. int ruv = 90 * v;
  2111. int guv = -46 * v + -22 * u;
  2112. int buv = 113 * u;
  2113. #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
  2114. rgb[0] = SATURATE_CAST_UCHAR((y + ruv) >> 6);
  2115. rgb[1] = SATURATE_CAST_UCHAR((y + guv) >> 6);
  2116. rgb[2] = SATURATE_CAST_UCHAR((y + buv) >> 6);
  2117. #undef SATURATE_CAST_UCHAR
  2118. rgb += 3;
  2119. py0 += 2;
  2120. py1 += 2;
  2121. puv += 2;
  2122. }
  2123. // next two row
  2124. py0 = py1;
  2125. py1 = py0 + w;
  2126. }
  2127. }
  2128. Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
  2129. {
  2130. int type_from = type & PIXEL_FORMAT_MASK;
  2131. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2132. {
  2133. return Mat::from_pixels(pixels, type, w, h, w * 3, allocator);
  2134. }
  2135. else if (type_from == PIXEL_GRAY)
  2136. {
  2137. return Mat::from_pixels(pixels, type, w, h, w * 1, allocator);
  2138. }
  2139. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2140. {
  2141. return Mat::from_pixels(pixels, type, w, h, w * 4, allocator);
  2142. }
  2143. // unknown convert type
  2144. NCNN_LOGE("unknown convert type %d", type);
  2145. return Mat();
  2146. }
  2147. Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator)
  2148. {
  2149. Mat m;
  2150. if (type & PIXEL_CONVERT_MASK)
  2151. {
  2152. switch (type)
  2153. {
  2154. case PIXEL_RGB2BGR:
  2155. case PIXEL_BGR2RGB:
  2156. from_rgb2bgr(pixels, w, h, stride, m, allocator);
  2157. break;
  2158. case PIXEL_RGB2GRAY:
  2159. from_rgb2gray(pixels, w, h, stride, m, allocator);
  2160. break;
  2161. case PIXEL_RGB2RGBA:
  2162. case PIXEL_BGR2BGRA:
  2163. from_rgb2rgba(pixels, w, h, stride, m, allocator);
  2164. break;
  2165. case PIXEL_BGR2GRAY:
  2166. from_bgr2gray(pixels, w, h, stride, m, allocator);
  2167. break;
  2168. case PIXEL_BGR2RGBA:
  2169. case PIXEL_RGB2BGRA:
  2170. from_bgr2rgba(pixels, w, h, stride, m, allocator);
  2171. break;
  2172. case PIXEL_GRAY2RGB:
  2173. case PIXEL_GRAY2BGR:
  2174. from_gray2rgb(pixels, w, h, stride, m, allocator);
  2175. break;
  2176. case PIXEL_GRAY2RGBA:
  2177. case PIXEL_GRAY2BGRA:
  2178. from_gray2rgba(pixels, w, h, stride, m, allocator);
  2179. break;
  2180. case PIXEL_RGBA2RGB:
  2181. case PIXEL_BGRA2BGR:
  2182. from_rgba2rgb(pixels, w, h, stride, m, allocator);
  2183. break;
  2184. case PIXEL_RGBA2BGR:
  2185. case PIXEL_BGRA2RGB:
  2186. from_rgba2bgr(pixels, w, h, stride, m, allocator);
  2187. break;
  2188. case PIXEL_RGBA2GRAY:
  2189. from_rgba2gray(pixels, w, h, stride, m, allocator);
  2190. break;
  2191. case PIXEL_RGBA2BGRA:
  2192. case PIXEL_BGRA2RGBA:
  2193. from_rgba2bgra(pixels, w, h, stride, m, allocator);
  2194. break;
  2195. case PIXEL_BGRA2GRAY:
  2196. from_bgra2gray(pixels, w, h, stride, m, allocator);
  2197. break;
  2198. default:
  2199. // unimplemented convert type
  2200. NCNN_LOGE("unimplemented convert type %d", type);
  2201. break;
  2202. }
  2203. }
  2204. else
  2205. {
  2206. if (type == PIXEL_RGB || type == PIXEL_BGR)
  2207. from_rgb(pixels, w, h, stride, m, allocator);
  2208. if (type == PIXEL_GRAY)
  2209. from_gray(pixels, w, h, stride, m, allocator);
  2210. if (type == PIXEL_RGBA || type == PIXEL_BGRA)
  2211. from_rgba(pixels, w, h, stride, m, allocator);
  2212. }
  2213. return m;
  2214. }
  2215. Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
  2216. {
  2217. int type_from = type & PIXEL_FORMAT_MASK;
  2218. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2219. {
  2220. return Mat::from_pixels_resize(pixels, type, w, h, w * 3, target_width, target_height, allocator);
  2221. }
  2222. else if (type_from == PIXEL_GRAY)
  2223. {
  2224. return Mat::from_pixels_resize(pixels, type, w, h, w * 1, target_width, target_height, allocator);
  2225. }
  2226. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2227. {
  2228. return Mat::from_pixels_resize(pixels, type, w, h, w * 4, target_width, target_height, allocator);
  2229. }
  2230. // unknown convert type
  2231. NCNN_LOGE("unknown convert type %d", type);
  2232. return Mat();
  2233. }
  2234. Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator)
  2235. {
  2236. if (w == target_width && h == target_height)
  2237. return Mat::from_pixels(pixels, type, w, h, stride, allocator);
  2238. int type_from = type & PIXEL_FORMAT_MASK;
  2239. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2240. {
  2241. Mat dst(target_width, target_height, (size_t)3u, 3);
  2242. resize_bilinear_c3(pixels, w, h, stride, dst, target_width, target_height, target_width * 3);
  2243. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2244. }
  2245. else if (type_from == PIXEL_GRAY)
  2246. {
  2247. Mat dst(target_width, target_height, (size_t)1u, 1);
  2248. resize_bilinear_c1(pixels, w, h, stride, dst, target_width, target_height, target_width * 1);
  2249. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2250. }
  2251. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2252. {
  2253. Mat dst(target_width, target_height, (size_t)4u, 4);
  2254. resize_bilinear_c4(pixels, w, h, stride, dst, target_width, target_height, target_width * 4);
  2255. return Mat::from_pixels(dst, type, target_width, target_height, allocator);
  2256. }
  2257. // unknown convert type
  2258. NCNN_LOGE("unknown convert type %d", type);
  2259. return Mat();
  2260. }
  2261. Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator)
  2262. {
  2263. if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
  2264. {
  2265. NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
  2266. return Mat();
  2267. }
  2268. int type_from = type & PIXEL_FORMAT_MASK;
  2269. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2270. {
  2271. return from_pixels(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, allocator);
  2272. }
  2273. else if (type_from == PIXEL_GRAY)
  2274. {
  2275. return from_pixels(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, allocator);
  2276. }
  2277. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2278. {
  2279. return from_pixels(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, allocator);
  2280. }
  2281. // unknown convert type
  2282. NCNN_LOGE("unknown convert type %d", type);
  2283. return Mat();
  2284. }
  2285. Mat Mat::from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator)
  2286. {
  2287. if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
  2288. {
  2289. NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
  2290. return Mat();
  2291. }
  2292. int type_from = type & PIXEL_FORMAT_MASK;
  2293. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2294. {
  2295. return from_pixels(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, allocator);
  2296. }
  2297. else if (type_from == PIXEL_GRAY)
  2298. {
  2299. return from_pixels(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, allocator);
  2300. }
  2301. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2302. {
  2303. return from_pixels(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, allocator);
  2304. }
  2305. // unknown convert type
  2306. NCNN_LOGE("unknown convert type %d", type);
  2307. return Mat();
  2308. }
  2309. Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
  2310. {
  2311. if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
  2312. {
  2313. NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
  2314. return Mat();
  2315. }
  2316. int type_from = type & PIXEL_FORMAT_MASK;
  2317. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2318. {
  2319. return from_pixels_resize(pixels + (roiy * w + roix) * 3, type, roiw, roih, w * 3, target_width, target_height, allocator);
  2320. }
  2321. else if (type_from == PIXEL_GRAY)
  2322. {
  2323. return from_pixels_resize(pixels + (roiy * w + roix) * 1, type, roiw, roih, w * 1, target_width, target_height, allocator);
  2324. }
  2325. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2326. {
  2327. return from_pixels_resize(pixels + (roiy * w + roix) * 4, type, roiw, roih, w * 4, target_width, target_height, allocator);
  2328. }
  2329. // unknown convert type
  2330. NCNN_LOGE("unknown convert type %d", type);
  2331. return Mat();
  2332. }
  2333. Mat Mat::from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator)
  2334. {
  2335. if (roix < 0 || roiy < 0 || roiw <= 0 || roih <= 0 || roix + roiw > w || roiy + roih > h)
  2336. {
  2337. NCNN_LOGE("roi %d %d %d %d out of image %d %d", roix, roiy, roiw, roih, w, h);
  2338. return Mat();
  2339. }
  2340. int type_from = type & PIXEL_FORMAT_MASK;
  2341. if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
  2342. {
  2343. return from_pixels_resize(pixels + roiy * stride + roix * 3, type, roiw, roih, stride, target_width, target_height, allocator);
  2344. }
  2345. else if (type_from == PIXEL_GRAY)
  2346. {
  2347. return from_pixels_resize(pixels + roiy * stride + roix * 1, type, roiw, roih, stride, target_width, target_height, allocator);
  2348. }
  2349. else if (type_from == PIXEL_RGBA || type_from == PIXEL_BGRA)
  2350. {
  2351. return from_pixels_resize(pixels + roiy * stride + roix * 4, type, roiw, roih, stride, target_width, target_height, allocator);
  2352. }
  2353. // unknown convert type
  2354. NCNN_LOGE("unknown convert type %d", type);
  2355. return Mat();
  2356. }
  2357. void Mat::to_pixels(unsigned char* pixels, int type) const
  2358. {
  2359. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2360. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2361. {
  2362. to_pixels(pixels, type, w * 3);
  2363. }
  2364. else if (type_to == PIXEL_GRAY)
  2365. {
  2366. to_pixels(pixels, type, w * 1);
  2367. }
  2368. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2369. {
  2370. to_pixels(pixels, type, w * 4);
  2371. }
  2372. }
  2373. void Mat::to_pixels(unsigned char* pixels, int type, int stride) const
  2374. {
  2375. if (type & PIXEL_CONVERT_MASK)
  2376. {
  2377. switch (type)
  2378. {
  2379. case PIXEL_RGB2BGR:
  2380. case PIXEL_BGR2RGB:
  2381. to_bgr2rgb(*this, pixels, stride);
  2382. break;
  2383. case PIXEL_RGB2RGBA:
  2384. case PIXEL_BGR2BGRA:
  2385. to_rgb2rgba(*this, pixels, stride);
  2386. break;
  2387. case PIXEL_BGR2RGBA:
  2388. case PIXEL_RGB2BGRA:
  2389. to_bgr2rgba(*this, pixels, stride);
  2390. break;
  2391. case PIXEL_GRAY2RGBA:
  2392. case PIXEL_GRAY2BGRA:
  2393. to_gray2rgba(*this, pixels, stride);
  2394. break;
  2395. case PIXEL_RGBA2BGRA:
  2396. case PIXEL_BGRA2RGBA:
  2397. to_rgba2bgra(*this, pixels, stride);
  2398. break;
  2399. default:
  2400. // unimplemented convert type
  2401. NCNN_LOGE("unimplemented convert type %d", type);
  2402. break;
  2403. }
  2404. }
  2405. else
  2406. {
  2407. if (type == PIXEL_RGB || type == PIXEL_BGR)
  2408. to_rgb(*this, pixels, stride);
  2409. if (type == PIXEL_GRAY)
  2410. to_gray(*this, pixels, stride);
  2411. if (type == PIXEL_RGBA || type == PIXEL_BGRA)
  2412. to_rgba(*this, pixels, stride);
  2413. }
  2414. }
  2415. void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
  2416. {
  2417. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2418. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2419. {
  2420. to_pixels_resize(pixels, type, target_width, target_height, target_width * 3);
  2421. }
  2422. else if (type_to == PIXEL_GRAY)
  2423. {
  2424. to_pixels_resize(pixels, type, target_width, target_height, target_width * 1);
  2425. }
  2426. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2427. {
  2428. to_pixels_resize(pixels, type, target_width, target_height, target_width * 4);
  2429. }
  2430. }
  2431. void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const
  2432. {
  2433. if (w == target_width && h == target_height)
  2434. return to_pixels(pixels, type);
  2435. int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
  2436. if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
  2437. {
  2438. Mat src(w, h, (size_t)3u, 3);
  2439. to_pixels(src, type);
  2440. resize_bilinear_c3(src, w, h, w * 3, pixels, target_width, target_height, target_stride);
  2441. }
  2442. else if (type_to == PIXEL_GRAY)
  2443. {
  2444. Mat src(w, h, (size_t)1u, 1);
  2445. to_pixels(src, type);
  2446. resize_bilinear_c1(src, w, h, w * 1, pixels, target_width, target_height, target_stride);
  2447. }
  2448. else if (type_to == PIXEL_RGBA || type_to == PIXEL_BGRA)
  2449. {
  2450. Mat src(w, h, (size_t)4u, 4);
  2451. to_pixels(src, type);
  2452. resize_bilinear_c4(src, w, h, w * 4, pixels, target_width, target_height, target_stride);
  2453. }
  2454. }
  2455. #endif // NCNN_PIXEL
  2456. } // namespace ncnn