You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mat_pixel_rotate.cpp 234 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "mat.h"
  15. #if __ARM_NEON
  16. #include <arm_neon.h>
  17. #endif // __ARM_NEON
  18. #include "platform.h"
  19. namespace ncnn {
  20. #if NCNN_PIXEL_ROTATE
  21. // should be a kanna ascii art here in my local branch
  22. // but we shall ask the original art author for permission first ...
  23. // https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/
  24. static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  25. {
  26. const int srcwgap = srcstride - srcw;
  27. const int wgap = stride - w;
  28. const unsigned char* src0 = src;
  29. const unsigned char* src1 = src + srcstride;
  30. unsigned char* dst0 = dst;
  31. unsigned char* dst1 = dst + stride;
  32. int y = 0;
  33. for (; y + 1 < srch; y += 2)
  34. {
  35. #if __ARM_NEON
  36. int nn = srcw >> 5;
  37. int remain = srcw - (nn << 5);
  38. #if __aarch64__
  39. for (; nn > 0; nn--)
  40. {
  41. uint8x16_t _src0 = vld1q_u8(src0);
  42. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  43. vst1q_u8(dst0, _src0);
  44. vst1q_u8(dst0 + 16, _src0n);
  45. uint8x16_t _src1 = vld1q_u8(src1);
  46. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  47. vst1q_u8(dst1, _src1);
  48. vst1q_u8(dst1 + 16, _src1n);
  49. src0 += 32;
  50. src1 += 32;
  51. dst0 += 32;
  52. dst1 += 32;
  53. }
  54. #else
  55. if (nn > 0)
  56. {
  57. asm volatile(
  58. "0: \n"
  59. "pld [%1, #256] \n"
  60. "vld1.u8 {d0-d3}, [%1]! \n"
  61. "pld [%2, #256] \n"
  62. "vld1.u8 {d4-d7}, [%2]! \n"
  63. "subs %0, #1 \n"
  64. "vst1.u8 {d0-d3}, [%3]! \n"
  65. "vst1.u8 {d4-d7}, [%4]! \n"
  66. "bne 0b \n"
  67. : "=r"(nn), // %0
  68. "=r"(src0), // %1
  69. "=r"(src1), // %2
  70. "=r"(dst0), // %3
  71. "=r"(dst1) // %4
  72. : "0"(nn),
  73. "1"(src0),
  74. "2"(src1),
  75. "3"(dst0),
  76. "4"(dst1)
  77. : "cc", "memory", "q0", "q1", "q2", "q3");
  78. }
  79. #endif // __aarch64__
  80. #else
  81. int remain = srcw;
  82. #endif // __ARM_NEON
  83. for (; remain > 0; remain--)
  84. {
  85. *dst0++ = *src0++;
  86. *dst1++ = *src1++;
  87. }
  88. src0 += srcwgap + srcstride;
  89. src1 += srcwgap + srcstride;
  90. dst0 += wgap + stride;
  91. dst1 += wgap + stride;
  92. }
  93. for (; y < srch; y++)
  94. {
  95. #if __ARM_NEON
  96. int nn = srcw >> 5;
  97. int remain = srcw - (nn << 5);
  98. #if __aarch64__
  99. for (; nn > 0; nn--)
  100. {
  101. uint8x16_t _src = vld1q_u8(src0);
  102. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  103. vst1q_u8(dst0, _src);
  104. vst1q_u8(dst0 + 16, _src2);
  105. src0 += 32;
  106. dst0 += 32;
  107. }
  108. #else
  109. if (nn > 0)
  110. {
  111. asm volatile(
  112. "0: \n"
  113. "pld [%1, #256] \n"
  114. "vld1.u8 {d0-d3}, [%1]! \n"
  115. "subs %0, #1 \n"
  116. "vst1.u8 {d0-d3}, [%2]! \n"
  117. "bne 0b \n"
  118. : "=r"(nn), // %0
  119. "=r"(src0), // %1
  120. "=r"(dst0) // %2
  121. : "0"(nn),
  122. "1"(src0),
  123. "2"(dst0)
  124. : "cc", "memory", "q0", "q1");
  125. }
  126. #endif // __aarch64__
  127. #else
  128. int remain = srcw;
  129. #endif // __ARM_NEON
  130. for (; remain > 0; remain--)
  131. {
  132. *dst0++ = *src0++;
  133. }
  134. src0 += srcwgap;
  135. dst0 += wgap;
  136. }
  137. }
  138. static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  139. {
  140. const int srcwgap = srcstride - srcw * 2;
  141. const int wgap = stride - w * 2;
  142. int size = srcw * 2;
  143. const unsigned char* src0 = src;
  144. const unsigned char* src1 = src + srcstride;
  145. unsigned char* dst0 = dst;
  146. unsigned char* dst1 = dst + stride;
  147. int y = 0;
  148. for (; y + 1 < srch; y += 2)
  149. {
  150. #if __ARM_NEON
  151. int nn = size >> 5;
  152. int remain = size - (nn << 5);
  153. #if __aarch64__
  154. for (; nn > 0; nn--)
  155. {
  156. uint8x16_t _src0 = vld1q_u8(src0);
  157. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  158. vst1q_u8(dst0, _src0);
  159. vst1q_u8(dst0 + 16, _src0n);
  160. uint8x16_t _src1 = vld1q_u8(src1);
  161. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  162. vst1q_u8(dst1, _src1);
  163. vst1q_u8(dst1 + 16, _src1n);
  164. src0 += 32;
  165. src1 += 32;
  166. dst0 += 32;
  167. dst1 += 32;
  168. }
  169. #else
  170. if (nn > 0)
  171. {
  172. asm volatile(
  173. "0: \n"
  174. "pld [%1, #256] \n"
  175. "vld1.u8 {d0-d3}, [%1]! \n"
  176. "pld [%2, #256] \n"
  177. "vld1.u8 {d4-d7}, [%2]! \n"
  178. "subs %0, #1 \n"
  179. "vst1.u8 {d0-d3}, [%3]! \n"
  180. "vst1.u8 {d4-d7}, [%4]! \n"
  181. "bne 0b \n"
  182. : "=r"(nn), // %0
  183. "=r"(src0), // %1
  184. "=r"(src1), // %2
  185. "=r"(dst0), // %3
  186. "=r"(dst1) // %4
  187. : "0"(nn),
  188. "1"(src0),
  189. "2"(src1),
  190. "3"(dst0),
  191. "4"(dst1)
  192. : "cc", "memory", "q0", "q1", "q2", "q3");
  193. }
  194. #endif // __aarch64__
  195. #else
  196. int remain = size;
  197. #endif // __ARM_NEON
  198. for (; remain > 0; remain--)
  199. {
  200. *dst0++ = *src0++;
  201. *dst1++ = *src1++;
  202. }
  203. src0 += srcwgap + srcstride;
  204. src1 += srcwgap + srcstride;
  205. dst0 += wgap + stride;
  206. dst1 += wgap + stride;
  207. }
  208. for (; y < srch; y++)
  209. {
  210. #if __ARM_NEON
  211. int nn = size >> 5;
  212. int remain = size - (nn << 5);
  213. #if __aarch64__
  214. for (; nn > 0; nn--)
  215. {
  216. uint8x16_t _src = vld1q_u8(src0);
  217. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  218. vst1q_u8(dst0, _src);
  219. vst1q_u8(dst0 + 16, _src2);
  220. src0 += 32;
  221. dst0 += 32;
  222. }
  223. #else
  224. if (nn > 0)
  225. {
  226. asm volatile(
  227. "0: \n"
  228. "pld [%1, #256] \n"
  229. "vld1.u8 {d0-d3}, [%1]! \n"
  230. "subs %0, #1 \n"
  231. "vst1.u8 {d0-d3}, [%2]! \n"
  232. "bne 0b \n"
  233. : "=r"(nn), // %0
  234. "=r"(src0), // %1
  235. "=r"(dst0) // %2
  236. : "0"(nn),
  237. "1"(src0),
  238. "2"(dst0)
  239. : "cc", "memory", "q0", "q1");
  240. }
  241. #endif // __aarch64__
  242. #else
  243. int remain = size;
  244. #endif // __ARM_NEON
  245. for (; remain > 0; remain--)
  246. {
  247. *dst0++ = *src0++;
  248. }
  249. src0 += srcwgap;
  250. dst0 += wgap;
  251. }
  252. }
  253. static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  254. {
  255. const int srcwgap = srcstride - srcw * 3;
  256. const int wgap = stride - w * 3;
  257. int size = srcw * 3;
  258. const unsigned char* src0 = src;
  259. const unsigned char* src1 = src + srcstride;
  260. unsigned char* dst0 = dst;
  261. unsigned char* dst1 = dst + stride;
  262. int y = 0;
  263. for (; y + 1 < srch; y += 2)
  264. {
  265. #if __ARM_NEON
  266. int nn = size >> 5;
  267. int remain = size - (nn << 5);
  268. #if __aarch64__
  269. for (; nn > 0; nn--)
  270. {
  271. uint8x16_t _src0 = vld1q_u8(src0);
  272. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  273. vst1q_u8(dst0, _src0);
  274. vst1q_u8(dst0 + 16, _src0n);
  275. uint8x16_t _src1 = vld1q_u8(src1);
  276. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  277. vst1q_u8(dst1, _src1);
  278. vst1q_u8(dst1 + 16, _src1n);
  279. src0 += 32;
  280. src1 += 32;
  281. dst0 += 32;
  282. dst1 += 32;
  283. }
  284. #else
  285. if (nn > 0)
  286. {
  287. asm volatile(
  288. "0: \n"
  289. "pld [%1, #256] \n"
  290. "vld1.u8 {d0-d3}, [%1]! \n"
  291. "pld [%2, #256] \n"
  292. "vld1.u8 {d4-d7}, [%2]! \n"
  293. "subs %0, #1 \n"
  294. "vst1.u8 {d0-d3}, [%3]! \n"
  295. "vst1.u8 {d4-d7}, [%4]! \n"
  296. "bne 0b \n"
  297. : "=r"(nn), // %0
  298. "=r"(src0), // %1
  299. "=r"(src1), // %2
  300. "=r"(dst0), // %3
  301. "=r"(dst1) // %4
  302. : "0"(nn),
  303. "1"(src0),
  304. "2"(src1),
  305. "3"(dst0),
  306. "4"(dst1)
  307. : "cc", "memory", "q0", "q1", "q2", "q3");
  308. }
  309. #endif // __aarch64__
  310. #else
  311. int remain = size;
  312. #endif // __ARM_NEON
  313. for (; remain > 0; remain--)
  314. {
  315. *dst0++ = *src0++;
  316. *dst1++ = *src1++;
  317. }
  318. src0 += srcwgap + srcstride;
  319. src1 += srcwgap + srcstride;
  320. dst0 += wgap + stride;
  321. dst1 += wgap + stride;
  322. }
  323. for (; y < srch; y++)
  324. {
  325. #if __ARM_NEON
  326. int nn = size >> 5;
  327. int remain = size - (nn << 5);
  328. #if __aarch64__
  329. for (; nn > 0; nn--)
  330. {
  331. uint8x16_t _src = vld1q_u8(src0);
  332. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  333. vst1q_u8(dst0, _src);
  334. vst1q_u8(dst0 + 16, _src2);
  335. src0 += 32;
  336. dst0 += 32;
  337. }
  338. #else
  339. if (nn > 0)
  340. {
  341. asm volatile(
  342. "0: \n"
  343. "pld [%1, #256] \n"
  344. "vld1.u8 {d0-d3}, [%1]! \n"
  345. "subs %0, #1 \n"
  346. "vst1.u8 {d0-d3}, [%2]! \n"
  347. "bne 0b \n"
  348. : "=r"(nn), // %0
  349. "=r"(src0), // %1
  350. "=r"(dst0) // %2
  351. : "0"(nn),
  352. "1"(src0),
  353. "2"(dst0)
  354. : "cc", "memory", "q0", "q1");
  355. }
  356. #endif // __aarch64__
  357. #else
  358. int remain = size;
  359. #endif // __ARM_NEON
  360. for (; remain > 0; remain--)
  361. {
  362. *dst0++ = *src0++;
  363. }
  364. src0 += srcwgap;
  365. dst0 += wgap;
  366. }
  367. }
  368. static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  369. {
  370. const int srcwgap = srcstride - srcw * 4;
  371. const int wgap = stride - w * 4;
  372. int size = srcw * 4;
  373. const unsigned char* src0 = src;
  374. const unsigned char* src1 = src + srcstride;
  375. unsigned char* dst0 = dst;
  376. unsigned char* dst1 = dst + stride;
  377. int y = 0;
  378. for (; y + 1 < srch; y += 2)
  379. {
  380. #if __ARM_NEON
  381. int nn = size >> 5;
  382. int remain = size - (nn << 5);
  383. #if __aarch64__
  384. for (; nn > 0; nn--)
  385. {
  386. uint8x16_t _src0 = vld1q_u8(src0);
  387. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  388. vst1q_u8(dst0, _src0);
  389. vst1q_u8(dst0 + 16, _src0n);
  390. uint8x16_t _src1 = vld1q_u8(src1);
  391. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  392. vst1q_u8(dst1, _src1);
  393. vst1q_u8(dst1 + 16, _src1n);
  394. src0 += 32;
  395. src1 += 32;
  396. dst0 += 32;
  397. dst1 += 32;
  398. }
  399. #else
  400. if (nn > 0)
  401. {
  402. asm volatile(
  403. "0: \n"
  404. "pld [%1, #256] \n"
  405. "vld1.u8 {d0-d3}, [%1]! \n"
  406. "pld [%2, #256] \n"
  407. "vld1.u8 {d4-d7}, [%2]! \n"
  408. "subs %0, #1 \n"
  409. "vst1.u8 {d0-d3}, [%3]! \n"
  410. "vst1.u8 {d4-d7}, [%4]! \n"
  411. "bne 0b \n"
  412. : "=r"(nn), // %0
  413. "=r"(src0), // %1
  414. "=r"(src1), // %2
  415. "=r"(dst0), // %3
  416. "=r"(dst1) // %4
  417. : "0"(nn),
  418. "1"(src0),
  419. "2"(src1),
  420. "3"(dst0),
  421. "4"(dst1)
  422. : "cc", "memory", "q0", "q1", "q2", "q3");
  423. }
  424. #endif // __aarch64__
  425. #else
  426. int remain = size;
  427. #endif // __ARM_NEON
  428. for (; remain > 0; remain--)
  429. {
  430. *dst0++ = *src0++;
  431. *dst1++ = *src1++;
  432. }
  433. src0 += srcwgap + srcstride;
  434. src1 += srcwgap + srcstride;
  435. dst0 += wgap + stride;
  436. dst1 += wgap + stride;
  437. }
  438. for (; y < srch; y++)
  439. {
  440. #if __ARM_NEON
  441. int nn = size >> 5;
  442. int remain = size - (nn << 5);
  443. #if __aarch64__
  444. for (; nn > 0; nn--)
  445. {
  446. uint8x16_t _src = vld1q_u8(src0);
  447. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  448. vst1q_u8(dst0, _src);
  449. vst1q_u8(dst0 + 16, _src2);
  450. src0 += 32;
  451. dst0 += 32;
  452. }
  453. #else
  454. if (nn > 0)
  455. {
  456. asm volatile(
  457. "0: \n"
  458. "pld [%1, #256] \n"
  459. "vld1.u8 {d0-d3}, [%1]! \n"
  460. "subs %0, #1 \n"
  461. "vst1.u8 {d0-d3}, [%2]! \n"
  462. "bne 0b \n"
  463. : "=r"(nn), // %0
  464. "=r"(src0), // %1
  465. "=r"(dst0) // %2
  466. : "0"(nn),
  467. "1"(src0),
  468. "2"(dst0)
  469. : "cc", "memory", "q0", "q1");
  470. }
  471. #endif // __aarch64__
  472. #else
  473. int remain = size;
  474. #endif // __ARM_NEON
  475. for (; remain > 0; remain--)
  476. {
  477. *dst0++ = *src0++;
  478. }
  479. src0 += srcwgap;
  480. dst0 += wgap;
  481. }
  482. }
  483. static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  484. {
  485. const int srcwgap = srcstride - srcw;
  486. const int wgap = stride + w;
  487. const unsigned char* src0 = src;
  488. unsigned char* dst0 = dst + w - 1;
  489. int y = 0;
  490. for (; y < srch; y++)
  491. {
  492. #if __ARM_NEON
  493. dst0 -= 15;
  494. int nn = srcw >> 4;
  495. int remain = srcw - (nn << 4);
  496. #if __aarch64__
  497. for (; nn > 0; nn--)
  498. {
  499. uint8x8_t _src = vld1_u8(src0);
  500. uint8x8_t _src2 = vld1_u8(src0 + 8);
  501. _src = vrev64_u8(_src);
  502. _src2 = vrev64_u8(_src2);
  503. vst1_u8(dst0, _src2);
  504. vst1_u8(dst0 + 8, _src);
  505. src0 += 16;
  506. dst0 -= 16;
  507. }
  508. #else
  509. if (nn > 0)
  510. {
  511. asm volatile(
  512. "mov r4, #-16 \n"
  513. "0: \n"
  514. "pld [%1, #128] \n"
  515. "vld1.u8 {d0-d1}, [%1]! \n"
  516. "vrev64.u8 d3, d0 \n"
  517. "vrev64.u8 d2, d1 \n"
  518. "subs %0, #1 \n"
  519. "vst1.u8 {d2-d3}, [%2], r4 \n"
  520. "bne 0b \n"
  521. : "=r"(nn), // %0
  522. "=r"(src0), // %1
  523. "=r"(dst0) // %2
  524. : "0"(nn),
  525. "1"(src0),
  526. "2"(dst0)
  527. : "cc", "memory", "q0", "q1", "r4");
  528. }
  529. #endif // __aarch64__
  530. dst0 += 15;
  531. #else
  532. int remain = srcw;
  533. #endif // __ARM_NEON
  534. for (; remain > 0; remain--)
  535. {
  536. *dst0 = *src0;
  537. src0 += 1;
  538. dst0 -= 1;
  539. }
  540. src0 += srcwgap;
  541. dst0 += wgap;
  542. }
  543. }
  544. static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  545. {
  546. const int srcwgap = srcstride - srcw * 2;
  547. const int wgap = stride + w * 2;
  548. const unsigned char* src0 = src;
  549. unsigned char* dst0 = dst + w * 2 - 2;
  550. int y = 0;
  551. for (; y < srch; y++)
  552. {
  553. #if __ARM_NEON
  554. dst0 -= 7 * 2;
  555. int nn = srcw >> 4;
  556. int remain = srcw - (nn << 4);
  557. #if __aarch64__
  558. for (; nn > 0; nn--)
  559. {
  560. uint8x8x2_t _src = vld2_u8(src0);
  561. uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
  562. _src.val[0] = vrev64_u8(_src.val[0]);
  563. _src.val[1] = vrev64_u8(_src.val[1]);
  564. _src2.val[0] = vrev64_u8(_src2.val[0]);
  565. _src2.val[1] = vrev64_u8(_src2.val[1]);
  566. vst2_u8(dst0, _src);
  567. vst2_u8(dst0 - 8 * 2, _src2);
  568. src0 += 16 * 2;
  569. dst0 -= 16 * 2;
  570. }
  571. #else
  572. if (nn > 0)
  573. {
  574. asm volatile(
  575. "mov r4, #-16 \n"
  576. "0: \n"
  577. "pld [%1, #128] \n"
  578. "vld2.u8 {d0-d1}, [%1]! \n"
  579. "vrev64.u8 d0, d0 \n"
  580. "pld [%1, #128] \n"
  581. "vld2.u8 {d2-d3}, [%1]! \n"
  582. "vrev64.u8 d1, d1 \n"
  583. "vrev64.u8 d2, d2 \n"
  584. "vst2.u8 {d0-d1}, [%2], r4 \n"
  585. "vrev64.u8 d3, d3 \n"
  586. "subs %0, #1 \n"
  587. "vst2.u8 {d2-d3}, [%2], r4 \n"
  588. "bne 0b \n"
  589. : "=r"(nn), // %0
  590. "=r"(src0), // %1
  591. "=r"(dst0) // %2
  592. : "0"(nn),
  593. "1"(src0),
  594. "2"(dst0)
  595. : "cc", "memory", "q0", "q1", "r4");
  596. }
  597. #endif // __aarch64__
  598. dst0 += 7 * 2;
  599. #else
  600. int remain = srcw;
  601. #endif // __ARM_NEON
  602. for (; remain > 0; remain--)
  603. {
  604. dst0[0] = src0[0];
  605. dst0[1] = src0[1];
  606. src0 += 2;
  607. dst0 -= 2;
  608. }
  609. src0 += srcwgap;
  610. dst0 += wgap;
  611. }
  612. }
  613. static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  614. {
  615. const int srcwgap = srcstride - srcw * 3;
  616. const int wgap = stride + w * 3;
  617. const unsigned char* src0 = src;
  618. unsigned char* dst0 = dst + w * 3 - 3;
  619. int y = 0;
  620. for (; y < srch; y++)
  621. {
  622. #if __ARM_NEON
  623. dst0 -= 7 * 3;
  624. int nn = srcw >> 4;
  625. int remain = srcw - (nn << 4);
  626. #if __aarch64__
  627. for (; nn > 0; nn--)
  628. {
  629. uint8x8x3_t _src = vld3_u8(src0);
  630. uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
  631. _src.val[0] = vrev64_u8(_src.val[0]);
  632. _src.val[1] = vrev64_u8(_src.val[1]);
  633. _src.val[2] = vrev64_u8(_src.val[2]);
  634. _src2.val[0] = vrev64_u8(_src2.val[0]);
  635. _src2.val[1] = vrev64_u8(_src2.val[1]);
  636. _src2.val[2] = vrev64_u8(_src2.val[2]);
  637. vst3_u8(dst0, _src);
  638. vst3_u8(dst0 - 8 * 3, _src2);
  639. src0 += 16 * 3;
  640. dst0 -= 16 * 3;
  641. }
  642. #else
  643. if (nn > 0)
  644. {
  645. asm volatile(
  646. "mov r4, #-24 \n"
  647. "0: \n"
  648. "pld [%1, #192] \n"
  649. "vld3.u8 {d0-d2}, [%1]! \n"
  650. "vrev64.u8 d0, d0 \n"
  651. "vrev64.u8 d1, d1 \n"
  652. "pld [%1, #192] \n"
  653. "vld3.u8 {d4-d6}, [%1]! \n"
  654. "vrev64.u8 d2, d2 \n"
  655. "vrev64.u8 d4, d4 \n"
  656. "vst3.u8 {d0-d2}, [%2], r4 \n"
  657. "vrev64.u8 d5, d5 \n"
  658. "vrev64.u8 d6, d6 \n"
  659. "subs %0, #1 \n"
  660. "vst3.u8 {d4-d6}, [%2], r4 \n"
  661. "bne 0b \n"
  662. : "=r"(nn), // %0
  663. "=r"(src0), // %1
  664. "=r"(dst0) // %2
  665. : "0"(nn),
  666. "1"(src0),
  667. "2"(dst0)
  668. : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
  669. }
  670. #endif // __aarch64__
  671. dst0 += 7 * 3;
  672. #else
  673. int remain = srcw;
  674. #endif // __ARM_NEON
  675. for (; remain > 0; remain--)
  676. {
  677. dst0[0] = src0[0];
  678. dst0[1] = src0[1];
  679. dst0[2] = src0[2];
  680. src0 += 3;
  681. dst0 -= 3;
  682. }
  683. src0 += srcwgap;
  684. dst0 += wgap;
  685. }
  686. }
  687. static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  688. {
  689. const int srcwgap = srcstride - srcw * 4;
  690. const int wgap = stride + w * 4;
  691. const unsigned char* src0 = src;
  692. unsigned char* dst0 = dst + w * 4 - 4;
  693. int y = 0;
  694. for (; y < srch; y++)
  695. {
  696. #if __ARM_NEON
  697. dst0 -= 7 * 4;
  698. int nn = srcw >> 4;
  699. int remain = srcw - (nn << 4);
  700. #if __aarch64__
  701. for (; nn > 0; nn--)
  702. {
  703. uint8x8x4_t _src = vld4_u8(src0);
  704. uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
  705. _src.val[0] = vrev64_u8(_src.val[0]);
  706. _src.val[1] = vrev64_u8(_src.val[1]);
  707. _src.val[2] = vrev64_u8(_src.val[2]);
  708. _src.val[3] = vrev64_u8(_src.val[3]);
  709. _src2.val[0] = vrev64_u8(_src2.val[0]);
  710. _src2.val[1] = vrev64_u8(_src2.val[1]);
  711. _src2.val[2] = vrev64_u8(_src2.val[2]);
  712. _src2.val[3] = vrev64_u8(_src2.val[3]);
  713. vst4_u8(dst0, _src);
  714. vst4_u8(dst0 - 8 * 4, _src2);
  715. src0 += 16 * 4;
  716. dst0 -= 16 * 4;
  717. }
  718. #else
  719. if (nn > 0)
  720. {
  721. asm volatile(
  722. "mov r4, #-32 \n"
  723. "0: \n"
  724. "pld [%1, #256] \n"
  725. "vld4.u8 {d0-d3}, [%1]! \n"
  726. "vrev64.u8 d0, d0 \n"
  727. "vrev64.u8 d1, d1 \n"
  728. "vrev64.u8 d2, d2 \n"
  729. "pld [%1, #256] \n"
  730. "vld4.u8 {d4-d7}, [%1]! \n"
  731. "vrev64.u8 d3, d3 \n"
  732. "vrev64.u8 d4, d4 \n"
  733. "vrev64.u8 d5, d5 \n"
  734. "vst4.u8 {d0-d3}, [%2], r4 \n"
  735. "vrev64.u8 d6, d6 \n"
  736. "vrev64.u8 d7, d7 \n"
  737. "subs %0, #1 \n"
  738. "vst4.u8 {d4-d7}, [%2], r4 \n"
  739. "bne 0b \n"
  740. : "=r"(nn), // %0
  741. "=r"(src0), // %1
  742. "=r"(dst0) // %2
  743. : "0"(nn),
  744. "1"(src0),
  745. "2"(dst0)
  746. : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
  747. }
  748. #endif // __aarch64__
  749. dst0 += 7 * 4;
  750. #else
  751. int remain = srcw;
  752. #endif // __ARM_NEON
  753. for (; remain > 0; remain--)
  754. {
  755. dst0[0] = src0[0];
  756. dst0[1] = src0[1];
  757. dst0[2] = src0[2];
  758. dst0[3] = src0[3];
  759. src0 += 4;
  760. dst0 -= 4;
  761. }
  762. src0 += srcwgap;
  763. dst0 += wgap;
  764. }
  765. }
  766. static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  767. {
  768. const int srcwgap = srcstride - srcw;
  769. const int wgap = stride - w;
  770. // point to the last dst pixel
  771. unsigned char* dstend = dst + stride * h - wgap;
  772. const unsigned char* src0 = src;
  773. unsigned char* dst0 = dstend - 1;
  774. int y = 0;
  775. for (; y < srch; y++)
  776. {
  777. #if __ARM_NEON
  778. dst0 -= 15;
  779. int nn = srcw >> 4;
  780. int remain = srcw - (nn << 4);
  781. #if __aarch64__
  782. for (; nn > 0; nn--)
  783. {
  784. uint8x8_t _src = vld1_u8(src0);
  785. uint8x8_t _src2 = vld1_u8(src0 + 8);
  786. _src = vrev64_u8(_src);
  787. _src2 = vrev64_u8(_src2);
  788. vst1_u8(dst0, _src2);
  789. vst1_u8(dst0 + 8, _src);
  790. src0 += 16;
  791. dst0 -= 16;
  792. }
  793. #else
  794. if (nn > 0)
  795. {
  796. asm volatile(
  797. "mov r4, #-16 \n"
  798. "0: \n"
  799. "pld [%1, #128] \n"
  800. "vld1.u8 {d0-d1}, [%1]! \n"
  801. "vrev64.u8 d3, d0 \n"
  802. "vrev64.u8 d2, d1 \n"
  803. "subs %0, #1 \n"
  804. "vst1.u8 {d2-d3}, [%2], r4 \n"
  805. "bne 0b \n"
  806. : "=r"(nn), // %0
  807. "=r"(src0), // %1
  808. "=r"(dst0) // %2
  809. : "0"(nn),
  810. "1"(src0),
  811. "2"(dst0)
  812. : "cc", "memory", "q0", "q1", "r4");
  813. }
  814. #endif // __aarch64__
  815. dst0 += 15;
  816. #else
  817. int remain = srcw;
  818. #endif // __ARM_NEON
  819. for (; remain > 0; remain--)
  820. {
  821. *dst0 = *src0;
  822. src0 += 1;
  823. dst0 -= 1;
  824. }
  825. src0 += srcwgap;
  826. dst0 -= wgap;
  827. }
  828. }
  829. static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  830. {
  831. const int srcwgap = srcstride - srcw * 2;
  832. const int wgap = stride - w * 2;
  833. // point to the last dst pixel
  834. unsigned char* dstend = dst + stride * h - wgap;
  835. const unsigned char* src0 = src;
  836. unsigned char* dst0 = dstend - 2;
  837. int y = 0;
  838. for (; y < srch; y++)
  839. {
  840. #if __ARM_NEON
  841. dst0 -= 7 * 2;
  842. int nn = srcw >> 4;
  843. int remain = srcw - (nn << 4);
  844. #if __aarch64__
  845. for (; nn > 0; nn--)
  846. {
  847. uint8x8x2_t _src = vld2_u8(src0);
  848. uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
  849. _src.val[0] = vrev64_u8(_src.val[0]);
  850. _src.val[1] = vrev64_u8(_src.val[1]);
  851. _src2.val[0] = vrev64_u8(_src2.val[0]);
  852. _src2.val[1] = vrev64_u8(_src2.val[1]);
  853. vst2_u8(dst0, _src);
  854. vst2_u8(dst0 - 8 * 2, _src2);
  855. src0 += 16 * 2;
  856. dst0 -= 16 * 2;
  857. }
  858. #else
  859. if (nn > 0)
  860. {
  861. asm volatile(
  862. "mov r4, #-16 \n"
  863. "0: \n"
  864. "pld [%1, #128] \n"
  865. "vld2.u8 {d0-d1}, [%1]! \n"
  866. "vrev64.u8 d0, d0 \n"
  867. "pld [%1, #128] \n"
  868. "vld2.u8 {d2-d3}, [%1]! \n"
  869. "vrev64.u8 d1, d1 \n"
  870. "vrev64.u8 d2, d2 \n"
  871. "vst2.u8 {d0-d1}, [%2], r4 \n"
  872. "vrev64.u8 d3, d3 \n"
  873. "subs %0, #1 \n"
  874. "vst2.u8 {d2-d3}, [%2], r4 \n"
  875. "bne 0b \n"
  876. : "=r"(nn), // %0
  877. "=r"(src0), // %1
  878. "=r"(dst0) // %2
  879. : "0"(nn),
  880. "1"(src0),
  881. "2"(dst0)
  882. : "cc", "memory", "q0", "q1", "r4");
  883. }
  884. #endif // __aarch64__
  885. dst0 += 7 * 2;
  886. #else
  887. int remain = srcw;
  888. #endif // __ARM_NEON
  889. for (; remain > 0; remain--)
  890. {
  891. dst0[0] = src0[0];
  892. dst0[1] = src0[1];
  893. src0 += 2;
  894. dst0 -= 2;
  895. }
  896. src0 += srcwgap;
  897. dst0 -= wgap;
  898. }
  899. }
  900. static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  901. {
  902. const int srcwgap = srcstride - srcw * 3;
  903. const int wgap = stride - w * 3;
  904. // point to the last dst pixel
  905. unsigned char* dstend = dst + stride * h - wgap;
  906. const unsigned char* src0 = src;
  907. unsigned char* dst0 = dstend - 3;
  908. int y = 0;
  909. for (; y < srch; y++)
  910. {
  911. #if __ARM_NEON
  912. dst0 -= 7 * 3;
  913. int nn = srcw >> 4;
  914. int remain = srcw - (nn << 4);
  915. #if __aarch64__
  916. for (; nn > 0; nn--)
  917. {
  918. uint8x8x3_t _src = vld3_u8(src0);
  919. uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
  920. _src.val[0] = vrev64_u8(_src.val[0]);
  921. _src.val[1] = vrev64_u8(_src.val[1]);
  922. _src.val[2] = vrev64_u8(_src.val[2]);
  923. _src2.val[0] = vrev64_u8(_src2.val[0]);
  924. _src2.val[1] = vrev64_u8(_src2.val[1]);
  925. _src2.val[2] = vrev64_u8(_src2.val[2]);
  926. vst3_u8(dst0, _src);
  927. vst3_u8(dst0 - 8 * 3, _src2);
  928. src0 += 16 * 3;
  929. dst0 -= 16 * 3;
  930. }
  931. #else
  932. if (nn > 0)
  933. {
  934. asm volatile(
  935. "mov r4, #-24 \n"
  936. "0: \n"
  937. "pld [%1, #192] \n"
  938. "vld3.u8 {d0-d2}, [%1]! \n"
  939. "vrev64.u8 d0, d0 \n"
  940. "vrev64.u8 d1, d1 \n"
  941. "pld [%1, #192] \n"
  942. "vld3.u8 {d4-d6}, [%1]! \n"
  943. "vrev64.u8 d2, d2 \n"
  944. "vrev64.u8 d4, d4 \n"
  945. "vst3.u8 {d0-d2}, [%2], r4 \n"
  946. "vrev64.u8 d5, d5 \n"
  947. "vrev64.u8 d6, d6 \n"
  948. "subs %0, #1 \n"
  949. "vst3.u8 {d4-d6}, [%2], r4 \n"
  950. "bne 0b \n"
  951. : "=r"(nn), // %0
  952. "=r"(src0), // %1
  953. "=r"(dst0) // %2
  954. : "0"(nn),
  955. "1"(src0),
  956. "2"(dst0)
  957. : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
  958. }
  959. #endif // __aarch64__
  960. dst0 += 7 * 3;
  961. #else
  962. int remain = srcw;
  963. #endif // __ARM_NEON
  964. for (; remain > 0; remain--)
  965. {
  966. dst0[0] = src0[0];
  967. dst0[1] = src0[1];
  968. dst0[2] = src0[2];
  969. src0 += 3;
  970. dst0 -= 3;
  971. }
  972. src0 += srcwgap;
  973. dst0 -= wgap;
  974. }
  975. }
  976. static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  977. {
  978. const int srcwgap = srcstride - srcw * 4;
  979. const int wgap = stride - w * 4;
  980. // point to the last dst pixel
  981. unsigned char* dstend = dst + stride * h - wgap;
  982. const unsigned char* src0 = src;
  983. unsigned char* dst0 = dstend - 4;
  984. int y = 0;
  985. for (; y < srch; y++)
  986. {
  987. #if __ARM_NEON
  988. dst0 -= 7 * 4;
  989. int nn = srcw >> 4;
  990. int remain = srcw - (nn << 4);
  991. #if __aarch64__
  992. for (; nn > 0; nn--)
  993. {
  994. uint8x8x4_t _src = vld4_u8(src0);
  995. uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
  996. _src.val[0] = vrev64_u8(_src.val[0]);
  997. _src.val[1] = vrev64_u8(_src.val[1]);
  998. _src.val[2] = vrev64_u8(_src.val[2]);
  999. _src.val[3] = vrev64_u8(_src.val[3]);
  1000. _src2.val[0] = vrev64_u8(_src2.val[0]);
  1001. _src2.val[1] = vrev64_u8(_src2.val[1]);
  1002. _src2.val[2] = vrev64_u8(_src2.val[2]);
  1003. _src2.val[3] = vrev64_u8(_src2.val[3]);
  1004. vst4_u8(dst0, _src);
  1005. vst4_u8(dst0 - 8 * 4, _src2);
  1006. src0 += 16 * 4;
  1007. dst0 -= 16 * 4;
  1008. }
  1009. #else
  1010. if (nn > 0)
  1011. {
  1012. asm volatile(
  1013. "mov r4, #-32 \n"
  1014. "0: \n"
  1015. "pld [%1, #256] \n"
  1016. "vld4.u8 {d0-d3}, [%1]! \n"
  1017. "vrev64.u8 d0, d0 \n"
  1018. "vrev64.u8 d1, d1 \n"
  1019. "vrev64.u8 d2, d2 \n"
  1020. "pld [%1, #256] \n"
  1021. "vld4.u8 {d4-d7}, [%1]! \n"
  1022. "vrev64.u8 d3, d3 \n"
  1023. "vrev64.u8 d4, d4 \n"
  1024. "vrev64.u8 d5, d5 \n"
  1025. "vst4.u8 {d0-d3}, [%2], r4 \n"
  1026. "vrev64.u8 d6, d6 \n"
  1027. "vrev64.u8 d7, d7 \n"
  1028. "subs %0, #1 \n"
  1029. "vst4.u8 {d4-d7}, [%2], r4 \n"
  1030. "bne 0b \n"
  1031. : "=r"(nn), // %0
  1032. "=r"(src0), // %1
  1033. "=r"(dst0) // %2
  1034. : "0"(nn),
  1035. "1"(src0),
  1036. "2"(dst0)
  1037. : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
  1038. }
  1039. #endif // __aarch64__
  1040. dst0 += 7 * 4;
  1041. #else
  1042. int remain = srcw;
  1043. #endif // __ARM_NEON
  1044. for (; remain > 0; remain--)
  1045. {
  1046. dst0[0] = src0[0];
  1047. dst0[1] = src0[1];
  1048. dst0[2] = src0[2];
  1049. dst0[3] = src0[3];
  1050. src0 += 4;
  1051. dst0 -= 4;
  1052. }
  1053. src0 += srcwgap;
  1054. dst0 -= wgap;
  1055. }
  1056. }
  1057. static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  1058. {
  1059. const int srcwgap = srcstride - srcw;
  1060. const int wgap = stride + w;
  1061. // point to the last dst pixel row
  1062. unsigned char* dstend = dst + stride * (h - 1);
  1063. const unsigned char* src0 = src;
  1064. const unsigned char* src1 = src + srcstride;
  1065. unsigned char* dst0 = dstend;
  1066. unsigned char* dst1 = dstend - stride;
  1067. int y = 0;
  1068. for (; y + 1 < srch; y += 2)
  1069. {
  1070. #if __ARM_NEON
  1071. int nn = srcw >> 5;
  1072. int remain = srcw - (nn << 5);
  1073. #if __aarch64__
  1074. for (; nn > 0; nn--)
  1075. {
  1076. uint8x16_t _src0 = vld1q_u8(src0);
  1077. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  1078. vst1q_u8(dst0, _src0);
  1079. vst1q_u8(dst0 + 16, _src0n);
  1080. uint8x16_t _src1 = vld1q_u8(src1);
  1081. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  1082. vst1q_u8(dst1, _src1);
  1083. vst1q_u8(dst1 + 16, _src1n);
  1084. src0 += 32;
  1085. src1 += 32;
  1086. dst0 += 32;
  1087. dst1 += 32;
  1088. }
  1089. #else
  1090. if (nn > 0)
  1091. {
  1092. asm volatile(
  1093. "0: \n"
  1094. "pld [%1, #256] \n"
  1095. "vld1.u8 {d0-d3}, [%1]! \n"
  1096. "pld [%2, #256] \n"
  1097. "vld1.u8 {d4-d7}, [%2]! \n"
  1098. "subs %0, #1 \n"
  1099. "vst1.u8 {d0-d3}, [%3]! \n"
  1100. "vst1.u8 {d4-d7}, [%4]! \n"
  1101. "bne 0b \n"
  1102. : "=r"(nn), // %0
  1103. "=r"(src0), // %1
  1104. "=r"(src1), // %2
  1105. "=r"(dst0), // %3
  1106. "=r"(dst1) // %4
  1107. : "0"(nn),
  1108. "1"(src0),
  1109. "2"(src1),
  1110. "3"(dst0),
  1111. "4"(dst1)
  1112. : "cc", "memory", "q0", "q1", "q2", "q3");
  1113. }
  1114. #endif // __aarch64__
  1115. #else
  1116. int remain = srcw;
  1117. #endif // __ARM_NEON
  1118. for (; remain > 0; remain--)
  1119. {
  1120. *dst0++ = *src0++;
  1121. *dst1++ = *src1++;
  1122. }
  1123. src0 += srcwgap + srcstride;
  1124. src1 += srcwgap + srcstride;
  1125. dst0 -= wgap + stride;
  1126. dst1 -= wgap + stride;
  1127. }
  1128. for (; y < srch; y++)
  1129. {
  1130. #if __ARM_NEON
  1131. int nn = srcw >> 5;
  1132. int remain = srcw - (nn << 5);
  1133. #if __aarch64__
  1134. for (; nn > 0; nn--)
  1135. {
  1136. uint8x16_t _src = vld1q_u8(src0);
  1137. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  1138. vst1q_u8(dst0, _src);
  1139. vst1q_u8(dst0 + 16, _src2);
  1140. src0 += 32;
  1141. dst0 += 32;
  1142. }
  1143. #else
  1144. if (nn > 0)
  1145. {
  1146. asm volatile(
  1147. "0: \n"
  1148. "pld [%1, #256] \n"
  1149. "vld1.u8 {d0-d3}, [%1]! \n"
  1150. "subs %0, #1 \n"
  1151. "vst1.u8 {d0-d3}, [%2]! \n"
  1152. "bne 0b \n"
  1153. : "=r"(nn), // %0
  1154. "=r"(src0), // %1
  1155. "=r"(dst0) // %2
  1156. : "0"(nn),
  1157. "1"(src0),
  1158. "2"(dst0)
  1159. : "cc", "memory", "q0", "q1");
  1160. }
  1161. #endif // __aarch64__
  1162. #else
  1163. int remain = srcw;
  1164. #endif // __ARM_NEON
  1165. for (; remain > 0; remain--)
  1166. {
  1167. *dst0++ = *src0++;
  1168. }
  1169. src0 += srcwgap;
  1170. dst0 -= wgap;
  1171. }
  1172. }
  1173. static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  1174. {
  1175. const int srcwgap = srcstride - srcw * 2;
  1176. const int wgap = stride + w * 2;
  1177. // point to the last dst pixel row
  1178. unsigned char* dstend = dst + stride * (h - 1);
  1179. int size = srcw * 2;
  1180. const unsigned char* src0 = src;
  1181. const unsigned char* src1 = src + srcstride;
  1182. unsigned char* dst0 = dstend;
  1183. unsigned char* dst1 = dstend - stride;
  1184. int y = 0;
  1185. for (; y + 1 < srch; y += 2)
  1186. {
  1187. #if __ARM_NEON
  1188. int nn = size >> 5;
  1189. int remain = size - (nn << 5);
  1190. #if __aarch64__
  1191. for (; nn > 0; nn--)
  1192. {
  1193. uint8x16_t _src0 = vld1q_u8(src0);
  1194. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  1195. vst1q_u8(dst0, _src0);
  1196. vst1q_u8(dst0 + 16, _src0n);
  1197. uint8x16_t _src1 = vld1q_u8(src1);
  1198. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  1199. vst1q_u8(dst1, _src1);
  1200. vst1q_u8(dst1 + 16, _src1n);
  1201. src0 += 32;
  1202. src1 += 32;
  1203. dst0 += 32;
  1204. dst1 += 32;
  1205. }
  1206. #else
  1207. if (nn > 0)
  1208. {
  1209. asm volatile(
  1210. "0: \n"
  1211. "pld [%1, #256] \n"
  1212. "vld1.u8 {d0-d3}, [%1]! \n"
  1213. "pld [%2, #256] \n"
  1214. "vld1.u8 {d4-d7}, [%2]! \n"
  1215. "subs %0, #1 \n"
  1216. "vst1.u8 {d0-d3}, [%3]! \n"
  1217. "vst1.u8 {d4-d7}, [%4]! \n"
  1218. "bne 0b \n"
  1219. : "=r"(nn), // %0
  1220. "=r"(src0), // %1
  1221. "=r"(src1), // %2
  1222. "=r"(dst0), // %3
  1223. "=r"(dst1) // %4
  1224. : "0"(nn),
  1225. "1"(src0),
  1226. "2"(src1),
  1227. "3"(dst0),
  1228. "4"(dst1)
  1229. : "cc", "memory", "q0", "q1", "q2", "q3");
  1230. }
  1231. #endif // __aarch64__
  1232. #else
  1233. int remain = size;
  1234. #endif // __ARM_NEON
  1235. for (; remain > 0; remain--)
  1236. {
  1237. *dst0++ = *src0++;
  1238. *dst1++ = *src1++;
  1239. }
  1240. src0 += srcwgap + srcstride;
  1241. src1 += srcwgap + srcstride;
  1242. dst0 -= wgap + stride;
  1243. dst1 -= wgap + stride;
  1244. }
  1245. for (; y < srch; y++)
  1246. {
  1247. #if __ARM_NEON
  1248. int nn = size >> 5;
  1249. int remain = size - (nn << 5);
  1250. #if __aarch64__
  1251. for (; nn > 0; nn--)
  1252. {
  1253. uint8x16_t _src = vld1q_u8(src0);
  1254. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  1255. vst1q_u8(dst0, _src);
  1256. vst1q_u8(dst0 + 16, _src2);
  1257. src0 += 32;
  1258. dst0 += 32;
  1259. }
  1260. #else
  1261. if (nn > 0)
  1262. {
  1263. asm volatile(
  1264. "0: \n"
  1265. "pld [%1, #256] \n"
  1266. "vld1.u8 {d0-d3}, [%1]! \n"
  1267. "subs %0, #1 \n"
  1268. "vst1.u8 {d0-d3}, [%2]! \n"
  1269. "bne 0b \n"
  1270. : "=r"(nn), // %0
  1271. "=r"(src0), // %1
  1272. "=r"(dst0) // %2
  1273. : "0"(nn),
  1274. "1"(src0),
  1275. "2"(dst0)
  1276. : "cc", "memory", "q0", "q1");
  1277. }
  1278. #endif // __aarch64__
  1279. #else
  1280. int remain = size;
  1281. #endif // __ARM_NEON
  1282. for (; remain > 0; remain--)
  1283. {
  1284. *dst0++ = *src0++;
  1285. }
  1286. src0 += srcwgap;
  1287. dst0 -= wgap;
  1288. }
  1289. }
  1290. static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  1291. {
  1292. const int srcwgap = srcstride - srcw * 3;
  1293. const int wgap = stride + w * 3;
  1294. // point to the last dst pixel row
  1295. unsigned char* dstend = dst + stride * (h - 1);
  1296. int size = srcw * 3;
  1297. const unsigned char* src0 = src;
  1298. const unsigned char* src1 = src + srcstride;
  1299. unsigned char* dst0 = dstend;
  1300. unsigned char* dst1 = dstend - stride;
  1301. int y = 0;
  1302. for (; y + 1 < srch; y += 2)
  1303. {
  1304. #if __ARM_NEON
  1305. int nn = size >> 5;
  1306. int remain = size - (nn << 5);
  1307. #if __aarch64__
  1308. for (; nn > 0; nn--)
  1309. {
  1310. uint8x16_t _src0 = vld1q_u8(src0);
  1311. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  1312. vst1q_u8(dst0, _src0);
  1313. vst1q_u8(dst0 + 16, _src0n);
  1314. uint8x16_t _src1 = vld1q_u8(src1);
  1315. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  1316. vst1q_u8(dst1, _src1);
  1317. vst1q_u8(dst1 + 16, _src1n);
  1318. src0 += 32;
  1319. src1 += 32;
  1320. dst0 += 32;
  1321. dst1 += 32;
  1322. }
  1323. #else
  1324. if (nn > 0)
  1325. {
  1326. asm volatile(
  1327. "0: \n"
  1328. "pld [%1, #256] \n"
  1329. "vld1.u8 {d0-d3}, [%1]! \n"
  1330. "pld [%2, #256] \n"
  1331. "vld1.u8 {d4-d7}, [%2]! \n"
  1332. "subs %0, #1 \n"
  1333. "vst1.u8 {d0-d3}, [%3]! \n"
  1334. "vst1.u8 {d4-d7}, [%4]! \n"
  1335. "bne 0b \n"
  1336. : "=r"(nn), // %0
  1337. "=r"(src0), // %1
  1338. "=r"(src1), // %2
  1339. "=r"(dst0), // %3
  1340. "=r"(dst1) // %4
  1341. : "0"(nn),
  1342. "1"(src0),
  1343. "2"(src1),
  1344. "3"(dst0),
  1345. "4"(dst1)
  1346. : "cc", "memory", "q0", "q1", "q2", "q3");
  1347. }
  1348. #endif // __aarch64__
  1349. #else
  1350. int remain = size;
  1351. #endif // __ARM_NEON
  1352. for (; remain > 0; remain--)
  1353. {
  1354. *dst0++ = *src0++;
  1355. *dst1++ = *src1++;
  1356. }
  1357. src0 += srcwgap + srcstride;
  1358. src1 += srcwgap + srcstride;
  1359. dst0 -= wgap + stride;
  1360. dst1 -= wgap + stride;
  1361. }
  1362. for (; y < srch; y++)
  1363. {
  1364. #if __ARM_NEON
  1365. int nn = size >> 5;
  1366. int remain = size - (nn << 5);
  1367. #if __aarch64__
  1368. for (; nn > 0; nn--)
  1369. {
  1370. uint8x16_t _src = vld1q_u8(src0);
  1371. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  1372. vst1q_u8(dst0, _src);
  1373. vst1q_u8(dst0 + 16, _src2);
  1374. src0 += 32;
  1375. dst0 += 32;
  1376. }
  1377. #else
  1378. if (nn > 0)
  1379. {
  1380. asm volatile(
  1381. "0: \n"
  1382. "pld [%1, #256] \n"
  1383. "vld1.u8 {d0-d3}, [%1]! \n"
  1384. "subs %0, #1 \n"
  1385. "vst1.u8 {d0-d3}, [%2]! \n"
  1386. "bne 0b \n"
  1387. : "=r"(nn), // %0
  1388. "=r"(src0), // %1
  1389. "=r"(dst0) // %2
  1390. : "0"(nn),
  1391. "1"(src0),
  1392. "2"(dst0)
  1393. : "cc", "memory", "q0", "q1");
  1394. }
  1395. #endif // __aarch64__
  1396. #else
  1397. int remain = size;
  1398. #endif // __ARM_NEON
  1399. for (; remain > 0; remain--)
  1400. {
  1401. *dst0++ = *src0++;
  1402. }
  1403. src0 += srcwgap;
  1404. dst0 -= wgap;
  1405. }
  1406. }
  1407. static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  1408. {
  1409. const int srcwgap = srcstride - srcw * 4;
  1410. const int wgap = stride + w * 4;
  1411. // point to the last dst pixel row
  1412. unsigned char* dstend = dst + stride * (h - 1);
  1413. int size = srcw * 4;
  1414. const unsigned char* src0 = src;
  1415. const unsigned char* src1 = src + srcstride;
  1416. unsigned char* dst0 = dstend;
  1417. unsigned char* dst1 = dstend - stride;
  1418. int y = 0;
  1419. for (; y + 1 < srch; y += 2)
  1420. {
  1421. #if __ARM_NEON
  1422. int nn = size >> 5;
  1423. int remain = size - (nn << 5);
  1424. #if __aarch64__
  1425. for (; nn > 0; nn--)
  1426. {
  1427. uint8x16_t _src0 = vld1q_u8(src0);
  1428. uint8x16_t _src0n = vld1q_u8(src0 + 16);
  1429. vst1q_u8(dst0, _src0);
  1430. vst1q_u8(dst0 + 16, _src0n);
  1431. uint8x16_t _src1 = vld1q_u8(src1);
  1432. uint8x16_t _src1n = vld1q_u8(src1 + 16);
  1433. vst1q_u8(dst1, _src1);
  1434. vst1q_u8(dst1 + 16, _src1n);
  1435. src0 += 32;
  1436. src1 += 32;
  1437. dst0 += 32;
  1438. dst1 += 32;
  1439. }
  1440. #else
  1441. if (nn > 0)
  1442. {
  1443. asm volatile(
  1444. "0: \n"
  1445. "pld [%1, #256] \n"
  1446. "vld1.u8 {d0-d3}, [%1]! \n"
  1447. "pld [%2, #256] \n"
  1448. "vld1.u8 {d4-d7}, [%2]! \n"
  1449. "subs %0, #1 \n"
  1450. "vst1.u8 {d0-d3}, [%3]! \n"
  1451. "vst1.u8 {d4-d7}, [%4]! \n"
  1452. "bne 0b \n"
  1453. : "=r"(nn), // %0
  1454. "=r"(src0), // %1
  1455. "=r"(src1), // %2
  1456. "=r"(dst0), // %3
  1457. "=r"(dst1) // %4
  1458. : "0"(nn),
  1459. "1"(src0),
  1460. "2"(src1),
  1461. "3"(dst0),
  1462. "4"(dst1)
  1463. : "cc", "memory", "q0", "q1", "q2", "q3");
  1464. }
  1465. #endif // __aarch64__
  1466. #else
  1467. int remain = size;
  1468. #endif // __ARM_NEON
  1469. for (; remain > 0; remain--)
  1470. {
  1471. *dst0++ = *src0++;
  1472. *dst1++ = *src1++;
  1473. }
  1474. src0 += srcwgap + srcstride;
  1475. src1 += srcwgap + srcstride;
  1476. dst0 -= wgap + stride;
  1477. dst1 -= wgap + stride;
  1478. }
  1479. for (; y < srch; y++)
  1480. {
  1481. #if __ARM_NEON
  1482. int nn = size >> 5;
  1483. int remain = size - (nn << 5);
  1484. #if __aarch64__
  1485. for (; nn > 0; nn--)
  1486. {
  1487. uint8x16_t _src = vld1q_u8(src0);
  1488. uint8x16_t _src2 = vld1q_u8(src0 + 16);
  1489. vst1q_u8(dst0, _src);
  1490. vst1q_u8(dst0 + 16, _src2);
  1491. src0 += 32;
  1492. dst0 += 32;
  1493. }
  1494. #else
  1495. if (nn > 0)
  1496. {
  1497. asm volatile(
  1498. "0: \n"
  1499. "pld [%1, #256] \n"
  1500. "vld1.u8 {d0-d3}, [%1]! \n"
  1501. "subs %0, #1 \n"
  1502. "vst1.u8 {d0-d3}, [%2]! \n"
  1503. "bne 0b \n"
  1504. : "=r"(nn), // %0
  1505. "=r"(src0), // %1
  1506. "=r"(dst0) // %2
  1507. : "0"(nn),
  1508. "1"(src0),
  1509. "2"(dst0)
  1510. : "cc", "memory", "q0", "q1");
  1511. }
  1512. #endif // __aarch64__
  1513. #else
  1514. int remain = size;
  1515. #endif // __ARM_NEON
  1516. for (; remain > 0; remain--)
  1517. {
  1518. *dst0++ = *src0++;
  1519. }
  1520. src0 += srcwgap;
  1521. dst0 -= wgap;
  1522. }
  1523. }
  1524. static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
  1525. {
  1526. const int srcwgap = srcstride - srcw;
  1527. const unsigned char* src0 = src;
  1528. int y = 0;
  1529. #if __ARM_NEON
  1530. for (; y + 7 < srch; y += 8)
  1531. {
  1532. const unsigned char* src1 = src0 + srcstride;
  1533. unsigned char* dst0 = dst + y;
  1534. unsigned char* dst1 = dst + y + stride;
  1535. int src_step = 2 * srcstride;
  1536. int dst_step = 2 * stride;
  1537. int nn = srcw >> 3;
  1538. int remain = srcw - (nn << 3);
  1539. #if __aarch64__
  1540. for (; nn > 0; nn--)
  1541. {
  1542. uint8x8_t _src0 = vld1_u8(src0);
  1543. uint8x8_t _src1 = vld1_u8(src1);
  1544. uint8x8_t _src2 = vld1_u8(src0 + src_step);
  1545. uint8x8_t _src3 = vld1_u8(src1 + src_step);
  1546. uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
  1547. uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
  1548. uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
  1549. uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
  1550. uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
  1551. uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
  1552. uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
  1553. uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
  1554. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  1555. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  1556. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  1557. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  1558. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  1559. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  1560. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  1561. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  1562. uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  1563. uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  1564. uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  1565. uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  1566. uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  1567. uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  1568. uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  1569. uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  1570. vst1_u8(dst0, _dst0);
  1571. vst1_u8(dst1, _dst1);
  1572. vst1_u8(dst0 + dst_step, _dst2);
  1573. vst1_u8(dst1 + dst_step, _dst3);
  1574. vst1_u8(dst0 + 2 * dst_step, _dst4);
  1575. vst1_u8(dst1 + 2 * dst_step, _dst5);
  1576. vst1_u8(dst0 + 3 * dst_step, _dst6);
  1577. vst1_u8(dst1 + 3 * dst_step, _dst7);
  1578. src0 += 8;
  1579. src1 += 8;
  1580. dst0 += 4 * dst_step;
  1581. dst1 += 4 * dst_step;
  1582. }
  1583. #else
  1584. if (nn > 0)
  1585. {
  1586. asm volatile(
  1587. "0: \n"
  1588. "pld [%1, #64] \n"
  1589. "vld1.u8 {d0}, [%1], %10 \n"
  1590. "pld [%2, #64] \n"
  1591. "vld1.u8 {d1}, [%2], %10 \n"
  1592. "pld [%1, #64] \n"
  1593. "vld1.u8 {d2}, [%1], %10 \n"
  1594. "vtrn.u8 d0, d1 \n" // _src01t_r
  1595. "pld [%2, #64] \n"
  1596. "vld1.u8 {d3}, [%2], %10 \n"
  1597. "pld [%1, #64] \n"
  1598. "vld1.u8 {d4}, [%1], %10 \n"
  1599. "vtrn.u8 d2, d3 \n" // _src23t_r
  1600. "pld [%2, #64] \n"
  1601. "vld1.u8 {d5}, [%2], %10 \n"
  1602. "pld [%1, #64] \n"
  1603. "vld1.u8 {d6}, [%1], %10 \n"
  1604. "vtrn.u8 d4, d5 \n" // _src45t_r
  1605. "pld [%2, #64] \n"
  1606. "vld1.u8 {d7}, [%2], %10 \n"
  1607. "vtrn.u8 d6, d7 \n" // _src67t_r
  1608. "sub %1, %1, %10, lsl #2 \n" // restore src0
  1609. "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
  1610. "sub %2, %2, %10, lsl #2 \n" // restore src1
  1611. "vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r
  1612. "add %1, #8 \n" // src0 += 8
  1613. "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
  1614. "add %2, #8 \n" // src1 += 8
  1615. "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
  1616. "vst1.u8 {d0}, [%3], %11 \n"
  1617. "vst1.u8 {d1}, [%4], %11 \n"
  1618. "subs %0, #1 \n"
  1619. "vst1.u8 {d2}, [%3], %11 \n"
  1620. "vst1.u8 {d3}, [%4], %11 \n"
  1621. "vst1.u8 {d4}, [%3], %11 \n"
  1622. "vst1.u8 {d5}, [%4], %11 \n"
  1623. "vst1.u8 {d6}, [%3], %11 \n"
  1624. "vst1.u8 {d7}, [%4], %11 \n"
  1625. "bne 0b \n"
  1626. : "=r"(nn), // %0
  1627. "=r"(src0), // %1
  1628. "=r"(src1), // %2
  1629. "=r"(dst0), // %3
  1630. "=r"(dst1) // %4
  1631. : "0"(nn),
  1632. "1"(src0),
  1633. "2"(src1),
  1634. "3"(dst0),
  1635. "4"(dst1),
  1636. "r"(src_step), // %10
  1637. "r"(dst_step) // %11
  1638. : "cc", "memory", "q0", "q1", "q2", "q3");
  1639. }
  1640. #endif // __aarch64__
  1641. for (; remain > 0; remain--)
  1642. {
  1643. dst0[0] = src0[0];
  1644. dst0[1] = src1[0];
  1645. dst0[2] = src0[0 + src_step];
  1646. dst0[3] = src1[0 + src_step];
  1647. dst0[4] = src0[0 + 2 * src_step];
  1648. dst0[5] = src1[0 + 2 * src_step];
  1649. dst0[6] = src0[0 + 3 * src_step];
  1650. dst0[7] = src1[0 + 3 * src_step];
  1651. src0 += 1;
  1652. src1 += 1;
  1653. dst0 += stride;
  1654. }
  1655. src0 += srcwgap + 7 * srcstride;
  1656. }
  1657. #endif // __ARM_NEON
  1658. for (; y < srch; y++)
  1659. {
  1660. unsigned char* dst0 = dst + y;
  1661. int x = 0;
  1662. for (; x < srcw; x++)
  1663. {
  1664. *dst0 = *src0;
  1665. src0 += 1;
  1666. dst0 += stride;
  1667. }
  1668. src0 += srcwgap;
  1669. }
  1670. }
  1671. static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
  1672. {
  1673. const int srcwgap = srcstride - srcw * 2;
  1674. const unsigned char* src0 = src;
  1675. int y = 0;
  1676. #if __ARM_NEON
  1677. for (; y + 7 < srch; y += 8)
  1678. {
  1679. const unsigned char* src1 = src0 + srcstride;
  1680. unsigned char* dst0 = dst + y * 2;
  1681. unsigned char* dst1 = dst + y * 2 + stride;
  1682. int src_step = 2 * srcstride;
  1683. int dst_step = 2 * stride;
  1684. int nn = srcw >> 3;
  1685. int remain = srcw - (nn << 3);
  1686. #if __aarch64__
  1687. for (; nn > 0; nn--)
  1688. {
  1689. uint8x8x2_t _src0 = vld2_u8(src0);
  1690. uint8x8x2_t _src1 = vld2_u8(src1);
  1691. uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
  1692. uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
  1693. uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
  1694. uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
  1695. uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
  1696. uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
  1697. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  1698. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  1699. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  1700. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  1701. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  1702. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  1703. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  1704. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  1705. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  1706. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  1707. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  1708. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  1709. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  1710. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  1711. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  1712. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  1713. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  1714. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  1715. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  1716. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  1717. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  1718. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  1719. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  1720. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  1721. uint8x8x2_t _dst0;
  1722. uint8x8x2_t _dst1;
  1723. uint8x8x2_t _dst2;
  1724. uint8x8x2_t _dst3;
  1725. uint8x8x2_t _dst4;
  1726. uint8x8x2_t _dst5;
  1727. uint8x8x2_t _dst6;
  1728. uint8x8x2_t _dst7;
  1729. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  1730. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  1731. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  1732. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  1733. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  1734. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  1735. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  1736. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  1737. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  1738. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  1739. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  1740. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  1741. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  1742. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  1743. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  1744. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  1745. vst2_u8(dst0, _dst0);
  1746. vst2_u8(dst1, _dst1);
  1747. vst2_u8(dst0 + dst_step, _dst2);
  1748. vst2_u8(dst1 + dst_step, _dst3);
  1749. vst2_u8(dst0 + 2 * dst_step, _dst4);
  1750. vst2_u8(dst1 + 2 * dst_step, _dst5);
  1751. vst2_u8(dst0 + 3 * dst_step, _dst6);
  1752. vst2_u8(dst1 + 3 * dst_step, _dst7);
  1753. src0 += 2 * 8;
  1754. src1 += 2 * 8;
  1755. dst0 += 4 * dst_step;
  1756. dst1 += 4 * dst_step;
  1757. }
  1758. #else
  1759. if (nn > 0)
  1760. {
  1761. asm volatile(
  1762. "0: \n"
  1763. "pld [%1, #128] \n"
  1764. "vld2.u8 {d0-d1}, [%1], %10 \n"
  1765. "pld [%2, #128] \n"
  1766. "vld2.u8 {d2-d3}, [%2], %10 \n"
  1767. "pld [%1, #128] \n"
  1768. "vld2.u8 {d4-d5}, [%1], %10 \n"
  1769. "vtrn.u8 q0, q1 \n" // _src01t_r
  1770. "pld [%2, #128] \n"
  1771. "vld2.u8 {d6-d7}, [%2], %10 \n"
  1772. "pld [%1, #128] \n"
  1773. "vld2.u8 {d16-d17}, [%1], %10\n"
  1774. "vtrn.u8 q2, q3 \n" // _src23t_r
  1775. "pld [%2, #128] \n"
  1776. "vld2.u8 {d18-d19}, [%2], %10\n"
  1777. "pld [%1, #128] \n"
  1778. "vld2.u8 {d20-d21}, [%1], %10\n"
  1779. "vtrn.u8 q8, q9 \n" // _src45t_r
  1780. "pld [%2, #128] \n"
  1781. "vld2.u8 {d22-d23}, [%2], %10\n"
  1782. "vtrn.u8 q10, q11 \n" // _src67t_r
  1783. "sub %1, %1, %10, lsl #2 \n" // restore src0
  1784. "vtrn.u16 q0, q2 \n" // _src02tt_r
  1785. "sub %2, %2, %10, lsl #2 \n" // restore src1
  1786. "vtrn.u16 q1, q3 \n" // _src13tt_r
  1787. "add %1, #16 \n" // src0 += 16
  1788. "vtrn.u16 q8, q10 \n" // _src46tt_r
  1789. "add %2, #16 \n" // src1 += 16
  1790. "vtrn.u16 q9, q11 \n" // _src57tt_r
  1791. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  1792. "vtrn.u32 q1, q9 \n" // _src15ttt_r
  1793. "vst2.u8 {d0-d1}, [%3], %11 \n"
  1794. "vtrn.u32 q2, q10 \n" // _src26ttt_r
  1795. "vst2.u8 {d2-d3}, [%4], %11 \n"
  1796. "vtrn.u32 q3, q11 \n" // _src37ttt_r
  1797. "vst2.u8 {d4-d5}, [%3], %11 \n"
  1798. "subs %0, #1 \n"
  1799. "vst2.u8 {d6-d7}, [%4], %11 \n"
  1800. "vst2.u8 {d16-d17}, [%3], %11\n"
  1801. "vst2.u8 {d18-d19}, [%4], %11\n"
  1802. "vst2.u8 {d20-d21}, [%3], %11\n"
  1803. "vst2.u8 {d22-d23}, [%4], %11\n"
  1804. "bne 0b \n"
  1805. : "=r"(nn), // %0
  1806. "=r"(src0), // %1
  1807. "=r"(src1), // %2
  1808. "=r"(dst0), // %3
  1809. "=r"(dst1) // %4
  1810. : "0"(nn),
  1811. "1"(src0),
  1812. "2"(src1),
  1813. "3"(dst0),
  1814. "4"(dst1),
  1815. "r"(src_step), // %10
  1816. "r"(dst_step) // %11
  1817. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  1818. }
  1819. #endif // __aarch64__
  1820. for (; remain > 0; remain--)
  1821. {
  1822. dst0[0] = src0[0];
  1823. dst0[1] = src0[1];
  1824. dst0[2] = src1[0];
  1825. dst0[3] = src1[1];
  1826. dst0[4] = src0[0 + src_step];
  1827. dst0[5] = src0[1 + src_step];
  1828. dst0[6] = src1[0 + src_step];
  1829. dst0[7] = src1[1 + src_step];
  1830. dst0[8] = src0[0 + 2 * src_step];
  1831. dst0[9] = src0[1 + 2 * src_step];
  1832. dst0[10] = src1[0 + 2 * src_step];
  1833. dst0[11] = src1[1 + 2 * src_step];
  1834. dst0[12] = src0[0 + 3 * src_step];
  1835. dst0[13] = src0[1 + 3 * src_step];
  1836. dst0[14] = src1[0 + 3 * src_step];
  1837. dst0[15] = src1[1 + 3 * src_step];
  1838. src0 += 2;
  1839. src1 += 2;
  1840. dst0 += stride;
  1841. }
  1842. src0 += srcwgap + 7 * srcstride;
  1843. }
  1844. #endif // __ARM_NEON
  1845. for (; y < srch; y++)
  1846. {
  1847. unsigned char* dst0 = dst + y * 2;
  1848. int x = 0;
  1849. for (; x < srcw; x++)
  1850. {
  1851. dst0[0] = src0[0];
  1852. dst0[1] = src0[1];
  1853. src0 += 2;
  1854. dst0 += stride;
  1855. }
  1856. src0 += srcwgap;
  1857. }
  1858. }
  1859. static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
  1860. {
  1861. const int srcwgap = srcstride - srcw * 3;
  1862. const unsigned char* src0 = src;
  1863. int y = 0;
  1864. #if __ARM_NEON
  1865. for (; y + 7 < srch; y += 8)
  1866. {
  1867. const unsigned char* src1 = src0 + srcstride;
  1868. unsigned char* dst0 = dst + y * 3;
  1869. unsigned char* dst1 = dst + y * 3 + stride;
  1870. int src_step = 2 * srcstride;
  1871. int dst_step = 2 * stride;
  1872. int nn = srcw >> 3;
  1873. int remain = srcw - (nn << 3);
  1874. #if __aarch64__
  1875. for (; nn > 0; nn--)
  1876. {
  1877. uint8x8x3_t _src0 = vld3_u8(src0);
  1878. uint8x8x3_t _src1 = vld3_u8(src1);
  1879. uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
  1880. uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
  1881. uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
  1882. uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
  1883. uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
  1884. uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
  1885. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  1886. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  1887. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  1888. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  1889. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  1890. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  1891. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  1892. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  1893. uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
  1894. uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
  1895. uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
  1896. uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
  1897. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  1898. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  1899. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  1900. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  1901. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  1902. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  1903. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  1904. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  1905. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
  1906. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
  1907. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
  1908. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
  1909. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  1910. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  1911. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  1912. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  1913. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  1914. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  1915. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  1916. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  1917. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
  1918. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
  1919. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
  1920. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
  1921. uint8x8x3_t _dst0;
  1922. uint8x8x3_t _dst1;
  1923. uint8x8x3_t _dst2;
  1924. uint8x8x3_t _dst3;
  1925. uint8x8x3_t _dst4;
  1926. uint8x8x3_t _dst5;
  1927. uint8x8x3_t _dst6;
  1928. uint8x8x3_t _dst7;
  1929. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  1930. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  1931. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  1932. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  1933. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  1934. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  1935. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  1936. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  1937. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  1938. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  1939. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  1940. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  1941. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  1942. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  1943. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  1944. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  1945. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  1946. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  1947. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  1948. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  1949. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  1950. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  1951. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  1952. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  1953. vst3_u8(dst0, _dst0);
  1954. vst3_u8(dst1, _dst1);
  1955. vst3_u8(dst0 + dst_step, _dst2);
  1956. vst3_u8(dst1 + dst_step, _dst3);
  1957. vst3_u8(dst0 + 2 * dst_step, _dst4);
  1958. vst3_u8(dst1 + 2 * dst_step, _dst5);
  1959. vst3_u8(dst0 + 3 * dst_step, _dst6);
  1960. vst3_u8(dst1 + 3 * dst_step, _dst7);
  1961. src0 += 3 * 8;
  1962. src1 += 3 * 8;
  1963. dst0 += 4 * dst_step;
  1964. dst1 += 4 * dst_step;
  1965. }
  1966. #else
  1967. if (nn > 0)
  1968. {
  1969. asm volatile(
  1970. "0: \n"
  1971. "pld [%1, #192] \n"
  1972. "vld3.u8 {d0-d2}, [%1], %10 \n"
  1973. "pld [%2, #192] \n"
  1974. "vld3.u8 {d4-d6}, [%2], %10 \n"
  1975. "pld [%1, #192] \n"
  1976. "vld3.u8 {d8-d10}, [%1], %10 \n"
  1977. "vtrn.u8 q0, q2 \n" // _src01t_r
  1978. "vtrn.u8 d2, d6 \n"
  1979. "pld [%2, #192] \n"
  1980. "vld3.u8 {d12-d14}, [%2], %10\n"
  1981. "pld [%1, #192] \n"
  1982. "vld3.u8 {d16-d18}, [%1], %10\n"
  1983. "vtrn.u8 q4, q6 \n" // _src23t_r
  1984. "vtrn.u8 d10, d14 \n"
  1985. "pld [%2, #192] \n"
  1986. "vld3.u8 {d20-d22}, [%2], %10\n"
  1987. "pld [%1, #192] \n"
  1988. "vld3.u8 {d24-d26}, [%1], %10\n"
  1989. "vtrn.u8 q8, q10 \n" // _src45t_r
  1990. "vtrn.u8 d18, d22 \n"
  1991. "pld [%2, #192] \n"
  1992. "vld3.u8 {d28-d30}, [%2], %10\n"
  1993. "vtrn.u8 q12, q14 \n" // _src67t_r
  1994. "vtrn.u8 d26, d30 \n"
  1995. "sub %1, %1, %10, lsl #2 \n" // restore src0
  1996. "vtrn.u16 q0, q4 \n" // _src02tt_r
  1997. "vtrn.u16 d2, d10 \n"
  1998. "sub %2, %2, %10, lsl #2 \n" // restore src1
  1999. "vtrn.u16 q2, q6 \n" // _src13tt_r
  2000. "vtrn.u16 d6, d14 \n"
  2001. "add %1, #24 \n" // src0 += 24
  2002. "vtrn.u16 q8, q12 \n" // _src46tt_r
  2003. "vtrn.u16 d18, d26 \n"
  2004. "add %2, #24 \n" // src1 += 24
  2005. "vtrn.u16 q10, q14 \n" // _src57tt_r
  2006. "vtrn.u16 d22, d30 \n"
  2007. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  2008. "vtrn.u32 d2, d18 \n"
  2009. "vtrn.u32 q2, q10 \n" // _src15ttt_r
  2010. "vst3.u8 {d0-d2}, [%3], %11 \n"
  2011. "vtrn.u32 d6, d22 \n"
  2012. "vtrn.u32 q4, q12 \n" // _src26ttt_r
  2013. "vst3.u8 {d4-d6}, [%4], %11 \n"
  2014. "vtrn.u32 d10, d26 \n"
  2015. "vtrn.u32 q6, q14 \n" // _src37ttt_r
  2016. "vst3.u8 {d8-d10}, [%3], %11 \n"
  2017. "vtrn.u32 d14, d30 \n"
  2018. "subs %0, #1 \n"
  2019. "vst3.u8 {d16-d18}, [%3], %11\n"
  2020. "vst3.u8 {d12-d14}, [%4], %11\n"
  2021. "vst3.u8 {d20-d22}, [%4], %11\n"
  2022. "vst3.u8 {d24-d26}, [%3], %11\n"
  2023. "vst3.u8 {d28-d30}, [%4], %11\n"
  2024. "bne 0b \n"
  2025. : "=r"(nn), // %0
  2026. "=r"(src0), // %1
  2027. "=r"(src1), // %2
  2028. "=r"(dst0), // %3
  2029. "=r"(dst1) // %4
  2030. : "0"(nn),
  2031. "1"(src0),
  2032. "2"(src1),
  2033. "3"(dst0),
  2034. "4"(dst1),
  2035. "r"(src_step), // %10
  2036. "r"(dst_step) // %11
  2037. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  2038. }
  2039. #endif // __aarch64__
  2040. for (; remain > 0; remain--)
  2041. {
  2042. dst0[0] = src0[0];
  2043. dst0[1] = src0[1];
  2044. dst0[2] = src0[2];
  2045. dst0[3] = src1[0];
  2046. dst0[4] = src1[1];
  2047. dst0[5] = src1[2];
  2048. dst0[6] = src0[0 + src_step];
  2049. dst0[7] = src0[1 + src_step];
  2050. dst0[8] = src0[2 + src_step];
  2051. dst0[9] = src1[0 + src_step];
  2052. dst0[10] = src1[1 + src_step];
  2053. dst0[11] = src1[2 + src_step];
  2054. dst0[12] = src0[0 + 2 * src_step];
  2055. dst0[13] = src0[1 + 2 * src_step];
  2056. dst0[14] = src0[2 + 2 * src_step];
  2057. dst0[15] = src1[0 + 2 * src_step];
  2058. dst0[16] = src1[1 + 2 * src_step];
  2059. dst0[17] = src1[2 + 2 * src_step];
  2060. dst0[18] = src0[0 + 3 * src_step];
  2061. dst0[19] = src0[1 + 3 * src_step];
  2062. dst0[20] = src0[2 + 3 * src_step];
  2063. dst0[21] = src1[0 + 3 * src_step];
  2064. dst0[22] = src1[1 + 3 * src_step];
  2065. dst0[23] = src1[2 + 3 * src_step];
  2066. src0 += 3;
  2067. src1 += 3;
  2068. dst0 += stride;
  2069. }
  2070. src0 += srcwgap + 7 * srcstride;
  2071. }
  2072. #endif // __ARM_NEON
  2073. for (; y < srch; y++)
  2074. {
  2075. unsigned char* dst0 = dst + y * 3;
  2076. int x = 0;
  2077. for (; x < srcw; x++)
  2078. {
  2079. dst0[0] = src0[0];
  2080. dst0[1] = src0[1];
  2081. dst0[2] = src0[2];
  2082. src0 += 3;
  2083. dst0 += stride;
  2084. }
  2085. src0 += srcwgap;
  2086. }
  2087. }
  2088. static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
  2089. {
  2090. const int srcwgap = srcstride - srcw * 4;
  2091. const unsigned char* src0 = src;
  2092. int y = 0;
  2093. #if __ARM_NEON
  2094. for (; y + 7 < srch; y += 8)
  2095. {
  2096. const unsigned char* src1 = src0 + srcstride;
  2097. unsigned char* dst0 = dst + y * 4;
  2098. unsigned char* dst1 = dst + y * 4 + stride;
  2099. int src_step = 2 * srcstride;
  2100. int dst_step = 2 * stride;
  2101. int nn = srcw >> 3;
  2102. int remain = srcw - (nn << 3);
  2103. #if __aarch64__
  2104. for (; nn > 0; nn--)
  2105. {
  2106. uint8x8x4_t _src0 = vld4_u8(src0);
  2107. uint8x8x4_t _src1 = vld4_u8(src1);
  2108. uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
  2109. uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
  2110. uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
  2111. uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
  2112. uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
  2113. uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
  2114. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  2115. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  2116. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  2117. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  2118. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  2119. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  2120. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  2121. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  2122. uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
  2123. uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
  2124. uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
  2125. uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
  2126. uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
  2127. uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
  2128. uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
  2129. uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
  2130. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  2131. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  2132. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  2133. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  2134. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  2135. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  2136. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  2137. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  2138. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
  2139. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
  2140. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
  2141. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
  2142. uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
  2143. uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
  2144. uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
  2145. uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
  2146. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  2147. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  2148. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  2149. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  2150. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  2151. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  2152. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  2153. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  2154. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
  2155. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
  2156. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
  2157. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
  2158. uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
  2159. uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
  2160. uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
  2161. uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
  2162. uint8x8x4_t _dst0;
  2163. uint8x8x4_t _dst1;
  2164. uint8x8x4_t _dst2;
  2165. uint8x8x4_t _dst3;
  2166. uint8x8x4_t _dst4;
  2167. uint8x8x4_t _dst5;
  2168. uint8x8x4_t _dst6;
  2169. uint8x8x4_t _dst7;
  2170. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  2171. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  2172. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  2173. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  2174. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  2175. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  2176. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  2177. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  2178. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  2179. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  2180. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  2181. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  2182. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  2183. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  2184. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  2185. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  2186. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  2187. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  2188. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  2189. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  2190. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  2191. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  2192. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  2193. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  2194. _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
  2195. _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
  2196. _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
  2197. _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
  2198. _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
  2199. _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
  2200. _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
  2201. _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
  2202. vst4_u8(dst0, _dst0);
  2203. vst4_u8(dst1, _dst1);
  2204. vst4_u8(dst0 + dst_step, _dst2);
  2205. vst4_u8(dst1 + dst_step, _dst3);
  2206. vst4_u8(dst0 + 2 * dst_step, _dst4);
  2207. vst4_u8(dst1 + 2 * dst_step, _dst5);
  2208. vst4_u8(dst0 + 3 * dst_step, _dst6);
  2209. vst4_u8(dst1 + 3 * dst_step, _dst7);
  2210. src0 += 4 * 8;
  2211. src1 += 4 * 8;
  2212. dst0 += 4 * dst_step;
  2213. dst1 += 4 * dst_step;
  2214. }
  2215. #else
  2216. if (nn > 0)
  2217. {
  2218. asm volatile(
  2219. "0: \n"
  2220. "pld [%1, #256] \n"
  2221. "vld4.u8 {d0-d3}, [%1], %10 \n"
  2222. "pld [%2, #256] \n"
  2223. "vld4.u8 {d4-d7}, [%2], %10 \n"
  2224. "pld [%1, #256] \n"
  2225. "vld4.u8 {d8-d11}, [%1], %10 \n"
  2226. "vtrn.u8 q0, q2 \n" // _src01t_r
  2227. "vtrn.u8 q1, q3 \n"
  2228. "pld [%2, #256] \n"
  2229. "vld4.u8 {d12-d15}, [%2], %10\n"
  2230. "pld [%1, #256] \n"
  2231. "vld4.u8 {d16-d19}, [%1], %10\n"
  2232. "vtrn.u8 q4, q6 \n" // _src23t_r
  2233. "vtrn.u8 q5, q7 \n"
  2234. "pld [%2, #256] \n"
  2235. "vld4.u8 {d20-d23}, [%2], %10\n"
  2236. "pld [%1, #256] \n"
  2237. "vld4.u8 {d24-d27}, [%1], %10\n"
  2238. "vtrn.u8 q8, q10 \n" // _src45t_r
  2239. "vtrn.u8 q9, q11 \n"
  2240. "pld [%2, #256] \n"
  2241. "vld4.u8 {d28-d31}, [%2], %10\n"
  2242. "vtrn.u8 q12, q14 \n" // _src67t_r
  2243. "vtrn.u8 q13, q15 \n"
  2244. "sub %1, %1, %10, lsl #2 \n" // restore src0
  2245. "vtrn.u16 q0, q4 \n" // _src02tt_r
  2246. "vtrn.u16 q1, q5 \n"
  2247. "sub %2, %2, %10, lsl #2 \n" // restore src1
  2248. "vtrn.u16 q2, q6 \n" // _src13tt_r
  2249. "vtrn.u16 q3, q7 \n"
  2250. "add %1, #32 \n" // src0 += 32
  2251. "vtrn.u16 q8, q12 \n" // _src46tt_r
  2252. "vtrn.u16 q9, q13 \n"
  2253. "add %2, #32 \n" // src1 += 32
  2254. "vtrn.u16 q10, q14 \n" // _src57tt_r
  2255. "vtrn.u16 q11, q15 \n"
  2256. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  2257. "vtrn.u32 q1, q9 \n"
  2258. "vtrn.u32 q2, q10 \n" // _src15ttt_r
  2259. "vst4.u8 {d0-d3}, [%3], %11 \n"
  2260. "vtrn.u32 q3, q11 \n"
  2261. "vtrn.u32 q4, q12 \n" // _src26ttt_r
  2262. "vst4.u8 {d4-d7}, [%4], %11 \n"
  2263. "vtrn.u32 q5, q13 \n"
  2264. "vtrn.u32 q6, q14 \n" // _src37ttt_r
  2265. "vst4.u8 {d8-d11}, [%3], %11 \n"
  2266. "vtrn.u32 q7, q15 \n"
  2267. "subs %0, #1 \n"
  2268. "vst4.u8 {d16-d19}, [%3], %11\n"
  2269. "vst4.u8 {d12-d15}, [%4], %11\n"
  2270. "vst4.u8 {d20-d23}, [%4], %11\n"
  2271. "vst4.u8 {d24-d27}, [%3], %11\n"
  2272. "vst4.u8 {d28-d31}, [%4], %11\n"
  2273. "bne 0b \n"
  2274. : "=r"(nn), // %0
  2275. "=r"(src0), // %1
  2276. "=r"(src1), // %2
  2277. "=r"(dst0), // %3
  2278. "=r"(dst1) // %4
  2279. : "0"(nn),
  2280. "1"(src0),
  2281. "2"(src1),
  2282. "3"(dst0),
  2283. "4"(dst1),
  2284. "r"(src_step), // %10
  2285. "r"(dst_step) // %11
  2286. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  2287. }
  2288. #endif // __aarch64__
  2289. for (; remain > 0; remain--)
  2290. {
  2291. dst0[0] = src0[0];
  2292. dst0[1] = src0[1];
  2293. dst0[2] = src0[2];
  2294. dst0[3] = src0[3];
  2295. dst0[4] = src1[0];
  2296. dst0[5] = src1[1];
  2297. dst0[6] = src1[2];
  2298. dst0[7] = src1[3];
  2299. dst0[8] = src0[0 + src_step];
  2300. dst0[9] = src0[1 + src_step];
  2301. dst0[10] = src0[2 + src_step];
  2302. dst0[11] = src0[3 + src_step];
  2303. dst0[12] = src1[0 + src_step];
  2304. dst0[13] = src1[1 + src_step];
  2305. dst0[14] = src1[2 + src_step];
  2306. dst0[15] = src1[3 + src_step];
  2307. dst0[16] = src0[0 + 2 * src_step];
  2308. dst0[17] = src0[1 + 2 * src_step];
  2309. dst0[18] = src0[2 + 2 * src_step];
  2310. dst0[19] = src0[3 + 2 * src_step];
  2311. dst0[20] = src1[0 + 2 * src_step];
  2312. dst0[21] = src1[1 + 2 * src_step];
  2313. dst0[22] = src1[2 + 2 * src_step];
  2314. dst0[23] = src1[3 + 2 * src_step];
  2315. dst0[24] = src0[0 + 3 * src_step];
  2316. dst0[25] = src0[1 + 3 * src_step];
  2317. dst0[26] = src0[2 + 3 * src_step];
  2318. dst0[27] = src0[3 + 3 * src_step];
  2319. dst0[28] = src1[0 + 3 * src_step];
  2320. dst0[29] = src1[1 + 3 * src_step];
  2321. dst0[30] = src1[2 + 3 * src_step];
  2322. dst0[31] = src1[3 + 3 * src_step];
  2323. src0 += 4;
  2324. src1 += 4;
  2325. dst0 += stride;
  2326. }
  2327. src0 += srcwgap + 7 * srcstride;
  2328. }
  2329. #endif // __ARM_NEON
  2330. for (; y < srch; y++)
  2331. {
  2332. unsigned char* dst0 = dst + y * 4;
  2333. int x = 0;
  2334. for (; x < srcw; x++)
  2335. {
  2336. dst0[0] = src0[0];
  2337. dst0[1] = src0[1];
  2338. dst0[2] = src0[2];
  2339. dst0[3] = src0[3];
  2340. src0 += 4;
  2341. dst0 += stride;
  2342. }
  2343. src0 += srcwgap;
  2344. }
  2345. }
  2346. static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  2347. {
  2348. const int srcwgap = srcstride - srcw;
  2349. // point to the last dst pixel in row
  2350. unsigned char* dstend = dst + w;
  2351. const unsigned char* src0 = src;
  2352. int y = 0;
  2353. #if __ARM_NEON
  2354. for (; y + 7 < srch; y += 8)
  2355. {
  2356. const unsigned char* src1 = src0 + srcstride;
  2357. unsigned char* dst0 = dstend - y - 8;
  2358. unsigned char* dst1 = dstend - y - 8 + stride;
  2359. int src_step = 2 * srcstride;
  2360. int dst_step = 2 * stride;
  2361. int nn = srcw >> 3;
  2362. int remain = srcw - (nn << 3);
  2363. #if __aarch64__
  2364. for (; nn > 0; nn--)
  2365. {
  2366. uint8x8_t _src0 = vld1_u8(src0);
  2367. uint8x8_t _src1 = vld1_u8(src1);
  2368. uint8x8_t _src2 = vld1_u8(src0 + src_step);
  2369. uint8x8_t _src3 = vld1_u8(src1 + src_step);
  2370. uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
  2371. uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
  2372. uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
  2373. uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
  2374. uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
  2375. uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
  2376. uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
  2377. uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
  2378. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  2379. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  2380. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  2381. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  2382. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  2383. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  2384. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  2385. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  2386. uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  2387. uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  2388. uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  2389. uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  2390. uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  2391. uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  2392. uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  2393. uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  2394. vst1_u8(dst0, _dst7);
  2395. vst1_u8(dst1, _dst6);
  2396. vst1_u8(dst0 + dst_step, _dst5);
  2397. vst1_u8(dst1 + dst_step, _dst4);
  2398. vst1_u8(dst0 + 2 * dst_step, _dst3);
  2399. vst1_u8(dst1 + 2 * dst_step, _dst2);
  2400. vst1_u8(dst0 + 3 * dst_step, _dst1);
  2401. vst1_u8(dst1 + 3 * dst_step, _dst0);
  2402. src0 += 8;
  2403. src1 += 8;
  2404. dst0 += 4 * dst_step;
  2405. dst1 += 4 * dst_step;
  2406. }
  2407. #else
  2408. if (nn > 0)
  2409. {
  2410. asm volatile(
  2411. "0: \n"
  2412. "pld [%1, #64] \n"
  2413. "vld1.u8 {d0}, [%1], %10 \n"
  2414. "pld [%2, #64] \n"
  2415. "vld1.u8 {d1}, [%2], %10 \n"
  2416. "pld [%1, #64] \n"
  2417. "vld1.u8 {d2}, [%1], %10 \n"
  2418. "vtrn.u8 d1, d0 \n" // _src01t_r
  2419. "pld [%2, #64] \n"
  2420. "vld1.u8 {d3}, [%2], %10 \n"
  2421. "pld [%1, #64] \n"
  2422. "vld1.u8 {d4}, [%1], %10 \n"
  2423. "vtrn.u8 d3, d2 \n" // _src23t_r
  2424. "pld [%2, #64] \n"
  2425. "vld1.u8 {d5}, [%2], %10 \n"
  2426. "pld [%1, #64] \n"
  2427. "vld1.u8 {d6}, [%1], %10 \n"
  2428. "vtrn.u8 d5, d4 \n" // _src45t_r
  2429. "pld [%2, #64] \n"
  2430. "vld1.u8 {d7}, [%2], %10 \n"
  2431. "vtrn.u8 d7, d6 \n" // _src67t_r
  2432. "sub %1, %1, %10, lsl #2 \n" // restore src0
  2433. "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
  2434. "sub %2, %2, %10, lsl #2 \n" // restore src1
  2435. "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
  2436. "add %1, #8 \n" // src0 += 8
  2437. "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
  2438. "add %2, #8 \n" // src1 += 8
  2439. "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
  2440. "vst1.u8 {d6}, [%4], %11 \n"
  2441. "vst1.u8 {d7}, [%3], %11 \n"
  2442. "subs %0, #1 \n"
  2443. "vst1.u8 {d4}, [%4], %11 \n"
  2444. "vst1.u8 {d5}, [%3], %11 \n"
  2445. "vst1.u8 {d2}, [%4], %11 \n"
  2446. "vst1.u8 {d3}, [%3], %11 \n"
  2447. "vst1.u8 {d0}, [%4], %11 \n"
  2448. "vst1.u8 {d1}, [%3], %11 \n"
  2449. "bne 0b \n"
  2450. : "=r"(nn), // %0
  2451. "=r"(src0), // %1
  2452. "=r"(src1), // %2
  2453. "=r"(dst0), // %3
  2454. "=r"(dst1) // %4
  2455. : "0"(nn),
  2456. "1"(src0),
  2457. "2"(src1),
  2458. "3"(dst0),
  2459. "4"(dst1),
  2460. "r"(src_step), // %10
  2461. "r"(dst_step) // %11
  2462. : "cc", "memory", "q0", "q1", "q2", "q3");
  2463. }
  2464. #endif // __aarch64__
  2465. for (; remain > 0; remain--)
  2466. {
  2467. dst0[0] = src1[0 + 3 * src_step];
  2468. dst0[1] = src0[0 + 3 * src_step];
  2469. dst0[2] = src1[0 + 2 * src_step];
  2470. dst0[3] = src0[0 + 2 * src_step];
  2471. dst0[4] = src1[0 + src_step];
  2472. dst0[5] = src0[0 + src_step];
  2473. dst0[6] = src1[0];
  2474. dst0[7] = src0[0];
  2475. src0 += 1;
  2476. src1 += 1;
  2477. dst0 += stride;
  2478. }
  2479. src0 += srcwgap + 7 * srcstride;
  2480. }
  2481. #endif // __ARM_NEON
  2482. for (; y < srch; y++)
  2483. {
  2484. unsigned char* dst0 = dstend - y - 1;
  2485. int x = 0;
  2486. for (; x < srcw; x++)
  2487. {
  2488. *dst0 = *src0;
  2489. src0 += 1;
  2490. dst0 += stride;
  2491. }
  2492. src0 += srcwgap;
  2493. }
  2494. }
  2495. static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  2496. {
  2497. const int srcwgap = srcstride - srcw * 2;
  2498. // point to the last dst pixel in row
  2499. unsigned char* dstend = dst + w * 2;
  2500. const unsigned char* src0 = src;
  2501. int y = 0;
  2502. #if __ARM_NEON
  2503. for (; y + 7 < srch; y += 8)
  2504. {
  2505. const unsigned char* src1 = src0 + srcstride;
  2506. unsigned char* dst0 = dstend - y * 2 - 8 * 2;
  2507. unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride;
  2508. int src_step = 2 * srcstride;
  2509. int dst_step = 2 * stride;
  2510. int nn = srcw >> 3;
  2511. int remain = srcw - (nn << 3);
  2512. #if __aarch64__
  2513. for (; nn > 0; nn--)
  2514. {
  2515. uint8x8x2_t _src0 = vld2_u8(src0);
  2516. uint8x8x2_t _src1 = vld2_u8(src1);
  2517. uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
  2518. uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
  2519. uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
  2520. uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
  2521. uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
  2522. uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
  2523. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  2524. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  2525. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  2526. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  2527. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  2528. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  2529. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  2530. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  2531. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  2532. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  2533. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  2534. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  2535. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  2536. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  2537. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  2538. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  2539. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  2540. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  2541. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  2542. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  2543. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  2544. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  2545. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  2546. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  2547. uint8x8x2_t _dst0;
  2548. uint8x8x2_t _dst1;
  2549. uint8x8x2_t _dst2;
  2550. uint8x8x2_t _dst3;
  2551. uint8x8x2_t _dst4;
  2552. uint8x8x2_t _dst5;
  2553. uint8x8x2_t _dst6;
  2554. uint8x8x2_t _dst7;
  2555. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  2556. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  2557. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  2558. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  2559. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  2560. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  2561. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  2562. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  2563. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  2564. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  2565. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  2566. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  2567. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  2568. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  2569. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  2570. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  2571. vst2_u8(dst0, _dst7);
  2572. vst2_u8(dst1, _dst6);
  2573. vst2_u8(dst0 + dst_step, _dst5);
  2574. vst2_u8(dst1 + dst_step, _dst4);
  2575. vst2_u8(dst0 + 2 * dst_step, _dst3);
  2576. vst2_u8(dst1 + 2 * dst_step, _dst2);
  2577. vst2_u8(dst0 + 3 * dst_step, _dst1);
  2578. vst2_u8(dst1 + 3 * dst_step, _dst0);
  2579. src0 += 2 * 8;
  2580. src1 += 2 * 8;
  2581. dst0 += 4 * dst_step;
  2582. dst1 += 4 * dst_step;
  2583. }
  2584. #else
  2585. if (nn > 0)
  2586. {
  2587. asm volatile(
  2588. "0: \n"
  2589. "pld [%1, #128] \n"
  2590. "vld2.u8 {d0-d1}, [%1], %10 \n"
  2591. "pld [%2, #128] \n"
  2592. "vld2.u8 {d2-d3}, [%2], %10 \n"
  2593. "pld [%1, #128] \n"
  2594. "vld2.u8 {d4-d5}, [%1], %10 \n"
  2595. "vtrn.u8 q1, q0 \n" // _src01t_r
  2596. "pld [%2, #128] \n"
  2597. "vld2.u8 {d6-d7}, [%2], %10 \n"
  2598. "pld [%1, #128] \n"
  2599. "vld2.u8 {d16-d17}, [%1], %10\n"
  2600. "vtrn.u8 q3, q2 \n" // _src23t_r
  2601. "pld [%2, #128] \n"
  2602. "vld2.u8 {d18-d19}, [%2], %10\n"
  2603. "pld [%1, #128] \n"
  2604. "vld2.u8 {d20-d21}, [%1], %10\n"
  2605. "vtrn.u8 q9, q8 \n" // _src45t_r
  2606. "pld [%2, #128] \n"
  2607. "vld2.u8 {d22-d23}, [%2], %10\n"
  2608. "vtrn.u8 q11, q10 \n" // _src67t_r
  2609. "sub %1, %1, %10, lsl #2 \n" // restore src0
  2610. "vtrn.u16 q2, q0 \n" // _src02tt_r
  2611. "sub %2, %2, %10, lsl #2 \n" // restore src1
  2612. "vtrn.u16 q3, q1 \n" // _src13tt_r
  2613. "add %1, #16 \n" // src0 += 16
  2614. "vtrn.u16 q10, q8 \n" // _src46tt_r
  2615. "add %2, #16 \n" // src1 += 16
  2616. "vtrn.u16 q11, q9 \n" // _src57tt_r
  2617. "vtrn.u32 q10, q2 \n" // _src26ttt_r
  2618. "vtrn.u32 q11, q3 \n" // _src37ttt_r
  2619. "vst2.u8 {d20-d21}, [%4], %11\n"
  2620. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  2621. "vst2.u8 {d22-d23}, [%3], %11\n"
  2622. "vtrn.u32 q9, q1 \n" // _src15ttt_r
  2623. "vst2.u8 {d16-d17}, [%4], %11\n"
  2624. "subs %0, #1 \n"
  2625. "vst2.u8 {d18-d19}, [%3], %11\n"
  2626. "vst2.u8 {d4-d5}, [%4], %11 \n"
  2627. "vst2.u8 {d6-d7}, [%3], %11 \n"
  2628. "vst2.u8 {d0-d1}, [%4], %11 \n"
  2629. "vst2.u8 {d2-d3}, [%3], %11 \n"
  2630. "bne 0b \n"
  2631. : "=r"(nn), // %0
  2632. "=r"(src0), // %1
  2633. "=r"(src1), // %2
  2634. "=r"(dst0), // %3
  2635. "=r"(dst1) // %4
  2636. : "0"(nn),
  2637. "1"(src0),
  2638. "2"(src1),
  2639. "3"(dst0),
  2640. "4"(dst1),
  2641. "r"(src_step), // %10
  2642. "r"(dst_step) // %11
  2643. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  2644. }
  2645. #endif // __aarch64__
  2646. for (; remain > 0; remain--)
  2647. {
  2648. dst0[0] = src1[0 + 3 * src_step];
  2649. dst0[1] = src1[1 + 3 * src_step];
  2650. dst0[2] = src0[0 + 3 * src_step];
  2651. dst0[3] = src0[1 + 3 * src_step];
  2652. dst0[4] = src1[0 + 2 * src_step];
  2653. dst0[5] = src1[1 + 2 * src_step];
  2654. dst0[6] = src0[0 + 2 * src_step];
  2655. dst0[7] = src0[1 + 2 * src_step];
  2656. dst0[8] = src1[0 + src_step];
  2657. dst0[9] = src1[1 + src_step];
  2658. dst0[10] = src0[0 + src_step];
  2659. dst0[11] = src0[1 + src_step];
  2660. dst0[12] = src1[0];
  2661. dst0[13] = src1[1];
  2662. dst0[14] = src0[0];
  2663. dst0[15] = src0[1];
  2664. src0 += 2;
  2665. src1 += 2;
  2666. dst0 += stride;
  2667. }
  2668. src0 += srcwgap + 7 * srcstride;
  2669. }
  2670. #endif // __ARM_NEON
  2671. for (; y < srch; y++)
  2672. {
  2673. unsigned char* dst0 = dstend - y * 2 - 2;
  2674. int x = 0;
  2675. for (; x < srcw; x++)
  2676. {
  2677. dst0[0] = src0[0];
  2678. dst0[1] = src0[1];
  2679. src0 += 2;
  2680. dst0 += stride;
  2681. }
  2682. src0 += srcwgap;
  2683. }
  2684. }
  2685. static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  2686. {
  2687. const int srcwgap = srcstride - srcw * 3;
  2688. // point to the last dst pixel in row
  2689. unsigned char* dstend = dst + w * 3;
  2690. const unsigned char* src0 = src;
  2691. int y = 0;
  2692. #if __ARM_NEON
  2693. for (; y + 7 < srch; y += 8)
  2694. {
  2695. const unsigned char* src1 = src0 + srcstride;
  2696. unsigned char* dst0 = dstend - y * 3 - 8 * 3;
  2697. unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride;
  2698. int src_step = 2 * srcstride;
  2699. int dst_step = 2 * stride;
  2700. int nn = srcw >> 3;
  2701. int remain = srcw - (nn << 3);
  2702. #if __aarch64__
  2703. for (; nn > 0; nn--)
  2704. {
  2705. uint8x8x3_t _src0 = vld3_u8(src0);
  2706. uint8x8x3_t _src1 = vld3_u8(src1);
  2707. uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
  2708. uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
  2709. uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
  2710. uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
  2711. uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
  2712. uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
  2713. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  2714. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  2715. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  2716. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  2717. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  2718. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  2719. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  2720. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  2721. uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
  2722. uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
  2723. uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
  2724. uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
  2725. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  2726. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  2727. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  2728. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  2729. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  2730. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  2731. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  2732. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  2733. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
  2734. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
  2735. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
  2736. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
  2737. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  2738. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  2739. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  2740. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  2741. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  2742. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  2743. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  2744. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  2745. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
  2746. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
  2747. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
  2748. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
  2749. uint8x8x3_t _dst0;
  2750. uint8x8x3_t _dst1;
  2751. uint8x8x3_t _dst2;
  2752. uint8x8x3_t _dst3;
  2753. uint8x8x3_t _dst4;
  2754. uint8x8x3_t _dst5;
  2755. uint8x8x3_t _dst6;
  2756. uint8x8x3_t _dst7;
  2757. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  2758. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  2759. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  2760. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  2761. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  2762. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  2763. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  2764. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  2765. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  2766. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  2767. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  2768. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  2769. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  2770. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  2771. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  2772. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  2773. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  2774. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  2775. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  2776. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  2777. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  2778. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  2779. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  2780. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  2781. vst3_u8(dst0, _dst7);
  2782. vst3_u8(dst1, _dst6);
  2783. vst3_u8(dst0 + dst_step, _dst5);
  2784. vst3_u8(dst1 + dst_step, _dst4);
  2785. vst3_u8(dst0 + 2 * dst_step, _dst3);
  2786. vst3_u8(dst1 + 2 * dst_step, _dst2);
  2787. vst3_u8(dst0 + 3 * dst_step, _dst1);
  2788. vst3_u8(dst1 + 3 * dst_step, _dst0);
  2789. src0 += 3 * 8;
  2790. src1 += 3 * 8;
  2791. dst0 += 4 * dst_step;
  2792. dst1 += 4 * dst_step;
  2793. }
  2794. #else
  2795. if (nn > 0)
  2796. {
  2797. asm volatile(
  2798. "0: \n"
  2799. "pld [%1, #192] \n"
  2800. "vld3.u8 {d0-d2}, [%1], %10 \n"
  2801. "pld [%2, #192] \n"
  2802. "vld3.u8 {d4-d6}, [%2], %10 \n"
  2803. "pld [%1, #192] \n"
  2804. "vld3.u8 {d8-d10}, [%1], %10 \n"
  2805. "vtrn.u8 q2, q0 \n" // _src01t_r
  2806. "vtrn.u8 d6, d2 \n"
  2807. "pld [%2, #192] \n"
  2808. "vld3.u8 {d12-d14}, [%2], %10\n"
  2809. "pld [%1, #192] \n"
  2810. "vld3.u8 {d16-d18}, [%1], %10\n"
  2811. "vtrn.u8 q6, q4 \n" // _src23t_r
  2812. "vtrn.u8 d14, d10 \n"
  2813. "pld [%2, #192] \n"
  2814. "vld3.u8 {d20-d22}, [%2], %10\n"
  2815. "pld [%1, #192] \n"
  2816. "vld3.u8 {d24-d26}, [%1], %10\n"
  2817. "vtrn.u8 q10, q8 \n" // _src45t_r
  2818. "vtrn.u8 d22, d18 \n"
  2819. "pld [%2, #192] \n"
  2820. "vld3.u8 {d28-d30}, [%2], %10\n"
  2821. "vtrn.u8 q14, q12 \n" // _src67t_r
  2822. "vtrn.u8 d30, d26 \n"
  2823. "sub %1, %1, %10, lsl #2 \n" // restore src0
  2824. "vtrn.u16 q4, q0 \n" // _src02tt_r
  2825. "vtrn.u16 d10, d2 \n"
  2826. "sub %2, %2, %10, lsl #2 \n" // restore src1
  2827. "vtrn.u16 q6, q2 \n" // _src13tt_r
  2828. "vtrn.u16 d14, d6 \n"
  2829. "add %1, #24 \n" // src0 += 24
  2830. "vtrn.u16 q12, q8 \n" // _src46tt_r
  2831. "vtrn.u16 d26, d18 \n"
  2832. "add %2, #24 \n" // src1 += 24
  2833. "vtrn.u16 q14, q10 \n" // _src57tt_r
  2834. "vtrn.u16 d30, d22 \n"
  2835. "vtrn.u32 q12, q4 \n" // _src26ttt_r
  2836. "vtrn.u32 d26, d10 \n"
  2837. "vtrn.u32 q14, q6 \n" // _src37ttt_r
  2838. "vst3.u8 {d24-d26}, [%4], %11\n"
  2839. "vtrn.u32 d30, d14 \n"
  2840. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  2841. "vst3.u8 {d28-d30}, [%3], %11\n"
  2842. "vtrn.u32 d18, d2 \n"
  2843. "vtrn.u32 q10, q2 \n" // _src15ttt_r
  2844. "vst3.u8 {d16-d18}, [%4], %11\n"
  2845. "vtrn.u32 d22, d6 \n"
  2846. "subs %0, #1 \n"
  2847. "vst3.u8 {d20-d22}, [%3], %11\n"
  2848. "vst3.u8 {d8-d10}, [%4], %11 \n"
  2849. "vst3.u8 {d12-d14}, [%3], %11\n"
  2850. "vst3.u8 {d0-d2}, [%4], %11 \n"
  2851. "vst3.u8 {d4-d6}, [%3], %11 \n"
  2852. "bne 0b \n"
  2853. : "=r"(nn), // %0
  2854. "=r"(src0), // %1
  2855. "=r"(src1), // %2
  2856. "=r"(dst0), // %3
  2857. "=r"(dst1) // %4
  2858. : "0"(nn),
  2859. "1"(src0),
  2860. "2"(src1),
  2861. "3"(dst0),
  2862. "4"(dst1),
  2863. "r"(src_step), // %10
  2864. "r"(dst_step) // %11
  2865. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  2866. }
  2867. #endif // __aarch64__
  2868. for (; remain > 0; remain--)
  2869. {
  2870. dst0[0] = src1[0 + 3 * src_step];
  2871. dst0[1] = src1[1 + 3 * src_step];
  2872. dst0[2] = src1[2 + 3 * src_step];
  2873. dst0[3] = src0[0 + 3 * src_step];
  2874. dst0[4] = src0[1 + 3 * src_step];
  2875. dst0[5] = src0[2 + 3 * src_step];
  2876. dst0[6] = src1[0 + 2 * src_step];
  2877. dst0[7] = src1[1 + 2 * src_step];
  2878. dst0[8] = src1[2 + 2 * src_step];
  2879. dst0[9] = src0[0 + 2 * src_step];
  2880. dst0[10] = src0[1 + 2 * src_step];
  2881. dst0[11] = src0[2 + 2 * src_step];
  2882. dst0[12] = src1[0 + src_step];
  2883. dst0[13] = src1[1 + src_step];
  2884. dst0[14] = src1[2 + src_step];
  2885. dst0[15] = src0[0 + src_step];
  2886. dst0[16] = src0[1 + src_step];
  2887. dst0[17] = src0[2 + src_step];
  2888. dst0[18] = src1[0];
  2889. dst0[19] = src1[1];
  2890. dst0[20] = src1[2];
  2891. dst0[21] = src0[0];
  2892. dst0[22] = src0[1];
  2893. dst0[23] = src0[2];
  2894. src0 += 3;
  2895. src1 += 3;
  2896. dst0 += stride;
  2897. }
  2898. src0 += srcwgap + 7 * srcstride;
  2899. }
  2900. #endif // __ARM_NEON
  2901. for (; y < srch; y++)
  2902. {
  2903. unsigned char* dst0 = dstend - y * 3 - 3;
  2904. int x = 0;
  2905. for (; x < srcw; x++)
  2906. {
  2907. dst0[0] = src0[0];
  2908. dst0[1] = src0[1];
  2909. dst0[2] = src0[2];
  2910. src0 += 3;
  2911. dst0 += stride;
  2912. }
  2913. src0 += srcwgap;
  2914. }
  2915. }
  2916. static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
  2917. {
  2918. const int srcwgap = srcstride - srcw * 4;
  2919. // point to the last dst pixel in row
  2920. unsigned char* dstend = dst + w * 4;
  2921. const unsigned char* src0 = src;
  2922. int y = 0;
  2923. #if __ARM_NEON
  2924. for (; y + 7 < srch; y += 8)
  2925. {
  2926. const unsigned char* src1 = src0 + srcstride;
  2927. unsigned char* dst0 = dstend - y * 4 - 8 * 4;
  2928. unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride;
  2929. int src_step = 2 * srcstride;
  2930. int dst_step = 2 * stride;
  2931. int nn = srcw >> 3;
  2932. int remain = srcw - (nn << 3);
  2933. #if __aarch64__
  2934. for (; nn > 0; nn--)
  2935. {
  2936. uint8x8x4_t _src0 = vld4_u8(src0);
  2937. uint8x8x4_t _src1 = vld4_u8(src1);
  2938. uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
  2939. uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
  2940. uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
  2941. uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
  2942. uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
  2943. uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
  2944. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  2945. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  2946. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  2947. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  2948. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  2949. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  2950. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  2951. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  2952. uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
  2953. uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
  2954. uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
  2955. uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
  2956. uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
  2957. uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
  2958. uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
  2959. uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
  2960. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  2961. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  2962. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  2963. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  2964. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  2965. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  2966. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  2967. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  2968. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
  2969. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
  2970. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
  2971. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
  2972. uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
  2973. uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
  2974. uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
  2975. uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
  2976. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  2977. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  2978. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  2979. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  2980. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  2981. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  2982. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  2983. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  2984. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
  2985. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
  2986. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
  2987. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
  2988. uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
  2989. uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
  2990. uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
  2991. uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
  2992. uint8x8x4_t _dst0;
  2993. uint8x8x4_t _dst1;
  2994. uint8x8x4_t _dst2;
  2995. uint8x8x4_t _dst3;
  2996. uint8x8x4_t _dst4;
  2997. uint8x8x4_t _dst5;
  2998. uint8x8x4_t _dst6;
  2999. uint8x8x4_t _dst7;
  3000. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  3001. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  3002. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  3003. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  3004. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  3005. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  3006. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  3007. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  3008. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  3009. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  3010. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  3011. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  3012. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  3013. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  3014. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  3015. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  3016. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  3017. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  3018. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  3019. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  3020. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  3021. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  3022. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  3023. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  3024. _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
  3025. _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
  3026. _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
  3027. _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
  3028. _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
  3029. _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
  3030. _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
  3031. _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
  3032. vst4_u8(dst0, _dst7);
  3033. vst4_u8(dst1, _dst6);
  3034. vst4_u8(dst0 + dst_step, _dst5);
  3035. vst4_u8(dst1 + dst_step, _dst4);
  3036. vst4_u8(dst0 + 2 * dst_step, _dst3);
  3037. vst4_u8(dst1 + 2 * dst_step, _dst2);
  3038. vst4_u8(dst0 + 3 * dst_step, _dst1);
  3039. vst4_u8(dst1 + 3 * dst_step, _dst0);
  3040. src0 += 4 * 8;
  3041. src1 += 4 * 8;
  3042. dst0 += 4 * dst_step;
  3043. dst1 += 4 * dst_step;
  3044. }
  3045. #else
  3046. if (nn > 0)
  3047. {
  3048. asm volatile(
  3049. "0: \n"
  3050. "pld [%1, #256] \n"
  3051. "vld4.u8 {d0-d3}, [%1], %10 \n"
  3052. "pld [%2, #256] \n"
  3053. "vld4.u8 {d4-d7}, [%2], %10 \n"
  3054. "pld [%1, #256] \n"
  3055. "vld4.u8 {d8-d11}, [%1], %10 \n"
  3056. "vtrn.u8 q2, q0 \n" // _src01t_r
  3057. "vtrn.u8 q3, q1 \n"
  3058. "pld [%2, #256] \n"
  3059. "vld4.u8 {d12-d15}, [%2], %10\n"
  3060. "pld [%1, #256] \n"
  3061. "vld4.u8 {d16-d19}, [%1], %10\n"
  3062. "vtrn.u8 q6, q4 \n" // _src23t_r
  3063. "vtrn.u8 q7, q5 \n"
  3064. "pld [%2, #256] \n"
  3065. "vld4.u8 {d20-d23}, [%2], %10\n"
  3066. "pld [%1, #256] \n"
  3067. "vld4.u8 {d24-d27}, [%1], %10\n"
  3068. "vtrn.u8 q10, q8 \n" // _src45t_r
  3069. "vtrn.u8 q11, q9 \n"
  3070. "pld [%2, #256] \n"
  3071. "vld4.u8 {d28-d31}, [%2], %10\n"
  3072. "vtrn.u8 q14, q12 \n" // _src67t_r
  3073. "vtrn.u8 q15, q13 \n"
  3074. "sub %1, %1, %10, lsl #2 \n" // restore src0
  3075. "vtrn.u16 q4, q0 \n" // _src02tt_r
  3076. "vtrn.u16 q5, q1 \n"
  3077. "sub %2, %2, %10, lsl #2 \n" // restore src1
  3078. "vtrn.u16 q6, q2 \n" // _src13tt_r
  3079. "vtrn.u16 q7, q3 \n"
  3080. "add %1, #32 \n" // src0 += 32
  3081. "vtrn.u16 q12, q8 \n" // _src46tt_r
  3082. "vtrn.u16 q13, q9 \n"
  3083. "add %2, #32 \n" // src1 += 32
  3084. "vtrn.u16 q14, q10 \n" // _src57tt_r
  3085. "vtrn.u16 q15, q11 \n"
  3086. "vtrn.u32 q12, q4 \n" // _src26ttt_r
  3087. "vtrn.u32 q13, q5 \n"
  3088. "vtrn.u32 q14, q6 \n" // _src37ttt_r
  3089. "vst4.u8 {d24-d27}, [%4], %11\n"
  3090. "vtrn.u32 q15, q7 \n"
  3091. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  3092. "vst4.u8 {d28-d31}, [%3], %11\n"
  3093. "vtrn.u32 q9, q1 \n"
  3094. "vtrn.u32 q10, q2 \n" // _src15ttt_r
  3095. "vst4.u8 {d16-d19}, [%4], %11\n"
  3096. "vtrn.u32 q11, q3 \n"
  3097. "subs %0, #1 \n"
  3098. "vst4.u8 {d8-d11}, [%4], %11 \n"
  3099. "vst4.u8 {d20-d23}, [%3], %11\n"
  3100. "vst4.u8 {d12-d15}, [%3], %11\n"
  3101. "vst4.u8 {d0-d3}, [%4], %11 \n"
  3102. "vst4.u8 {d4-d7}, [%3], %11 \n"
  3103. "bne 0b \n"
  3104. : "=r"(nn), // %0
  3105. "=r"(src0), // %1
  3106. "=r"(src1), // %2
  3107. "=r"(dst0), // %3
  3108. "=r"(dst1) // %4
  3109. : "0"(nn),
  3110. "1"(src0),
  3111. "2"(src1),
  3112. "3"(dst0),
  3113. "4"(dst1),
  3114. "r"(src_step), // %10
  3115. "r"(dst_step) // %11
  3116. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  3117. }
  3118. #endif // __aarch64__
  3119. for (; remain > 0; remain--)
  3120. {
  3121. dst0[0] = src1[0 + 3 * src_step];
  3122. dst0[1] = src1[1 + 3 * src_step];
  3123. dst0[2] = src1[2 + 3 * src_step];
  3124. dst0[3] = src1[3 + 3 * src_step];
  3125. dst0[4] = src0[0 + 3 * src_step];
  3126. dst0[5] = src0[1 + 3 * src_step];
  3127. dst0[6] = src0[2 + 3 * src_step];
  3128. dst0[7] = src0[3 + 3 * src_step];
  3129. dst0[8] = src1[0 + 2 * src_step];
  3130. dst0[9] = src1[1 + 2 * src_step];
  3131. dst0[10] = src1[2 + 2 * src_step];
  3132. dst0[11] = src1[3 + 2 * src_step];
  3133. dst0[12] = src0[0 + 2 * src_step];
  3134. dst0[13] = src0[1 + 2 * src_step];
  3135. dst0[14] = src0[2 + 2 * src_step];
  3136. dst0[15] = src0[3 + 2 * src_step];
  3137. dst0[16] = src1[0 + src_step];
  3138. dst0[17] = src1[1 + src_step];
  3139. dst0[18] = src1[2 + src_step];
  3140. dst0[19] = src1[3 + src_step];
  3141. dst0[20] = src0[0 + src_step];
  3142. dst0[21] = src0[1 + src_step];
  3143. dst0[22] = src0[2 + src_step];
  3144. dst0[23] = src0[3 + src_step];
  3145. dst0[24] = src1[0];
  3146. dst0[25] = src1[1];
  3147. dst0[26] = src1[2];
  3148. dst0[27] = src1[3];
  3149. dst0[28] = src0[0];
  3150. dst0[29] = src0[1];
  3151. dst0[30] = src0[2];
  3152. dst0[31] = src0[3];
  3153. src0 += 4;
  3154. src1 += 4;
  3155. dst0 += stride;
  3156. }
  3157. src0 += srcwgap + 7 * srcstride;
  3158. }
  3159. #endif // __ARM_NEON
  3160. for (; y < srch; y++)
  3161. {
  3162. unsigned char* dst0 = dstend - y * 4 - 4;
  3163. int x = 0;
  3164. for (; x < srcw; x++)
  3165. {
  3166. dst0[0] = src0[0];
  3167. dst0[1] = src0[1];
  3168. dst0[2] = src0[2];
  3169. dst0[3] = src0[3];
  3170. src0 += 4;
  3171. dst0 += stride;
  3172. }
  3173. src0 += srcwgap;
  3174. }
  3175. }
  3176. static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  3177. {
  3178. const int srcwgap = srcstride - srcw;
  3179. // point to the last dst pixel
  3180. unsigned char* dstend = dst + stride * (h - 1) + w;
  3181. const unsigned char* src0 = src;
  3182. int y = 0;
  3183. #if __ARM_NEON
  3184. for (; y + 7 < srch; y += 8)
  3185. {
  3186. const unsigned char* src1 = src0 + srcstride;
  3187. unsigned char* dst6 = dstend - y - 8 - stride;
  3188. unsigned char* dst7 = dstend - y - 8;
  3189. int src_step = 2 * srcstride;
  3190. int dst_step = -2 * stride;
  3191. int nn = srcw >> 3;
  3192. int remain = srcw - (nn << 3);
  3193. #if __aarch64__
  3194. for (; nn > 0; nn--)
  3195. {
  3196. uint8x8_t _src0 = vld1_u8(src0);
  3197. uint8x8_t _src1 = vld1_u8(src1);
  3198. uint8x8_t _src2 = vld1_u8(src0 + src_step);
  3199. uint8x8_t _src3 = vld1_u8(src1 + src_step);
  3200. uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
  3201. uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
  3202. uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
  3203. uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
  3204. uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
  3205. uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
  3206. uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
  3207. uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
  3208. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  3209. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  3210. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  3211. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  3212. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  3213. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  3214. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  3215. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  3216. uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  3217. uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  3218. uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  3219. uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  3220. uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  3221. uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  3222. uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  3223. uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  3224. vst1_u8(dst7, _dst7);
  3225. vst1_u8(dst6, _dst6);
  3226. vst1_u8(dst7 + dst_step, _dst5);
  3227. vst1_u8(dst6 + dst_step, _dst4);
  3228. vst1_u8(dst7 + 2 * dst_step, _dst3);
  3229. vst1_u8(dst6 + 2 * dst_step, _dst2);
  3230. vst1_u8(dst7 + 3 * dst_step, _dst1);
  3231. vst1_u8(dst6 + 3 * dst_step, _dst0);
  3232. src0 += 8;
  3233. src1 += 8;
  3234. dst7 += 4 * dst_step;
  3235. dst6 += 4 * dst_step;
  3236. }
  3237. #else
  3238. if (nn > 0)
  3239. {
  3240. asm volatile(
  3241. "0: \n"
  3242. "pld [%1, #64] \n"
  3243. "vld1.u8 {d0}, [%1], %10 \n"
  3244. "pld [%2, #64] \n"
  3245. "vld1.u8 {d1}, [%2], %10 \n"
  3246. "pld [%1, #64] \n"
  3247. "vld1.u8 {d2}, [%1], %10 \n"
  3248. "vtrn.u8 d1, d0 \n" // _src01t_r
  3249. "pld [%2, #64] \n"
  3250. "vld1.u8 {d3}, [%2], %10 \n"
  3251. "pld [%1, #64] \n"
  3252. "vld1.u8 {d4}, [%1], %10 \n"
  3253. "vtrn.u8 d3, d2 \n" // _src23t_r
  3254. "pld [%2, #64] \n"
  3255. "vld1.u8 {d5}, [%2], %10 \n"
  3256. "pld [%1, #64] \n"
  3257. "vld1.u8 {d6}, [%1], %10 \n"
  3258. "vtrn.u8 d5, d4 \n" // _src45t_r
  3259. "pld [%2, #64] \n"
  3260. "vld1.u8 {d7}, [%2], %10 \n"
  3261. "vtrn.u8 d7, d6 \n" // _src67t_r
  3262. "sub %1, %1, %10, lsl #2 \n" // restore src0
  3263. "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
  3264. "sub %2, %2, %10, lsl #2 \n" // restore src1
  3265. "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
  3266. "add %1, #8 \n" // src0 += 8
  3267. "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
  3268. "add %2, #8 \n" // src1 += 8
  3269. "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
  3270. "vst1.u8 {d6}, [%4], %11 \n"
  3271. "vst1.u8 {d7}, [%3], %11 \n"
  3272. "subs %0, #1 \n"
  3273. "vst1.u8 {d4}, [%4], %11 \n"
  3274. "vst1.u8 {d5}, [%3], %11 \n"
  3275. "vst1.u8 {d2}, [%4], %11 \n"
  3276. "vst1.u8 {d3}, [%3], %11 \n"
  3277. "vst1.u8 {d0}, [%4], %11 \n"
  3278. "vst1.u8 {d1}, [%3], %11 \n"
  3279. "bne 0b \n"
  3280. : "=r"(nn), // %0
  3281. "=r"(src0), // %1
  3282. "=r"(src1), // %2
  3283. "=r"(dst7), // %3
  3284. "=r"(dst6) // %4
  3285. : "0"(nn),
  3286. "1"(src0),
  3287. "2"(src1),
  3288. "3"(dst7),
  3289. "4"(dst6),
  3290. "r"(src_step), // %10
  3291. "r"(dst_step) // %11
  3292. : "cc", "memory", "q0", "q1", "q2", "q3");
  3293. }
  3294. #endif // __aarch64__
  3295. for (; remain > 0; remain--)
  3296. {
  3297. dst7[0] = src1[0 + 3 * src_step];
  3298. dst7[1] = src0[0 + 3 * src_step];
  3299. dst7[2] = src1[0 + 2 * src_step];
  3300. dst7[3] = src0[0 + 2 * src_step];
  3301. dst7[4] = src1[0 + src_step];
  3302. dst7[5] = src0[0 + src_step];
  3303. dst7[6] = src1[0];
  3304. dst7[7] = src0[0];
  3305. src0 += 1;
  3306. src1 += 1;
  3307. dst7 -= stride;
  3308. }
  3309. src0 += srcwgap + 7 * srcstride;
  3310. }
  3311. #endif // __ARM_NEON
  3312. for (; y < srch; y++)
  3313. {
  3314. unsigned char* dst0 = dstend - y - 1;
  3315. int x = 0;
  3316. for (; x < srcw; x++)
  3317. {
  3318. *dst0 = *src0;
  3319. src0 += 1;
  3320. dst0 -= stride;
  3321. }
  3322. src0 += srcwgap;
  3323. }
  3324. }
  3325. static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  3326. {
  3327. const int srcwgap = srcstride - srcw * 2;
  3328. // point to the last dst pixel
  3329. unsigned char* dstend = dst + stride * (h - 1) + w * 2;
  3330. const unsigned char* src0 = src;
  3331. int y = 0;
  3332. #if __ARM_NEON
  3333. for (; y + 7 < srch; y += 8)
  3334. {
  3335. const unsigned char* src1 = src0 + srcstride;
  3336. unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride;
  3337. unsigned char* dst7 = dstend - y * 2 - 8 * 2;
  3338. int src_step = 2 * srcstride;
  3339. int dst_step = -2 * stride;
  3340. int nn = srcw >> 3;
  3341. int remain = srcw - (nn << 3);
  3342. #if __aarch64__
  3343. for (; nn > 0; nn--)
  3344. {
  3345. uint8x8x2_t _src0 = vld2_u8(src0);
  3346. uint8x8x2_t _src1 = vld2_u8(src1);
  3347. uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
  3348. uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
  3349. uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
  3350. uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
  3351. uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
  3352. uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
  3353. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  3354. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  3355. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  3356. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  3357. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  3358. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  3359. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  3360. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  3361. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  3362. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  3363. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  3364. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  3365. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  3366. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  3367. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  3368. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  3369. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  3370. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  3371. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  3372. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  3373. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  3374. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  3375. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  3376. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  3377. uint8x8x2_t _dst0;
  3378. uint8x8x2_t _dst1;
  3379. uint8x8x2_t _dst2;
  3380. uint8x8x2_t _dst3;
  3381. uint8x8x2_t _dst4;
  3382. uint8x8x2_t _dst5;
  3383. uint8x8x2_t _dst6;
  3384. uint8x8x2_t _dst7;
  3385. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  3386. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  3387. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  3388. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  3389. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  3390. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  3391. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  3392. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  3393. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  3394. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  3395. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  3396. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  3397. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  3398. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  3399. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  3400. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  3401. vst2_u8(dst7, _dst7);
  3402. vst2_u8(dst6, _dst6);
  3403. vst2_u8(dst7 + dst_step, _dst5);
  3404. vst2_u8(dst6 + dst_step, _dst4);
  3405. vst2_u8(dst7 + 2 * dst_step, _dst3);
  3406. vst2_u8(dst6 + 2 * dst_step, _dst2);
  3407. vst2_u8(dst7 + 3 * dst_step, _dst1);
  3408. vst2_u8(dst6 + 3 * dst_step, _dst0);
  3409. src0 += 2 * 8;
  3410. src1 += 2 * 8;
  3411. dst7 += 4 * dst_step;
  3412. dst6 += 4 * dst_step;
  3413. }
  3414. #else
  3415. if (nn > 0)
  3416. {
  3417. asm volatile(
  3418. "0: \n"
  3419. "pld [%1, #128] \n"
  3420. "vld2.u8 {d0-d1}, [%1], %10 \n"
  3421. "pld [%2, #128] \n"
  3422. "vld2.u8 {d2-d3}, [%2], %10 \n"
  3423. "pld [%1, #128] \n"
  3424. "vld2.u8 {d4-d5}, [%1], %10 \n"
  3425. "vtrn.u8 q1, q0 \n" // _src01t_r
  3426. "pld [%2, #128] \n"
  3427. "vld2.u8 {d6-d7}, [%2], %10 \n"
  3428. "pld [%1, #128] \n"
  3429. "vld2.u8 {d16-d17}, [%1], %10\n"
  3430. "vtrn.u8 q3, q2 \n" // _src23t_r
  3431. "pld [%2, #128] \n"
  3432. "vld2.u8 {d18-d19}, [%2], %10\n"
  3433. "pld [%1, #128] \n"
  3434. "vld2.u8 {d20-d21}, [%1], %10\n"
  3435. "vtrn.u8 q9, q8 \n" // _src45t_r
  3436. "pld [%2, #128] \n"
  3437. "vld2.u8 {d22-d23}, [%2], %10\n"
  3438. "vtrn.u8 q11, q10 \n" // _src67t_r
  3439. "sub %1, %1, %10, lsl #2 \n" // restore src0
  3440. "vtrn.u16 q2, q0 \n" // _src02tt_r
  3441. "sub %2, %2, %10, lsl #2 \n" // restore src1
  3442. "vtrn.u16 q3, q1 \n" // _src13tt_r
  3443. "add %1, #16 \n" // src0 += 16
  3444. "vtrn.u16 q10, q8 \n" // _src46tt_r
  3445. "add %2, #16 \n" // src1 += 16
  3446. "vtrn.u16 q11, q9 \n" // _src57tt_r
  3447. "vtrn.u32 q10, q2 \n" // _src26ttt_r
  3448. "vtrn.u32 q11, q3 \n" // _src37ttt_r
  3449. "vst2.u8 {d20-d21}, [%4], %11\n"
  3450. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  3451. "vst2.u8 {d22-d23}, [%3], %11\n"
  3452. "vtrn.u32 q9, q1 \n" // _src15ttt_r
  3453. "vst2.u8 {d16-d17}, [%4], %11\n"
  3454. "subs %0, #1 \n"
  3455. "vst2.u8 {d4-d5}, [%4], %11 \n"
  3456. "vst2.u8 {d18-d19}, [%3], %11\n"
  3457. "vst2.u8 {d6-d7}, [%3], %11 \n"
  3458. "vst2.u8 {d0-d1}, [%4], %11 \n"
  3459. "vst2.u8 {d2-d3}, [%3], %11 \n"
  3460. "bne 0b \n"
  3461. : "=r"(nn), // %0
  3462. "=r"(src0), // %1
  3463. "=r"(src1), // %2
  3464. "=r"(dst7), // %3
  3465. "=r"(dst6) // %4
  3466. : "0"(nn),
  3467. "1"(src0),
  3468. "2"(src1),
  3469. "3"(dst7),
  3470. "4"(dst6),
  3471. "r"(src_step), // %10
  3472. "r"(dst_step) // %11
  3473. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  3474. }
  3475. #endif // __aarch64__
  3476. for (; remain > 0; remain--)
  3477. {
  3478. dst7[0] = src1[0 + 3 * src_step];
  3479. dst7[1] = src1[1 + 3 * src_step];
  3480. dst7[2] = src0[0 + 3 * src_step];
  3481. dst7[3] = src0[1 + 3 * src_step];
  3482. dst7[4] = src1[0 + 2 * src_step];
  3483. dst7[5] = src1[1 + 2 * src_step];
  3484. dst7[6] = src0[0 + 2 * src_step];
  3485. dst7[7] = src0[1 + 2 * src_step];
  3486. dst7[8] = src1[0 + src_step];
  3487. dst7[9] = src1[1 + src_step];
  3488. dst7[10] = src0[0 + src_step];
  3489. dst7[11] = src0[1 + src_step];
  3490. dst7[12] = src1[0];
  3491. dst7[13] = src1[1];
  3492. dst7[14] = src0[0];
  3493. dst7[15] = src0[1];
  3494. src0 += 2;
  3495. src1 += 2;
  3496. dst7 -= stride;
  3497. }
  3498. src0 += srcwgap + 7 * srcstride;
  3499. }
  3500. #endif // __ARM_NEON
  3501. for (; y < srch; y++)
  3502. {
  3503. unsigned char* dst0 = dstend - y * 2 - 2;
  3504. int x = 0;
  3505. for (; x < srcw; x++)
  3506. {
  3507. dst0[0] = src0[0];
  3508. dst0[1] = src0[1];
  3509. src0 += 2;
  3510. dst0 -= stride;
  3511. }
  3512. src0 += srcwgap;
  3513. }
  3514. }
  3515. static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  3516. {
  3517. const int srcwgap = srcstride - srcw * 3;
  3518. // point to the last dst pixel
  3519. unsigned char* dstend = dst + stride * (h - 1) + w * 3;
  3520. const unsigned char* src0 = src;
  3521. int y = 0;
  3522. #if __ARM_NEON
  3523. for (; y + 7 < srch; y += 8)
  3524. {
  3525. const unsigned char* src1 = src0 + srcstride;
  3526. unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride;
  3527. unsigned char* dst7 = dstend - y * 3 - 8 * 3;
  3528. int src_step = 2 * srcstride;
  3529. int dst_step = -2 * stride;
  3530. int nn = srcw >> 3;
  3531. int remain = srcw - (nn << 3);
  3532. #if __aarch64__
  3533. for (; nn > 0; nn--)
  3534. {
  3535. uint8x8x3_t _src0 = vld3_u8(src0);
  3536. uint8x8x3_t _src1 = vld3_u8(src1);
  3537. uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
  3538. uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
  3539. uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
  3540. uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
  3541. uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
  3542. uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
  3543. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  3544. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  3545. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  3546. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  3547. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  3548. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  3549. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  3550. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  3551. uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
  3552. uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
  3553. uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
  3554. uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
  3555. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  3556. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  3557. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  3558. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  3559. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  3560. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  3561. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  3562. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  3563. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
  3564. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
  3565. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
  3566. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
  3567. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  3568. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  3569. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  3570. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  3571. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  3572. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  3573. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  3574. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  3575. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
  3576. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
  3577. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
  3578. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
  3579. uint8x8x3_t _dst0;
  3580. uint8x8x3_t _dst1;
  3581. uint8x8x3_t _dst2;
  3582. uint8x8x3_t _dst3;
  3583. uint8x8x3_t _dst4;
  3584. uint8x8x3_t _dst5;
  3585. uint8x8x3_t _dst6;
  3586. uint8x8x3_t _dst7;
  3587. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  3588. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  3589. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  3590. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  3591. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  3592. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  3593. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  3594. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  3595. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  3596. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  3597. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  3598. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  3599. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  3600. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  3601. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  3602. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  3603. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  3604. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  3605. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  3606. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  3607. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  3608. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  3609. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  3610. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  3611. vst3_u8(dst7, _dst7);
  3612. vst3_u8(dst6, _dst6);
  3613. vst3_u8(dst7 + dst_step, _dst5);
  3614. vst3_u8(dst6 + dst_step, _dst4);
  3615. vst3_u8(dst7 + 2 * dst_step, _dst3);
  3616. vst3_u8(dst6 + 2 * dst_step, _dst2);
  3617. vst3_u8(dst7 + 3 * dst_step, _dst1);
  3618. vst3_u8(dst6 + 3 * dst_step, _dst0);
  3619. src0 += 3 * 8;
  3620. src1 += 3 * 8;
  3621. dst7 += 4 * dst_step;
  3622. dst6 += 4 * dst_step;
  3623. }
  3624. #else
  3625. if (nn > 0)
  3626. {
  3627. asm volatile(
  3628. "0: \n"
  3629. "pld [%1, #192] \n"
  3630. "vld3.u8 {d0-d2}, [%1], %10 \n"
  3631. "pld [%2, #192] \n"
  3632. "vld3.u8 {d4-d6}, [%2], %10 \n"
  3633. "pld [%1, #192] \n"
  3634. "vld3.u8 {d8-d10}, [%1], %10 \n"
  3635. "vtrn.u8 q2, q0 \n" // _src01t_r
  3636. "vtrn.u8 d6, d2 \n"
  3637. "pld [%2, #192] \n"
  3638. "vld3.u8 {d12-d14}, [%2], %10\n"
  3639. "pld [%1, #192] \n"
  3640. "vld3.u8 {d16-d18}, [%1], %10\n"
  3641. "vtrn.u8 q6, q4 \n" // _src23t_r
  3642. "vtrn.u8 d14, d10 \n"
  3643. "pld [%2, #192] \n"
  3644. "vld3.u8 {d20-d22}, [%2], %10\n"
  3645. "pld [%1, #192] \n"
  3646. "vld3.u8 {d24-d26}, [%1], %10\n"
  3647. "vtrn.u8 q10, q8 \n" // _src45t_r
  3648. "vtrn.u8 d22, d18 \n"
  3649. "pld [%2, #192] \n"
  3650. "vld3.u8 {d28-d30}, [%2], %10\n"
  3651. "vtrn.u8 q14, q12 \n" // _src67t_r
  3652. "vtrn.u8 d30, d26 \n"
  3653. "sub %1, %1, %10, lsl #2 \n" // restore src0
  3654. "vtrn.u16 q4, q0 \n" // _src02tt_r
  3655. "vtrn.u16 d10, d2 \n"
  3656. "sub %2, %2, %10, lsl #2 \n" // restore src1
  3657. "vtrn.u16 q6, q2 \n" // _src13tt_r
  3658. "vtrn.u16 d14, d6 \n"
  3659. "add %1, #24 \n" // src0 += 24
  3660. "vtrn.u16 q12, q8 \n" // _src46tt_r
  3661. "vtrn.u16 d26, d18 \n"
  3662. "add %2, #24 \n" // src1 += 24
  3663. "vtrn.u16 q14, q10 \n" // _src57tt_r
  3664. "vtrn.u16 d30, d22 \n"
  3665. "vtrn.u32 q12, q4 \n" // _src26ttt_r
  3666. "vtrn.u32 d26, d10 \n"
  3667. "vtrn.u32 q14, q6 \n" // _src37ttt_r
  3668. "vst3.u8 {d24-d26}, [%4], %11\n"
  3669. "vtrn.u32 d30, d14 \n"
  3670. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  3671. "vst3.u8 {d28-d30}, [%3], %11\n"
  3672. "vtrn.u32 d18, d2 \n"
  3673. "vtrn.u32 q10, q2 \n" // _src15ttt_r
  3674. "vst3.u8 {d16-d18}, [%4], %11\n"
  3675. "vtrn.u32 d22, d6 \n"
  3676. "subs %0, #1 \n"
  3677. "vst3.u8 {d8-d10}, [%4], %11 \n"
  3678. "vst3.u8 {d20-d22}, [%3], %11\n"
  3679. "vst3.u8 {d12-d14}, [%3], %11\n"
  3680. "vst3.u8 {d0-d2}, [%4], %11 \n"
  3681. "vst3.u8 {d4-d6}, [%3], %11 \n"
  3682. "bne 0b \n"
  3683. : "=r"(nn), // %0
  3684. "=r"(src0), // %1
  3685. "=r"(src1), // %2
  3686. "=r"(dst7), // %3
  3687. "=r"(dst6) // %4
  3688. : "0"(nn),
  3689. "1"(src0),
  3690. "2"(src1),
  3691. "3"(dst7),
  3692. "4"(dst6),
  3693. "r"(src_step), // %10
  3694. "r"(dst_step) // %11
  3695. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  3696. }
  3697. #endif // __aarch64__
  3698. for (; remain > 0; remain--)
  3699. {
  3700. dst7[0] = src1[0 + 3 * src_step];
  3701. dst7[1] = src1[1 + 3 * src_step];
  3702. dst7[2] = src1[2 + 3 * src_step];
  3703. dst7[3] = src0[0 + 3 * src_step];
  3704. dst7[4] = src0[1 + 3 * src_step];
  3705. dst7[5] = src0[2 + 3 * src_step];
  3706. dst7[6] = src1[0 + 2 * src_step];
  3707. dst7[7] = src1[1 + 2 * src_step];
  3708. dst7[8] = src1[2 + 2 * src_step];
  3709. dst7[9] = src0[0 + 2 * src_step];
  3710. dst7[10] = src0[1 + 2 * src_step];
  3711. dst7[11] = src0[2 + 2 * src_step];
  3712. dst7[12] = src1[0 + src_step];
  3713. dst7[13] = src1[1 + src_step];
  3714. dst7[14] = src1[2 + src_step];
  3715. dst7[15] = src0[0 + src_step];
  3716. dst7[16] = src0[1 + src_step];
  3717. dst7[17] = src0[2 + src_step];
  3718. dst7[18] = src1[0];
  3719. dst7[19] = src1[1];
  3720. dst7[20] = src1[2];
  3721. dst7[21] = src0[0];
  3722. dst7[22] = src0[1];
  3723. dst7[23] = src0[2];
  3724. src0 += 3;
  3725. src1 += 3;
  3726. dst7 -= stride;
  3727. }
  3728. src0 += srcwgap + 7 * srcstride;
  3729. }
  3730. #endif // __ARM_NEON
  3731. for (; y < srch; y++)
  3732. {
  3733. unsigned char* dst0 = dstend - y * 3 - 3;
  3734. int x = 0;
  3735. for (; x < srcw; x++)
  3736. {
  3737. dst0[0] = src0[0];
  3738. dst0[1] = src0[1];
  3739. dst0[2] = src0[2];
  3740. src0 += 3;
  3741. dst0 -= stride;
  3742. }
  3743. src0 += srcwgap;
  3744. }
  3745. }
  3746. static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
  3747. {
  3748. const int srcwgap = srcstride - srcw * 4;
  3749. // point to the last dst pixel
  3750. unsigned char* dstend = dst + stride * (h - 1) + w * 4;
  3751. const unsigned char* src0 = src;
  3752. int y = 0;
  3753. #if __ARM_NEON
  3754. for (; y + 7 < srch; y += 8)
  3755. {
  3756. const unsigned char* src1 = src0 + srcstride;
  3757. unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride;
  3758. unsigned char* dst7 = dstend - y * 4 - 8 * 4;
  3759. int src_step = 2 * srcstride;
  3760. int dst_step = -2 * stride;
  3761. int nn = srcw >> 3;
  3762. int remain = srcw - (nn << 3);
  3763. #if __aarch64__
  3764. for (; nn > 0; nn--)
  3765. {
  3766. uint8x8x4_t _src0 = vld4_u8(src0);
  3767. uint8x8x4_t _src1 = vld4_u8(src1);
  3768. uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
  3769. uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
  3770. uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
  3771. uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
  3772. uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
  3773. uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
  3774. uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
  3775. uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
  3776. uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
  3777. uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
  3778. uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
  3779. uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
  3780. uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
  3781. uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
  3782. uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
  3783. uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
  3784. uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
  3785. uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
  3786. uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
  3787. uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
  3788. uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
  3789. uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
  3790. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
  3791. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
  3792. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
  3793. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
  3794. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
  3795. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
  3796. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
  3797. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
  3798. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
  3799. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
  3800. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
  3801. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
  3802. uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
  3803. uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
  3804. uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
  3805. uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
  3806. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
  3807. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
  3808. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
  3809. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
  3810. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
  3811. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
  3812. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
  3813. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
  3814. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
  3815. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
  3816. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
  3817. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
  3818. uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
  3819. uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
  3820. uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
  3821. uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
  3822. uint8x8x4_t _dst0;
  3823. uint8x8x4_t _dst1;
  3824. uint8x8x4_t _dst2;
  3825. uint8x8x4_t _dst3;
  3826. uint8x8x4_t _dst4;
  3827. uint8x8x4_t _dst5;
  3828. uint8x8x4_t _dst6;
  3829. uint8x8x4_t _dst7;
  3830. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  3831. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  3832. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  3833. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  3834. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  3835. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  3836. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  3837. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  3838. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  3839. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  3840. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  3841. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  3842. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  3843. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  3844. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  3845. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  3846. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  3847. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  3848. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  3849. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  3850. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  3851. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  3852. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  3853. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  3854. _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
  3855. _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
  3856. _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
  3857. _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
  3858. _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
  3859. _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
  3860. _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
  3861. _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
  3862. vst4_u8(dst7, _dst7);
  3863. vst4_u8(dst6, _dst6);
  3864. vst4_u8(dst7 + dst_step, _dst5);
  3865. vst4_u8(dst6 + dst_step, _dst4);
  3866. vst4_u8(dst7 + 2 * dst_step, _dst3);
  3867. vst4_u8(dst6 + 2 * dst_step, _dst2);
  3868. vst4_u8(dst7 + 3 * dst_step, _dst1);
  3869. vst4_u8(dst6 + 3 * dst_step, _dst0);
  3870. src0 += 4 * 8;
  3871. src1 += 4 * 8;
  3872. dst7 += 4 * dst_step;
  3873. dst6 += 4 * dst_step;
  3874. }
  3875. #else
  3876. if (nn > 0)
  3877. {
  3878. asm volatile(
  3879. "0: \n"
  3880. "pld [%1, #256] \n"
  3881. "vld4.u8 {d0-d3}, [%1], %10 \n"
  3882. "pld [%2, #256] \n"
  3883. "vld4.u8 {d4-d7}, [%2], %10 \n"
  3884. "pld [%1, #256] \n"
  3885. "vld4.u8 {d8-d11}, [%1], %10 \n"
  3886. "vtrn.u8 q2, q0 \n" // _src01t_r
  3887. "vtrn.u8 q3, q1 \n"
  3888. "pld [%2, #256] \n"
  3889. "vld4.u8 {d12-d15}, [%2], %10\n"
  3890. "pld [%1, #256] \n"
  3891. "vld4.u8 {d16-d19}, [%1], %10\n"
  3892. "vtrn.u8 q6, q4 \n" // _src23t_r
  3893. "vtrn.u8 q7, q5 \n"
  3894. "pld [%2, #256] \n"
  3895. "vld4.u8 {d20-d23}, [%2], %10\n"
  3896. "pld [%1, #256] \n"
  3897. "vld4.u8 {d24-d27}, [%1], %10\n"
  3898. "vtrn.u8 q10, q8 \n" // _src45t_r
  3899. "vtrn.u8 q11, q9 \n"
  3900. "pld [%2, #256] \n"
  3901. "vld4.u8 {d28-d31}, [%2], %10\n"
  3902. "vtrn.u8 q14, q12 \n" // _src67t_r
  3903. "vtrn.u8 q15, q13 \n"
  3904. "sub %1, %1, %10, lsl #2 \n" // restore src0
  3905. "vtrn.u16 q4, q0 \n" // _src02tt_r
  3906. "vtrn.u16 q5, q1 \n"
  3907. "sub %2, %2, %10, lsl #2 \n" // restore src1
  3908. "vtrn.u16 q6, q2 \n" // _src13tt_r
  3909. "vtrn.u16 q7, q3 \n"
  3910. "add %1, #32 \n" // src0 += 32
  3911. "vtrn.u16 q12, q8 \n" // _src46tt_r
  3912. "vtrn.u16 q13, q9 \n"
  3913. "add %2, #32 \n" // src1 += 32
  3914. "vtrn.u16 q14, q10 \n" // _src57tt_r
  3915. "vtrn.u16 q15, q11 \n"
  3916. "vtrn.u32 q12, q4 \n" // _src26ttt_r
  3917. "vtrn.u32 q13, q5 \n"
  3918. "vtrn.u32 q14, q6 \n" // _src37ttt_r
  3919. "vst4.u8 {d24-d27}, [%4], %11\n"
  3920. "vtrn.u32 q15, q7 \n"
  3921. "vtrn.u32 q8, q0 \n" // _src04ttt_r
  3922. "vst4.u8 {d28-d31}, [%3], %11\n"
  3923. "vtrn.u32 q9, q1 \n"
  3924. "vtrn.u32 q10, q2 \n" // _src15ttt_r
  3925. "vst4.u8 {d16-d19}, [%4], %11\n"
  3926. "vtrn.u32 q11, q3 \n"
  3927. "subs %0, #1 \n"
  3928. "vst4.u8 {d8-d11}, [%4], %11 \n"
  3929. "vst4.u8 {d20-d23}, [%3], %11\n"
  3930. "vst4.u8 {d12-d15}, [%3], %11\n"
  3931. "vst4.u8 {d0-d3}, [%4], %11 \n"
  3932. "vst4.u8 {d4-d7}, [%3], %11 \n"
  3933. "bne 0b \n"
  3934. : "=r"(nn), // %0
  3935. "=r"(src0), // %1
  3936. "=r"(src1), // %2
  3937. "=r"(dst7), // %3
  3938. "=r"(dst6) // %4
  3939. : "0"(nn),
  3940. "1"(src0),
  3941. "2"(src1),
  3942. "3"(dst7),
  3943. "4"(dst6),
  3944. "r"(src_step), // %10
  3945. "r"(dst_step) // %11
  3946. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  3947. }
  3948. #endif // __aarch64__
  3949. for (; remain > 0; remain--)
  3950. {
  3951. dst7[0] = src1[0 + 3 * src_step];
  3952. dst7[1] = src1[1 + 3 * src_step];
  3953. dst7[2] = src1[2 + 3 * src_step];
  3954. dst7[3] = src1[3 + 3 * src_step];
  3955. dst7[4] = src0[0 + 3 * src_step];
  3956. dst7[5] = src0[1 + 3 * src_step];
  3957. dst7[6] = src0[2 + 3 * src_step];
  3958. dst7[7] = src0[3 + 3 * src_step];
  3959. dst7[8] = src1[0 + 2 * src_step];
  3960. dst7[9] = src1[1 + 2 * src_step];
  3961. dst7[10] = src1[2 + 2 * src_step];
  3962. dst7[11] = src1[3 + 2 * src_step];
  3963. dst7[12] = src0[0 + 2 * src_step];
  3964. dst7[13] = src0[1 + 2 * src_step];
  3965. dst7[14] = src0[2 + 2 * src_step];
  3966. dst7[15] = src0[3 + 2 * src_step];
  3967. dst7[16] = src1[0 + src_step];
  3968. dst7[17] = src1[1 + src_step];
  3969. dst7[18] = src1[2 + src_step];
  3970. dst7[19] = src1[3 + src_step];
  3971. dst7[20] = src0[0 + src_step];
  3972. dst7[21] = src0[1 + src_step];
  3973. dst7[22] = src0[2 + src_step];
  3974. dst7[23] = src0[3 + src_step];
  3975. dst7[24] = src1[0];
  3976. dst7[25] = src1[1];
  3977. dst7[26] = src1[2];
  3978. dst7[27] = src1[3];
  3979. dst7[28] = src0[0];
  3980. dst7[29] = src0[1];
  3981. dst7[30] = src0[2];
  3982. dst7[31] = src0[3];
  3983. src0 += 4;
  3984. src1 += 4;
  3985. dst7 -= stride;
  3986. }
  3987. src0 += srcwgap + 7 * srcstride;
  3988. }
  3989. #endif // __ARM_NEON
  3990. for (; y < srch; y++)
  3991. {
  3992. unsigned char* dst0 = dstend - y * 4 - 4;
  3993. int x = 0;
  3994. for (; x < srcw; x++)
  3995. {
  3996. dst0[0] = src0[0];
  3997. dst0[1] = src0[1];
  3998. dst0[2] = src0[2];
  3999. dst0[3] = src0[3];
  4000. src0 += 4;
  4001. dst0 -= stride;
  4002. }
  4003. src0 += srcwgap;
  4004. }
  4005. }
  4006. static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
  4007. {
  4008. const int srcwgap = srcstride - srcw;
  4009. // point to the last dst pixel row
  4010. unsigned char* dstend = dst + stride * (h - 1);
  4011. const unsigned char* src0 = src;
  4012. int y = 0;
  4013. #if __ARM_NEON
  4014. for (; y + 7 < srch; y += 8)
  4015. {
  4016. const unsigned char* src1 = src0 + srcstride;
  4017. unsigned char* dst7 = dstend + y;
  4018. unsigned char* dst6 = dstend + y - stride;
  4019. int src_step = 2 * srcstride;
  4020. int dst_step = -2 * stride;
  4021. int nn = srcw >> 3;
  4022. int remain = srcw - (nn << 3);
  4023. #if __aarch64__
  4024. for (; nn > 0; nn--)
  4025. {
  4026. uint8x8_t _src0 = vld1_u8(src0);
  4027. uint8x8_t _src1 = vld1_u8(src1);
  4028. uint8x8_t _src2 = vld1_u8(src0 + src_step);
  4029. uint8x8_t _src3 = vld1_u8(src1 + src_step);
  4030. uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
  4031. uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
  4032. uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
  4033. uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
  4034. uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
  4035. uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
  4036. uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
  4037. uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
  4038. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  4039. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  4040. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  4041. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  4042. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  4043. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  4044. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  4045. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  4046. uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  4047. uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  4048. uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  4049. uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  4050. uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  4051. uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  4052. uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  4053. uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  4054. vst1_u8(dst7, _dst0);
  4055. vst1_u8(dst6, _dst1);
  4056. vst1_u8(dst7 + dst_step, _dst2);
  4057. vst1_u8(dst6 + dst_step, _dst3);
  4058. vst1_u8(dst7 + 2 * dst_step, _dst4);
  4059. vst1_u8(dst6 + 2 * dst_step, _dst5);
  4060. vst1_u8(dst7 + 3 * dst_step, _dst6);
  4061. vst1_u8(dst6 + 3 * dst_step, _dst7);
  4062. src0 += 8;
  4063. src1 += 8;
  4064. dst7 += 4 * dst_step;
  4065. dst6 += 4 * dst_step;
  4066. }
  4067. #else
  4068. if (nn > 0)
  4069. {
  4070. asm volatile(
  4071. "0: \n"
  4072. "pld [%1, #64] \n"
  4073. "vld1.u8 {d0}, [%1], %10 \n"
  4074. "pld [%2, #64] \n"
  4075. "vld1.u8 {d1}, [%2], %10 \n"
  4076. "pld [%1, #64] \n"
  4077. "vld1.u8 {d2}, [%1], %10 \n"
  4078. "vtrn.u8 d0, d1 \n" // _src01t_r
  4079. "pld [%2, #64] \n"
  4080. "vld1.u8 {d3}, [%2], %10 \n"
  4081. "pld [%1, #64] \n"
  4082. "vld1.u8 {d4}, [%1], %10 \n"
  4083. "vtrn.u8 d2, d3 \n" // _src23t_r
  4084. "pld [%2, #64] \n"
  4085. "vld1.u8 {d5}, [%2], %10 \n"
  4086. "pld [%1, #64] \n"
  4087. "vld1.u8 {d6}, [%1], %10 \n"
  4088. "vtrn.u8 d4, d5 \n" // _src45t_r
  4089. "pld [%2, #64] \n"
  4090. "vld1.u8 {d7}, [%2], %10 \n"
  4091. "vtrn.u8 d6, d7 \n" // _src67t_r
  4092. "sub %1, %1, %10, lsl #2 \n" // restore src0
  4093. "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
  4094. "sub %2, %2, %10, lsl #2 \n" // restore src1
  4095. "vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r
  4096. "add %1, #8 \n" // src0 += 8
  4097. "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
  4098. "add %2, #8 \n" // src1 += 8
  4099. "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
  4100. "vst1.u8 {d0}, [%3], %11 \n"
  4101. "vst1.u8 {d1}, [%4], %11 \n"
  4102. "subs %0, #1 \n"
  4103. "vst1.u8 {d2}, [%3], %11 \n"
  4104. "vst1.u8 {d3}, [%4], %11 \n"
  4105. "vst1.u8 {d4}, [%3], %11 \n"
  4106. "vst1.u8 {d5}, [%4], %11 \n"
  4107. "vst1.u8 {d6}, [%3], %11 \n"
  4108. "vst1.u8 {d7}, [%4], %11 \n"
  4109. "bne 0b \n"
  4110. : "=r"(nn), // %0
  4111. "=r"(src0), // %1
  4112. "=r"(src1), // %2
  4113. "=r"(dst7), // %3
  4114. "=r"(dst6) // %4
  4115. : "0"(nn),
  4116. "1"(src0),
  4117. "2"(src1),
  4118. "3"(dst7),
  4119. "4"(dst6),
  4120. "r"(src_step), // %10
  4121. "r"(dst_step) // %11
  4122. : "cc", "memory", "q0", "q1", "q2", "q3");
  4123. }
  4124. #endif // __aarch64__
  4125. for (; remain > 0; remain--)
  4126. {
  4127. dst7[0] = src0[0];
  4128. dst7[1] = src1[0];
  4129. dst7[2] = src0[0 + src_step];
  4130. dst7[3] = src1[0 + src_step];
  4131. dst7[4] = src0[0 + 2 * src_step];
  4132. dst7[5] = src1[0 + 2 * src_step];
  4133. dst7[6] = src0[0 + 3 * src_step];
  4134. dst7[7] = src1[0 + 3 * src_step];
  4135. src0 += 1;
  4136. src1 += 1;
  4137. dst7 -= stride;
  4138. }
  4139. src0 += srcwgap + 7 * srcstride;
  4140. }
  4141. #endif // __ARM_NEON
  4142. for (; y < srch; y++)
  4143. {
  4144. unsigned char* dst0 = dstend + y;
  4145. int x = 0;
  4146. for (; x < srcw; x++)
  4147. {
  4148. *dst0 = *src0;
  4149. src0 += 1;
  4150. dst0 -= stride;
  4151. }
  4152. src0 += srcwgap;
  4153. }
  4154. }
  4155. static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
  4156. {
  4157. const int srcwgap = srcstride - srcw * 2;
  4158. // point to the last dst pixel row
  4159. unsigned char* dstend = dst + stride * (h - 1);
  4160. const unsigned char* src0 = src;
  4161. int y = 0;
  4162. #if __ARM_NEON
  4163. for (; y + 7 < srch; y += 8)
  4164. {
  4165. const unsigned char* src1 = src0 + srcstride;
  4166. unsigned char* dst7 = dstend + y * 2;
  4167. unsigned char* dst6 = dstend + y * 2 - stride;
  4168. int src_step = 2 * srcstride;
  4169. int dst_step = -2 * stride;
  4170. int nn = srcw >> 3;
  4171. int remain = srcw - (nn << 3);
  4172. #if __aarch64__
  4173. for (; nn > 0; nn--)
  4174. {
  4175. uint8x8x2_t _src0 = vld2_u8(src0);
  4176. uint8x8x2_t _src1 = vld2_u8(src1);
  4177. uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
  4178. uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
  4179. uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
  4180. uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
  4181. uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
  4182. uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
  4183. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  4184. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  4185. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  4186. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  4187. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  4188. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  4189. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  4190. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  4191. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  4192. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  4193. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  4194. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  4195. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  4196. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  4197. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  4198. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  4199. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  4200. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  4201. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  4202. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  4203. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  4204. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  4205. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  4206. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  4207. uint8x8x2_t _dst0;
  4208. uint8x8x2_t _dst1;
  4209. uint8x8x2_t _dst2;
  4210. uint8x8x2_t _dst3;
  4211. uint8x8x2_t _dst4;
  4212. uint8x8x2_t _dst5;
  4213. uint8x8x2_t _dst6;
  4214. uint8x8x2_t _dst7;
  4215. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  4216. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  4217. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  4218. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  4219. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  4220. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  4221. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  4222. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  4223. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  4224. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  4225. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  4226. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  4227. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  4228. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  4229. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  4230. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  4231. vst2_u8(dst7, _dst0);
  4232. vst2_u8(dst6, _dst1);
  4233. vst2_u8(dst7 + dst_step, _dst2);
  4234. vst2_u8(dst6 + dst_step, _dst3);
  4235. vst2_u8(dst7 + 2 * dst_step, _dst4);
  4236. vst2_u8(dst6 + 2 * dst_step, _dst5);
  4237. vst2_u8(dst7 + 3 * dst_step, _dst6);
  4238. vst2_u8(dst6 + 3 * dst_step, _dst7);
  4239. src0 += 2 * 8;
  4240. src1 += 2 * 8;
  4241. dst7 += 4 * dst_step;
  4242. dst6 += 4 * dst_step;
  4243. }
  4244. #else
  4245. if (nn > 0)
  4246. {
  4247. asm volatile(
  4248. "0: \n"
  4249. "pld [%1, #128] \n"
  4250. "vld2.u8 {d0-d1}, [%1], %10 \n"
  4251. "pld [%2, #128] \n"
  4252. "vld2.u8 {d2-d3}, [%2], %10 \n"
  4253. "pld [%1, #128] \n"
  4254. "vld2.u8 {d4-d5}, [%1], %10 \n"
  4255. "vtrn.u8 q0, q1 \n" // _src01t_r
  4256. "pld [%2, #128] \n"
  4257. "vld2.u8 {d6-d7}, [%2], %10 \n"
  4258. "pld [%1, #128] \n"
  4259. "vld2.u8 {d16-d17}, [%1], %10\n"
  4260. "vtrn.u8 q2, q3 \n" // _src23t_r
  4261. "pld [%2, #128] \n"
  4262. "vld2.u8 {d18-d19}, [%2], %10\n"
  4263. "pld [%1, #128] \n"
  4264. "vld2.u8 {d20-d21}, [%1], %10\n"
  4265. "vtrn.u8 q8, q9 \n" // _src45t_r
  4266. "pld [%2, #128] \n"
  4267. "vld2.u8 {d22-d23}, [%2], %10\n"
  4268. "vtrn.u8 q10, q11 \n" // _src67t_r
  4269. "sub %1, %1, %10, lsl #2 \n" // restore src0
  4270. "vtrn.u16 q0, q2 \n" // _src02tt_r
  4271. "sub %2, %2, %10, lsl #2 \n" // restore src1
  4272. "vtrn.u16 q1, q3 \n" // _src13tt_r
  4273. "add %1, #16 \n" // src0 += 16
  4274. "vtrn.u16 q8, q10 \n" // _src46tt_r
  4275. "add %2, #16 \n" // src1 += 16
  4276. "vtrn.u16 q9, q11 \n" // _src57tt_r
  4277. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  4278. "vtrn.u32 q1, q9 \n" // _src15ttt_r
  4279. "vst2.u8 {d0-d1}, [%3], %11 \n"
  4280. "vtrn.u32 q2, q10 \n" // _src26ttt_r
  4281. "vst2.u8 {d2-d3}, [%4], %11 \n"
  4282. "vtrn.u32 q3, q11 \n" // _src37ttt_r
  4283. "vst2.u8 {d4-d5}, [%3], %11 \n"
  4284. "subs %0, #1 \n"
  4285. "vst2.u8 {d16-d17}, [%3], %11\n"
  4286. "vst2.u8 {d6-d7}, [%4], %11 \n"
  4287. "vst2.u8 {d18-d19}, [%4], %11\n"
  4288. "vst2.u8 {d20-d21}, [%3], %11\n"
  4289. "vst2.u8 {d22-d23}, [%4], %11\n"
  4290. "bne 0b \n"
  4291. : "=r"(nn), // %0
  4292. "=r"(src0), // %1
  4293. "=r"(src1), // %2
  4294. "=r"(dst7), // %3
  4295. "=r"(dst6) // %4
  4296. : "0"(nn),
  4297. "1"(src0),
  4298. "2"(src1),
  4299. "3"(dst7),
  4300. "4"(dst6),
  4301. "r"(src_step), // %10
  4302. "r"(dst_step) // %11
  4303. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  4304. }
  4305. #endif // __aarch64__
  4306. for (; remain > 0; remain--)
  4307. {
  4308. dst7[0] = src0[0];
  4309. dst7[1] = src0[1];
  4310. dst7[2] = src1[0];
  4311. dst7[3] = src1[1];
  4312. dst7[4] = src0[0 + src_step];
  4313. dst7[5] = src0[1 + src_step];
  4314. dst7[6] = src1[0 + src_step];
  4315. dst7[7] = src1[1 + src_step];
  4316. dst7[8] = src0[0 + 2 * src_step];
  4317. dst7[9] = src0[1 + 2 * src_step];
  4318. dst7[10] = src1[0 + 2 * src_step];
  4319. dst7[11] = src1[1 + 2 * src_step];
  4320. dst7[12] = src0[0 + 3 * src_step];
  4321. dst7[13] = src0[1 + 3 * src_step];
  4322. dst7[14] = src1[0 + 3 * src_step];
  4323. dst7[15] = src1[1 + 3 * src_step];
  4324. src0 += 2;
  4325. src1 += 2;
  4326. dst7 -= stride;
  4327. }
  4328. src0 += srcwgap + 7 * srcstride;
  4329. }
  4330. #endif // __ARM_NEON
  4331. for (; y < srch; y++)
  4332. {
  4333. unsigned char* dst0 = dstend + y * 2;
  4334. int x = 0;
  4335. for (; x < srcw; x++)
  4336. {
  4337. dst0[0] = src0[0];
  4338. dst0[1] = src0[1];
  4339. src0 += 2;
  4340. dst0 -= stride;
  4341. }
  4342. src0 += srcwgap;
  4343. }
  4344. }
  4345. static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
  4346. {
  4347. const int srcwgap = srcstride - srcw * 3;
  4348. // point to the last dst pixel row
  4349. unsigned char* dstend = dst + stride * (h - 1);
  4350. const unsigned char* src0 = src;
  4351. int y = 0;
  4352. #if __ARM_NEON
  4353. for (; y + 7 < srch; y += 8)
  4354. {
  4355. const unsigned char* src1 = src0 + srcstride;
  4356. unsigned char* dst7 = dstend + y * 3;
  4357. unsigned char* dst6 = dstend + y * 3 - stride;
  4358. int src_step = 2 * srcstride;
  4359. int dst_step = -2 * stride;
  4360. int nn = srcw >> 3;
  4361. int remain = srcw - (nn << 3);
  4362. #if __aarch64__
  4363. for (; nn > 0; nn--)
  4364. {
  4365. uint8x8x3_t _src0 = vld3_u8(src0);
  4366. uint8x8x3_t _src1 = vld3_u8(src1);
  4367. uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
  4368. uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
  4369. uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
  4370. uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
  4371. uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
  4372. uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
  4373. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  4374. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  4375. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  4376. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  4377. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  4378. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  4379. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  4380. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  4381. uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
  4382. uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
  4383. uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
  4384. uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
  4385. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  4386. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  4387. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  4388. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  4389. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  4390. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  4391. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  4392. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  4393. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
  4394. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
  4395. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
  4396. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
  4397. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  4398. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  4399. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  4400. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  4401. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  4402. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  4403. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  4404. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  4405. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
  4406. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
  4407. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
  4408. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
  4409. uint8x8x3_t _dst0;
  4410. uint8x8x3_t _dst1;
  4411. uint8x8x3_t _dst2;
  4412. uint8x8x3_t _dst3;
  4413. uint8x8x3_t _dst4;
  4414. uint8x8x3_t _dst5;
  4415. uint8x8x3_t _dst6;
  4416. uint8x8x3_t _dst7;
  4417. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  4418. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  4419. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  4420. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  4421. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  4422. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  4423. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  4424. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  4425. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  4426. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  4427. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  4428. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  4429. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  4430. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  4431. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  4432. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  4433. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  4434. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  4435. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  4436. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  4437. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  4438. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  4439. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  4440. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  4441. vst3_u8(dst7, _dst0);
  4442. vst3_u8(dst6, _dst1);
  4443. vst3_u8(dst7 + dst_step, _dst2);
  4444. vst3_u8(dst6 + dst_step, _dst3);
  4445. vst3_u8(dst7 + 2 * dst_step, _dst4);
  4446. vst3_u8(dst6 + 2 * dst_step, _dst5);
  4447. vst3_u8(dst7 + 3 * dst_step, _dst6);
  4448. vst3_u8(dst6 + 3 * dst_step, _dst7);
  4449. src0 += 3 * 8;
  4450. src1 += 3 * 8;
  4451. dst7 += 4 * dst_step;
  4452. dst6 += 4 * dst_step;
  4453. }
  4454. #else
  4455. if (nn > 0)
  4456. {
  4457. asm volatile(
  4458. "0: \n"
  4459. "pld [%1, #192] \n"
  4460. "vld3.u8 {d0-d2}, [%1], %10 \n"
  4461. "pld [%2, #192] \n"
  4462. "vld3.u8 {d4-d6}, [%2], %10 \n"
  4463. "pld [%1, #192] \n"
  4464. "vld3.u8 {d8-d10}, [%1], %10 \n"
  4465. "vtrn.u8 q0, q2 \n" // _src01t_r
  4466. "vtrn.u8 d2, d6 \n"
  4467. "pld [%2, #192] \n"
  4468. "vld3.u8 {d12-d14}, [%2], %10\n"
  4469. "pld [%1, #192] \n"
  4470. "vld3.u8 {d16-d18}, [%1], %10\n"
  4471. "vtrn.u8 q4, q6 \n" // _src23t_r
  4472. "vtrn.u8 d10, d14 \n"
  4473. "pld [%2, #192] \n"
  4474. "vld3.u8 {d20-d22}, [%2], %10\n"
  4475. "pld [%1, #192] \n"
  4476. "vld3.u8 {d24-d26}, [%1], %10\n"
  4477. "vtrn.u8 q8, q10 \n" // _src45t_r
  4478. "vtrn.u8 d18, d22 \n"
  4479. "pld [%2, #192] \n"
  4480. "vld3.u8 {d28-d30}, [%2], %10\n"
  4481. "vtrn.u8 q12, q14 \n" // _src67t_r
  4482. "vtrn.u8 d26, d30 \n"
  4483. "sub %1, %1, %10, lsl #2 \n" // restore src0
  4484. "vtrn.u16 q0, q4 \n" // _src02tt_r
  4485. "vtrn.u16 d2, d10 \n"
  4486. "sub %2, %2, %10, lsl #2 \n" // restore src1
  4487. "vtrn.u16 q2, q6 \n" // _src13tt_r
  4488. "vtrn.u16 d6, d14 \n"
  4489. "add %1, #24 \n" // src0 += 24
  4490. "vtrn.u16 q8, q12 \n" // _src46tt_r
  4491. "vtrn.u16 d18, d26 \n"
  4492. "add %2, #24 \n" // src1 += 24
  4493. "vtrn.u16 q10, q14 \n" // _src57tt_r
  4494. "vtrn.u16 d22, d30 \n"
  4495. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  4496. "vtrn.u32 d2, d18 \n"
  4497. "vtrn.u32 q2, q10 \n" // _src15ttt_r
  4498. "vst3.u8 {d0-d2}, [%3], %11 \n"
  4499. "vtrn.u32 d6, d22 \n"
  4500. "vtrn.u32 q4, q12 \n" // _src26ttt_r
  4501. "vst3.u8 {d4-d6}, [%4], %11 \n"
  4502. "vtrn.u32 d10, d26 \n"
  4503. "vtrn.u32 q6, q14 \n" // _src37ttt_r
  4504. "vst3.u8 {d8-d10}, [%3], %11 \n"
  4505. "vtrn.u32 d14, d30 \n"
  4506. "subs %0, #1 \n"
  4507. "vst3.u8 {d16-d18}, [%3], %11\n"
  4508. "vst3.u8 {d12-d14}, [%4], %11\n"
  4509. "vst3.u8 {d20-d22}, [%4], %11\n"
  4510. "vst3.u8 {d24-d26}, [%3], %11\n"
  4511. "vst3.u8 {d28-d30}, [%4], %11\n"
  4512. "bne 0b \n"
  4513. : "=r"(nn), // %0
  4514. "=r"(src0), // %1
  4515. "=r"(src1), // %2
  4516. "=r"(dst7), // %3
  4517. "=r"(dst6) // %4
  4518. : "0"(nn),
  4519. "1"(src0),
  4520. "2"(src1),
  4521. "3"(dst7),
  4522. "4"(dst6),
  4523. "r"(src_step), // %10
  4524. "r"(dst_step) // %11
  4525. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  4526. }
  4527. #endif // __aarch64__
  4528. for (; remain > 0; remain--)
  4529. {
  4530. dst7[0] = src0[0];
  4531. dst7[1] = src0[1];
  4532. dst7[2] = src0[2];
  4533. dst7[3] = src1[0];
  4534. dst7[4] = src1[1];
  4535. dst7[5] = src1[2];
  4536. dst7[6] = src0[0 + src_step];
  4537. dst7[7] = src0[1 + src_step];
  4538. dst7[8] = src0[2 + src_step];
  4539. dst7[9] = src1[0 + src_step];
  4540. dst7[10] = src1[1 + src_step];
  4541. dst7[11] = src1[2 + src_step];
  4542. dst7[12] = src0[0 + 2 * src_step];
  4543. dst7[13] = src0[1 + 2 * src_step];
  4544. dst7[14] = src0[2 + 2 * src_step];
  4545. dst7[15] = src1[0 + 2 * src_step];
  4546. dst7[16] = src1[1 + 2 * src_step];
  4547. dst7[17] = src1[2 + 2 * src_step];
  4548. dst7[18] = src0[0 + 3 * src_step];
  4549. dst7[19] = src0[1 + 3 * src_step];
  4550. dst7[20] = src0[2 + 3 * src_step];
  4551. dst7[21] = src1[0 + 3 * src_step];
  4552. dst7[22] = src1[1 + 3 * src_step];
  4553. dst7[23] = src1[2 + 3 * src_step];
  4554. src0 += 3;
  4555. src1 += 3;
  4556. dst7 -= stride;
  4557. }
  4558. src0 += srcwgap + 7 * srcstride;
  4559. }
  4560. #endif // __ARM_NEON
  4561. for (; y < srch; y++)
  4562. {
  4563. unsigned char* dst0 = dstend + y * 3;
  4564. int x = 0;
  4565. for (; x < srcw; x++)
  4566. {
  4567. dst0[0] = src0[0];
  4568. dst0[1] = src0[1];
  4569. dst0[2] = src0[2];
  4570. src0 += 3;
  4571. dst0 -= stride;
  4572. }
  4573. src0 += srcwgap;
  4574. }
  4575. }
  4576. static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
  4577. {
  4578. const int srcwgap = srcstride - srcw * 4;
  4579. // point to the last dst pixel row
  4580. unsigned char* dstend = dst + stride * (h - 1);
  4581. const unsigned char* src0 = src;
  4582. int y = 0;
  4583. #if __ARM_NEON
  4584. for (; y + 7 < srch; y += 8)
  4585. {
  4586. const unsigned char* src1 = src0 + srcstride;
  4587. unsigned char* dst7 = dstend + y * 4;
  4588. unsigned char* dst6 = dstend + y * 4 - stride;
  4589. int src_step = 2 * srcstride;
  4590. int dst_step = -2 * stride;
  4591. int nn = srcw >> 3;
  4592. int remain = srcw - (nn << 3);
  4593. #if __aarch64__
  4594. for (; nn > 0; nn--)
  4595. {
  4596. uint8x8x4_t _src0 = vld4_u8(src0);
  4597. uint8x8x4_t _src1 = vld4_u8(src1);
  4598. uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
  4599. uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
  4600. uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
  4601. uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
  4602. uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
  4603. uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
  4604. uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
  4605. uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
  4606. uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
  4607. uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
  4608. uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
  4609. uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
  4610. uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
  4611. uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
  4612. uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
  4613. uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
  4614. uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
  4615. uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
  4616. uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
  4617. uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
  4618. uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
  4619. uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
  4620. uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
  4621. uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
  4622. uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
  4623. uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
  4624. uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
  4625. uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
  4626. uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
  4627. uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
  4628. uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
  4629. uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
  4630. uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
  4631. uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
  4632. uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
  4633. uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
  4634. uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
  4635. uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
  4636. uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
  4637. uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
  4638. uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
  4639. uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
  4640. uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
  4641. uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
  4642. uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
  4643. uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
  4644. uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
  4645. uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
  4646. uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
  4647. uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
  4648. uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
  4649. uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
  4650. uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
  4651. uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
  4652. uint8x8x4_t _dst0;
  4653. uint8x8x4_t _dst1;
  4654. uint8x8x4_t _dst2;
  4655. uint8x8x4_t _dst3;
  4656. uint8x8x4_t _dst4;
  4657. uint8x8x4_t _dst5;
  4658. uint8x8x4_t _dst6;
  4659. uint8x8x4_t _dst7;
  4660. _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
  4661. _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
  4662. _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
  4663. _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
  4664. _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
  4665. _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
  4666. _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
  4667. _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
  4668. _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
  4669. _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
  4670. _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
  4671. _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
  4672. _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
  4673. _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
  4674. _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
  4675. _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
  4676. _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
  4677. _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
  4678. _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
  4679. _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
  4680. _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
  4681. _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
  4682. _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
  4683. _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
  4684. _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
  4685. _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
  4686. _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
  4687. _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
  4688. _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
  4689. _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
  4690. _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
  4691. _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
  4692. vst4_u8(dst7, _dst0);
  4693. vst4_u8(dst6, _dst1);
  4694. vst4_u8(dst7 + dst_step, _dst2);
  4695. vst4_u8(dst6 + dst_step, _dst3);
  4696. vst4_u8(dst7 + 2 * dst_step, _dst4);
  4697. vst4_u8(dst6 + 2 * dst_step, _dst5);
  4698. vst4_u8(dst7 + 3 * dst_step, _dst6);
  4699. vst4_u8(dst6 + 3 * dst_step, _dst7);
  4700. src0 += 4 * 8;
  4701. src1 += 4 * 8;
  4702. dst7 += 4 * dst_step;
  4703. dst6 += 4 * dst_step;
  4704. }
  4705. #else
  4706. if (nn > 0)
  4707. {
  4708. asm volatile(
  4709. "0: \n"
  4710. "pld [%1, #256] \n"
  4711. "vld4.u8 {d0-d3}, [%1], %10 \n"
  4712. "pld [%2, #256] \n"
  4713. "vld4.u8 {d4-d7}, [%2], %10 \n"
  4714. "pld [%1, #256] \n"
  4715. "vld4.u8 {d8-d11}, [%1], %10 \n"
  4716. "vtrn.u8 q0, q2 \n" // _src01t_r
  4717. "vtrn.u8 q1, q3 \n"
  4718. "pld [%2, #256] \n"
  4719. "vld4.u8 {d12-d15}, [%2], %10\n"
  4720. "pld [%1, #256] \n"
  4721. "vld4.u8 {d16-d19}, [%1], %10\n"
  4722. "vtrn.u8 q4, q6 \n" // _src23t_r
  4723. "vtrn.u8 q5, q7 \n"
  4724. "pld [%2, #256] \n"
  4725. "vld4.u8 {d20-d23}, [%2], %10\n"
  4726. "pld [%1, #256] \n"
  4727. "vld4.u8 {d24-d27}, [%1], %10\n"
  4728. "vtrn.u8 q8, q10 \n" // _src45t_r
  4729. "vtrn.u8 q9, q11 \n"
  4730. "pld [%2, #256] \n"
  4731. "vld4.u8 {d28-d31}, [%2], %10\n"
  4732. "vtrn.u8 q12, q14 \n" // _src67t_r
  4733. "vtrn.u8 q13, q15 \n"
  4734. "sub %1, %1, %10, lsl #2 \n" // restore src0
  4735. "vtrn.u16 q0, q4 \n" // _src02tt_r
  4736. "vtrn.u16 q1, q5 \n"
  4737. "sub %2, %2, %10, lsl #2 \n" // restore src1
  4738. "vtrn.u16 q2, q6 \n" // _src13tt_r
  4739. "vtrn.u16 q3, q7 \n"
  4740. "add %1, #32 \n" // src0 += 32
  4741. "vtrn.u16 q8, q12 \n" // _src46tt_r
  4742. "vtrn.u16 q9, q13 \n"
  4743. "add %2, #32 \n" // src1 += 32
  4744. "vtrn.u16 q10, q14 \n" // _src57tt_r
  4745. "vtrn.u16 q11, q15 \n"
  4746. "vtrn.u32 q0, q8 \n" // _src04ttt_r
  4747. "vtrn.u32 q1, q9 \n"
  4748. "vtrn.u32 q2, q10 \n" // _src15ttt_r
  4749. "vst4.u8 {d0-d3}, [%3], %11 \n"
  4750. "vtrn.u32 q3, q11 \n"
  4751. "vtrn.u32 q4, q12 \n" // _src26ttt_r
  4752. "vst4.u8 {d4-d7}, [%4], %11 \n"
  4753. "vtrn.u32 q5, q13 \n"
  4754. "vtrn.u32 q6, q14 \n" // _src37ttt_r
  4755. "vst4.u8 {d8-d11}, [%3], %11 \n"
  4756. "vtrn.u32 q7, q15 \n"
  4757. "subs %0, #1 \n"
  4758. "vst4.u8 {d16-d19}, [%3], %11\n"
  4759. "vst4.u8 {d12-d15}, [%4], %11\n"
  4760. "vst4.u8 {d20-d23}, [%4], %11\n"
  4761. "vst4.u8 {d24-d27}, [%3], %11\n"
  4762. "vst4.u8 {d28-d31}, [%4], %11\n"
  4763. "bne 0b \n"
  4764. : "=r"(nn), // %0
  4765. "=r"(src0), // %1
  4766. "=r"(src1), // %2
  4767. "=r"(dst7), // %3
  4768. "=r"(dst6) // %4
  4769. : "0"(nn),
  4770. "1"(src0),
  4771. "2"(src1),
  4772. "3"(dst7),
  4773. "4"(dst6),
  4774. "r"(src_step), // %10
  4775. "r"(dst_step) // %11
  4776. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  4777. }
  4778. #endif // __aarch64__
  4779. for (; remain > 0; remain--)
  4780. {
  4781. dst7[0] = src0[0];
  4782. dst7[1] = src0[1];
  4783. dst7[2] = src0[2];
  4784. dst7[3] = src0[3];
  4785. dst7[4] = src1[0];
  4786. dst7[5] = src1[1];
  4787. dst7[6] = src1[2];
  4788. dst7[7] = src1[3];
  4789. dst7[8] = src0[0 + src_step];
  4790. dst7[9] = src0[1 + src_step];
  4791. dst7[10] = src0[2 + src_step];
  4792. dst7[11] = src0[3 + src_step];
  4793. dst7[12] = src1[0 + src_step];
  4794. dst7[13] = src1[1 + src_step];
  4795. dst7[14] = src1[2 + src_step];
  4796. dst7[15] = src1[3 + src_step];
  4797. dst7[16] = src0[0 + 2 * src_step];
  4798. dst7[17] = src0[1 + 2 * src_step];
  4799. dst7[18] = src0[2 + 2 * src_step];
  4800. dst7[19] = src0[3 + 2 * src_step];
  4801. dst7[20] = src1[0 + 2 * src_step];
  4802. dst7[21] = src1[1 + 2 * src_step];
  4803. dst7[22] = src1[2 + 2 * src_step];
  4804. dst7[23] = src1[3 + 2 * src_step];
  4805. dst7[24] = src0[0 + 3 * src_step];
  4806. dst7[25] = src0[1 + 3 * src_step];
  4807. dst7[26] = src0[2 + 3 * src_step];
  4808. dst7[27] = src0[3 + 3 * src_step];
  4809. dst7[28] = src1[0 + 3 * src_step];
  4810. dst7[29] = src1[1 + 3 * src_step];
  4811. dst7[30] = src1[2 + 3 * src_step];
  4812. dst7[31] = src1[3 + 3 * src_step];
  4813. src0 += 4;
  4814. src1 += 4;
  4815. dst7 -= stride;
  4816. }
  4817. src0 += srcwgap + 7 * srcstride;
  4818. }
  4819. #endif // __ARM_NEON
  4820. for (; y < srch; y++)
  4821. {
  4822. unsigned char* dst0 = dstend + y * 4;
  4823. int x = 0;
  4824. for (; x < srcw; x++)
  4825. {
  4826. dst0[0] = src0[0];
  4827. dst0[1] = src0[1];
  4828. dst0[2] = src0[2];
  4829. dst0[3] = src0[3];
  4830. src0 += 4;
  4831. dst0 -= stride;
  4832. }
  4833. src0 += srcwgap;
  4834. }
  4835. }
  4836. void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
  4837. {
  4838. return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type);
  4839. }
  4840. void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
  4841. {
  4842. return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type);
  4843. }
  4844. void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
  4845. {
  4846. return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type);
  4847. }
  4848. void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
  4849. {
  4850. return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type);
  4851. }
  4852. void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
  4853. {
  4854. // assert srcw == w && srch == h for type 1234
  4855. // assert srcw == h && srch == w for type 5678
  4856. switch (type)
  4857. {
  4858. case 1:
  4859. kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4860. break;
  4861. case 2:
  4862. kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4863. break;
  4864. case 3:
  4865. kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4866. break;
  4867. case 4:
  4868. kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4869. break;
  4870. case 5:
  4871. kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4872. break;
  4873. case 6:
  4874. kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4875. break;
  4876. case 7:
  4877. kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4878. break;
  4879. case 8:
  4880. kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride);
  4881. break;
  4882. default:
  4883. // unsupported rotate type
  4884. break;
  4885. }
  4886. }
  4887. void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
  4888. {
  4889. // assert srcw == w && srch == h for type 1234
  4890. // assert srcw == h && srch == w for type 5678
  4891. switch (type)
  4892. {
  4893. case 1:
  4894. kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4895. break;
  4896. case 2:
  4897. kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4898. break;
  4899. case 3:
  4900. kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4901. break;
  4902. case 4:
  4903. kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4904. break;
  4905. case 5:
  4906. kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4907. break;
  4908. case 6:
  4909. kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4910. break;
  4911. case 7:
  4912. kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4913. break;
  4914. case 8:
  4915. kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride);
  4916. break;
  4917. default:
  4918. // unsupported rotate type
  4919. break;
  4920. }
  4921. }
  4922. void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
  4923. {
  4924. // assert srcw == w && srch == h for type 1234
  4925. // assert srcw == h && srch == w for type 5678
  4926. switch (type)
  4927. {
  4928. case 1:
  4929. kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4930. break;
  4931. case 2:
  4932. kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4933. break;
  4934. case 3:
  4935. kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4936. break;
  4937. case 4:
  4938. kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4939. break;
  4940. case 5:
  4941. kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4942. break;
  4943. case 6:
  4944. kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4945. break;
  4946. case 7:
  4947. kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4948. break;
  4949. case 8:
  4950. kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride);
  4951. break;
  4952. default:
  4953. // unsupported rotate type
  4954. break;
  4955. }
  4956. }
  4957. void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
  4958. {
  4959. // assert srcw == w && srch == h for type 1234
  4960. // assert srcw == h && srch == w for type 5678
  4961. switch (type)
  4962. {
  4963. case 1:
  4964. kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4965. break;
  4966. case 2:
  4967. kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4968. break;
  4969. case 3:
  4970. kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4971. break;
  4972. case 4:
  4973. kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4974. break;
  4975. case 5:
  4976. kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4977. break;
  4978. case 6:
  4979. kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4980. break;
  4981. case 7:
  4982. kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4983. break;
  4984. case 8:
  4985. kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride);
  4986. break;
  4987. default:
  4988. // unsupported rotate type
  4989. break;
  4990. }
  4991. }
  4992. void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
  4993. {
  4994. // assert srcw % 2 == 0
  4995. // assert srch % 2 == 0
  4996. // assert w % 2 == 0
  4997. // assert h % 2 == 0
  4998. const unsigned char* srcY = src;
  4999. unsigned char* dstY = dst;
  5000. kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type);
  5001. const unsigned char* srcUV = src + srcw * srch;
  5002. unsigned char* dstUV = dst + w * h;
  5003. kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type);
  5004. }
  5005. #endif // NCNN_PIXEL_ROTATE
  5006. } // namespace ncnn